diff --git a/src/impl/sse.c b/src/impl/sse.c index 550bdb0..45300f4 100644 --- a/src/impl/sse.c +++ b/src/impl/sse.c @@ -13,10 +13,11 @@ float *compute_jacobi(int n, float init_value, float threshold, borders b, int * float *new_x; float *tmp_x; float max_diff, new_value; - int i, j; + int i, j, k; int nb = n + 2; // n plus the border int n_mult = (n / 4) * 4; - __m128 new_value_vec, tmp_vec; + __m128 new_value_vec, tmp_vec, diff_vec; + float diffs[] = {0.0, 0.0, 0.0, 0.0}; /* Initialize boundary regions */ x = create_sa_matrix(nb, nb); new_x = create_sa_matrix(nb, nb); @@ -52,10 +53,14 @@ float *compute_jacobi(int n, float init_value, float threshold, borders b, int * tmp_vec = _mm_set1_ps(0.25); new_value_vec = _mm_mul_ps(new_value_vec, tmp_vec); _mm_storeu_ps(&new_x[IDX(nb, i, j)], new_value_vec); - max_diff = (float) fmax(max_diff, fabs(new_x[IDX(nb, i, j)] - x[IDX(nb, i, j)])); - max_diff = (float) fmax(max_diff, fabs(new_x[IDX(nb, i, j + 1)] - x[IDX(nb, i, j + 1)])); - max_diff = (float) fmax(max_diff, fabs(new_x[IDX(nb, i, j + 2)] - x[IDX(nb, i, j + 2)])); - max_diff = (float) fmax(max_diff, fabs(new_x[IDX(nb, i, j + 3)] - x[IDX(nb, i, j + 3)])); + + diff_vec = _mm_loadu_ps(&new_x[IDX(nb, i, j)]); + tmp_vec = _mm_loadu_ps(&x[IDX(nb, i, j)]); + diff_vec = _mm_sub_ps(diff_vec, tmp_vec); + _mm_storeu_ps(diffs, diff_vec); + for (k = 0; k < 4; k++) { + max_diff = (float) fmax(max_diff, fabs(diffs[k])); + } } for (j = n_mult; j <= n; j++) { new_value = 0.25 * (x[IDX(nb, i - 1, j)] + x[IDX(nb, i, j + 1)] + x[IDX(nb, i + 1, j)] + x[IDX(nb, i, j - 1)]);