diff --git a/src/impl/sse.c b/src/impl/sse.c
index 550bdb0..45300f4 100644
--- a/src/impl/sse.c
+++ b/src/impl/sse.c
@@ -13,10 +13,11 @@ float *compute_jacobi(int n, float init_value, float threshold, borders b, int *
   float *new_x;
   float *tmp_x;
   float max_diff, new_value;
-  int i, j;
+  int i, j, k;
   int nb = n + 2; // n plus the border
   int n_mult = (n / 4) * 4;
-  __m128 new_value_vec, tmp_vec;
+  __m128 new_value_vec, tmp_vec, diff_vec;
+  float diffs[] = {0.0, 0.0, 0.0, 0.0};
   /* Initialize boundary regions */
   x = create_sa_matrix(nb, nb);
   new_x = create_sa_matrix(nb, nb);
@@ -52,10 +53,14 @@ float *compute_jacobi(int n, float init_value, float threshold, borders b, int *
         tmp_vec = _mm_set1_ps(0.25);
         new_value_vec = _mm_mul_ps(new_value_vec, tmp_vec);
         _mm_storeu_ps(&new_x[IDX(nb, i, j)], new_value_vec);
-        max_diff = (float) fmax(max_diff, fabs(new_x[IDX(nb, i, j)] - x[IDX(nb, i, j)]));
-        max_diff = (float) fmax(max_diff, fabs(new_x[IDX(nb, i, j + 1)] - x[IDX(nb, i, j + 1)]));
-        max_diff = (float) fmax(max_diff, fabs(new_x[IDX(nb, i, j + 2)] - x[IDX(nb, i, j + 2)]));
-        max_diff = (float) fmax(max_diff, fabs(new_x[IDX(nb, i, j + 3)] - x[IDX(nb, i, j + 3)]));
+
+        diff_vec = _mm_loadu_ps(&new_x[IDX(nb, i, j)]);
+        tmp_vec = _mm_loadu_ps(&x[IDX(nb, i, j)]);
+        diff_vec = _mm_sub_ps(diff_vec, tmp_vec);
+        _mm_storeu_ps(diffs, diff_vec);
+        for (k = 0; k < 4; k++) {
+          max_diff = (float) fmax(max_diff, fabs(diffs[k]));
+        }
       }
       for (j = n_mult; j <= n; j++) {
         new_value = 0.25 * (x[IDX(nb, i - 1, j)] + x[IDX(nb, i, j + 1)] + x[IDX(nb, i + 1, j)] + x[IDX(nb, i, j - 1)]);