Added SSE Implementation

2016-12-13 18:13:22 +01:00 · 2016-12-13 18:13:22 +01:00 · 600e5eb149
commit 600e5eb149
parent ec073db31d
3 changed files with 81 additions and 3 deletions
--- a/10
+++ b/10
@ -6,7 +6,7 @@ SRC=src
 BUILD=build
 BIN=bin

-all: sequential mpi_line mpi_line_async omp
+all: sequential mpi_line mpi_line_async omp sse

 sequential: config utils main
 	${CC} ${CFLAGS} \
@ -16,6 +16,14 @@ sequential: config utils main
 		${SRC}/impl/sequential.c \
 	-o ${BIN}/jacobi_sequential

+sse: config utils main
+	${CC_OMP} ${CFLAGS} \
+		${BUILD}/config.o \
+		${BUILD}/utils.o \
+		${BUILD}/main.o \
+		${SRC}/impl/sse.c \
+	-o ${BIN}/jacobi_sse
+
 omp: config utils main
 	${CC_OMP} ${CFLAGS} \
 		${BUILD}/config.o \
--- a/src/impl/omp.c
+++ b/src/impl/omp.c
@ -39,9 +39,9 @@ double *compute_jacobi(int n, double init_value, double threshold, borders b, in
  *iterations = 0;
  do {
    max_diff = 0;
-    #pragma omp parallel for schedule(static) \
+    #pragma omp parallel for \
      reduction (max:max_diff) \
-      default(none) private(new_value, j) firstprivate(n, nb) shared(x, new_x)
+      default(none) private(new_value, j) shared(x, new_x, n, nb)
    for (i = 1; i <= n; i++) {
      for (j = 1; j <= n; j++) {
        new_value = 0.25 * (x[IDX(nb, i - 1, j)] + x[IDX(nb, i, j + 1)] + x[IDX(nb, i + 1, j)] + x[IDX(nb, i, j - 1)]);
--- a/src/impl/sse.c
+++ b/src/impl/sse.c
@ -0,0 +1,70 @@
+/*
+ * SSE version.
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <emmintrin.h>
+#include "../config.h"
+#include "../utils.h"
+
+double *compute_jacobi(int n, double init_value, double threshold, borders b, int *iterations) {
+  double *x;
+  double *new_x;
+  double *tmp_x;
+  double max_diff, new_value;
+  int i, j;
+  int nb = n + 2; // n plus the border
+  int n_mult = (n % 2 == 0) ? n : n - 1;
+  /* Initialize boundary regions */
+  x = create_sa_matrix(nb, nb);
+  new_x = create_sa_matrix(nb, nb);
+  for (i = 0; i < nb; i++) {
+    x[IDX(nb, 0, i)] = b.north;
+    x[IDX(nb, n + 1, i)] = b.south;
+    x[IDX(nb, i, 0)] = b.west;
+    x[IDX(nb, i, n + 1)] = b.east;
+    new_x[IDX(nb, 0, i)] = b.north;
+    new_x[IDX(nb, n + 1, i)] = b.south;
+    new_x[IDX(nb, i, 0)] = b.west;
+    new_x[IDX(nb, i, n + 1)] = b.east;
+  }
+  /* Initialize the rest of the matrix */
+  for (i = 1; i <= n; i++) {
+    for (j = 1; j <= n; j++) {
+      x[IDX(nb, i, j)] = init_value;
+    }
+  }
+  /* Iterative refinement of x until values converge */
+  *iterations = 0;
+  do {
+    max_diff = 0;
+    for (i = 1; i <= n; i++) {
+      for (j = 1; j <= n_mult; j += 2) {
+        __m128d new_value_vec, tmp_vec;
+        new_value_vec = _mm_loadu_pd(&x[IDX(nb, i - 1, j)]);
+        tmp_vec = _mm_loadu_pd(&x[IDX(nb, i + 1, j)]);
+        new_value_vec = _mm_add_pd(new_value_vec, tmp_vec);
+        tmp_vec = _mm_loadu_pd(&x[IDX(nb, i, j - 1)]);
+        new_value_vec = _mm_add_pd(new_value_vec, tmp_vec);
+        tmp_vec = _mm_loadu_pd(&x[IDX(nb, i, j + 1)]);
+        new_value_vec = _mm_add_pd(new_value_vec, tmp_vec);
+        tmp_vec = _mm_set1_pd(0.25);
+        new_value_vec = _mm_mul_pd(new_value_vec, tmp_vec);
+        _mm_storeu_pd(&new_x[IDX(nb, i, j)], new_value_vec);
+        max_diff = (double) fmax(max_diff, fabs(new_x[IDX(nb, i, j)] - x[IDX(nb, i, j)]));
+        max_diff = (double) fmax(max_diff, fabs(new_x[IDX(nb, i, j + 1)] - x[IDX(nb, i, j + 1)]));
+      }
+      for (j = n_mult; j <= n; j++) {
+        new_value = 0.25 * (x[IDX(nb, i - 1, j)] + x[IDX(nb, i, j + 1)] + x[IDX(nb, i + 1, j)] + x[IDX(nb, i, j - 1)]);
+        max_diff = (double) fmax(max_diff, fabs(new_value - x[IDX(nb, i, j)]));
+        new_x[IDX(nb, i, j)] = new_value;
+      }
+    }
+    tmp_x = new_x;
+    new_x = x;
+    x = tmp_x;
+    (*iterations)++;
+  } while (max_diff > threshold);
+  return x;
+}