Added SSE Implementation

2016-12-13 18:13:22 +01:00 · 2016-12-13 18:13:22 +01:00 · 600e5eb149
commit 600e5eb149
parent ec073db31d
3 changed files with 81 additions and 3 deletions
--- a/10
+++ b/10
@ -6,7 +6,7 @@ SRC=src
 BUILD=build
 BIN=bin
-all: sequential mpi_line mpi_line_async omp
+all: sequential mpi_line mpi_line_async omp sse
 sequential: config utils main
 	${CC} ${CFLAGS} \
@ -16,6 +16,14 @@ sequential: config utils main
 		${SRC}/impl/sequential.c \
 	-o ${BIN}/jacobi_sequential
 sse: config utils main
 	${CC_OMP} ${CFLAGS} \
 		${BUILD}/config.o \
 		${BUILD}/utils.o \
 		${BUILD}/main.o \
 		${SRC}/impl/sse.c \
 	-o ${BIN}/jacobi_sse
 omp: config utils main
 	${CC_OMP} ${CFLAGS} \
 		${BUILD}/config.o \
--- a/src/impl/omp.c
+++ b/src/impl/omp.c
@ -39,9 +39,9 @@ double *compute_jacobi(int n, double init_value, double threshold, borders b, in
  *iterations = 0;
  do {
    max_diff = 0;
-    #pragma omp parallel for schedule(static) \
+    #pragma omp parallel for \
      reduction (max:max_diff) \
-      default(none) private(new_value, j) firstprivate(n, nb) shared(x, new_x)
+      default(none) private(new_value, j) shared(x, new_x, n, nb)
    for (i = 1; i <= n; i++) {
      for (j = 1; j <= n; j++) {
        new_value = 0.25 * (x[IDX(nb, i - 1, j)] + x[IDX(nb, i, j + 1)] + x[IDX(nb, i + 1, j)] + x[IDX(nb, i, j - 1)]);
--- a/src/impl/sse.c
+++ b/src/impl/sse.c
@ -0,0 +1,70 @@
 /*
 * SSE version.
 */
 #include <stdio.h>
 #include <math.h>
 #include <emmintrin.h>
 #include "../config.h"
 #include "../utils.h"
 double *compute_jacobi(int n, double init_value, double threshold, borders b, int *iterations) {
  double *x;
  double *new_x;
  double *tmp_x;
  double max_diff, new_value;
  int i, j;
  int nb = n + 2; // n plus the border
  int n_mult = (n % 2 == 0) ? n : n - 1;
  /* Initialize boundary regions */
  x = create_sa_matrix(nb, nb);
  new_x = create_sa_matrix(nb, nb);
  for (i = 0; i < nb; i++) {
    x[IDX(nb, 0, i)] = b.north;
    x[IDX(nb, n + 1, i)] = b.south;
    x[IDX(nb, i, 0)] = b.west;
    x[IDX(nb, i, n + 1)] = b.east;
    new_x[IDX(nb, 0, i)] = b.north;
    new_x[IDX(nb, n + 1, i)] = b.south;
    new_x[IDX(nb, i, 0)] = b.west;
    new_x[IDX(nb, i, n + 1)] = b.east;
  }
  /* Initialize the rest of the matrix */
  for (i = 1; i <= n; i++) {
    for (j = 1; j <= n; j++) {
      x[IDX(nb, i, j)] = init_value;
    }
  }
  /* Iterative refinement of x until values converge */
  *iterations = 0;
  do {
    max_diff = 0;
    for (i = 1; i <= n; i++) {
      for (j = 1; j <= n_mult; j += 2) {
        __m128d new_value_vec, tmp_vec;
        new_value_vec = _mm_loadu_pd(&x[IDX(nb, i - 1, j)]);
        tmp_vec = _mm_loadu_pd(&x[IDX(nb, i + 1, j)]);
        new_value_vec = _mm_add_pd(new_value_vec, tmp_vec);
        tmp_vec = _mm_loadu_pd(&x[IDX(nb, i, j - 1)]);
        new_value_vec = _mm_add_pd(new_value_vec, tmp_vec);
        tmp_vec = _mm_loadu_pd(&x[IDX(nb, i, j + 1)]);
        new_value_vec = _mm_add_pd(new_value_vec, tmp_vec);
        tmp_vec = _mm_set1_pd(0.25);
        new_value_vec = _mm_mul_pd(new_value_vec, tmp_vec);
        _mm_storeu_pd(&new_x[IDX(nb, i, j)], new_value_vec);
        max_diff = (double) fmax(max_diff, fabs(new_x[IDX(nb, i, j)] - x[IDX(nb, i, j)]));
        max_diff = (double) fmax(max_diff, fabs(new_x[IDX(nb, i, j + 1)] - x[IDX(nb, i, j + 1)]));
      }
      for (j = n_mult; j <= n; j++) {
        new_value = 0.25 * (x[IDX(nb, i - 1, j)] + x[IDX(nb, i, j + 1)] + x[IDX(nb, i + 1, j)] + x[IDX(nb, i, j - 1)]);
        max_diff = (double) fmax(max_diff, fabs(new_value - x[IDX(nb, i, j)]));
        new_x[IDX(nb, i, j)] = new_value;
      }
    }
    tmp_x = new_x;
    new_x = x;
    x = tmp_x;
    (*iterations)++;
  } while (max_diff > threshold);
  return x;
 }