diff --git a/Makefile b/Makefile index f6f1f76..655e1fd 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ SRC=src BUILD=build BIN=bin -all: sequential mpi_line mpi_line_async omp +all: sequential mpi_line mpi_line_async omp sse sequential: config utils main ${CC} ${CFLAGS} \ @@ -16,6 +16,14 @@ sequential: config utils main ${SRC}/impl/sequential.c \ -o ${BIN}/jacobi_sequential +sse: config utils main + ${CC_OMP} ${CFLAGS} \ + ${BUILD}/config.o \ + ${BUILD}/utils.o \ + ${BUILD}/main.o \ + ${SRC}/impl/sse.c \ + -o ${BIN}/jacobi_sse + omp: config utils main ${CC_OMP} ${CFLAGS} \ ${BUILD}/config.o \ diff --git a/src/impl/omp.c b/src/impl/omp.c index e098185..5888122 100644 --- a/src/impl/omp.c +++ b/src/impl/omp.c @@ -39,9 +39,9 @@ double *compute_jacobi(int n, double init_value, double threshold, borders b, in *iterations = 0; do { max_diff = 0; - #pragma omp parallel for schedule(static) \ + #pragma omp parallel for \ reduction (max:max_diff) \ - default(none) private(new_value, j) firstprivate(n, nb) shared(x, new_x) + default(none) private(new_value, j) shared(x, new_x, n, nb) for (i = 1; i <= n; i++) { for (j = 1; j <= n; j++) { new_value = 0.25 * (x[IDX(nb, i - 1, j)] + x[IDX(nb, i, j + 1)] + x[IDX(nb, i + 1, j)] + x[IDX(nb, i, j - 1)]); diff --git a/src/impl/sse.c b/src/impl/sse.c new file mode 100644 index 0000000..b16ca8a --- /dev/null +++ b/src/impl/sse.c @@ -0,0 +1,70 @@ +/* + * SSE version. + */ + +#include +#include +#include +#include "../config.h" +#include "../utils.h" + +double *compute_jacobi(int n, double init_value, double threshold, borders b, int *iterations) { + double *x; + double *new_x; + double *tmp_x; + double max_diff, new_value; + int i, j; + int nb = n + 2; // n plus the border + int n_mult = (n % 2 == 0) ? n : n - 1; + /* Initialize boundary regions */ + x = create_sa_matrix(nb, nb); + new_x = create_sa_matrix(nb, nb); + for (i = 0; i < nb; i++) { + x[IDX(nb, 0, i)] = b.north; + x[IDX(nb, n + 1, i)] = b.south; + x[IDX(nb, i, 0)] = b.west; + x[IDX(nb, i, n + 1)] = b.east; + new_x[IDX(nb, 0, i)] = b.north; + new_x[IDX(nb, n + 1, i)] = b.south; + new_x[IDX(nb, i, 0)] = b.west; + new_x[IDX(nb, i, n + 1)] = b.east; + } + /* Initialize the rest of the matrix */ + for (i = 1; i <= n; i++) { + for (j = 1; j <= n; j++) { + x[IDX(nb, i, j)] = init_value; + } + } + /* Iterative refinement of x until values converge */ + *iterations = 0; + do { + max_diff = 0; + for (i = 1; i <= n; i++) { + for (j = 1; j <= n_mult; j += 2) { + __m128d new_value_vec, tmp_vec; + new_value_vec = _mm_loadu_pd(&x[IDX(nb, i - 1, j)]); + tmp_vec = _mm_loadu_pd(&x[IDX(nb, i + 1, j)]); + new_value_vec = _mm_add_pd(new_value_vec, tmp_vec); + tmp_vec = _mm_loadu_pd(&x[IDX(nb, i, j - 1)]); + new_value_vec = _mm_add_pd(new_value_vec, tmp_vec); + tmp_vec = _mm_loadu_pd(&x[IDX(nb, i, j + 1)]); + new_value_vec = _mm_add_pd(new_value_vec, tmp_vec); + tmp_vec = _mm_set1_pd(0.25); + new_value_vec = _mm_mul_pd(new_value_vec, tmp_vec); + _mm_storeu_pd(&new_x[IDX(nb, i, j)], new_value_vec); + max_diff = (double) fmax(max_diff, fabs(new_x[IDX(nb, i, j)] - x[IDX(nb, i, j)])); + max_diff = (double) fmax(max_diff, fabs(new_x[IDX(nb, i, j + 1)] - x[IDX(nb, i, j + 1)])); + } + for (j = n_mult; j <= n; j++) { + new_value = 0.25 * (x[IDX(nb, i - 1, j)] + x[IDX(nb, i, j + 1)] + x[IDX(nb, i + 1, j)] + x[IDX(nb, i, j - 1)]); + max_diff = (double) fmax(max_diff, fabs(new_value - x[IDX(nb, i, j)])); + new_x[IDX(nb, i, j)] = new_value; + } + } + tmp_x = new_x; + new_x = x; + x = tmp_x; + (*iterations)++; + } while (max_diff > threshold); + return x; +}