Added SSE Implementation

This commit is contained in:
Fabio Salvini 2016-12-13 18:13:22 +01:00
parent ec073db31d
commit 600e5eb149
3 changed files with 81 additions and 3 deletions

View File

@ -6,7 +6,7 @@ SRC=src
BUILD=build BUILD=build
BIN=bin BIN=bin
all: sequential mpi_line mpi_line_async omp all: sequential mpi_line mpi_line_async omp sse
sequential: config utils main sequential: config utils main
${CC} ${CFLAGS} \ ${CC} ${CFLAGS} \
@ -16,6 +16,14 @@ sequential: config utils main
${SRC}/impl/sequential.c \ ${SRC}/impl/sequential.c \
-o ${BIN}/jacobi_sequential -o ${BIN}/jacobi_sequential
sse: config utils main
${CC_OMP} ${CFLAGS} \
${BUILD}/config.o \
${BUILD}/utils.o \
${BUILD}/main.o \
${SRC}/impl/sse.c \
-o ${BIN}/jacobi_sse
omp: config utils main omp: config utils main
${CC_OMP} ${CFLAGS} \ ${CC_OMP} ${CFLAGS} \
${BUILD}/config.o \ ${BUILD}/config.o \

View File

@ -39,9 +39,9 @@ double *compute_jacobi(int n, double init_value, double threshold, borders b, in
*iterations = 0; *iterations = 0;
do { do {
max_diff = 0; max_diff = 0;
#pragma omp parallel for schedule(static) \ #pragma omp parallel for \
reduction (max:max_diff) \ reduction (max:max_diff) \
default(none) private(new_value, j) firstprivate(n, nb) shared(x, new_x) default(none) private(new_value, j) shared(x, new_x, n, nb)
for (i = 1; i <= n; i++) { for (i = 1; i <= n; i++) {
for (j = 1; j <= n; j++) { for (j = 1; j <= n; j++) {
new_value = 0.25 * (x[IDX(nb, i - 1, j)] + x[IDX(nb, i, j + 1)] + x[IDX(nb, i + 1, j)] + x[IDX(nb, i, j - 1)]); new_value = 0.25 * (x[IDX(nb, i - 1, j)] + x[IDX(nb, i, j + 1)] + x[IDX(nb, i + 1, j)] + x[IDX(nb, i, j - 1)]);

70
src/impl/sse.c Normal file
View File

@ -0,0 +1,70 @@
/*
* SSE version.
*/
#include <stdio.h>
#include <math.h>
#include <emmintrin.h>
#include "../config.h"
#include "../utils.h"
double *compute_jacobi(int n, double init_value, double threshold, borders b, int *iterations) {
double *x;
double *new_x;
double *tmp_x;
double max_diff, new_value;
int i, j;
int nb = n + 2; // n plus the border
int n_mult = (n % 2 == 0) ? n : n - 1;
/* Initialize boundary regions */
x = create_sa_matrix(nb, nb);
new_x = create_sa_matrix(nb, nb);
for (i = 0; i < nb; i++) {
x[IDX(nb, 0, i)] = b.north;
x[IDX(nb, n + 1, i)] = b.south;
x[IDX(nb, i, 0)] = b.west;
x[IDX(nb, i, n + 1)] = b.east;
new_x[IDX(nb, 0, i)] = b.north;
new_x[IDX(nb, n + 1, i)] = b.south;
new_x[IDX(nb, i, 0)] = b.west;
new_x[IDX(nb, i, n + 1)] = b.east;
}
/* Initialize the rest of the matrix */
for (i = 1; i <= n; i++) {
for (j = 1; j <= n; j++) {
x[IDX(nb, i, j)] = init_value;
}
}
/* Iterative refinement of x until values converge */
*iterations = 0;
do {
max_diff = 0;
for (i = 1; i <= n; i++) {
for (j = 1; j <= n_mult; j += 2) {
__m128d new_value_vec, tmp_vec;
new_value_vec = _mm_loadu_pd(&x[IDX(nb, i - 1, j)]);
tmp_vec = _mm_loadu_pd(&x[IDX(nb, i + 1, j)]);
new_value_vec = _mm_add_pd(new_value_vec, tmp_vec);
tmp_vec = _mm_loadu_pd(&x[IDX(nb, i, j - 1)]);
new_value_vec = _mm_add_pd(new_value_vec, tmp_vec);
tmp_vec = _mm_loadu_pd(&x[IDX(nb, i, j + 1)]);
new_value_vec = _mm_add_pd(new_value_vec, tmp_vec);
tmp_vec = _mm_set1_pd(0.25);
new_value_vec = _mm_mul_pd(new_value_vec, tmp_vec);
_mm_storeu_pd(&new_x[IDX(nb, i, j)], new_value_vec);
max_diff = (double) fmax(max_diff, fabs(new_x[IDX(nb, i, j)] - x[IDX(nb, i, j)]));
max_diff = (double) fmax(max_diff, fabs(new_x[IDX(nb, i, j + 1)] - x[IDX(nb, i, j + 1)]));
}
for (j = n_mult; j <= n; j++) {
new_value = 0.25 * (x[IDX(nb, i - 1, j)] + x[IDX(nb, i, j + 1)] + x[IDX(nb, i + 1, j)] + x[IDX(nb, i, j - 1)]);
max_diff = (double) fmax(max_diff, fabs(new_value - x[IDX(nb, i, j)]));
new_x[IDX(nb, i, j)] = new_value;
}
}
tmp_x = new_x;
new_x = x;
x = tmp_x;
(*iterations)++;
} while (max_diff > threshold);
return x;
}