Last commit

2017-01-22 01:02:17 +01:00 · 2017-01-22 01:02:17 +01:00 · a3be6d5298
commit a3be6d5298
parent 7f60341812
58 changed files with 910 additions and 137 deletions
--- a/45
+++ b/45
@ -1,12 +1,14 @@
 CC=gcc
 CC_OMP=gcc -fopenmp
 CC_MPI=mpicc
 CC_CUDA=nvcc
 CFLAGS=-Wall -lm -std=c99
 SRC=src
 BUILD=build
 BUILD_CUDA=build_cuda
 BIN=bin
-all: sequential mpi_line mpi_line_async omp sse
+all: sequential mpi omp sse cuda
 sequential: config utils main
 	${CC} ${CFLAGS} \
@ -16,8 +18,16 @@ sequential: config utils main
 		${SRC}/impl/sequential.c \
 	-o ${BIN}/jacobi_sequential
 cuda: config_cuda utils_cuda main_cuda
 	${CC_CUDA} \
 		${BUILD_CUDA}/config.o \
 		${BUILD_CUDA}/utils.o \
 		${BUILD_CUDA}/main.o \
 		${SRC}/impl/cuda.cu \
 	-o ${BIN}/jacobi_cuda
 sse: config utils main
-	${CC_OMP} ${CFLAGS} \
+	${CC} ${CFLAGS} \
 		${BUILD}/config.o \
 		${BUILD}/utils.o \
 		${BUILD}/main.o \
@ -32,21 +42,13 @@ omp: config utils main
 		${SRC}/impl/omp.c \
 	-o ${BIN}/jacobi_omp
-mpi_line: config utils main_mpi
+mpi: config utils main_mpi
 	${CC_MPI} ${CFLAGS} \
 		${BUILD}/config.o \
 		${BUILD}/utils.o \
 		${BUILD}/main_mpi.o \
-		${SRC}/impl/mpi_line.c \
+		${SRC}/impl/mpi.c \
-	-o ${BIN}/jacobi_mpi_line
+	-o ${BIN}/jacobi_mpi
 mpi_line_async: config utils main_mpi
 	${CC_MPI} ${CFLAGS} \
 		${BUILD}/config.o \
 		${BUILD}/utils.o \
 		${BUILD}/main_mpi.o \
 		${SRC}/impl/mpi_line_async.c \
 	-o ${BIN}/jacobi_mpi_line_async
 main: ${SRC}/main/main.c
 	${CC_MPI} -c ${CFLAGS} \
@ -58,16 +60,31 @@ main_mpi: ${SRC}/main/main_mpi.c
 		${SRC}/main/main_mpi.c \
 	-o ${BUILD}/main_mpi.o
 main_cuda: ${SRC}/main/main.cu
 	${CC_CUDA} -c \
 		${SRC}/main/main.cu \
 	-o ${BUILD_CUDA}/main.o
 config: ${SRC}/config.c
 	${CC_MPI} -c ${CFLAGS} \
 		${SRC}/config.c \
 	-o ${BUILD}/config.o
 config_cuda: ${SRC}/config.cu
 	${CC_CUDA} -c \
 		${SRC}/config.cu \
 	-o ${BUILD_CUDA}/config.o
 utils: ${SRC}/utils.c
 	${CC_MPI} -c ${CFLAGS} \
 		${SRC}/utils.c \
 	-o ${BUILD}/utils.o
 utils_cuda: ${SRC}/utils.cu
 	${CC_CUDA} -c \
 		${SRC}/utils.cu \
 	-o ${BUILD_CUDA}/utils.o
 .PHONY: clean
 clean:
-	rm build/[^.]* bin/[^.]*
+	rm build/[^.]* build_cuda/[^.]* bin/[^.]*
--- a/benchmarks/b1/cuda.sh
+++ b/benchmarks/b1/cuda.sh
@ -0,0 +1,3 @@
 #!/bin/sh
 ../../bin/jacobi_cuda > results/stdout_cuda.txt
--- a/benchmarks/b1/jacobi.conf
+++ b/benchmarks/b1/jacobi.conf
@ -0,0 +1,16 @@
 # Configuration file for the Jacobi project.
 # The size of the matrix (borders excluded).
 N 1000
 # The value at each border.
 NORTH 0.0
 EAST 0.0
 SOUTH 300.0
 WEST 0.0
 # The initial value to assign at each internal cell.
 INIT_VALUE 0.0
 # The threshold that determines the convergence.
 THRESHOLD 1.0
--- a/benchmarks/b1/mpi_1_1.sh
+++ b/benchmarks/b1/mpi_1_1.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J mpi
 #SBATCH -o results/stdout_mpi_1_1.txt
 #SBATCH -e results/stderr_mpi_1_1.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 mpirun ~/JacobiHPC/bin/jacobi_mpi
--- a/benchmarks/b1/mpi_1_10.sh
+++ b/benchmarks/b1/mpi_1_10.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J mpi
 #SBATCH -o results/stdout_mpi_1_10.txt
 #SBATCH -e results/stderr_mpi_1_10.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=10
 #SBATCH --nodelist=c6
 mpirun ~/JacobiHPC/bin/jacobi_mpi
--- a/benchmarks/b1/mpi_2_5.sh
+++ b/benchmarks/b1/mpi_2_5.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J mpi
 #SBATCH -o results/stdout_mpi_2_5.txt
 #SBATCH -e results/stderr_mpi_2_5.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 2
 #SBATCH --tasks-per-node=5
 #SBATCH --nodelist=c6,c7
 mpirun ~/JacobiHPC/bin/jacobi_mpi
--- a/benchmarks/b1/omp_1.sh
+++ b/benchmarks/b1/omp_1.sh
@ -0,0 +1,12 @@
 #!/bin/sh
 #SBATCH -J omp
 #SBATCH -o results/stdout_omp_1.txt
 #SBATCH -e results/stderr_omp_1.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 export OMP_NUM_THREADS=1
 ~/JacobiHPC/bin/jacobi_omp
--- a/benchmarks/b1/omp_10.sh
+++ b/benchmarks/b1/omp_10.sh
@ -0,0 +1,12 @@
 #!/bin/sh
 #SBATCH -J omp
 #SBATCH -o results/stdout_omp_10.txt
 #SBATCH -e results/stderr_omp_10.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 export OMP_NUM_THREADS=10
 ~/JacobiHPC/bin/jacobi_omp
--- a/benchmarks/b1/sequential.sh
+++ b/benchmarks/b1/sequential.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J sequential
 #SBATCH -o results/stdout_sequential.txt
 #SBATCH -e results/stderr_sequential.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 ~/JacobiHPC/bin/jacobi_sequential
--- a/benchmarks/b1/sse.sh
+++ b/benchmarks/b1/sse.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J sse
 #SBATCH -o results/stdout_sse.txt
 #SBATCH -e results/stderr_sse.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 ~/JacobiHPC/bin/jacobi_sse
--- a/benchmarks/b2/cuda.sh
+++ b/benchmarks/b2/cuda.sh
@ -0,0 +1,3 @@
 #!/bin/sh
 ../../bin/jacobi_cuda > results/stdout_cuda.txt
--- a/benchmarks/b2/jacobi.conf
+++ b/benchmarks/b2/jacobi.conf
@ -0,0 +1,16 @@
 # Configuration file for the Jacobi project.
 # The size of the matrix (borders excluded).
 N 2000
 # The value at each border.
 NORTH 0.0
 EAST 0.0
 SOUTH 300.0
 WEST 0.0
 # The initial value to assign at each internal cell.
 INIT_VALUE 0.0
 # The threshold that determines the convergence.
 THRESHOLD 1.0
--- a/benchmarks/b2/mpi_1_1.sh
+++ b/benchmarks/b2/mpi_1_1.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J mpi
 #SBATCH -o results/stdout_mpi_1_1.txt
 #SBATCH -e results/stderr_mpi_1_1.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 mpirun ~/JacobiHPC/bin/jacobi_mpi
--- a/benchmarks/b2/mpi_1_10.sh
+++ b/benchmarks/b2/mpi_1_10.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J mpi
 #SBATCH -o results/stdout_mpi_1_10.txt
 #SBATCH -e results/stderr_mpi_1_10.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=10
 #SBATCH --nodelist=c6
 mpirun ~/JacobiHPC/bin/jacobi_mpi
--- a/benchmarks/b2/mpi_2_5.sh
+++ b/benchmarks/b2/mpi_2_5.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J mpi
 #SBATCH -o results/stdout_mpi_2_5.txt
 #SBATCH -e results/stderr_mpi_2_5.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 2
 #SBATCH --tasks-per-node=5
 #SBATCH --nodelist=c6,c7
 mpirun ~/JacobiHPC/bin/jacobi_mpi
--- a/benchmarks/b2/omp_1.sh
+++ b/benchmarks/b2/omp_1.sh
@ -0,0 +1,12 @@
 #!/bin/sh
 #SBATCH -J omp
 #SBATCH -o results/stdout_omp_1.txt
 #SBATCH -e results/stderr_omp_1.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 export OMP_NUM_THREADS=1
 ~/JacobiHPC/bin/jacobi_omp
--- a/benchmarks/b2/omp_10.sh
+++ b/benchmarks/b2/omp_10.sh
@ -0,0 +1,12 @@
 #!/bin/sh
 #SBATCH -J omp
 #SBATCH -o results/stdout_omp_10.txt
 #SBATCH -e results/stderr_omp_10.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 export OMP_NUM_THREADS=10
 ~/JacobiHPC/bin/jacobi_omp
--- a/benchmarks/b2/sequential.sh
+++ b/benchmarks/b2/sequential.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J sequential
 #SBATCH -o results/stdout_sequential.txt
 #SBATCH -e results/stderr_sequential.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 ~/JacobiHPC/bin/jacobi_sequential
--- a/benchmarks/b2/sse.sh
+++ b/benchmarks/b2/sse.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J sse
 #SBATCH -o results/stdout_sse.txt
 #SBATCH -e results/stderr_sse.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 ~/JacobiHPC/bin/jacobi_sse
--- a/benchmarks/b3/cuda.sh
+++ b/benchmarks/b3/cuda.sh
@ -0,0 +1,3 @@
 #!/bin/sh
 ../../bin/jacobi_cuda > results/stdout_cuda.txt
--- a/benchmarks/b3/jacobi.conf
+++ b/benchmarks/b3/jacobi.conf
@ -0,0 +1,16 @@
 # Configuration file for the Jacobi project.
 # The size of the matrix (borders excluded).
 N 5000
 # The value at each border.
 NORTH 0.0
 EAST 0.0
 SOUTH 300.0
 WEST 0.0
 # The initial value to assign at each internal cell.
 INIT_VALUE 0.0
 # The threshold that determines the convergence.
 THRESHOLD 1.0
--- a/benchmarks/b3/mpi_1_1.sh
+++ b/benchmarks/b3/mpi_1_1.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J mpi
 #SBATCH -o results/stdout_mpi_1_1.txt
 #SBATCH -e results/stderr_mpi_1_1.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 mpirun ~/JacobiHPC/bin/jacobi_mpi
--- a/benchmarks/b3/mpi_1_10.sh
+++ b/benchmarks/b3/mpi_1_10.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J mpi
 #SBATCH -o results/stdout_mpi_1_10.txt
 #SBATCH -e results/stderr_mpi_1_10.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=10
 #SBATCH --nodelist=c6
 mpirun ~/JacobiHPC/bin/jacobi_mpi
--- a/benchmarks/b3/mpi_2_5.sh
+++ b/benchmarks/b3/mpi_2_5.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J mpi
 #SBATCH -o results/stdout_mpi_2_5.txt
 #SBATCH -e results/stderr_mpi_2_5.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 2
 #SBATCH --tasks-per-node=5
 #SBATCH --nodelist=c6,c7
 mpirun ~/JacobiHPC/bin/jacobi_mpi
--- a/benchmarks/b3/omp_1.sh
+++ b/benchmarks/b3/omp_1.sh
@ -0,0 +1,12 @@
 #!/bin/sh
 #SBATCH -J omp
 #SBATCH -o results/stdout_omp_1.txt
 #SBATCH -e results/stderr_omp_1.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 export OMP_NUM_THREADS=1
 ~/JacobiHPC/bin/jacobi_omp
--- a/benchmarks/b3/omp_10.sh
+++ b/benchmarks/b3/omp_10.sh
@ -0,0 +1,12 @@
 #!/bin/sh
 #SBATCH -J omp
 #SBATCH -o results/stdout_omp_10.txt
 #SBATCH -e results/stderr_omp_10.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 export OMP_NUM_THREADS=10
 ~/JacobiHPC/bin/jacobi_omp
--- a/benchmarks/b3/sequential.sh
+++ b/benchmarks/b3/sequential.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J sequential
 #SBATCH -o results/stdout_sequential.txt
 #SBATCH -e results/stderr_sequential.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 ~/JacobiHPC/bin/jacobi_sequential
--- a/benchmarks/b3/sse.sh
+++ b/benchmarks/b3/sse.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J sse
 #SBATCH -o results/stdout_sse.txt
 #SBATCH -e results/stderr_sse.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 ~/JacobiHPC/bin/jacobi_sse
--- a/benchmarks/b4/cuda.sh
+++ b/benchmarks/b4/cuda.sh
@ -0,0 +1,3 @@
 #!/bin/sh
 ../../bin/jacobi_cuda > results/stdout_cuda.txt
--- a/benchmarks/b4/jacobi.conf
+++ b/benchmarks/b4/jacobi.conf
@ -0,0 +1,16 @@
 # Configuration file for the Jacobi project.
 # The size of the matrix (borders excluded).
 N 10000
 # The value at each border.
 NORTH 0.0
 EAST 0.0
 SOUTH 300.0
 WEST 0.0
 # The initial value to assign at each internal cell.
 INIT_VALUE 0.0
 # The threshold that determines the convergence.
 THRESHOLD 1.0
--- a/benchmarks/b4/mpi_1_1.sh
+++ b/benchmarks/b4/mpi_1_1.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J mpi
 #SBATCH -o results/stdout_mpi_1_1.txt
 #SBATCH -e results/stderr_mpi_1_1.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 mpirun ~/JacobiHPC/bin/jacobi_mpi
--- a/benchmarks/b4/mpi_1_10.sh
+++ b/benchmarks/b4/mpi_1_10.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J mpi
 #SBATCH -o results/stdout_mpi_1_10.txt
 #SBATCH -e results/stderr_mpi_1_10.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=10
 #SBATCH --nodelist=c6
 mpirun ~/JacobiHPC/bin/jacobi_mpi
--- a/benchmarks/b4/mpi_5_10.sh
+++ b/benchmarks/b4/mpi_5_10.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J mpi
 #SBATCH -o results/stdout_mpi_5_10.txt
 #SBATCH -e results/stderr_mpi_5_10.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 5
 #SBATCH --tasks-per-node=10
 #SBATCH --nodelist=c2,c3,c4,c6,c7
 mpirun ~/JacobiHPC/bin/jacobi_mpi
--- a/benchmarks/b4/omp_1.sh
+++ b/benchmarks/b4/omp_1.sh
@ -0,0 +1,12 @@
 #!/bin/sh
 #SBATCH -J omp
 #SBATCH -o results/stdout_omp_1.txt
 #SBATCH -e results/stderr_omp_1.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 export OMP_NUM_THREADS=1
 ~/JacobiHPC/bin/jacobi_omp
--- a/benchmarks/b4/omp_10.sh
+++ b/benchmarks/b4/omp_10.sh
@ -0,0 +1,12 @@
 #!/bin/sh
 #SBATCH -J omp
 #SBATCH -o results/stdout_omp_10.txt
 #SBATCH -e results/stderr_omp_10.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 export OMP_NUM_THREADS=10
 ~/JacobiHPC/bin/jacobi_omp
--- a/benchmarks/b4/sequential.sh
+++ b/benchmarks/b4/sequential.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J sequential
 #SBATCH -o results/stdout_sequential.txt
 #SBATCH -e results/stderr_sequential.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 ~/JacobiHPC/bin/jacobi_sequential
--- a/benchmarks/b4/sse.sh
+++ b/benchmarks/b4/sse.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J sse
 #SBATCH -o results/stdout_sse.txt
 #SBATCH -e results/stderr_sse.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 ~/JacobiHPC/bin/jacobi_sse
--- a/benchmarks/b5/cuda.sh
+++ b/benchmarks/b5/cuda.sh
@ -0,0 +1,3 @@
 #!/bin/sh
 ../../bin/jacobi_cuda > results/stdout_cuda.txt
--- a/benchmarks/b5/jacobi.conf
+++ b/benchmarks/b5/jacobi.conf
@ -0,0 +1,16 @@
 # Configuration file for the Jacobi project.
 # The size of the matrix (borders excluded).
 N 20000
 # The value at each border.
 NORTH 0.0
 EAST 0.0
 SOUTH 300.0
 WEST 0.0
 # The initial value to assign at each internal cell.
 INIT_VALUE 0.0
 # The threshold that determines the convergence.
 THRESHOLD 1.0
--- a/benchmarks/b5/mpi_1_1.sh
+++ b/benchmarks/b5/mpi_1_1.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J mpi
 #SBATCH -o results/stdout_mpi_1_1.txt
 #SBATCH -e results/stderr_mpi_1_1.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 mpirun ~/JacobiHPC/bin/jacobi_mpi
--- a/benchmarks/b5/mpi_1_10.sh
+++ b/benchmarks/b5/mpi_1_10.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J mpi
 #SBATCH -o results/stdout_mpi_1_10.txt
 #SBATCH -e results/stderr_mpi_1_10.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=10
 #SBATCH --nodelist=c6
 mpirun ~/JacobiHPC/bin/jacobi_mpi
--- a/benchmarks/b5/mpi_5_10.sh
+++ b/benchmarks/b5/mpi_5_10.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J mpi
 #SBATCH -o results/stdout_mpi_5_10.txt
 #SBATCH -e results/stderr_mpi_5_10.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 5
 #SBATCH --tasks-per-node=10
 #SBATCH --nodelist=c2,c3,c4,c6,c7
 mpirun ~/JacobiHPC/bin/jacobi_mpi
--- a/benchmarks/b5/omp_1.sh
+++ b/benchmarks/b5/omp_1.sh
@ -0,0 +1,12 @@
 #!/bin/sh
 #SBATCH -J omp
 #SBATCH -o results/stdout_omp_1.txt
 #SBATCH -e results/stderr_omp_1.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 export OMP_NUM_THREADS=1
 ~/JacobiHPC/bin/jacobi_omp
--- a/benchmarks/b5/omp_10.sh
+++ b/benchmarks/b5/omp_10.sh
@ -0,0 +1,12 @@
 #!/bin/sh
 #SBATCH -J omp
 #SBATCH -o results/stdout_omp_10.txt
 #SBATCH -e results/stderr_omp_10.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 export OMP_NUM_THREADS=10
 ~/JacobiHPC/bin/jacobi_omp
--- a/benchmarks/b5/sequential.sh
+++ b/benchmarks/b5/sequential.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J sequential
 #SBATCH -o results/stdout_sequential.txt
 #SBATCH -e results/stderr_sequential.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 ~/JacobiHPC/bin/jacobi_sequential
--- a/benchmarks/b5/sse.sh
+++ b/benchmarks/b5/sse.sh
@ -0,0 +1,10 @@
 #!/bin/sh
 #SBATCH -J sse
 #SBATCH -o results/stdout_sse.txt
 #SBATCH -e results/stderr_sse.txt
 #SBATCH -t 00:30:00
 #SBATCH -N 1
 #SBATCH --tasks-per-node=1
 #SBATCH --nodelist=c6
 ~/JacobiHPC/bin/jacobi_sse
--- a/build_cuda/.gitignore
+++ b/build_cuda/.gitignore
@ -0,0 +1,4 @@
 # Ignore everything in this directory
 *
 # Except this file
 !.gitignore
--- a/src/config.c
+++ b/src/config.c
@ -11,6 +11,7 @@ typedef struct configuration {
  float threshold;
 } configuration;
 int load_config(configuration *config) {
  char property[100];
  char *value;
--- a/src/config.cu
+++ b/src/config.cu
@ -0,0 +1,79 @@
 #include <stdio.h>
 #include <string.h>
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
 typedef struct configuration {
 	int n;
 	float north;
 	float east;
 	float south;
 	float west;
 	float init_value;
 	float threshold;
 } configuration;
 __host__ int old_load_config(configuration *config) {
 	config->n = 5;
 	config->north = 0.0;
 	config->east = 0.0;
 	config->west = 0.0;
 	config->south = 300.0;
 	config->init_value = 0.0;
 	config->threshold = 1.0;
 	return 0;
 }
 __host__ int load_config(configuration *config) {
 	char property[100];
 	char *value;
 	FILE *fp;
 	fp = fopen("jacobi.conf", "r");
 	if (fp == NULL) {
 		perror("Error opening file jacobi.conf");
 		return 1;
 	}
 	while (fgets(property, 100, fp) != NULL) {
 		if (property[0] == '\n' || property[0] == '#') {
 			/* Skip empty lines and comments */
 			continue;
 		}
 		value = strchr(property, ' ');
 		if (value == NULL) {
 			fclose(fp);
 			perror("Error reading file jacobi.conf");
 			return 1;
 		}
 		value[0] = '\0';
 		value += sizeof(char);
 		value[strlen(value) - 1] = '\0';
 		if (strcmp(property, "N") == 0) {
 			sscanf(value, "%d", &(config->n));
 		}
 		else if (strcmp(property, "NORTH") == 0) {
 			sscanf(value, "%f", &(config->north));
 		}
 		else if (strcmp(property, "EAST") == 0) {
 			sscanf(value, "%f", &(config->east));
 		}
 		else if (strcmp(property, "SOUTH") == 0) {
 			sscanf(value, "%f", &(config->south));
 		}
 		else if (strcmp(property, "WEST") == 0) {
 			sscanf(value, "%f", &(config->west));
 		}
 		else if (strcmp(property, "INIT_VALUE") == 0) {
 			sscanf(value, "%f", &(config->init_value));
 		}
 		else if (strcmp(property, "THRESHOLD") == 0) {
 			sscanf(value, "%f", &(config->threshold));
 		}
 		else {
 			printf("Unknown property %s\n", property);
 		}
 	}
 	fclose(fp);
 	return 0;
 }
--- a/src/config.cuh
+++ b/src/config.cuh
@ -0,0 +1,14 @@
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
 typedef struct configuration {
 	int n;
 	float north;
 	float east;
 	float south;
 	float west;
 	float init_value;
 	float threshold;
 } configuration;
 __host__ int load_config(configuration *config);
--- a/src/impl/cuda.cu
+++ b/src/impl/cuda.cu
@ -0,0 +1,118 @@
 /*
 * CUDA version.
 */
 #include <stdio.h>
 #include <math.h>
 #include "../config.cuh"
 #include "../utils.cuh"
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
 #define THREADS_BLOCK 256;
 __host__ void check_status(cudaError_t cuda_status, char *msg) {
 	if (cuda_status != cudaSuccess) {
 		fprintf(stderr, msg);
 		fprintf(stderr, ": ");
 		fprintf(stderr, cudaGetErrorString(cuda_status));
 		fprintf(stderr, " (error code: %d)\n", cuda_status);
 		exit(EXIT_FAILURE);
 	}
 }
 __global__ void initialize_matrix_on_gpu(float *x, int n, float init_value, borders b, cudaError_t *cuda_status) {
 	int i, j;
 	int nb = n + 2;
 	/* Initialize borders */
 	for (i = 0; i < nb; i++) {
 		x[IDX(nb, 0, i)] = b.north;
 		x[IDX(nb, n + 1, i)] = b.south;
 		x[IDX(nb, i, 0)] = b.west;
 		x[IDX(nb, i, n + 1)] = b.east;
 	}
 	/* Initialize the rest of the matrix */
 	for (i = 1; i <= n; i++) {
 		for (j = 1; j <= n; j++) {
 			x[IDX(nb, i, j)] = init_value;
 		}
 	}
 }
 __global__ void iterate(int n, float *x, float *new_x) {
 	int idx, nb;
 	int i, j;
 	nb = n + 2;
 	idx = blockDim.x * blockIdx.x + threadIdx.x;
 	i = idx / nb;
 	j = idx % nb;
 	if (i >= 1 && i <= n && j >= 1 && j <= n) {
 		new_x[idx] = 0.25 * (x[IDX(nb, i - 1, j)] + x[IDX(nb, i, j + 1)] + x[IDX(nb, i + 1, j)] + x[IDX(nb, i, j - 1)]);
 	}
 }
 __host__ float *compute_jacobi(int n, float init_value, float threshold, borders b, int *iterations) {
 	float *x, *new_x;
 	float *x_gpu, *new_x_gpu;
 	float *tmp_x;
 	float max_diff;
 	int i, j;
 	int nb = n + 2; // n plus the border
 	int blocks_number;
 	int threads_block = THREADS_BLOCK;
 	cudaError_t cuda_status;
 	// Select the GPU
 	check_status(cudaSetDevice(0), "cudaSetDevice failed!");
 	/* Create the matrixes on the GPU */
 	x_gpu = create_sa_matrix_on_gpu(nb, nb, &cuda_status);
 	check_status(cuda_status, "create_sa_matrix_on_gpu failed!");
 	new_x_gpu = create_sa_matrix_on_gpu(nb, nb, &cuda_status);
 	check_status(cuda_status, "create_sa_matrix_on_gpu failed!");
 	/* Initialize the matrixes */
 	initialize_matrix_on_gpu<<<1, 1>>>(x_gpu, n, init_value, b, &cuda_status);
 	check_status(cuda_status, "initialize_matrix_on_gpu failed!");
 	initialize_matrix_on_gpu<<<1, 1>>>(new_x_gpu, n, init_value, b, &cuda_status);
 	check_status(cuda_status, "initialize_matrix_on_gpu failed!");
 	/* Iterative refinement of x until values converge */
 	x = retrieve_sa_matrix_from_gpu(x_gpu, nb, nb, &cuda_status);
 	check_status(cuda_status, "retrieve_sa_matrix_from_gpu failed!");
 	blocks_number = nb / threads_block + 1;
 	*iterations = 0;
 	do {
 		iterate<<<blocks_number, threads_block>>>(n, x_gpu, new_x_gpu);
 		new_x = retrieve_sa_matrix_from_gpu(new_x_gpu, nb, nb, &cuda_status);
 		check_status(cuda_status, "retrieve_sa_matrix_from_gpu failed!");
 		max_diff = 0;
 		for (i = 1; i <= n; i++) {
 			for (j = 1; j <= n; j++) {
 				max_diff = fmaxf(max_diff, fabs(new_x[IDX(nb, i, j)] - x[IDX(nb, i, j)]));
 			}
 		}
 		tmp_x = new_x;
 		new_x = x;
 		x = tmp_x;
 		tmp_x = new_x_gpu;
 		new_x_gpu = x_gpu;
 		x_gpu = tmp_x;
 		(*iterations)++;
 	} while (max_diff > threshold);
 	x = retrieve_sa_matrix_from_gpu(x_gpu, nb, nb, &cuda_status);
 	check_status(cuda_status, "retrieve_sa_matrix_from_gpu failed!");
 	destroy_sa_matrix_on_gpu(x_gpu);
 	destroy_sa_matrix_on_gpu(new_x_gpu);
 	return x;
 }
--- a/src/impl/mpi_line.c
+++ b/src/impl/mpi_line.c
--- a/src/impl/mpi_line_async.c
+++ b/src/impl/mpi_line_async.c
@ -1,123 +0,0 @@
 /*
 * MPI version with the matrix subdivided by "lines".
 */
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <mpi.h>
 #include "../config.h"
 #include "../utils.h"
 #define TAG_BORDER 0
 #define TAG_MATRIX 1
 float *compute_jacobi(int rank, int numprocs, int n, float init_value, float threshold, borders b, int *iterations) {
  float *complete_x;
  float *x;
  float *new_x;
  float *tmp_x;
  float max_diff, global_max_diff, new_value;
  int i, j;
  int nb = n + 2; // n plus the border
  int rows, rows_to_transmit;
  int receive_pos;
  MPI_Request request_north;
  MPI_Request request_south;
  if (rank == 0) {
    rows = n - (n / numprocs) * (numprocs - 1);
  } else {
    rows = n / numprocs;
  }
  LOG(printf("[Process %d/%d] rows: %d\n", rank, numprocs, rows));
  /* LOG(printf("[Process %d/%d] initializing matrix\n", rank, numprocs)); */
  /* Initialize the matrix */
  x = create_sa_matrix(rows + 2, nb);
  new_x = create_sa_matrix(rows + 2, nb);
  for (i = 0; i < rows + 2; i++) {
    for (j = 1; j <= n; j++) {
      x[IDX(nb, i, j)] = init_value;
      new_x[IDX(nb, i, j)] = init_value;
    }
  }
  /* Initialize boundary regions */
  for (i = 0; i < rows + 2; i++) {
    x[IDX(nb, i, 0)] = b.west;
    x[IDX(nb, i, n + 1)] = b.east;
    new_x[IDX(nb, i, 0)] = b.west;
    new_x[IDX(nb, i, n + 1)] = b.east;
  }
  if (rank == 0) {
    for (i = 1; i <= n + 1; i++) {
      x[IDX(nb, 0, i)] = b.north;
      new_x[IDX(nb, 0, i)] = b.north;
    }
  }
  if (rank == numprocs - 1){
    for (i = 1; i < n + 1; i++) {
      x[IDX(nb, rows + 1, i)] = b.south;
      new_x[IDX(nb, rows + 1, i)] = b.south;
    }
  }
  /* LOG(printf("[Process %d/%d] matrix initialized\n", rank, numprocs)); */
  /* Iterative refinement of x until values converge */
  *iterations = 0;
  do {
    if (rank != numprocs - 1) {
      // Send south border
      MPI_Isend(&x[IDX(nb, rows, 0)], nb, MPI_FLOAT, rank + 1, TAG_BORDER, MPI_COMM_WORLD, &request_south);
    }
    if (rank != 0) {
      // Send north border
      MPI_Isend(&x[IDX(nb, 1, 0)], nb, MPI_FLOAT, rank - 1, TAG_BORDER, MPI_COMM_WORLD, &request_north);
    }
    max_diff = 0;
    global_max_diff = 0;
    for (i = 1; i <= rows; i++) {
      for (j = 1; j <= n; j++) {
        new_value = 0.25 * (x[IDX(nb, i - 1, j)] + x[IDX(nb, i, j + 1)] + x[IDX(nb, i + 1, j)] + x[IDX(nb, i, j - 1)]);
        max_diff = fmaxf(max_diff, fabs(new_value - x[IDX(nb, i, j)]));
        new_x[IDX(nb, i, j)] = new_value;
      }
    }
    tmp_x = new_x;
    new_x = x;
    x = tmp_x;
    if (rank != numprocs - 1) {
      // Receive south border
      MPI_Recv(&x[IDX(nb, rows + 1, 0)], nb, MPI_FLOAT, rank + 1, TAG_BORDER, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
    }
    if (rank != 0) {
      // Receive north border
      MPI_Recv(&x[IDX(nb, 0, 0)], nb, MPI_FLOAT, rank - 1, TAG_BORDER, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
    }
    LOG(printf("[Process %d/%d] max_diff: %f\n", rank, numprocs, max_diff));
    MPI_Allreduce(&max_diff, &global_max_diff, 1, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD);
    /* LOG(printf("[Process %d/%d] global_max_diff: %f\n", rank, numprocs, global_max_diff)); */
    (*iterations)++;
  } while (global_max_diff > threshold);
  if (rank == 0) {
    complete_x = create_sa_matrix(nb, nb);
    memcpy(complete_x, x, (rows + ((rank == numprocs - 1) ? 2 : 1)) * (nb) * sizeof(float));
    rows_to_transmit = n / numprocs;
    receive_pos = rows + 1;
    for (i = 1; i < numprocs; i++) {
      if (i == numprocs - 1) {
        rows_to_transmit++;
      }
      MPI_Recv(&complete_x[IDX(nb, receive_pos, 0)], rows_to_transmit * (nb), MPI_FLOAT, i, TAG_MATRIX, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
      receive_pos += n / numprocs;
    }
  } else {
    complete_x = NULL;
    rows_to_transmit = rows;
    if (rank == numprocs - 1) {
      rows_to_transmit++;
    }
    MPI_Send(&x[IDX(nb, 1, 0)], rows_to_transmit * (nb), MPI_FLOAT, 0, TAG_MATRIX, MPI_COMM_WORLD);
  }
  return complete_x;
 }
--- a/src/main/main.cu
+++ b/src/main/main.cu
@ -0,0 +1,52 @@
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <sys/time.h>
 #include "../config.cuh"
 #include "../utils.cuh"
 __host__ float *compute_jacobi(int n, float init_value, float threshold, borders b, int *iterations);
 __host__ int main(int argc, char* argv[]) {
 	int n;
 	float init_value, threshold;
 	borders b;
 	int config_loaded;
 	configuration config;
 	float *x;
 	int iterations;
 	struct timeval start, end;
 	long secs_used, micros_used;
 	config_loaded = load_config(&config);
 	if (config_loaded != 0) {
 		return 1;
 	}
 	n = config.n;
 	threshold = config.threshold;
 	init_value = config.init_value;
 	b.north = config.north;
 	b.south = config.south;
 	b.east = config.east;
 	b.west = config.west;
 	gettimeofday(&start, NULL);
 	x = compute_jacobi(n, init_value, threshold, b, &iterations);
 	gettimeofday(&end, NULL);
 	secs_used = (end.tv_sec - start.tv_sec);
 	micros_used = ((secs_used * 1000000) + end.tv_usec) - (start.tv_usec);
 	printf("Wall clock time: %fs\n", (float)micros_used / 1000000);
 	printf("Iterations: N/A\n", iterations);
 	if (n < 10) {
 		print_sa_matrix(x, n + 2, n + 2);
 	}
 	destroy_sa_matrix(x);
 	return 0;
 }
--- a/src/utils.c
+++ b/src/utils.c
@ -54,3 +54,12 @@ void print_matrix(float **x, int rows, int cols) {
  }
  fflush(stdout);
 }
 float fmaxf(float a, float b) {
  if (a > b) {
    return a;
  } else {
    return b;
  }
 }
--- a/src/utils.cu
+++ b/src/utils.cu
@ -0,0 +1,79 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "utils.cuh"
 __host__ float *create_sa_matrix(int rows, int cols) {
 	float *x;
 	x = (float *)malloc(rows * cols * sizeof(float));
 	return x;
 }
 __host__ float *create_sa_matrix_on_gpu(int rows, int cols, cudaError_t *cuda_status) {
 	float *x = NULL;
 	*cuda_status = cudaMalloc((void**)&x, rows * cols * sizeof(float));
 	return x;
 }
 __host__ void destroy_sa_matrix(float *x) {
 	free(x);
 }
 __host__ void destroy_sa_matrix_on_gpu(float *x) {
 	cudaFree(x);
 }
 __host__ void initialize_matrix_on_gpu(float *x, int rows, int cols, cudaError_t *cuda_status) {
 	*cuda_status = cudaMemset(x, 0, rows * cols * sizeof(float));
 }
 __host__ float *retrieve_sa_matrix_from_gpu(float *x, int rows, int cols, cudaError_t *cuda_status) {
 	float *x_host;
 	x_host = create_sa_matrix(rows, cols);
 	*cuda_status = cudaMemcpy(x_host, x, rows * cols * sizeof(float), cudaMemcpyDeviceToHost);
 	return x_host;
 }
 __host__ void print_sa_matrix(float *x, int rows, int cols) {
 	int i, j;
 	for (i = 0; i < rows; i++) {
 		for (j = 0; j < cols; j++) {
 			printf("%f\t", x[IDX(cols, i, j)]);
 		}
 		printf("\n");
 	}
 	fflush(stdout);
 }
 __host__ float **create_matrix(int rows, int cols) {
 	int i;
 	float **x;
 	x = (float **)malloc(rows * sizeof(float));
 	for (i = 0; i < rows; i++) {
 		x[i] = (float *)malloc(cols * sizeof(float));
 	}
 	return x;
 }
 __host__ void destroy_matrix(float **x, int rows) {
 	int i;
 	for (i = 0; i < rows; i++) {
 		free(x[i]);
 	}
 	free(x);
 }
 __host__ void print_matrix(float **x, int rows, int cols) {
 	int i, j;
 	for (i = 0; i < rows; i++) {
 		for (j = 0; j < cols; j++) {
 			printf("%f\t", x[i][j]);
 		}
 		printf("\n");
 	}
 	fflush(stdout);
 }
--- a/src/utils.cuh
+++ b/src/utils.cuh
@ -0,0 +1,56 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
 /* #define ENABLE_LOG */
 #ifdef ENABLE_LOG
 #  define LOG(x) x
 #else
 #  define LOG(x) (void) 0
 #endif
 /*
 * Macro used with single array matrices to
 * get the array index given the number of columns,
 * the row index and the column index.
 */
 #define IDX(cols, r, c) ((r) * (cols) + (c))
 typedef struct borders {
 	float north;
 	float east;
 	float south;
 	float west;
 } borders;
 /*
 * Create a matrix stored in a single array.
 */
 __host__ float *create_sa_matrix(int rows, int cols);
 __host__ float *create_sa_matrix_on_gpu(int rows, int cols, cudaError_t *cuda_status);
 /*
 * Destroy a single array matrix.
 */
 __host__ void destroy_sa_matrix(float *x);
 __host__ void destroy_sa_matrix_on_gpu(float *x);
 __host__ void initialize_matrix_on_gpu(float *x, int rows, int cols, cudaError_t *cuda_status);
 __host__ float *retrieve_sa_matrix_from_gpu(float *x, int rows, int cols, cudaError_t *cuda_status);
 /*
 * Print a single array matrix.
 */
 __host__ void print_sa_matrix(float *x, int rows, int cols);
 __host__ float **create_matrix(int rows, int cols);
 __host__ void destroy_matrix(float **x, int rows);
 __host__ void print_matrix(float **x, int rows, int cols);
--- a/src/utils.h
+++ b/src/utils.h
@ -44,3 +44,5 @@ void print_sa_matrix(float *x, int rows, int cols);
 float **create_matrix(int rows, int cols);
 void destroy_matrix(float **x, int rows);
 void print_matrix(float **x, int rows, int cols);
 float fmaxf(float a, float b);
		`@ -0,0 +1,3 @@`
							`#!/bin/sh`

							`../../bin/jacobi_cuda > results/stdout_cuda.txt`