From a3be6d52986d8fdd6e4adfea0fe25a904b56ffff Mon Sep 17 00:00:00 2001 From: Fabio Salvini Date: Sun, 22 Jan 2017 01:02:17 +0100 Subject: [PATCH] Last commit --- Makefile | 45 ++++++++---- benchmarks/b1/cuda.sh | 3 + benchmarks/b1/jacobi.conf | 16 +++++ benchmarks/b1/mpi_1_1.sh | 10 +++ benchmarks/b1/mpi_1_10.sh | 10 +++ benchmarks/b1/mpi_2_5.sh | 10 +++ benchmarks/b1/omp_1.sh | 12 ++++ benchmarks/b1/omp_10.sh | 12 ++++ benchmarks/b1/sequential.sh | 10 +++ benchmarks/b1/sse.sh | 10 +++ benchmarks/b2/cuda.sh | 3 + benchmarks/b2/jacobi.conf | 16 +++++ benchmarks/b2/mpi_1_1.sh | 10 +++ benchmarks/b2/mpi_1_10.sh | 10 +++ benchmarks/b2/mpi_2_5.sh | 10 +++ benchmarks/b2/omp_1.sh | 12 ++++ benchmarks/b2/omp_10.sh | 12 ++++ benchmarks/b2/sequential.sh | 10 +++ benchmarks/b2/sse.sh | 10 +++ benchmarks/b3/cuda.sh | 3 + benchmarks/b3/jacobi.conf | 16 +++++ benchmarks/b3/mpi_1_1.sh | 10 +++ benchmarks/b3/mpi_1_10.sh | 10 +++ benchmarks/b3/mpi_2_5.sh | 10 +++ benchmarks/b3/omp_1.sh | 12 ++++ benchmarks/b3/omp_10.sh | 12 ++++ benchmarks/b3/sequential.sh | 10 +++ benchmarks/b3/sse.sh | 10 +++ benchmarks/b4/cuda.sh | 3 + benchmarks/b4/jacobi.conf | 16 +++++ benchmarks/b4/mpi_1_1.sh | 10 +++ benchmarks/b4/mpi_1_10.sh | 10 +++ benchmarks/b4/mpi_5_10.sh | 10 +++ benchmarks/b4/omp_1.sh | 12 ++++ benchmarks/b4/omp_10.sh | 12 ++++ benchmarks/b4/sequential.sh | 10 +++ benchmarks/b4/sse.sh | 10 +++ benchmarks/b5/cuda.sh | 3 + benchmarks/b5/jacobi.conf | 16 +++++ benchmarks/b5/mpi_1_1.sh | 10 +++ benchmarks/b5/mpi_1_10.sh | 10 +++ benchmarks/b5/mpi_5_10.sh | 10 +++ benchmarks/b5/omp_1.sh | 12 ++++ benchmarks/b5/omp_10.sh | 12 ++++ benchmarks/b5/sequential.sh | 10 +++ benchmarks/b5/sse.sh | 10 +++ build_cuda/.gitignore | 4 ++ src/config.c | 1 + src/config.cu | 79 +++++++++++++++++++++ src/config.cuh | 14 ++++ src/impl/cuda.cu | 118 +++++++++++++++++++++++++++++++ src/impl/{mpi_line.c => mpi.c} | 0 src/impl/mpi_line_async.c | 123 --------------------------------- src/main/main.cu | 52 ++++++++++++++ src/utils.c | 9 +++ src/utils.cu | 79 +++++++++++++++++++++ src/utils.cuh | 56 +++++++++++++++ src/utils.h | 2 + 58 files changed, 910 insertions(+), 137 deletions(-) create mode 100644 benchmarks/b1/cuda.sh create mode 100644 benchmarks/b1/jacobi.conf create mode 100644 benchmarks/b1/mpi_1_1.sh create mode 100644 benchmarks/b1/mpi_1_10.sh create mode 100644 benchmarks/b1/mpi_2_5.sh create mode 100644 benchmarks/b1/omp_1.sh create mode 100644 benchmarks/b1/omp_10.sh create mode 100644 benchmarks/b1/sequential.sh create mode 100644 benchmarks/b1/sse.sh create mode 100644 benchmarks/b2/cuda.sh create mode 100644 benchmarks/b2/jacobi.conf create mode 100644 benchmarks/b2/mpi_1_1.sh create mode 100644 benchmarks/b2/mpi_1_10.sh create mode 100644 benchmarks/b2/mpi_2_5.sh create mode 100644 benchmarks/b2/omp_1.sh create mode 100644 benchmarks/b2/omp_10.sh create mode 100644 benchmarks/b2/sequential.sh create mode 100644 benchmarks/b2/sse.sh create mode 100644 benchmarks/b3/cuda.sh create mode 100644 benchmarks/b3/jacobi.conf create mode 100644 benchmarks/b3/mpi_1_1.sh create mode 100644 benchmarks/b3/mpi_1_10.sh create mode 100644 benchmarks/b3/mpi_2_5.sh create mode 100644 benchmarks/b3/omp_1.sh create mode 100644 benchmarks/b3/omp_10.sh create mode 100644 benchmarks/b3/sequential.sh create mode 100644 benchmarks/b3/sse.sh create mode 100644 benchmarks/b4/cuda.sh create mode 100644 benchmarks/b4/jacobi.conf create mode 100644 benchmarks/b4/mpi_1_1.sh create mode 100644 benchmarks/b4/mpi_1_10.sh create mode 100644 benchmarks/b4/mpi_5_10.sh create mode 100644 benchmarks/b4/omp_1.sh create mode 100644 benchmarks/b4/omp_10.sh create mode 100644 benchmarks/b4/sequential.sh create mode 100644 benchmarks/b4/sse.sh create mode 100644 benchmarks/b5/cuda.sh create mode 100644 benchmarks/b5/jacobi.conf create mode 100644 benchmarks/b5/mpi_1_1.sh create mode 100644 benchmarks/b5/mpi_1_10.sh create mode 100644 benchmarks/b5/mpi_5_10.sh create mode 100644 benchmarks/b5/omp_1.sh create mode 100644 benchmarks/b5/omp_10.sh create mode 100644 benchmarks/b5/sequential.sh create mode 100644 benchmarks/b5/sse.sh create mode 100644 build_cuda/.gitignore create mode 100644 src/config.cu create mode 100644 src/config.cuh create mode 100644 src/impl/cuda.cu rename src/impl/{mpi_line.c => mpi.c} (100%) delete mode 100644 src/impl/mpi_line_async.c create mode 100644 src/main/main.cu create mode 100644 src/utils.cu create mode 100644 src/utils.cuh diff --git a/Makefile b/Makefile index 655e1fd..b111a40 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,14 @@ CC=gcc CC_OMP=gcc -fopenmp CC_MPI=mpicc +CC_CUDA=nvcc CFLAGS=-Wall -lm -std=c99 SRC=src BUILD=build +BUILD_CUDA=build_cuda BIN=bin -all: sequential mpi_line mpi_line_async omp sse +all: sequential mpi omp sse cuda sequential: config utils main ${CC} ${CFLAGS} \ @@ -16,8 +18,16 @@ sequential: config utils main ${SRC}/impl/sequential.c \ -o ${BIN}/jacobi_sequential +cuda: config_cuda utils_cuda main_cuda + ${CC_CUDA} \ + ${BUILD_CUDA}/config.o \ + ${BUILD_CUDA}/utils.o \ + ${BUILD_CUDA}/main.o \ + ${SRC}/impl/cuda.cu \ + -o ${BIN}/jacobi_cuda + sse: config utils main - ${CC_OMP} ${CFLAGS} \ + ${CC} ${CFLAGS} \ ${BUILD}/config.o \ ${BUILD}/utils.o \ ${BUILD}/main.o \ @@ -32,21 +42,13 @@ omp: config utils main ${SRC}/impl/omp.c \ -o ${BIN}/jacobi_omp -mpi_line: config utils main_mpi +mpi: config utils main_mpi ${CC_MPI} ${CFLAGS} \ ${BUILD}/config.o \ ${BUILD}/utils.o \ ${BUILD}/main_mpi.o \ - ${SRC}/impl/mpi_line.c \ - -o ${BIN}/jacobi_mpi_line - -mpi_line_async: config utils main_mpi - ${CC_MPI} ${CFLAGS} \ - ${BUILD}/config.o \ - ${BUILD}/utils.o \ - ${BUILD}/main_mpi.o \ - ${SRC}/impl/mpi_line_async.c \ - -o ${BIN}/jacobi_mpi_line_async + ${SRC}/impl/mpi.c \ + -o ${BIN}/jacobi_mpi main: ${SRC}/main/main.c ${CC_MPI} -c ${CFLAGS} \ @@ -58,16 +60,31 @@ main_mpi: ${SRC}/main/main_mpi.c ${SRC}/main/main_mpi.c \ -o ${BUILD}/main_mpi.o +main_cuda: ${SRC}/main/main.cu + ${CC_CUDA} -c \ + ${SRC}/main/main.cu \ + -o ${BUILD_CUDA}/main.o + config: ${SRC}/config.c ${CC_MPI} -c ${CFLAGS} \ ${SRC}/config.c \ -o ${BUILD}/config.o +config_cuda: ${SRC}/config.cu + ${CC_CUDA} -c \ + ${SRC}/config.cu \ + -o ${BUILD_CUDA}/config.o + utils: ${SRC}/utils.c ${CC_MPI} -c ${CFLAGS} \ ${SRC}/utils.c \ -o ${BUILD}/utils.o +utils_cuda: ${SRC}/utils.cu + ${CC_CUDA} -c \ + ${SRC}/utils.cu \ + -o ${BUILD_CUDA}/utils.o + .PHONY: clean clean: - rm build/[^.]* bin/[^.]* + rm build/[^.]* build_cuda/[^.]* bin/[^.]* diff --git a/benchmarks/b1/cuda.sh b/benchmarks/b1/cuda.sh new file mode 100644 index 0000000..b7f71e4 --- /dev/null +++ b/benchmarks/b1/cuda.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +../../bin/jacobi_cuda > results/stdout_cuda.txt diff --git a/benchmarks/b1/jacobi.conf b/benchmarks/b1/jacobi.conf new file mode 100644 index 0000000..cfdfd05 --- /dev/null +++ b/benchmarks/b1/jacobi.conf @@ -0,0 +1,16 @@ +# Configuration file for the Jacobi project. + +# The size of the matrix (borders excluded). +N 1000 + +# The value at each border. +NORTH 0.0 +EAST 0.0 +SOUTH 300.0 +WEST 0.0 + +# The initial value to assign at each internal cell. +INIT_VALUE 0.0 + +# The threshold that determines the convergence. +THRESHOLD 1.0 diff --git a/benchmarks/b1/mpi_1_1.sh b/benchmarks/b1/mpi_1_1.sh new file mode 100644 index 0000000..9b02a7d --- /dev/null +++ b/benchmarks/b1/mpi_1_1.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J mpi +#SBATCH -o results/stdout_mpi_1_1.txt +#SBATCH -e results/stderr_mpi_1_1.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +mpirun ~/JacobiHPC/bin/jacobi_mpi diff --git a/benchmarks/b1/mpi_1_10.sh b/benchmarks/b1/mpi_1_10.sh new file mode 100644 index 0000000..a96073f --- /dev/null +++ b/benchmarks/b1/mpi_1_10.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J mpi +#SBATCH -o results/stdout_mpi_1_10.txt +#SBATCH -e results/stderr_mpi_1_10.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=10 +#SBATCH --nodelist=c6 + +mpirun ~/JacobiHPC/bin/jacobi_mpi diff --git a/benchmarks/b1/mpi_2_5.sh b/benchmarks/b1/mpi_2_5.sh new file mode 100644 index 0000000..8040615 --- /dev/null +++ b/benchmarks/b1/mpi_2_5.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J mpi +#SBATCH -o results/stdout_mpi_2_5.txt +#SBATCH -e results/stderr_mpi_2_5.txt +#SBATCH -t 00:30:00 +#SBATCH -N 2 +#SBATCH --tasks-per-node=5 +#SBATCH --nodelist=c6,c7 + +mpirun ~/JacobiHPC/bin/jacobi_mpi diff --git a/benchmarks/b1/omp_1.sh b/benchmarks/b1/omp_1.sh new file mode 100644 index 0000000..3df57e2 --- /dev/null +++ b/benchmarks/b1/omp_1.sh @@ -0,0 +1,12 @@ +#!/bin/sh +#SBATCH -J omp +#SBATCH -o results/stdout_omp_1.txt +#SBATCH -e results/stderr_omp_1.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +export OMP_NUM_THREADS=1 + +~/JacobiHPC/bin/jacobi_omp diff --git a/benchmarks/b1/omp_10.sh b/benchmarks/b1/omp_10.sh new file mode 100644 index 0000000..2c1127e --- /dev/null +++ b/benchmarks/b1/omp_10.sh @@ -0,0 +1,12 @@ +#!/bin/sh +#SBATCH -J omp +#SBATCH -o results/stdout_omp_10.txt +#SBATCH -e results/stderr_omp_10.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +export OMP_NUM_THREADS=10 + +~/JacobiHPC/bin/jacobi_omp diff --git a/benchmarks/b1/sequential.sh b/benchmarks/b1/sequential.sh new file mode 100644 index 0000000..3d27e07 --- /dev/null +++ b/benchmarks/b1/sequential.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J sequential +#SBATCH -o results/stdout_sequential.txt +#SBATCH -e results/stderr_sequential.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +~/JacobiHPC/bin/jacobi_sequential diff --git a/benchmarks/b1/sse.sh b/benchmarks/b1/sse.sh new file mode 100644 index 0000000..450a3f6 --- /dev/null +++ b/benchmarks/b1/sse.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J sse +#SBATCH -o results/stdout_sse.txt +#SBATCH -e results/stderr_sse.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +~/JacobiHPC/bin/jacobi_sse diff --git a/benchmarks/b2/cuda.sh b/benchmarks/b2/cuda.sh new file mode 100644 index 0000000..b7f71e4 --- /dev/null +++ b/benchmarks/b2/cuda.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +../../bin/jacobi_cuda > results/stdout_cuda.txt diff --git a/benchmarks/b2/jacobi.conf b/benchmarks/b2/jacobi.conf new file mode 100644 index 0000000..898893a --- /dev/null +++ b/benchmarks/b2/jacobi.conf @@ -0,0 +1,16 @@ +# Configuration file for the Jacobi project. + +# The size of the matrix (borders excluded). +N 2000 + +# The value at each border. +NORTH 0.0 +EAST 0.0 +SOUTH 300.0 +WEST 0.0 + +# The initial value to assign at each internal cell. +INIT_VALUE 0.0 + +# The threshold that determines the convergence. +THRESHOLD 1.0 diff --git a/benchmarks/b2/mpi_1_1.sh b/benchmarks/b2/mpi_1_1.sh new file mode 100644 index 0000000..9b02a7d --- /dev/null +++ b/benchmarks/b2/mpi_1_1.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J mpi +#SBATCH -o results/stdout_mpi_1_1.txt +#SBATCH -e results/stderr_mpi_1_1.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +mpirun ~/JacobiHPC/bin/jacobi_mpi diff --git a/benchmarks/b2/mpi_1_10.sh b/benchmarks/b2/mpi_1_10.sh new file mode 100644 index 0000000..a96073f --- /dev/null +++ b/benchmarks/b2/mpi_1_10.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J mpi +#SBATCH -o results/stdout_mpi_1_10.txt +#SBATCH -e results/stderr_mpi_1_10.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=10 +#SBATCH --nodelist=c6 + +mpirun ~/JacobiHPC/bin/jacobi_mpi diff --git a/benchmarks/b2/mpi_2_5.sh b/benchmarks/b2/mpi_2_5.sh new file mode 100644 index 0000000..8040615 --- /dev/null +++ b/benchmarks/b2/mpi_2_5.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J mpi +#SBATCH -o results/stdout_mpi_2_5.txt +#SBATCH -e results/stderr_mpi_2_5.txt +#SBATCH -t 00:30:00 +#SBATCH -N 2 +#SBATCH --tasks-per-node=5 +#SBATCH --nodelist=c6,c7 + +mpirun ~/JacobiHPC/bin/jacobi_mpi diff --git a/benchmarks/b2/omp_1.sh b/benchmarks/b2/omp_1.sh new file mode 100644 index 0000000..3df57e2 --- /dev/null +++ b/benchmarks/b2/omp_1.sh @@ -0,0 +1,12 @@ +#!/bin/sh +#SBATCH -J omp +#SBATCH -o results/stdout_omp_1.txt +#SBATCH -e results/stderr_omp_1.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +export OMP_NUM_THREADS=1 + +~/JacobiHPC/bin/jacobi_omp diff --git a/benchmarks/b2/omp_10.sh b/benchmarks/b2/omp_10.sh new file mode 100644 index 0000000..2c1127e --- /dev/null +++ b/benchmarks/b2/omp_10.sh @@ -0,0 +1,12 @@ +#!/bin/sh +#SBATCH -J omp +#SBATCH -o results/stdout_omp_10.txt +#SBATCH -e results/stderr_omp_10.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +export OMP_NUM_THREADS=10 + +~/JacobiHPC/bin/jacobi_omp diff --git a/benchmarks/b2/sequential.sh b/benchmarks/b2/sequential.sh new file mode 100644 index 0000000..3d27e07 --- /dev/null +++ b/benchmarks/b2/sequential.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J sequential +#SBATCH -o results/stdout_sequential.txt +#SBATCH -e results/stderr_sequential.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +~/JacobiHPC/bin/jacobi_sequential diff --git a/benchmarks/b2/sse.sh b/benchmarks/b2/sse.sh new file mode 100644 index 0000000..450a3f6 --- /dev/null +++ b/benchmarks/b2/sse.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J sse +#SBATCH -o results/stdout_sse.txt +#SBATCH -e results/stderr_sse.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +~/JacobiHPC/bin/jacobi_sse diff --git a/benchmarks/b3/cuda.sh b/benchmarks/b3/cuda.sh new file mode 100644 index 0000000..b7f71e4 --- /dev/null +++ b/benchmarks/b3/cuda.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +../../bin/jacobi_cuda > results/stdout_cuda.txt diff --git a/benchmarks/b3/jacobi.conf b/benchmarks/b3/jacobi.conf new file mode 100644 index 0000000..f1121d4 --- /dev/null +++ b/benchmarks/b3/jacobi.conf @@ -0,0 +1,16 @@ +# Configuration file for the Jacobi project. + +# The size of the matrix (borders excluded). +N 5000 + +# The value at each border. +NORTH 0.0 +EAST 0.0 +SOUTH 300.0 +WEST 0.0 + +# The initial value to assign at each internal cell. +INIT_VALUE 0.0 + +# The threshold that determines the convergence. +THRESHOLD 1.0 diff --git a/benchmarks/b3/mpi_1_1.sh b/benchmarks/b3/mpi_1_1.sh new file mode 100644 index 0000000..9b02a7d --- /dev/null +++ b/benchmarks/b3/mpi_1_1.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J mpi +#SBATCH -o results/stdout_mpi_1_1.txt +#SBATCH -e results/stderr_mpi_1_1.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +mpirun ~/JacobiHPC/bin/jacobi_mpi diff --git a/benchmarks/b3/mpi_1_10.sh b/benchmarks/b3/mpi_1_10.sh new file mode 100644 index 0000000..a96073f --- /dev/null +++ b/benchmarks/b3/mpi_1_10.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J mpi +#SBATCH -o results/stdout_mpi_1_10.txt +#SBATCH -e results/stderr_mpi_1_10.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=10 +#SBATCH --nodelist=c6 + +mpirun ~/JacobiHPC/bin/jacobi_mpi diff --git a/benchmarks/b3/mpi_2_5.sh b/benchmarks/b3/mpi_2_5.sh new file mode 100644 index 0000000..8040615 --- /dev/null +++ b/benchmarks/b3/mpi_2_5.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J mpi +#SBATCH -o results/stdout_mpi_2_5.txt +#SBATCH -e results/stderr_mpi_2_5.txt +#SBATCH -t 00:30:00 +#SBATCH -N 2 +#SBATCH --tasks-per-node=5 +#SBATCH --nodelist=c6,c7 + +mpirun ~/JacobiHPC/bin/jacobi_mpi diff --git a/benchmarks/b3/omp_1.sh b/benchmarks/b3/omp_1.sh new file mode 100644 index 0000000..3df57e2 --- /dev/null +++ b/benchmarks/b3/omp_1.sh @@ -0,0 +1,12 @@ +#!/bin/sh +#SBATCH -J omp +#SBATCH -o results/stdout_omp_1.txt +#SBATCH -e results/stderr_omp_1.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +export OMP_NUM_THREADS=1 + +~/JacobiHPC/bin/jacobi_omp diff --git a/benchmarks/b3/omp_10.sh b/benchmarks/b3/omp_10.sh new file mode 100644 index 0000000..2c1127e --- /dev/null +++ b/benchmarks/b3/omp_10.sh @@ -0,0 +1,12 @@ +#!/bin/sh +#SBATCH -J omp +#SBATCH -o results/stdout_omp_10.txt +#SBATCH -e results/stderr_omp_10.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +export OMP_NUM_THREADS=10 + +~/JacobiHPC/bin/jacobi_omp diff --git a/benchmarks/b3/sequential.sh b/benchmarks/b3/sequential.sh new file mode 100644 index 0000000..3d27e07 --- /dev/null +++ b/benchmarks/b3/sequential.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J sequential +#SBATCH -o results/stdout_sequential.txt +#SBATCH -e results/stderr_sequential.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +~/JacobiHPC/bin/jacobi_sequential diff --git a/benchmarks/b3/sse.sh b/benchmarks/b3/sse.sh new file mode 100644 index 0000000..450a3f6 --- /dev/null +++ b/benchmarks/b3/sse.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J sse +#SBATCH -o results/stdout_sse.txt +#SBATCH -e results/stderr_sse.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +~/JacobiHPC/bin/jacobi_sse diff --git a/benchmarks/b4/cuda.sh b/benchmarks/b4/cuda.sh new file mode 100644 index 0000000..b7f71e4 --- /dev/null +++ b/benchmarks/b4/cuda.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +../../bin/jacobi_cuda > results/stdout_cuda.txt diff --git a/benchmarks/b4/jacobi.conf b/benchmarks/b4/jacobi.conf new file mode 100644 index 0000000..37689f1 --- /dev/null +++ b/benchmarks/b4/jacobi.conf @@ -0,0 +1,16 @@ +# Configuration file for the Jacobi project. + +# The size of the matrix (borders excluded). +N 10000 + +# The value at each border. +NORTH 0.0 +EAST 0.0 +SOUTH 300.0 +WEST 0.0 + +# The initial value to assign at each internal cell. +INIT_VALUE 0.0 + +# The threshold that determines the convergence. +THRESHOLD 1.0 diff --git a/benchmarks/b4/mpi_1_1.sh b/benchmarks/b4/mpi_1_1.sh new file mode 100644 index 0000000..9b02a7d --- /dev/null +++ b/benchmarks/b4/mpi_1_1.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J mpi +#SBATCH -o results/stdout_mpi_1_1.txt +#SBATCH -e results/stderr_mpi_1_1.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +mpirun ~/JacobiHPC/bin/jacobi_mpi diff --git a/benchmarks/b4/mpi_1_10.sh b/benchmarks/b4/mpi_1_10.sh new file mode 100644 index 0000000..a96073f --- /dev/null +++ b/benchmarks/b4/mpi_1_10.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J mpi +#SBATCH -o results/stdout_mpi_1_10.txt +#SBATCH -e results/stderr_mpi_1_10.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=10 +#SBATCH --nodelist=c6 + +mpirun ~/JacobiHPC/bin/jacobi_mpi diff --git a/benchmarks/b4/mpi_5_10.sh b/benchmarks/b4/mpi_5_10.sh new file mode 100644 index 0000000..927bb69 --- /dev/null +++ b/benchmarks/b4/mpi_5_10.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J mpi +#SBATCH -o results/stdout_mpi_5_10.txt +#SBATCH -e results/stderr_mpi_5_10.txt +#SBATCH -t 00:30:00 +#SBATCH -N 5 +#SBATCH --tasks-per-node=10 +#SBATCH --nodelist=c2,c3,c4,c6,c7 + +mpirun ~/JacobiHPC/bin/jacobi_mpi diff --git a/benchmarks/b4/omp_1.sh b/benchmarks/b4/omp_1.sh new file mode 100644 index 0000000..3df57e2 --- /dev/null +++ b/benchmarks/b4/omp_1.sh @@ -0,0 +1,12 @@ +#!/bin/sh +#SBATCH -J omp +#SBATCH -o results/stdout_omp_1.txt +#SBATCH -e results/stderr_omp_1.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +export OMP_NUM_THREADS=1 + +~/JacobiHPC/bin/jacobi_omp diff --git a/benchmarks/b4/omp_10.sh b/benchmarks/b4/omp_10.sh new file mode 100644 index 0000000..2c1127e --- /dev/null +++ b/benchmarks/b4/omp_10.sh @@ -0,0 +1,12 @@ +#!/bin/sh +#SBATCH -J omp +#SBATCH -o results/stdout_omp_10.txt +#SBATCH -e results/stderr_omp_10.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +export OMP_NUM_THREADS=10 + +~/JacobiHPC/bin/jacobi_omp diff --git a/benchmarks/b4/sequential.sh b/benchmarks/b4/sequential.sh new file mode 100644 index 0000000..3d27e07 --- /dev/null +++ b/benchmarks/b4/sequential.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J sequential +#SBATCH -o results/stdout_sequential.txt +#SBATCH -e results/stderr_sequential.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +~/JacobiHPC/bin/jacobi_sequential diff --git a/benchmarks/b4/sse.sh b/benchmarks/b4/sse.sh new file mode 100644 index 0000000..450a3f6 --- /dev/null +++ b/benchmarks/b4/sse.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J sse +#SBATCH -o results/stdout_sse.txt +#SBATCH -e results/stderr_sse.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +~/JacobiHPC/bin/jacobi_sse diff --git a/benchmarks/b5/cuda.sh b/benchmarks/b5/cuda.sh new file mode 100644 index 0000000..b7f71e4 --- /dev/null +++ b/benchmarks/b5/cuda.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +../../bin/jacobi_cuda > results/stdout_cuda.txt diff --git a/benchmarks/b5/jacobi.conf b/benchmarks/b5/jacobi.conf new file mode 100644 index 0000000..0e0e207 --- /dev/null +++ b/benchmarks/b5/jacobi.conf @@ -0,0 +1,16 @@ +# Configuration file for the Jacobi project. + +# The size of the matrix (borders excluded). +N 20000 + +# The value at each border. +NORTH 0.0 +EAST 0.0 +SOUTH 300.0 +WEST 0.0 + +# The initial value to assign at each internal cell. +INIT_VALUE 0.0 + +# The threshold that determines the convergence. +THRESHOLD 1.0 diff --git a/benchmarks/b5/mpi_1_1.sh b/benchmarks/b5/mpi_1_1.sh new file mode 100644 index 0000000..9b02a7d --- /dev/null +++ b/benchmarks/b5/mpi_1_1.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J mpi +#SBATCH -o results/stdout_mpi_1_1.txt +#SBATCH -e results/stderr_mpi_1_1.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +mpirun ~/JacobiHPC/bin/jacobi_mpi diff --git a/benchmarks/b5/mpi_1_10.sh b/benchmarks/b5/mpi_1_10.sh new file mode 100644 index 0000000..a96073f --- /dev/null +++ b/benchmarks/b5/mpi_1_10.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J mpi +#SBATCH -o results/stdout_mpi_1_10.txt +#SBATCH -e results/stderr_mpi_1_10.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=10 +#SBATCH --nodelist=c6 + +mpirun ~/JacobiHPC/bin/jacobi_mpi diff --git a/benchmarks/b5/mpi_5_10.sh b/benchmarks/b5/mpi_5_10.sh new file mode 100644 index 0000000..927bb69 --- /dev/null +++ b/benchmarks/b5/mpi_5_10.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J mpi +#SBATCH -o results/stdout_mpi_5_10.txt +#SBATCH -e results/stderr_mpi_5_10.txt +#SBATCH -t 00:30:00 +#SBATCH -N 5 +#SBATCH --tasks-per-node=10 +#SBATCH --nodelist=c2,c3,c4,c6,c7 + +mpirun ~/JacobiHPC/bin/jacobi_mpi diff --git a/benchmarks/b5/omp_1.sh b/benchmarks/b5/omp_1.sh new file mode 100644 index 0000000..3df57e2 --- /dev/null +++ b/benchmarks/b5/omp_1.sh @@ -0,0 +1,12 @@ +#!/bin/sh +#SBATCH -J omp +#SBATCH -o results/stdout_omp_1.txt +#SBATCH -e results/stderr_omp_1.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +export OMP_NUM_THREADS=1 + +~/JacobiHPC/bin/jacobi_omp diff --git a/benchmarks/b5/omp_10.sh b/benchmarks/b5/omp_10.sh new file mode 100644 index 0000000..2c1127e --- /dev/null +++ b/benchmarks/b5/omp_10.sh @@ -0,0 +1,12 @@ +#!/bin/sh +#SBATCH -J omp +#SBATCH -o results/stdout_omp_10.txt +#SBATCH -e results/stderr_omp_10.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +export OMP_NUM_THREADS=10 + +~/JacobiHPC/bin/jacobi_omp diff --git a/benchmarks/b5/sequential.sh b/benchmarks/b5/sequential.sh new file mode 100644 index 0000000..3d27e07 --- /dev/null +++ b/benchmarks/b5/sequential.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J sequential +#SBATCH -o results/stdout_sequential.txt +#SBATCH -e results/stderr_sequential.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +~/JacobiHPC/bin/jacobi_sequential diff --git a/benchmarks/b5/sse.sh b/benchmarks/b5/sse.sh new file mode 100644 index 0000000..450a3f6 --- /dev/null +++ b/benchmarks/b5/sse.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -J sse +#SBATCH -o results/stdout_sse.txt +#SBATCH -e results/stderr_sse.txt +#SBATCH -t 00:30:00 +#SBATCH -N 1 +#SBATCH --tasks-per-node=1 +#SBATCH --nodelist=c6 + +~/JacobiHPC/bin/jacobi_sse diff --git a/build_cuda/.gitignore b/build_cuda/.gitignore new file mode 100644 index 0000000..5e7d273 --- /dev/null +++ b/build_cuda/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore diff --git a/src/config.c b/src/config.c index 0c590a4..871e5b4 100644 --- a/src/config.c +++ b/src/config.c @@ -11,6 +11,7 @@ typedef struct configuration { float threshold; } configuration; + int load_config(configuration *config) { char property[100]; char *value; diff --git a/src/config.cu b/src/config.cu new file mode 100644 index 0000000..60ebb95 --- /dev/null +++ b/src/config.cu @@ -0,0 +1,79 @@ +#include +#include + +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +typedef struct configuration { + int n; + float north; + float east; + float south; + float west; + float init_value; + float threshold; +} configuration; + +__host__ int old_load_config(configuration *config) { + config->n = 5; + config->north = 0.0; + config->east = 0.0; + config->west = 0.0; + config->south = 300.0; + config->init_value = 0.0; + config->threshold = 1.0; + return 0; +} + +__host__ int load_config(configuration *config) { + char property[100]; + char *value; + FILE *fp; + + fp = fopen("jacobi.conf", "r"); + if (fp == NULL) { + perror("Error opening file jacobi.conf"); + return 1; + } + while (fgets(property, 100, fp) != NULL) { + if (property[0] == '\n' || property[0] == '#') { + /* Skip empty lines and comments */ + continue; + } + value = strchr(property, ' '); + if (value == NULL) { + fclose(fp); + perror("Error reading file jacobi.conf"); + return 1; + } + value[0] = '\0'; + value += sizeof(char); + value[strlen(value) - 1] = '\0'; + if (strcmp(property, "N") == 0) { + sscanf(value, "%d", &(config->n)); + } + else if (strcmp(property, "NORTH") == 0) { + sscanf(value, "%f", &(config->north)); + } + else if (strcmp(property, "EAST") == 0) { + sscanf(value, "%f", &(config->east)); + } + else if (strcmp(property, "SOUTH") == 0) { + sscanf(value, "%f", &(config->south)); + } + else if (strcmp(property, "WEST") == 0) { + sscanf(value, "%f", &(config->west)); + } + else if (strcmp(property, "INIT_VALUE") == 0) { + sscanf(value, "%f", &(config->init_value)); + } + else if (strcmp(property, "THRESHOLD") == 0) { + sscanf(value, "%f", &(config->threshold)); + } + else { + printf("Unknown property %s\n", property); + } + } + fclose(fp); + return 0; +} diff --git a/src/config.cuh b/src/config.cuh new file mode 100644 index 0000000..58c8b69 --- /dev/null +++ b/src/config.cuh @@ -0,0 +1,14 @@ +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +typedef struct configuration { + int n; + float north; + float east; + float south; + float west; + float init_value; + float threshold; +} configuration; + +__host__ int load_config(configuration *config); \ No newline at end of file diff --git a/src/impl/cuda.cu b/src/impl/cuda.cu new file mode 100644 index 0000000..c49bf62 --- /dev/null +++ b/src/impl/cuda.cu @@ -0,0 +1,118 @@ +/* +* CUDA version. +*/ + +#include +#include +#include "../config.cuh" +#include "../utils.cuh" + +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#define THREADS_BLOCK 256; + +__host__ void check_status(cudaError_t cuda_status, char *msg) { + if (cuda_status != cudaSuccess) { + fprintf(stderr, msg); + fprintf(stderr, ": "); + fprintf(stderr, cudaGetErrorString(cuda_status)); + fprintf(stderr, " (error code: %d)\n", cuda_status); + exit(EXIT_FAILURE); + } +} + +__global__ void initialize_matrix_on_gpu(float *x, int n, float init_value, borders b, cudaError_t *cuda_status) { + int i, j; + int nb = n + 2; + + /* Initialize borders */ + for (i = 0; i < nb; i++) { + x[IDX(nb, 0, i)] = b.north; + x[IDX(nb, n + 1, i)] = b.south; + x[IDX(nb, i, 0)] = b.west; + x[IDX(nb, i, n + 1)] = b.east; + } + /* Initialize the rest of the matrix */ + for (i = 1; i <= n; i++) { + for (j = 1; j <= n; j++) { + x[IDX(nb, i, j)] = init_value; + } + } +} + +__global__ void iterate(int n, float *x, float *new_x) { + int idx, nb; + int i, j; + + nb = n + 2; + idx = blockDim.x * blockIdx.x + threadIdx.x; + i = idx / nb; + j = idx % nb; + if (i >= 1 && i <= n && j >= 1 && j <= n) { + new_x[idx] = 0.25 * (x[IDX(nb, i - 1, j)] + x[IDX(nb, i, j + 1)] + x[IDX(nb, i + 1, j)] + x[IDX(nb, i, j - 1)]); + } +} + +__host__ float *compute_jacobi(int n, float init_value, float threshold, borders b, int *iterations) { + float *x, *new_x; + float *x_gpu, *new_x_gpu; + float *tmp_x; + float max_diff; + int i, j; + int nb = n + 2; // n plus the border + int blocks_number; + int threads_block = THREADS_BLOCK; + cudaError_t cuda_status; + + // Select the GPU + check_status(cudaSetDevice(0), "cudaSetDevice failed!"); + + /* Create the matrixes on the GPU */ + x_gpu = create_sa_matrix_on_gpu(nb, nb, &cuda_status); + check_status(cuda_status, "create_sa_matrix_on_gpu failed!"); + new_x_gpu = create_sa_matrix_on_gpu(nb, nb, &cuda_status); + check_status(cuda_status, "create_sa_matrix_on_gpu failed!"); + + /* Initialize the matrixes */ + initialize_matrix_on_gpu<<<1, 1>>>(x_gpu, n, init_value, b, &cuda_status); + check_status(cuda_status, "initialize_matrix_on_gpu failed!"); + initialize_matrix_on_gpu<<<1, 1>>>(new_x_gpu, n, init_value, b, &cuda_status); + check_status(cuda_status, "initialize_matrix_on_gpu failed!"); + + /* Iterative refinement of x until values converge */ + x = retrieve_sa_matrix_from_gpu(x_gpu, nb, nb, &cuda_status); + check_status(cuda_status, "retrieve_sa_matrix_from_gpu failed!"); + + blocks_number = nb / threads_block + 1; + *iterations = 0; + do { + iterate<<>>(n, x_gpu, new_x_gpu); + new_x = retrieve_sa_matrix_from_gpu(new_x_gpu, nb, nb, &cuda_status); + check_status(cuda_status, "retrieve_sa_matrix_from_gpu failed!"); + max_diff = 0; + for (i = 1; i <= n; i++) { + for (j = 1; j <= n; j++) { + max_diff = fmaxf(max_diff, fabs(new_x[IDX(nb, i, j)] - x[IDX(nb, i, j)])); + } + } + + tmp_x = new_x; + new_x = x; + x = tmp_x; + + tmp_x = new_x_gpu; + new_x_gpu = x_gpu; + x_gpu = tmp_x; + + (*iterations)++; + } while (max_diff > threshold); + + x = retrieve_sa_matrix_from_gpu(x_gpu, nb, nb, &cuda_status); + check_status(cuda_status, "retrieve_sa_matrix_from_gpu failed!"); + + destroy_sa_matrix_on_gpu(x_gpu); + destroy_sa_matrix_on_gpu(new_x_gpu); + + return x; +} diff --git a/src/impl/mpi_line.c b/src/impl/mpi.c similarity index 100% rename from src/impl/mpi_line.c rename to src/impl/mpi.c diff --git a/src/impl/mpi_line_async.c b/src/impl/mpi_line_async.c deleted file mode 100644 index 85ea882..0000000 --- a/src/impl/mpi_line_async.c +++ /dev/null @@ -1,123 +0,0 @@ -/* - * MPI version with the matrix subdivided by "lines". - */ - -#include -#include -#include -#include -#include "../config.h" -#include "../utils.h" - -#define TAG_BORDER 0 -#define TAG_MATRIX 1 - -float *compute_jacobi(int rank, int numprocs, int n, float init_value, float threshold, borders b, int *iterations) { - float *complete_x; - float *x; - float *new_x; - float *tmp_x; - float max_diff, global_max_diff, new_value; - int i, j; - int nb = n + 2; // n plus the border - int rows, rows_to_transmit; - int receive_pos; - MPI_Request request_north; - MPI_Request request_south; - - if (rank == 0) { - rows = n - (n / numprocs) * (numprocs - 1); - } else { - rows = n / numprocs; - } - LOG(printf("[Process %d/%d] rows: %d\n", rank, numprocs, rows)); - /* LOG(printf("[Process %d/%d] initializing matrix\n", rank, numprocs)); */ - /* Initialize the matrix */ - x = create_sa_matrix(rows + 2, nb); - new_x = create_sa_matrix(rows + 2, nb); - for (i = 0; i < rows + 2; i++) { - for (j = 1; j <= n; j++) { - x[IDX(nb, i, j)] = init_value; - new_x[IDX(nb, i, j)] = init_value; - } - } - /* Initialize boundary regions */ - for (i = 0; i < rows + 2; i++) { - x[IDX(nb, i, 0)] = b.west; - x[IDX(nb, i, n + 1)] = b.east; - new_x[IDX(nb, i, 0)] = b.west; - new_x[IDX(nb, i, n + 1)] = b.east; - } - if (rank == 0) { - for (i = 1; i <= n + 1; i++) { - x[IDX(nb, 0, i)] = b.north; - new_x[IDX(nb, 0, i)] = b.north; - } - } - if (rank == numprocs - 1){ - for (i = 1; i < n + 1; i++) { - x[IDX(nb, rows + 1, i)] = b.south; - new_x[IDX(nb, rows + 1, i)] = b.south; - } - } - /* LOG(printf("[Process %d/%d] matrix initialized\n", rank, numprocs)); */ - /* Iterative refinement of x until values converge */ - *iterations = 0; - do { - if (rank != numprocs - 1) { - // Send south border - MPI_Isend(&x[IDX(nb, rows, 0)], nb, MPI_FLOAT, rank + 1, TAG_BORDER, MPI_COMM_WORLD, &request_south); - } - if (rank != 0) { - // Send north border - MPI_Isend(&x[IDX(nb, 1, 0)], nb, MPI_FLOAT, rank - 1, TAG_BORDER, MPI_COMM_WORLD, &request_north); - } - max_diff = 0; - global_max_diff = 0; - for (i = 1; i <= rows; i++) { - for (j = 1; j <= n; j++) { - new_value = 0.25 * (x[IDX(nb, i - 1, j)] + x[IDX(nb, i, j + 1)] + x[IDX(nb, i + 1, j)] + x[IDX(nb, i, j - 1)]); - max_diff = fmaxf(max_diff, fabs(new_value - x[IDX(nb, i, j)])); - new_x[IDX(nb, i, j)] = new_value; - } - } - tmp_x = new_x; - new_x = x; - x = tmp_x; - if (rank != numprocs - 1) { - // Receive south border - MPI_Recv(&x[IDX(nb, rows + 1, 0)], nb, MPI_FLOAT, rank + 1, TAG_BORDER, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - } - if (rank != 0) { - // Receive north border - MPI_Recv(&x[IDX(nb, 0, 0)], nb, MPI_FLOAT, rank - 1, TAG_BORDER, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - } - LOG(printf("[Process %d/%d] max_diff: %f\n", rank, numprocs, max_diff)); - MPI_Allreduce(&max_diff, &global_max_diff, 1, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD); - /* LOG(printf("[Process %d/%d] global_max_diff: %f\n", rank, numprocs, global_max_diff)); */ - (*iterations)++; - } while (global_max_diff > threshold); - - if (rank == 0) { - complete_x = create_sa_matrix(nb, nb); - memcpy(complete_x, x, (rows + ((rank == numprocs - 1) ? 2 : 1)) * (nb) * sizeof(float)); - rows_to_transmit = n / numprocs; - receive_pos = rows + 1; - for (i = 1; i < numprocs; i++) { - if (i == numprocs - 1) { - rows_to_transmit++; - } - MPI_Recv(&complete_x[IDX(nb, receive_pos, 0)], rows_to_transmit * (nb), MPI_FLOAT, i, TAG_MATRIX, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - receive_pos += n / numprocs; - } - } else { - complete_x = NULL; - rows_to_transmit = rows; - if (rank == numprocs - 1) { - rows_to_transmit++; - } - MPI_Send(&x[IDX(nb, 1, 0)], rows_to_transmit * (nb), MPI_FLOAT, 0, TAG_MATRIX, MPI_COMM_WORLD); - } - - return complete_x; -} diff --git a/src/main/main.cu b/src/main/main.cu new file mode 100644 index 0000000..2c927a0 --- /dev/null +++ b/src/main/main.cu @@ -0,0 +1,52 @@ + +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#include +#include +#include +#include + +#include "../config.cuh" +#include "../utils.cuh" + +__host__ float *compute_jacobi(int n, float init_value, float threshold, borders b, int *iterations); + +__host__ int main(int argc, char* argv[]) { + int n; + float init_value, threshold; + borders b; + int config_loaded; + configuration config; + float *x; + int iterations; + struct timeval start, end; + long secs_used, micros_used; + + config_loaded = load_config(&config); + if (config_loaded != 0) { + return 1; + } + n = config.n; + threshold = config.threshold; + init_value = config.init_value; + b.north = config.north; + b.south = config.south; + b.east = config.east; + b.west = config.west; + + gettimeofday(&start, NULL); + x = compute_jacobi(n, init_value, threshold, b, &iterations); + gettimeofday(&end, NULL); + + secs_used = (end.tv_sec - start.tv_sec); + micros_used = ((secs_used * 1000000) + end.tv_usec) - (start.tv_usec); + printf("Wall clock time: %fs\n", (float)micros_used / 1000000); + printf("Iterations: N/A\n", iterations); + if (n < 10) { + print_sa_matrix(x, n + 2, n + 2); + } + destroy_sa_matrix(x); + + return 0; +} diff --git a/src/utils.c b/src/utils.c index f2115a7..1432e1e 100644 --- a/src/utils.c +++ b/src/utils.c @@ -54,3 +54,12 @@ void print_matrix(float **x, int rows, int cols) { } fflush(stdout); } + + +float fmaxf(float a, float b) { + if (a > b) { + return a; + } else { + return b; + } +} diff --git a/src/utils.cu b/src/utils.cu new file mode 100644 index 0000000..df4a82f --- /dev/null +++ b/src/utils.cu @@ -0,0 +1,79 @@ +#include +#include +#include "utils.cuh" + +__host__ float *create_sa_matrix(int rows, int cols) { + float *x; + + x = (float *)malloc(rows * cols * sizeof(float)); + return x; +} + +__host__ float *create_sa_matrix_on_gpu(int rows, int cols, cudaError_t *cuda_status) { + float *x = NULL; + + *cuda_status = cudaMalloc((void**)&x, rows * cols * sizeof(float)); + return x; +} + +__host__ void destroy_sa_matrix(float *x) { + free(x); +} + +__host__ void destroy_sa_matrix_on_gpu(float *x) { + cudaFree(x); +} + +__host__ void initialize_matrix_on_gpu(float *x, int rows, int cols, cudaError_t *cuda_status) { + *cuda_status = cudaMemset(x, 0, rows * cols * sizeof(float)); +} + +__host__ float *retrieve_sa_matrix_from_gpu(float *x, int rows, int cols, cudaError_t *cuda_status) { + float *x_host; + + x_host = create_sa_matrix(rows, cols); + *cuda_status = cudaMemcpy(x_host, x, rows * cols * sizeof(float), cudaMemcpyDeviceToHost); + return x_host; +} + +__host__ void print_sa_matrix(float *x, int rows, int cols) { + int i, j; + for (i = 0; i < rows; i++) { + for (j = 0; j < cols; j++) { + printf("%f\t", x[IDX(cols, i, j)]); + } + printf("\n"); + } + fflush(stdout); +} + +__host__ float **create_matrix(int rows, int cols) { + int i; + float **x; + + x = (float **)malloc(rows * sizeof(float)); + for (i = 0; i < rows; i++) { + x[i] = (float *)malloc(cols * sizeof(float)); + } + return x; +} + +__host__ void destroy_matrix(float **x, int rows) { + int i; + + for (i = 0; i < rows; i++) { + free(x[i]); + } + free(x); +} + +__host__ void print_matrix(float **x, int rows, int cols) { + int i, j; + for (i = 0; i < rows; i++) { + for (j = 0; j < cols; j++) { + printf("%f\t", x[i][j]); + } + printf("\n"); + } + fflush(stdout); +} diff --git a/src/utils.cuh b/src/utils.cuh new file mode 100644 index 0000000..b8baf59 --- /dev/null +++ b/src/utils.cuh @@ -0,0 +1,56 @@ +#include +#include + +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +/* #define ENABLE_LOG */ + +#ifdef ENABLE_LOG +# define LOG(x) x +#else +# define LOG(x) (void) 0 +#endif + +/* +* Macro used with single array matrices to +* get the array index given the number of columns, +* the row index and the column index. +*/ +#define IDX(cols, r, c) ((r) * (cols) + (c)) + +typedef struct borders { + float north; + float east; + float south; + float west; +} borders; + + +/* +* Create a matrix stored in a single array. +*/ +__host__ float *create_sa_matrix(int rows, int cols); + +__host__ float *create_sa_matrix_on_gpu(int rows, int cols, cudaError_t *cuda_status); + +/* +* Destroy a single array matrix. +*/ +__host__ void destroy_sa_matrix(float *x); + +__host__ void destroy_sa_matrix_on_gpu(float *x); + +__host__ void initialize_matrix_on_gpu(float *x, int rows, int cols, cudaError_t *cuda_status); + +__host__ float *retrieve_sa_matrix_from_gpu(float *x, int rows, int cols, cudaError_t *cuda_status); + + +/* +* Print a single array matrix. +*/ +__host__ void print_sa_matrix(float *x, int rows, int cols); + +__host__ float **create_matrix(int rows, int cols); +__host__ void destroy_matrix(float **x, int rows); +__host__ void print_matrix(float **x, int rows, int cols); \ No newline at end of file diff --git a/src/utils.h b/src/utils.h index ac3cbdb..b87a68a 100644 --- a/src/utils.h +++ b/src/utils.h @@ -44,3 +44,5 @@ void print_sa_matrix(float *x, int rows, int cols); float **create_matrix(int rows, int cols); void destroy_matrix(float **x, int rows); void print_matrix(float **x, int rows, int cols); + +float fmaxf(float a, float b);