Last commit

2017-01-22 01:02:17 +01:00
parent 7f60341812
commit a3be6d5298
58 changed files with 910 additions and 137 deletions
--- a/src/config.c
+++ b/src/config.c
@@ -11,6 +11,7 @@ typedef struct configuration {
  float threshold;
 } configuration;

+
 int load_config(configuration *config) {
  char property[100];
  char *value;
--- a/src/config.cu
+++ b/src/config.cu
@@ -0,0 +1,79 @@
+#include <stdio.h>
+#include <string.h>
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+typedef struct configuration {
+	int n;
+	float north;
+	float east;
+	float south;
+	float west;
+	float init_value;
+	float threshold;
+} configuration;
+
+__host__ int old_load_config(configuration *config) {
+	config->n = 5;
+	config->north = 0.0;
+	config->east = 0.0;
+	config->west = 0.0;
+	config->south = 300.0;
+	config->init_value = 0.0;
+	config->threshold = 1.0;
+	return 0;
+}
+
+__host__ int load_config(configuration *config) {
+	char property[100];
+	char *value;
+	FILE *fp;
+
+	fp = fopen("jacobi.conf", "r");
+	if (fp == NULL) {
+		perror("Error opening file jacobi.conf");
+		return 1;
+	}
+	while (fgets(property, 100, fp) != NULL) {
+		if (property[0] == '\n' || property[0] == '#') {
+			/* Skip empty lines and comments */
+			continue;
+		}
+		value = strchr(property, ' ');
+		if (value == NULL) {
+			fclose(fp);
+			perror("Error reading file jacobi.conf");
+			return 1;
+		}
+		value[0] = '\0';
+		value += sizeof(char);
+		value[strlen(value) - 1] = '\0';
+		if (strcmp(property, "N") == 0) {
+			sscanf(value, "%d", &(config->n));
+		}
+		else if (strcmp(property, "NORTH") == 0) {
+			sscanf(value, "%f", &(config->north));
+		}
+		else if (strcmp(property, "EAST") == 0) {
+			sscanf(value, "%f", &(config->east));
+		}
+		else if (strcmp(property, "SOUTH") == 0) {
+			sscanf(value, "%f", &(config->south));
+		}
+		else if (strcmp(property, "WEST") == 0) {
+			sscanf(value, "%f", &(config->west));
+		}
+		else if (strcmp(property, "INIT_VALUE") == 0) {
+			sscanf(value, "%f", &(config->init_value));
+		}
+		else if (strcmp(property, "THRESHOLD") == 0) {
+			sscanf(value, "%f", &(config->threshold));
+		}
+		else {
+			printf("Unknown property %s\n", property);
+		}
+	}
+	fclose(fp);
+	return 0;
+}
--- a/src/config.cuh
+++ b/src/config.cuh
@@ -0,0 +1,14 @@
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+typedef struct configuration {
+	int n;
+	float north;
+	float east;
+	float south;
+	float west;
+	float init_value;
+	float threshold;
+} configuration;
+
+__host__ int load_config(configuration *config);
--- a/src/impl/cuda.cu
+++ b/src/impl/cuda.cu
@@ -0,0 +1,118 @@
+/*
+* CUDA version.
+*/
+
+#include <stdio.h>
+#include <math.h>
+#include "../config.cuh"
+#include "../utils.cuh"
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+#define THREADS_BLOCK 256;
+
+__host__ void check_status(cudaError_t cuda_status, char *msg) {
+	if (cuda_status != cudaSuccess) {
+		fprintf(stderr, msg);
+		fprintf(stderr, ": ");
+		fprintf(stderr, cudaGetErrorString(cuda_status));
+		fprintf(stderr, " (error code: %d)\n", cuda_status);
+		exit(EXIT_FAILURE);
+	}
+}
+
+__global__ void initialize_matrix_on_gpu(float *x, int n, float init_value, borders b, cudaError_t *cuda_status) {
+	int i, j;
+	int nb = n + 2;
+
+	/* Initialize borders */
+	for (i = 0; i < nb; i++) {
+		x[IDX(nb, 0, i)] = b.north;
+		x[IDX(nb, n + 1, i)] = b.south;
+		x[IDX(nb, i, 0)] = b.west;
+		x[IDX(nb, i, n + 1)] = b.east;
+	}
+	/* Initialize the rest of the matrix */
+	for (i = 1; i <= n; i++) {
+		for (j = 1; j <= n; j++) {
+			x[IDX(nb, i, j)] = init_value;
+		}
+	}
+}
+
+__global__ void iterate(int n, float *x, float *new_x) {
+	int idx, nb;
+	int i, j;
+
+	nb = n + 2;
+	idx = blockDim.x * blockIdx.x + threadIdx.x;
+	i = idx / nb;
+	j = idx % nb;
+	if (i >= 1 && i <= n && j >= 1 && j <= n) {
+		new_x[idx] = 0.25 * (x[IDX(nb, i - 1, j)] + x[IDX(nb, i, j + 1)] + x[IDX(nb, i + 1, j)] + x[IDX(nb, i, j - 1)]);
+	}
+}
+
+__host__ float *compute_jacobi(int n, float init_value, float threshold, borders b, int *iterations) {
+	float *x, *new_x;
+	float *x_gpu, *new_x_gpu;
+	float *tmp_x;
+	float max_diff;
+	int i, j;
+	int nb = n + 2; // n plus the border
+	int blocks_number;
+	int threads_block = THREADS_BLOCK;
+	cudaError_t cuda_status;
+
+	// Select the GPU
+	check_status(cudaSetDevice(0), "cudaSetDevice failed!");
+
+	/* Create the matrixes on the GPU */
+	x_gpu = create_sa_matrix_on_gpu(nb, nb, &cuda_status);
+	check_status(cuda_status, "create_sa_matrix_on_gpu failed!");
+	new_x_gpu = create_sa_matrix_on_gpu(nb, nb, &cuda_status);
+	check_status(cuda_status, "create_sa_matrix_on_gpu failed!");
+
+	/* Initialize the matrixes */
+	initialize_matrix_on_gpu<<<1, 1>>>(x_gpu, n, init_value, b, &cuda_status);
+	check_status(cuda_status, "initialize_matrix_on_gpu failed!");
+	initialize_matrix_on_gpu<<<1, 1>>>(new_x_gpu, n, init_value, b, &cuda_status);
+	check_status(cuda_status, "initialize_matrix_on_gpu failed!");
+
+	/* Iterative refinement of x until values converge */
+	x = retrieve_sa_matrix_from_gpu(x_gpu, nb, nb, &cuda_status);
+	check_status(cuda_status, "retrieve_sa_matrix_from_gpu failed!");
+
+	blocks_number = nb / threads_block + 1;
+	*iterations = 0;
+	do {
+		iterate<<<blocks_number, threads_block>>>(n, x_gpu, new_x_gpu);
+		new_x = retrieve_sa_matrix_from_gpu(new_x_gpu, nb, nb, &cuda_status);
+		check_status(cuda_status, "retrieve_sa_matrix_from_gpu failed!");
+		max_diff = 0;
+		for (i = 1; i <= n; i++) {
+			for (j = 1; j <= n; j++) {
+				max_diff = fmaxf(max_diff, fabs(new_x[IDX(nb, i, j)] - x[IDX(nb, i, j)]));
+			}
+		}
+
+		tmp_x = new_x;
+		new_x = x;
+		x = tmp_x;
+
+		tmp_x = new_x_gpu;
+		new_x_gpu = x_gpu;
+		x_gpu = tmp_x;
+
+		(*iterations)++;
+	} while (max_diff > threshold);
+
+	x = retrieve_sa_matrix_from_gpu(x_gpu, nb, nb, &cuda_status);
+	check_status(cuda_status, "retrieve_sa_matrix_from_gpu failed!");
+
+	destroy_sa_matrix_on_gpu(x_gpu);
+	destroy_sa_matrix_on_gpu(new_x_gpu);
+
+	return x;
+}
--- a/src/impl/mpi_line.c
+++ b/src/impl/mpi_line.c
--- a/src/impl/mpi_line_async.c
+++ b/src/impl/mpi_line_async.c
@@ -1,123 +0,0 @@
-/*
- * MPI version with the matrix subdivided by "lines".
- */
-
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-#include <mpi.h>
-#include "../config.h"
-#include "../utils.h"
-
-#define TAG_BORDER 0
-#define TAG_MATRIX 1
-
-float *compute_jacobi(int rank, int numprocs, int n, float init_value, float threshold, borders b, int *iterations) {
-  float *complete_x;
-  float *x;
-  float *new_x;
-  float *tmp_x;
-  float max_diff, global_max_diff, new_value;
-  int i, j;
-  int nb = n + 2; // n plus the border
-  int rows, rows_to_transmit;
-  int receive_pos;
-  MPI_Request request_north;
-  MPI_Request request_south;
-
-  if (rank == 0) {
-    rows = n - (n / numprocs) * (numprocs - 1);
-  } else {
-    rows = n / numprocs;
-  }
-  LOG(printf("[Process %d/%d] rows: %d\n", rank, numprocs, rows));
-  /* LOG(printf("[Process %d/%d] initializing matrix\n", rank, numprocs)); */
-  /* Initialize the matrix */
-  x = create_sa_matrix(rows + 2, nb);
-  new_x = create_sa_matrix(rows + 2, nb);
-  for (i = 0; i < rows + 2; i++) {
-    for (j = 1; j <= n; j++) {
-      x[IDX(nb, i, j)] = init_value;
-      new_x[IDX(nb, i, j)] = init_value;
-    }
-  }
-  /* Initialize boundary regions */
-  for (i = 0; i < rows + 2; i++) {
-    x[IDX(nb, i, 0)] = b.west;
-    x[IDX(nb, i, n + 1)] = b.east;
-    new_x[IDX(nb, i, 0)] = b.west;
-    new_x[IDX(nb, i, n + 1)] = b.east;
-  }
-  if (rank == 0) {
-    for (i = 1; i <= n + 1; i++) {
-      x[IDX(nb, 0, i)] = b.north;
-      new_x[IDX(nb, 0, i)] = b.north;
-    }
-  }
-  if (rank == numprocs - 1){
-    for (i = 1; i < n + 1; i++) {
-      x[IDX(nb, rows + 1, i)] = b.south;
-      new_x[IDX(nb, rows + 1, i)] = b.south;
-    }
-  }
-  /* LOG(printf("[Process %d/%d] matrix initialized\n", rank, numprocs)); */
-  /* Iterative refinement of x until values converge */
-  *iterations = 0;
-  do {
-    if (rank != numprocs - 1) {
-      // Send south border
-      MPI_Isend(&x[IDX(nb, rows, 0)], nb, MPI_FLOAT, rank + 1, TAG_BORDER, MPI_COMM_WORLD, &request_south);
-    }
-    if (rank != 0) {
-      // Send north border
-      MPI_Isend(&x[IDX(nb, 1, 0)], nb, MPI_FLOAT, rank - 1, TAG_BORDER, MPI_COMM_WORLD, &request_north);
-    }
-    max_diff = 0;
-    global_max_diff = 0;
-    for (i = 1; i <= rows; i++) {
-      for (j = 1; j <= n; j++) {
-        new_value = 0.25 * (x[IDX(nb, i - 1, j)] + x[IDX(nb, i, j + 1)] + x[IDX(nb, i + 1, j)] + x[IDX(nb, i, j - 1)]);
-        max_diff = fmaxf(max_diff, fabs(new_value - x[IDX(nb, i, j)]));
-        new_x[IDX(nb, i, j)] = new_value;
-      }
-    }
-    tmp_x = new_x;
-    new_x = x;
-    x = tmp_x;
-    if (rank != numprocs - 1) {
-      // Receive south border
-      MPI_Recv(&x[IDX(nb, rows + 1, 0)], nb, MPI_FLOAT, rank + 1, TAG_BORDER, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-    }
-    if (rank != 0) {
-      // Receive north border
-      MPI_Recv(&x[IDX(nb, 0, 0)], nb, MPI_FLOAT, rank - 1, TAG_BORDER, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-    }
-    LOG(printf("[Process %d/%d] max_diff: %f\n", rank, numprocs, max_diff));
-    MPI_Allreduce(&max_diff, &global_max_diff, 1, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD);
-    /* LOG(printf("[Process %d/%d] global_max_diff: %f\n", rank, numprocs, global_max_diff)); */
-    (*iterations)++;
-  } while (global_max_diff > threshold);
-
-  if (rank == 0) {
-    complete_x = create_sa_matrix(nb, nb);
-    memcpy(complete_x, x, (rows + ((rank == numprocs - 1) ? 2 : 1)) * (nb) * sizeof(float));
-    rows_to_transmit = n / numprocs;
-    receive_pos = rows + 1;
-    for (i = 1; i < numprocs; i++) {
-      if (i == numprocs - 1) {
-        rows_to_transmit++;
-      }
-      MPI_Recv(&complete_x[IDX(nb, receive_pos, 0)], rows_to_transmit * (nb), MPI_FLOAT, i, TAG_MATRIX, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-      receive_pos += n / numprocs;
-    }
-  } else {
-    complete_x = NULL;
-    rows_to_transmit = rows;
-    if (rank == numprocs - 1) {
-      rows_to_transmit++;
-    }
-    MPI_Send(&x[IDX(nb, 1, 0)], rows_to_transmit * (nb), MPI_FLOAT, 0, TAG_MATRIX, MPI_COMM_WORLD);
-  }
-
-  return complete_x;
-}
--- a/src/main/main.cu
+++ b/src/main/main.cu
@@ -0,0 +1,52 @@
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <sys/time.h>
+
+#include "../config.cuh"
+#include "../utils.cuh"
+
+__host__ float *compute_jacobi(int n, float init_value, float threshold, borders b, int *iterations);
+
+__host__ int main(int argc, char* argv[]) {
+	int n;
+	float init_value, threshold;
+	borders b;
+	int config_loaded;
+	configuration config;
+	float *x;
+	int iterations;
+	struct timeval start, end;
+	long secs_used, micros_used;
+
+	config_loaded = load_config(&config);
+	if (config_loaded != 0) {
+		return 1;
+	}
+	n = config.n;
+	threshold = config.threshold;
+	init_value = config.init_value;
+	b.north = config.north;
+	b.south = config.south;
+	b.east = config.east;
+	b.west = config.west;
+
+	gettimeofday(&start, NULL);
+	x = compute_jacobi(n, init_value, threshold, b, &iterations);
+	gettimeofday(&end, NULL);
+
+	secs_used = (end.tv_sec - start.tv_sec);
+	micros_used = ((secs_used * 1000000) + end.tv_usec) - (start.tv_usec);
+	printf("Wall clock time: %fs\n", (float)micros_used / 1000000);
+	printf("Iterations: N/A\n", iterations);
+	if (n < 10) {
+		print_sa_matrix(x, n + 2, n + 2);
+	}
+	destroy_sa_matrix(x);
+
+	return 0;
+}
--- a/src/utils.c
+++ b/src/utils.c
@@ -54,3 +54,12 @@ void print_matrix(float **x, int rows, int cols) {
  }
  fflush(stdout);
 }
+
+
+float fmaxf(float a, float b) {
+  if (a > b) {
+    return a;
+  } else {
+    return b;
+  }
+}
--- a/src/utils.cu
+++ b/src/utils.cu
@@ -0,0 +1,79 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "utils.cuh"
+
+__host__ float *create_sa_matrix(int rows, int cols) {
+	float *x;
+
+	x = (float *)malloc(rows * cols * sizeof(float));
+	return x;
+}
+
+__host__ float *create_sa_matrix_on_gpu(int rows, int cols, cudaError_t *cuda_status) {
+	float *x = NULL;
+
+	*cuda_status = cudaMalloc((void**)&x, rows * cols * sizeof(float));
+	return x;
+}
+
+__host__ void destroy_sa_matrix(float *x) {
+	free(x);
+}
+
+__host__ void destroy_sa_matrix_on_gpu(float *x) {
+	cudaFree(x);
+}
+
+__host__ void initialize_matrix_on_gpu(float *x, int rows, int cols, cudaError_t *cuda_status) {
+	*cuda_status = cudaMemset(x, 0, rows * cols * sizeof(float));
+}
+
+__host__ float *retrieve_sa_matrix_from_gpu(float *x, int rows, int cols, cudaError_t *cuda_status) {
+	float *x_host;
+
+	x_host = create_sa_matrix(rows, cols);
+	*cuda_status = cudaMemcpy(x_host, x, rows * cols * sizeof(float), cudaMemcpyDeviceToHost);
+	return x_host;
+}
+
+__host__ void print_sa_matrix(float *x, int rows, int cols) {
+	int i, j;
+	for (i = 0; i < rows; i++) {
+		for (j = 0; j < cols; j++) {
+			printf("%f\t", x[IDX(cols, i, j)]);
+		}
+		printf("\n");
+	}
+	fflush(stdout);
+}
+
+__host__ float **create_matrix(int rows, int cols) {
+	int i;
+	float **x;
+
+	x = (float **)malloc(rows * sizeof(float));
+	for (i = 0; i < rows; i++) {
+		x[i] = (float *)malloc(cols * sizeof(float));
+	}
+	return x;
+}
+
+__host__ void destroy_matrix(float **x, int rows) {
+	int i;
+
+	for (i = 0; i < rows; i++) {
+		free(x[i]);
+	}
+	free(x);
+}
+
+__host__ void print_matrix(float **x, int rows, int cols) {
+	int i, j;
+	for (i = 0; i < rows; i++) {
+		for (j = 0; j < cols; j++) {
+			printf("%f\t", x[i][j]);
+		}
+		printf("\n");
+	}
+	fflush(stdout);
+}
--- a/src/utils.cuh
+++ b/src/utils.cuh
@@ -0,0 +1,56 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+/* #define ENABLE_LOG */
+
+#ifdef ENABLE_LOG
+#  define LOG(x) x
+#else
+#  define LOG(x) (void) 0
+#endif
+
+/*
+* Macro used with single array matrices to
+* get the array index given the number of columns,
+* the row index and the column index.
+*/
+#define IDX(cols, r, c) ((r) * (cols) + (c))
+
+typedef struct borders {
+	float north;
+	float east;
+	float south;
+	float west;
+} borders;
+
+
+/*
+* Create a matrix stored in a single array.
+*/
+__host__ float *create_sa_matrix(int rows, int cols);
+
+__host__ float *create_sa_matrix_on_gpu(int rows, int cols, cudaError_t *cuda_status);
+
+/*
+* Destroy a single array matrix.
+*/
+__host__ void destroy_sa_matrix(float *x);
+
+__host__ void destroy_sa_matrix_on_gpu(float *x);
+
+__host__ void initialize_matrix_on_gpu(float *x, int rows, int cols, cudaError_t *cuda_status);
+
+__host__ float *retrieve_sa_matrix_from_gpu(float *x, int rows, int cols, cudaError_t *cuda_status);
+
+
+/*
+* Print a single array matrix.
+*/
+__host__ void print_sa_matrix(float *x, int rows, int cols);
+
+__host__ float **create_matrix(int rows, int cols);
+__host__ void destroy_matrix(float **x, int rows);
+__host__ void print_matrix(float **x, int rows, int cols);
--- a/src/utils.h
+++ b/src/utils.h
@@ -44,3 +44,5 @@ void print_sa_matrix(float *x, int rows, int cols);
 float **create_matrix(int rows, int cols);
 void destroy_matrix(float **x, int rows);
 void print_matrix(float **x, int rows, int cols);
+
+float fmaxf(float a, float b);