JacobiHPC/src/impl/cuda.cu

/*
* CUDA version.
*/

#include <stdio.h>
#include <math.h>
#include "../config.cuh"
#include "../utils.cuh"

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define THREADS_BLOCK 256;

__host__ void check_status(cudaError_t cuda_status, char *msg) {
	if (cuda_status != cudaSuccess) {
		fprintf(stderr, msg);
		fprintf(stderr, ": ");
		fprintf(stderr, cudaGetErrorString(cuda_status));
		fprintf(stderr, " (error code: %d)\n", cuda_status);
		exit(EXIT_FAILURE);
	}
}

__global__ void initialize_matrix_on_gpu(float *x, int n, float init_value, borders b, cudaError_t *cuda_status) {
	int i, j;
	int nb = n + 2;

	/* Initialize borders */
	for (i = 0; i < nb; i++) {
		x[IDX(nb, 0, i)] = b.north;
		x[IDX(nb, n + 1, i)] = b.south;
		x[IDX(nb, i, 0)] = b.west;
		x[IDX(nb, i, n + 1)] = b.east;
	}
	/* Initialize the rest of the matrix */
	for (i = 1; i <= n; i++) {
		for (j = 1; j <= n; j++) {
			x[IDX(nb, i, j)] = init_value;
		}
	}
}

__global__ void iterate(int n, float *x, float *new_x) {
	int idx, nb;
	int i, j;

	nb = n + 2;
	idx = blockDim.x * blockIdx.x + threadIdx.x;
	i = idx / nb;
	j = idx % nb;
	if (i >= 1 && i <= n && j >= 1 && j <= n) {
		new_x[idx] = 0.25 * (x[IDX(nb, i - 1, j)] + x[IDX(nb, i, j + 1)] + x[IDX(nb, i + 1, j)] + x[IDX(nb, i, j - 1)]);
	}
}

__host__ float *compute_jacobi(int n, float init_value, float threshold, borders b, int *iterations) {
	float *x, *new_x;
	float *x_gpu, *new_x_gpu;
	float *tmp_x;
	float max_diff;
	int i, j;
	int nb = n + 2; // n plus the border
	int blocks_number;
	int threads_block = THREADS_BLOCK;
	cudaError_t cuda_status;

	// Select the GPU
	check_status(cudaSetDevice(0), "cudaSetDevice failed!");

	/* Create the matrixes on the GPU */
	x_gpu = create_sa_matrix_on_gpu(nb, nb, &cuda_status);
	check_status(cuda_status, "create_sa_matrix_on_gpu failed!");
	new_x_gpu = create_sa_matrix_on_gpu(nb, nb, &cuda_status);
	check_status(cuda_status, "create_sa_matrix_on_gpu failed!");

	/* Initialize the matrixes */
	initialize_matrix_on_gpu<<<1, 1>>>(x_gpu, n, init_value, b, &cuda_status);
	check_status(cuda_status, "initialize_matrix_on_gpu failed!");
	initialize_matrix_on_gpu<<<1, 1>>>(new_x_gpu, n, init_value, b, &cuda_status);
	check_status(cuda_status, "initialize_matrix_on_gpu failed!");

	/* Iterative refinement of x until values converge */
	x = retrieve_sa_matrix_from_gpu(x_gpu, nb, nb, &cuda_status);
	check_status(cuda_status, "retrieve_sa_matrix_from_gpu failed!");

	blocks_number = nb / threads_block + 1;
	*iterations = 0;
	do {
		iterate<<<blocks_number, threads_block>>>(n, x_gpu, new_x_gpu);
		new_x = retrieve_sa_matrix_from_gpu(new_x_gpu, nb, nb, &cuda_status);
		check_status(cuda_status, "retrieve_sa_matrix_from_gpu failed!");
		max_diff = 0;
		for (i = 1; i <= n; i++) {
			for (j = 1; j <= n; j++) {
				max_diff = fmaxf(max_diff, fabs(new_x[IDX(nb, i, j)] - x[IDX(nb, i, j)]));
			}
		}

		tmp_x = new_x;
		new_x = x;
		x = tmp_x;

		tmp_x = new_x_gpu;
		new_x_gpu = x_gpu;
		x_gpu = tmp_x;

		(*iterations)++;
	} while (max_diff > threshold);

	x = retrieve_sa_matrix_from_gpu(x_gpu, nb, nb, &cuda_status);
	check_status(cuda_status, "retrieve_sa_matrix_from_gpu failed!");

	destroy_sa_matrix_on_gpu(x_gpu);
	destroy_sa_matrix_on_gpu(new_x_gpu);

	return x;
}