Last commit
This commit is contained in:
118
src/impl/cuda.cu
Normal file
118
src/impl/cuda.cu
Normal file
@@ -0,0 +1,118 @@
|
||||
/*
|
||||
* CUDA version.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include "../config.cuh"
|
||||
#include "../utils.cuh"
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
#define THREADS_BLOCK 256;
|
||||
|
||||
__host__ void check_status(cudaError_t cuda_status, char *msg) {
|
||||
if (cuda_status != cudaSuccess) {
|
||||
fprintf(stderr, msg);
|
||||
fprintf(stderr, ": ");
|
||||
fprintf(stderr, cudaGetErrorString(cuda_status));
|
||||
fprintf(stderr, " (error code: %d)\n", cuda_status);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void initialize_matrix_on_gpu(float *x, int n, float init_value, borders b, cudaError_t *cuda_status) {
|
||||
int i, j;
|
||||
int nb = n + 2;
|
||||
|
||||
/* Initialize borders */
|
||||
for (i = 0; i < nb; i++) {
|
||||
x[IDX(nb, 0, i)] = b.north;
|
||||
x[IDX(nb, n + 1, i)] = b.south;
|
||||
x[IDX(nb, i, 0)] = b.west;
|
||||
x[IDX(nb, i, n + 1)] = b.east;
|
||||
}
|
||||
/* Initialize the rest of the matrix */
|
||||
for (i = 1; i <= n; i++) {
|
||||
for (j = 1; j <= n; j++) {
|
||||
x[IDX(nb, i, j)] = init_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void iterate(int n, float *x, float *new_x) {
|
||||
int idx, nb;
|
||||
int i, j;
|
||||
|
||||
nb = n + 2;
|
||||
idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
i = idx / nb;
|
||||
j = idx % nb;
|
||||
if (i >= 1 && i <= n && j >= 1 && j <= n) {
|
||||
new_x[idx] = 0.25 * (x[IDX(nb, i - 1, j)] + x[IDX(nb, i, j + 1)] + x[IDX(nb, i + 1, j)] + x[IDX(nb, i, j - 1)]);
|
||||
}
|
||||
}
|
||||
|
||||
__host__ float *compute_jacobi(int n, float init_value, float threshold, borders b, int *iterations) {
|
||||
float *x, *new_x;
|
||||
float *x_gpu, *new_x_gpu;
|
||||
float *tmp_x;
|
||||
float max_diff;
|
||||
int i, j;
|
||||
int nb = n + 2; // n plus the border
|
||||
int blocks_number;
|
||||
int threads_block = THREADS_BLOCK;
|
||||
cudaError_t cuda_status;
|
||||
|
||||
// Select the GPU
|
||||
check_status(cudaSetDevice(0), "cudaSetDevice failed!");
|
||||
|
||||
/* Create the matrixes on the GPU */
|
||||
x_gpu = create_sa_matrix_on_gpu(nb, nb, &cuda_status);
|
||||
check_status(cuda_status, "create_sa_matrix_on_gpu failed!");
|
||||
new_x_gpu = create_sa_matrix_on_gpu(nb, nb, &cuda_status);
|
||||
check_status(cuda_status, "create_sa_matrix_on_gpu failed!");
|
||||
|
||||
/* Initialize the matrixes */
|
||||
initialize_matrix_on_gpu<<<1, 1>>>(x_gpu, n, init_value, b, &cuda_status);
|
||||
check_status(cuda_status, "initialize_matrix_on_gpu failed!");
|
||||
initialize_matrix_on_gpu<<<1, 1>>>(new_x_gpu, n, init_value, b, &cuda_status);
|
||||
check_status(cuda_status, "initialize_matrix_on_gpu failed!");
|
||||
|
||||
/* Iterative refinement of x until values converge */
|
||||
x = retrieve_sa_matrix_from_gpu(x_gpu, nb, nb, &cuda_status);
|
||||
check_status(cuda_status, "retrieve_sa_matrix_from_gpu failed!");
|
||||
|
||||
blocks_number = nb / threads_block + 1;
|
||||
*iterations = 0;
|
||||
do {
|
||||
iterate<<<blocks_number, threads_block>>>(n, x_gpu, new_x_gpu);
|
||||
new_x = retrieve_sa_matrix_from_gpu(new_x_gpu, nb, nb, &cuda_status);
|
||||
check_status(cuda_status, "retrieve_sa_matrix_from_gpu failed!");
|
||||
max_diff = 0;
|
||||
for (i = 1; i <= n; i++) {
|
||||
for (j = 1; j <= n; j++) {
|
||||
max_diff = fmaxf(max_diff, fabs(new_x[IDX(nb, i, j)] - x[IDX(nb, i, j)]));
|
||||
}
|
||||
}
|
||||
|
||||
tmp_x = new_x;
|
||||
new_x = x;
|
||||
x = tmp_x;
|
||||
|
||||
tmp_x = new_x_gpu;
|
||||
new_x_gpu = x_gpu;
|
||||
x_gpu = tmp_x;
|
||||
|
||||
(*iterations)++;
|
||||
} while (max_diff > threshold);
|
||||
|
||||
x = retrieve_sa_matrix_from_gpu(x_gpu, nb, nb, &cuda_status);
|
||||
check_status(cuda_status, "retrieve_sa_matrix_from_gpu failed!");
|
||||
|
||||
destroy_sa_matrix_on_gpu(x_gpu);
|
||||
destroy_sa_matrix_on_gpu(new_x_gpu);
|
||||
|
||||
return x;
|
||||
}
|
||||
@@ -1,123 +0,0 @@
|
||||
/*
|
||||
* MPI version with the matrix subdivided by "lines".
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <mpi.h>
|
||||
#include "../config.h"
|
||||
#include "../utils.h"
|
||||
|
||||
#define TAG_BORDER 0
|
||||
#define TAG_MATRIX 1
|
||||
|
||||
float *compute_jacobi(int rank, int numprocs, int n, float init_value, float threshold, borders b, int *iterations) {
|
||||
float *complete_x;
|
||||
float *x;
|
||||
float *new_x;
|
||||
float *tmp_x;
|
||||
float max_diff, global_max_diff, new_value;
|
||||
int i, j;
|
||||
int nb = n + 2; // n plus the border
|
||||
int rows, rows_to_transmit;
|
||||
int receive_pos;
|
||||
MPI_Request request_north;
|
||||
MPI_Request request_south;
|
||||
|
||||
if (rank == 0) {
|
||||
rows = n - (n / numprocs) * (numprocs - 1);
|
||||
} else {
|
||||
rows = n / numprocs;
|
||||
}
|
||||
LOG(printf("[Process %d/%d] rows: %d\n", rank, numprocs, rows));
|
||||
/* LOG(printf("[Process %d/%d] initializing matrix\n", rank, numprocs)); */
|
||||
/* Initialize the matrix */
|
||||
x = create_sa_matrix(rows + 2, nb);
|
||||
new_x = create_sa_matrix(rows + 2, nb);
|
||||
for (i = 0; i < rows + 2; i++) {
|
||||
for (j = 1; j <= n; j++) {
|
||||
x[IDX(nb, i, j)] = init_value;
|
||||
new_x[IDX(nb, i, j)] = init_value;
|
||||
}
|
||||
}
|
||||
/* Initialize boundary regions */
|
||||
for (i = 0; i < rows + 2; i++) {
|
||||
x[IDX(nb, i, 0)] = b.west;
|
||||
x[IDX(nb, i, n + 1)] = b.east;
|
||||
new_x[IDX(nb, i, 0)] = b.west;
|
||||
new_x[IDX(nb, i, n + 1)] = b.east;
|
||||
}
|
||||
if (rank == 0) {
|
||||
for (i = 1; i <= n + 1; i++) {
|
||||
x[IDX(nb, 0, i)] = b.north;
|
||||
new_x[IDX(nb, 0, i)] = b.north;
|
||||
}
|
||||
}
|
||||
if (rank == numprocs - 1){
|
||||
for (i = 1; i < n + 1; i++) {
|
||||
x[IDX(nb, rows + 1, i)] = b.south;
|
||||
new_x[IDX(nb, rows + 1, i)] = b.south;
|
||||
}
|
||||
}
|
||||
/* LOG(printf("[Process %d/%d] matrix initialized\n", rank, numprocs)); */
|
||||
/* Iterative refinement of x until values converge */
|
||||
*iterations = 0;
|
||||
do {
|
||||
if (rank != numprocs - 1) {
|
||||
// Send south border
|
||||
MPI_Isend(&x[IDX(nb, rows, 0)], nb, MPI_FLOAT, rank + 1, TAG_BORDER, MPI_COMM_WORLD, &request_south);
|
||||
}
|
||||
if (rank != 0) {
|
||||
// Send north border
|
||||
MPI_Isend(&x[IDX(nb, 1, 0)], nb, MPI_FLOAT, rank - 1, TAG_BORDER, MPI_COMM_WORLD, &request_north);
|
||||
}
|
||||
max_diff = 0;
|
||||
global_max_diff = 0;
|
||||
for (i = 1; i <= rows; i++) {
|
||||
for (j = 1; j <= n; j++) {
|
||||
new_value = 0.25 * (x[IDX(nb, i - 1, j)] + x[IDX(nb, i, j + 1)] + x[IDX(nb, i + 1, j)] + x[IDX(nb, i, j - 1)]);
|
||||
max_diff = fmaxf(max_diff, fabs(new_value - x[IDX(nb, i, j)]));
|
||||
new_x[IDX(nb, i, j)] = new_value;
|
||||
}
|
||||
}
|
||||
tmp_x = new_x;
|
||||
new_x = x;
|
||||
x = tmp_x;
|
||||
if (rank != numprocs - 1) {
|
||||
// Receive south border
|
||||
MPI_Recv(&x[IDX(nb, rows + 1, 0)], nb, MPI_FLOAT, rank + 1, TAG_BORDER, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
|
||||
}
|
||||
if (rank != 0) {
|
||||
// Receive north border
|
||||
MPI_Recv(&x[IDX(nb, 0, 0)], nb, MPI_FLOAT, rank - 1, TAG_BORDER, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
|
||||
}
|
||||
LOG(printf("[Process %d/%d] max_diff: %f\n", rank, numprocs, max_diff));
|
||||
MPI_Allreduce(&max_diff, &global_max_diff, 1, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD);
|
||||
/* LOG(printf("[Process %d/%d] global_max_diff: %f\n", rank, numprocs, global_max_diff)); */
|
||||
(*iterations)++;
|
||||
} while (global_max_diff > threshold);
|
||||
|
||||
if (rank == 0) {
|
||||
complete_x = create_sa_matrix(nb, nb);
|
||||
memcpy(complete_x, x, (rows + ((rank == numprocs - 1) ? 2 : 1)) * (nb) * sizeof(float));
|
||||
rows_to_transmit = n / numprocs;
|
||||
receive_pos = rows + 1;
|
||||
for (i = 1; i < numprocs; i++) {
|
||||
if (i == numprocs - 1) {
|
||||
rows_to_transmit++;
|
||||
}
|
||||
MPI_Recv(&complete_x[IDX(nb, receive_pos, 0)], rows_to_transmit * (nb), MPI_FLOAT, i, TAG_MATRIX, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
|
||||
receive_pos += n / numprocs;
|
||||
}
|
||||
} else {
|
||||
complete_x = NULL;
|
||||
rows_to_transmit = rows;
|
||||
if (rank == numprocs - 1) {
|
||||
rows_to_transmit++;
|
||||
}
|
||||
MPI_Send(&x[IDX(nb, 1, 0)], rows_to_transmit * (nb), MPI_FLOAT, 0, TAG_MATRIX, MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
return complete_x;
|
||||
}
|
||||
Reference in New Issue
Block a user