/*
 * MPI version with the matrix subdivided by "lines".
 */

#include <stdio.h>
#include <string.h>
#include <math.h>
#include <mpi.h>
#include "../config/config.h"
#include "../utils/utils.h"

#define TAG_BORDER 0
#define TAG_MATRIX 1

double *compute_jacobi(int n, double init_value, double threshold, borders b, int *iterations);

int rank;
int numprocs;

int main(int argc, char* argv[]) {
  int n;
  double init_value, threshold;
  double north, south, east, west;
  borders b;
  int config_loaded;
  configuration config;
  double *x;
  double startwtime = 0.0, endwtime;
  int iterations;

  MPI_Init(&argc, &argv);

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);

  if (rank == 0) {
    config_loaded = load_config(&config);
    if (config_loaded != 0) {
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
    n = config.n;
    threshold = config.threshold;
    init_value = config.init_value;
    north = config.north;
    south = config.south;
    east = config.east;
    west = config.west;
  }
  MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&init_value, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&threshold, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&north, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&south, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&east, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&west, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);

  b.north = north;
  b.south = south;
  b.east = east;
  b.west = west;

  if (rank == 0) {
    startwtime = MPI_Wtime();
  }
 
  x = compute_jacobi(n, init_value, threshold, b, &iterations);

  if (rank == 0) {
    endwtime = MPI_Wtime();
    printf("Wall clock time: %fs\n", endwtime - startwtime);
    printf("Iterations: %d\n", iterations);
    /* print_sa_matrix(x, n + 2, n + 2); */
  }

  destroy_sa_matrix(x);

  MPI_Finalize();

  return 0;
}

double *compute_jacobi(int n, double init_value, double threshold, borders b, int *iterations) {
  double *complete_x;
  double *x;
  double max_diff, global_max_diff, new_x;
  int i, j;
  int rows, rows_to_transmit;
  int receive_pos;
  MPI_Status status;

  if (rank == 0) {
    rows = n - (n / numprocs) * (numprocs - 1);
  } else {
    rows = n / numprocs;
  }
  LOG(printf("[Process %d/%d] rows: %d\n", rank, numprocs, rows));
  /* LOG(printf("[Process %d/%d] initializing matrix\n", rank, numprocs)); */
  /* Initialize the matrix */
  x = create_sa_matrix(rows + 2, n + 2);
  for (i = 0; i < rows + 2; i++) {
    for (j = 1; j <= n; j++) {
      x[sa_index(n + 2, i, j)] = init_value;
    }
  }
  /* Initialize boundary regions */
  for (i = 0; i < rows + 2; i++) {
    x[sa_index(n + 2, i, 0)] = b.west;
    x[sa_index(n + 2, i, n + 1)] = b.east;
  }
  if (rank == 0) {
    for (i = 1; i <= n + 1; i++) {
      x[sa_index(n + 2, 0, i)] = b.north;
    }
  }
  if (rank == numprocs - 1){
    for (i = 1; i < n + 1; i++) {
      x[sa_index(n + 2, rows + 1, i)] = b.south;
    }
  }
  /* LOG(printf("[Process %d/%d] matrix initialized\n", rank, numprocs)); */
  /* Iterative refinement of x until values converge */
  *iterations = 0;
  do {
    max_diff = 0;
    global_max_diff = 0;
    for (i = 1; i <= rows; i++) {
      for (j = 1; j <= n; j++) {
        new_x = 0.25 * (x[sa_index(n + 2, i - 1, j)] + x[sa_index(n + 2, i, j + 1)] + x[sa_index(n + 2, i + 1, j)] + x[sa_index(n + 2, i, j - 1)]);
        max_diff = (double) fmax(max_diff, fabs(new_x - x[sa_index(n + 2, i, j)]));
        x[sa_index(n + 2, i, j)] = new_x;
      }
    }
    if (rank % 2 == 0) {
      if (rank != numprocs - 1) {
        // Send and receive south border
        MPI_Send(&x[sa_index(n + 2, rows, 0)], n + 2, MPI_DOUBLE, rank + 1, TAG_BORDER, MPI_COMM_WORLD);
        MPI_Recv(&x[sa_index(n + 2, rows + 1, 0)], n + 2, MPI_DOUBLE, rank + 1, TAG_BORDER, MPI_COMM_WORLD, &status);
      }
      if (rank != 0) {
        // Send and receive north border
        MPI_Send(&x[sa_index(n + 2, 1, 0)], n + 2, MPI_DOUBLE, rank - 1, TAG_BORDER, MPI_COMM_WORLD);
        MPI_Recv(&x[sa_index(n + 2, 0, 0)], n + 2, MPI_DOUBLE, rank - 1, TAG_BORDER, MPI_COMM_WORLD, &status);
      }
    } else {
      // Receive and send north border
      MPI_Recv(&x[sa_index(n + 2, 0, 0)], n + 2, MPI_DOUBLE, rank - 1, TAG_BORDER, MPI_COMM_WORLD, &status);
      MPI_Send(&x[sa_index(n + 2, 1, 0)], n + 2, MPI_DOUBLE, rank - 1, TAG_BORDER, MPI_COMM_WORLD);
      if (rank != numprocs - 1) {
        // Receive and send south border
        MPI_Recv(&x[sa_index(n + 2, rows + 1, 0)], n + 2, MPI_DOUBLE, rank + 1, TAG_BORDER, MPI_COMM_WORLD, &status);
        MPI_Send(&x[sa_index(n + 2, rows, 0)], n + 2, MPI_DOUBLE, rank + 1, TAG_BORDER, MPI_COMM_WORLD);
      }
    }
    /* LOG(printf("[Process %d/%d] max_diff: %f\n", rank, numprocs, max_diff)); */
    MPI_Allreduce(&max_diff, &global_max_diff, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
    /* LOG(printf("[Process %d/%d] global_max_diff: %f\n", rank, numprocs, global_max_diff)); */
    (*iterations)++;
  } while (global_max_diff > threshold);

  if (rank == 0) {
    complete_x = create_sa_matrix(n + 2, n + 2);
    memcpy(complete_x, x, (rows + 1) * (n + 2) * sizeof(double)); /* rows + 2 if rank == numprocs - 1 */
    rows_to_transmit = n / numprocs;
    receive_pos = rows + 1;
    for (i = 1; i < numprocs; i++) {
      if (i == numprocs - 1) {
        rows_to_transmit++;
      }
      MPI_Recv(&complete_x[sa_index(n + 2, receive_pos, 0)], rows_to_transmit * (n + 2), MPI_DOUBLE, i, TAG_MATRIX, MPI_COMM_WORLD, &status);
      receive_pos += n / numprocs;
    }
  } else {
    complete_x = NULL;
    rows_to_transmit = rows;
    if (rank == numprocs - 1) {
      rows_to_transmit++;
    }
    MPI_Send(&x[sa_index(n + 2, 1, 0)], rows_to_transmit * (n + 2), MPI_DOUBLE, 0, TAG_MATRIX, MPI_COMM_WORLD);
  }

  return complete_x;
}