I’m trying to implement an MPI program to iteratively set each element in an array to the average value of itself and its neighbors (on the previous timestep) while holding the first and last elements cosntant. For one process, this works fine; however, for multiple process, I am not getting the right answer, and in particular, the first array element is always overwritten.
My initialization step seems to be working correctly, at least as far as the “before computation” output is concerned, which prints the same vector regardless of whether the number of processes used is 1 or more.
One thing I’m not entirely sure of is whether I’m using MPI_Request and MPI_Status correctly; the variables to pay attention to are sendL, sendR, and status.
I tried to only include the relevant portions of code; “…” marks where something is missing. Some of these ellipses have comments to explain what was removed. Both the parallel and single-process implementations are given for comparison.
...
#include "mpi.h"
... //definition of f() for initialization
int main(int argc, char **argv) {
int id, p, i, j, k, n, t, m, v, vp,
lbound, ubound, block_size, offset;
double startwtime, endwtime;
float time;
MPI_Request *sendL, *sendR;
MPI_Status *status; /* return status for receive */
double *prev, *cur, *temp;
... // initialize MPI; get PE rank and size
.... // set the following:
// n = vector length, m = num iterations, k = buffer size
// v = verbose (true/false)
// Memory allocation for output from MPI functions
// Note that I never actually initialized these. Is this a problem?
sendL = (MPI_Request *) malloc(sizeof(MPI_Request));
sendR = (MPI_Request *) malloc(sizeof(MPI_Request));
status = (MPI_Status *) malloc(sizeof(MPI_Status));
// Memory allocation for data array.
block_size = (n/p+2*k);
prev = (double *) malloc( sizeof(double) * block_size);
cur = (double *) malloc( sizeof(double) * block_size);
... //malloc error handling
t = 0;
/* The following block is for a single process. It works correctly. */
if(p==1){
// Initialization
startwtime = MPI_Wtime();
for(i=0;i<n;i++) prev[i] = f(i,n);
cur[0] = f(0,n); cur[n-1] = f(n-1,n);
if(v){
printf("Before calculation\n");
for(i=0;i<n;i++) printf("%f ",prev[i]);
printf("\n");
}
while (t < m) {
for ( i=1 ; i < n-1 ; i++ ) {
cur[i] = (prev[i-1]+prev[i]+prev[i+1])/3;
}
temp = prev; prev = cur; cur = temp; t++;
}
if(v){
printf("After calculation:\n");
for(i=0;i<n;i++) printf("%f ",prev[i]);
printf("\n");
}
endwtime = MPI_Wtime();
time = endwtime-startwtime;
printf("Sequential process complete, time: %f\n", time);
return MPI_Finalize();
}
/* Here is my parallel implementation. It has problems. */
else{
if (id == 0){
startwtime = MPI_Wtime();
}
// Initialization
offset = id*(n/p)-k;
for(i=0;i<block_size;i++) prev[i] = f(i+offset,n);
cur[0] = f(0,n); cur[block_size-1] = prev[block_size-1];
if (id == 0){
for (i=0;i<k;i++){
prev[i] = f(0,n);
cur[i] = prev[i];
}
}
if (id == p-1){
for (i=block_size-k;i<block_size;i++){
prev[i] = f(n-1,n);
cur[i] = prev[i];
}
}
if(v && id == 0){
printf("Before calculation:\n");
for(j=k;j<(n/p)+k;j++) printf("%f ",prev[j]);
for(i=1;i<p;i++){
MPI_Recv(prev+k,(n/p),MPI_DOUBLE_PRECISION,i,2,MPI_COMM_WORLD,status);
for(j=k;j<(n/p)+k;j++) printf("%f ",prev[j]);
}
printf("\n");
}
else if (v){
MPI_Isend(prev+k,(n/p),MPI_DOUBLE_PRECISION,0,2,MPI_COMM_WORLD,sendL);
}
lbound = (id == 0) ? (k+1) : (1);
ubound = (id == p-1) ? (block_size-k-2) : (block_size-2);
while (t < m) {
for ( i=lbound ; i < ubound ; i++ ) {
cur[i] = (prev[i-1]+prev[i]+prev[i+1])/3;
}
temp = prev; prev = cur; cur = temp; t++;
if (t%k == 0){
if (id > 0){
// send to left
MPI_Isend(prev+k,k,MPI_DOUBLE_PRECISION,id-1,0,MPI_COMM_WORLD,sendL);
}
if (id < p-1) {
// send to right
MPI_Isend(prev+block_size-2*k,k,
MPI_DOUBLE_PRECISION,id+1,1,MPI_COMM_WORLD,sendR);
}
if (id < p-1){
// receive from right
MPI_Recv(prev+block_size-k,k,
MPI_DOUBLE_PRECISION,id+1,0,MPI_COMM_WORLD,status);
}
if (id > 0) {
// receive from left
MPI_Recv(prev,k,MPI_DOUBLE_PRECISION,id-1,1,MPI_COMM_WORLD,status);
}
}
}
if(v && id == 0){
printf("After calculation\n");
for(j=k;j<(n/p)+k;j++) printf("%f ",prev[j]);
for(i=1;i<p;i++){
MPI_Recv(prev+k,(n/p),MPI_DOUBLE_PRECISION,i,2,MPI_COMM_WORLD,status);
for(j=k;j<(n/p)+k;j++) printf("%f ",prev[j]);
}
printf("\n");
}
else if (v){
MPI_Isend(prev+k,(n/p),MPI_DOUBLE_PRECISION,0,2,MPI_COMM_WORLD,sendL);
}
if (id == 0){
endwtime = MPI_Wtime();
time = endwtime-startwtime;
printf("Process 0 complete, time: %f\n", time);
}
return MPI_Finalize();
}
}
The “before computation” output overwrites the
prevpointer. Oops.