Only very recently did the prospects of parallel programming caught my attention.
Since then I have used a variety of parallel programming libraries. Perhaps my
first stop was the Intel Thread Building Blocks(TBB). But, what often became the bottle-necks were the errors due to factors like Round-Offs and the unpredictable behaviour of these programmes in different processor architectures. Below is a piece of code which calculates the Pearsons Correlation Coefficient of two sets of values. It employs the very basic parallel patterns of the TBB – the *parallel_for* and *parallel_reduce* :
// A programme to calculate Pearsons Correlation coefficient
#include <math.h>
#include <stdlib.h>
#include <iostream>
#include <tbb/task_scheduler_init.h>
#include <tbb/parallel_for.h>
#include <tbb/parallel_reduce.h>
#include <tbb/blocked_range.h>
#include <tbb/tick_count.h>
using namespace std;
using namespace tbb;
const size_t n=100000;
double global=0;
namespace s //Namesapce for serial part
{
double *a,*b;
int j;
double mean_a,mean_b,sd_a=0,sd_b=0,pcc=0;
double sum_a,sum_b,i;
}
namespace p //Namespace for parallel part
{
double *a,*b;
double mean_a,mean_b,pcc;
double sum_a,sum_b,i;
double sd_a,sd_b;
}
class serials
{
public:
void computemean_serial()
{
using namespace s;
sum_a=0,sum_b=0,i=0;
a=(double*) malloc(n*sizeof(double));
b=(double*) malloc(n*sizeof(double));
for(j=0;j<n;j++,i++)
{
a[j]=sin(i);
b[j]=cos(i);
sum_a=sum_a+a[j];
sum_b=sum_b+b[j];
}
mean_a=sum_a/n;
mean_b=sum_b/n;
cout<<"\nMean of a :"<<mean_a;
cout<<"\nMean of b :"<<mean_b;
}
void computesd_serial()
{
using namespace s;
for(j=0;j<n;j++)
{sd_a=sd_a+pow((a[j]-mean_a),2);
sd_b=sd_b+pow((b[j]-mean_b),2);
}
sd_a=sd_a/n;
sd_a=sqrt(sd_a);
sd_b=sd_b/n;
sd_b=sqrt(sd_b);
cout<<"\nStandard deviation of a :"<<sd_a;
cout<<"\nStandard deviation of b :"<<sd_b;
}
void pearson_correlation_coefficient_serial()
{
using namespace s;
pcc=0;
for(j=0;j<n;j++)
{
pcc+=(a[j]-mean_a)*(b[j]-mean_b);
}
pcc=pcc/(n*sd_a*sd_b);
cout<<"\nPearson Correlation Coefficient: "<<pcc;
}
};
class parallel
{
public:
class compute_mean
{
double *store1,*store2;
public:
double mean_a,mean_b;
void operator()( const blocked_range<size_t>& r)
{
double *a= store1;
double *b= store2;
for(size_t i =r.begin();i!=r.end(); ++i)
{
mean_a+=a[i];
mean_b+=b[i];
}
}
compute_mean( compute_mean& x, split) : store1(x.store1),store2(x.store2),mean_a(0),mean_b(0){}
void join(const compute_mean& y) {mean_a+=y.mean_a;mean_b+=y.mean_b;}
compute_mean(double* a,double* b): store1(a),store2(b),mean_a(0),mean_b(0){}
};
class read_array
{
double *const a,*const b;
public:
read_array(double* vec1, double* vec2) : a(vec1),b(vec2){} // constructor copies the arguments into local store
void operator() (const blocked_range<size_t> &r) const { // opration to be used in parallel_for
for(size_t k = r.begin(); k!=r.end(); k++,global++)
{
a[k]=sin(global);
b[k]=cos(global);
}
}};
void computemean_parallel()
{
using namespace p;
i=0;
a=(double*) malloc(n*sizeof(double));
b=(double*) malloc(n*sizeof(double));
parallel_for(blocked_range<size_t>(0,n,5000),read_array(a,b));
compute_mean sf(a,b);
parallel_reduce(blocked_range<size_t>(0,n,5000),sf);
mean_a=sf.mean_a/n;
mean_b=sf.mean_b/n;
cout<<"\nMean of a :"<<mean_a;
cout<<"\nMean of b :"<<mean_b;
}
class compute_sd
{
double *store1,*store2;
double store3,store4;
public:
double sd_a,sd_b,dif_a,dif_b,temp_pcc;
void operator()( const blocked_range<size_t>& r)
{
double *a= store1;
double *b= store2;
double mean_a=store3;
double mean_b=store4;
for(size_t i =r.begin();i!=r.end(); ++i)
{
dif_a=a[i]-mean_a;
dif_b=b[i]-mean_b;
temp_pcc+=dif_a*dif_b;
sd_a+=pow(dif_a,2);
sd_b+=pow(dif_b,2);
}}
compute_sd( compute_sd& x, split) : store1(x.store1),store2(x.store2),store3(p::mean_a),store4(p::mean_b),sd_a(0),sd_b(0),temp_pcc(0){}
void join(const compute_sd& y) {sd_a+=y.sd_a;sd_b+=y.sd_b;}
compute_sd(double* a,double* b,double mean_a,double mean_b): store1(a),store2(b),store3(mean_a),store4(mean_b),sd_a(0),sd_b(0),temp_pcc(0){}
};
void computesd_and_pearson_correlation_coefficient_parallel()
{
using namespace p;
compute_sd obj2(a,b,mean_a,mean_b);
parallel_reduce(blocked_range<size_t>(0,n,5000),obj2);
sd_a=obj2.sd_a;
sd_b=obj2.sd_b;
sd_a=sd_a/n;
sd_a=sqrt(sd_a);
sd_b=sd_b/n;
sd_b=sqrt(sd_b);
cout<<"\nStandard deviation of a :"<<sd_a;
cout<<"\nStandard deviation of b :"<<sd_b;
pcc=obj2.temp_pcc;
pcc=pcc/(n*sd_a*sd_b);
cout<<"\nPearson Correlation Coefficient: "<<pcc;
}
};
main()
{
serials obj_s;
parallel obj_p;
cout<<"\nSerial Part";
cout<<"\n-----------";
tick_count start_s=tick_count::now();
obj_s.computemean_serial();
obj_s.computesd_serial();
obj_s.pearson_correlation_coefficient_serial();
tick_count end_s=tick_count::now();
cout<<"\n";
task_scheduler_init init;
cout<<"\nParallel Part";
cout<<"\n-------------";
tick_count start_p=tick_count::now();
obj_p.computemean_parallel();
obj_p.computesd_and_pearson_correlation_coefficient_parallel();
tick_count end_p=tick_count::now();
cout<<"\n";
cout<<"\nTime Estimates";
cout<<"\n--------------";
cout<<"\nSerial Time :"<<(end_s-start_s).seconds()<<" Seconds";
cout<<"\nParallel time :"<<(end_p-start_p).seconds()<<" Seconds\n";
}
Well ! it worked fine on a Windows Machine with a Core i5 inside it. It gave me absolutely the same values for every parameters in the output with parallel code manifold faster than the serial code. Here is my output:
OS : Windows 7 Ultimate 64 bit Processor : core i5
Serial Part
-----------
Mean of a :1.81203e-05
Mean of b :1.0324e-05
Standard deviation of a :0.707107
Standard deviation of b :0.707107
Pearson Correlation Coefficient: 3.65091e-07
Parallel Part
-------------
Mean of a :1.81203e-05
Mean of b :1.0324e-05
Standard deviation of a :0.707107
Standard deviation of b :0.707107
Pearson Correlation Coefficient: 3.65091e-07
Time Estimates
--------------
Serial Time : 0.0204829 Seconds
Parallel Time : 0.00939971 Seconds
So what about the other machines ? If I say it will work fine then at least of some of my friends will say “Wait mate! something is fishy.” There were minor differences in the answers(between those produced by the parallel and serial code) in different machines though the parallel code was always faster than the serial one. So what made these differences? The conclusions,we came across, for this abnormal behaviour were the round-off errors which comes at the cost of excessive parallelism and the difference in processor architectures.
This leads to my questions :
- What are the precautions we need to take when we employ the parallel
processing libraries in our codes to take advantage of multi-core
processors? - What are situations were we should not use the parallel approach even
though there is availability of multiple processors? - What is the best thing we can do to avoid the round-off errors?(Let me
specify that I am not talking about enforcing mutexes and barriers which
may sometime put a cap on the extend of parallelism but about
the simple programming tips which can be handy at times)
I’m very happy to see your suggestions regarding these questions. Please be free to answer
the part which best suits you if you have time limitations.
Edit – I have included more results here
OS : Linux Ubuntu 64 bit Processor : core i5
Serial Part
-----------
Mean of a :1.81203e-05
Mean of b :1.0324e-05
Standard deviation of a :0.707107
Standard deviation of b :0.707107
Pearson Correlation Coefficient: 3.65091e-07
Parallel Part
-------------
Mean of a :-0.000233041
Mean of b :0.00414375
Standard deviation of a :2.58428
Standard deviation of b :54.6333
Pearson Correlation Coefficient: -0.000538456
Time Estimates
--------------
Serial Time :0.0161237 Seconds
Parallel Time :0.0103125 Seconds
OS : Linux Fedora 64 bit Processor : core i3
Serial Part
-----------
Mean of a :1.81203e-05
Mean of b :1.0324e-05
Standard deviation of a :0.707107
Standard deviation of b :0.707107
Pearson Correlation Coefficient: 3.65091e-07
Parallel Part
-------------
Mean of a :-0.00197118
Mean of b :0.00124329
Standard deviation of a :0.707783
Standard deviation of b :0.703951
Pearson Correlation Coefficient: -0.129055
Time Estimates
--------------
Serial Time :0.02257 Seconds
Parallel Time :0.0107966 Seconds
Edit: After the change which timday has suggested
OS :Linux Ubuntu 64 bit Processor : corei5
Serial Part
-----------
Mean of a :1.81203e-05
Mean of b :1.0324e-05
Standard deviation of a :0.707107
Standard deviation of b :0.707107
Pearson Correlation Coefficient: 3.65091e-07
Parallel Part
-------------
Mean of a :-0.000304446
Mean of b :0.00172593
Standard deviation of a :0.708465
Standard deviation of b :0.7039
Pearson Correlation Coefficient: -0.140716
Time Estimates
--------------
Serial Time :0.0235391 Seconds
Parallel time :0.00810775 Seconds
Best Regards.
Note1 : I’m not guaranteeing that the above piece of code is correct.I believe so.
Note2 : This piece of code was tested on Linux boxes as well.
Note3 : Different grain size combinations and auto-partition options were tried.
I am deeply suspicious of the commented out
/*,mean_a(0),mean_b(0)*/in thecompute_mean( compute_mean& x, split)constructor. Seems likely your differences could arise from uninitialized data polluting the results. I’m guessing on the machines where you get consistent results, no task splitting is occurring or those members just happen to be on zeroed memory.Similarly your
compute_sd( compute_sd& x, split)leavesstore3andstore4uninitialized.