So I have some neural network simulator code that works correctly on the CPU, and the parallel version agrees with the serial version to at least 6 decimal places with a 32-thread single block on both of my CUDA under Win7 PCs, but with 1 block and 64 threads slightly different values for Wt are generated. Wt values are often no more than 3 decimal places in agreement, and when I attempt to eliminate race conditions by embedding __syncthreads() within the loops, the Wt values appear as Not A Number when copied back to the CPU.
Can someone give me a hint what I might be doing wrong? I’ve included the parallelized code below, and knlBackProp is being called with lSampleQtyReq=10000, o=1, and Option=’R’:
// device-global variables to facilitate data transfer
__device__ __constant__ __align__(8) struct rohanContext devSes;
__device__ __constant__ struct rohanLearningSet devLearn;
__device__ __align__(16) struct rohanNetwork devNet;
__device__ double devdReturn[1024*1024];
__device__ double devdRMSE=0;
__device__ int devlReturn[1024*1024];
__device__ int devlTrainable=0;
extern"C"
int knlBackProp(struct rohanContext& rSes, long lSampleQtyReq, long o, char Option)
{mIDfunc /*! divides error in yielded values and back-propagates corrections among weights */
// Option S - single sample correction only
// Option E - keep existing weights, count trainable samples only
// Option R - perform corrections for all trainable samples
int lTotal=0;
cudaMemcpyToSymbol( "devlTrainable", &lTotal, sizeof(int) ); // init return value on both sides
mCheckCudaWorked
cudaEvent_t start, stop;
cudaEventCreate( &start);
cudaEventCreate( &stop);
cudaEventRecord( start, 0);
mtkBackPropMT<<< rSes.iBpropBlocks , rSes.iBpropThreads >>>( lSampleQtyReq, o, Option);
cudaEventRecord( stop, 0);
mCheckCudaWorked
cudaMemcpyFromSymbol( &lTotal, "devlTrainable", sizeof(long) ); // retrieve return value
mCheckCudaWorked
cudaEventSynchronize( stop);
float elapsedTime;
cudaEventElapsedTime( &elapsedTime, start, stop);
conPrintf("DEVICE: Time to complete BackProp kernel: %3.1f ms\n", elapsedTime);
cudaEventDestroy( start);
cudaEventDestroy( stop);
return lTotal;
}
__global__ __device__ void mtkBackPropMT( long lSampleQtyReq, long o, char Option)
{/*! divides error in yielded values and back-propagates corrections among weights */
// Option S - single sample correction only
// Option E - keep existing weights, count trainable samples only
// Option R - perform corrections for all trainable samples
if(Option=='E' || Option=='e'){ //
devlTrainable=0; // reset global mem trainable counter
subkBackPropEoptMT(lSampleQtyReq, o);
}
if(Option=='S' || Option=='s'){
devlTrainable=0; // reset global mem trainable counter
subkBackPropSoptMT(lSampleQtyReq, false, devNet, devNet.Signals, devNet.Zs, devNet.Wt, devNet.Deltas, devLearn.gpuXInputs, devLearn.gpuYEval, devLearn.gpudYEval);
}
if(Option=='R' || Option=='r'){ //
devlTrainable=0; // reset global mem trainable counter
subkBackPropRoptMT(lSampleQtyReq, o);
}
}
__device__ void subkBackPropRoptMT(long lSampleQtyReq, long o)
{/*! flags and counts samples meeting */
long OUTROWLEN=devLearn.iOutputQty+1; // prepare array index and width
//long tIx = threadIdx.x + devSes.iEvalThreads * blockIdx.x; // tIx is thread index over the kernel
long tIx = threadIdx.x + blockDim.x * blockIdx.x; // tIx is thread index over the kernel
//long lTotalThreads = devSes.iBpropThreads * devSes.iBpropBlocks; // total number of threads
double maxSquared = devSes.dMAX * devSes.dMAX ; //needed to compart to stored delta squared values
devlTrainable=0; // clear global mem accumulator; out of bound samples will remain at this value
for (long s=0; s<lSampleQtyReq; ++s){ // iterate over samples
if( devLearn.gpudSE1024[IDX2C( o, s, OUTROWLEN )] > maxSquared ){ // if the MAX criterion is exceeded
if(tIx==0)++devlTrainable; // increment the counter
subkBackPropSoptMT( s, true, devNet, devNet.Signals, devNet.Zs, devNet.Wt, devNet.Deltas, devLearn.gpuXInputs, devLearn.gpuYEval, devLearn.gpudYEval);
}
}
}
__device__ void subkBackPropSoptMT(long s, int o, rohanNetwork& Net, cuDoubleComplex * Signals, cuDoubleComplex * Zs, cuDoubleComplex * Wt, cuDoubleComplex * Deltas, cuDoubleComplex * XInputs, cuDoubleComplex * YEval, double * dYEval )
{/*! propagates adjustment of weights backwards preceeding layers from the chosen network output. */
// s is sample's index
// o is an optional method selection parameter; print/don't print as of 2/29/12
long index, kindex; // for warpwise loops
long tIx = threadIdx.x + blockDim.x * blockIdx.x; // tIx is thread index over the kernel
long lTotalThreads = gridDim.x * blockDim.x; // total number of threads
const cuDoubleComplex cdcZero = { 0, 0 };
/* clear all temp values BP0 */
for (long offset=0; (index =offset+tIx)< MAXNEURONS ; offset+=lTotalThreads){ // index stands for i
Deltas[index]=cdcZero;
Signals[index]=cdcZero;
Zs[index]=cdcZero;
}
/* re-evaluate sample to load temp values. BPI */
subkEvalSampleBetaMT( devSes, s, Net, (s==0), Signals, Zs, Wt, XInputs, YEval, dYEval);
/* begin error calculation. BPII */
cuDoubleComplex Deltastar /* measured error at the chosen network output. */ ;
/* calc top layer deltas. */
long TOP=Net.iLayerQty-1;
int ROWLEN=Net.iNeuronQTY[TOP];
//for(int i=0; i<Net.iNeuronQTY[TOP]; ++i){
for (long offset=0; (index =offset+tIx)< Net.iNeuronQTY[TOP] ; offset+=lTotalThreads){ // index stands for i
// delta-star = D - Y = Desired output minus actual output from evaluation
// D is the cplx coords of the sector of the desired answer Y is the complex result of evaluation of the given sample, unactivated. */
Deltastar = CxSubtractCxUT(
devLearn.gpuDOutputs[ IDX2C( index, s, ROWLEN ) ],
Signals[Net.iNeuronOfst[TOP]+index] );
/* divide the correction; delta = alpha * delta-star / n+1 (but alpha is always 1 for now). */
//Deltas[Net.iNeuronOfst[TOP]+index] = CxDivideRlUT( Deltastar, Net.iDendrtQTY[TOP] );
Deltas[Net.iNeuronOfst[TOP]+index] = CxMultiplyRlUT( Deltastar, Net.dINV_S[TOP] );
}
__syncthreads();
/* Now distribute the correction to lower layers if any. BPII.1 */
if (Net.iLayerQty>2){ /* remember layer 0 = inputs, layer 1 = bottom row, layer {2..iLayerQty-2} = middle row, layer iLayerQty-1 = top row. */
for (int L=Net.iLayerQty-1; L>1; --L){
long LAY = L; /* setup access to layers. */
long TRIB = L-1; /* trib for tributary.*/
int iTributQTY=Net.iNeuronQTY[TRIB];
//int Sj=Net.iDendrtQTY[TRIB]; if (TRIB==1) Sj=1; // Sj=1 for firest hidden layer
for (int i=1; i<Net.iNeuronQTY[LAY]; ++i) { // skip 0th neuron as its weights are either 1 (div identity) or 0 (div forbidden) and don't change anyway
// k index must begin at 1, neuron zero not valid for correction
//for (int k=1; k<iTributQTY; ++k) { /* the contribution to ith neuron's kth tributary's delta = i's delta/i's weight k. */
for (long offset=1; ( kindex =offset+tIx)< iTributQTY ; offset+=lTotalThreads){ // kindex stands for k
Deltas[Net.iNeuronOfst[TRIB]+kindex]
= CxAddCxUT ( Deltas[Net.iNeuronOfst[TRIB]+kindex] ,
CxDivideCxUT(
Deltas[Net.iNeuronOfst[LAY]+i] ,
Wt[IDX2C( Net.iWeightOfst[LAY]+kindex, i, iTributQTY )] ));
}
}
for (long offset=1; ( kindex =offset+tIx)< iTributQTY ; offset+=lTotalThreads){ // kindex stands for k
//cuDoubleComplex preDiv=Deltas[Net.iNeuronOfst[TRIB]+kindex]; // diagnostic purpose only, remove if removing other diags
//Deltas[Net.iNeuronOfst[TRIB]+kindex]
// = CxDivideRlUT(
// Deltas[Net.iNeuronOfst[TRIB]+kindex] ,
// Sj );
Deltas[Net.iNeuronOfst[TRIB]+kindex]
= CxMultiplyRlUT(
Deltas[Net.iNeuronOfst[TRIB]+kindex] ,
Net.dINV_S[TRIB] );
}
}
}
__syncthreads();
/* error distribution completed */
/* and now update the weights BP III */
/* adj weights on first hidden layer. */
int FHID = 1;
int SIG = 0;
int iSignalQTY=Net.iNeuronQTY[SIG]; //rSes.rLearn->iInputQty+1;
int iHidWidth=Net.iNeuronQTY[FHID];
for (int k=1; k<iHidWidth; ++k){
//for (int i=0; i<iSignalQTY; ++i){
for (long offset=0; ( index =offset+tIx)< iSignalQTY ; offset+=lTotalThreads){ // index stands for i
/* dW=d*xbar/s1/|z|= neuron's delta * input's conjugate / ( dendrites+1 * abs of input i ). */
Wt[IDX2C( Net.iWeightOfst[FHID]+index, k, iSignalQTY )]
=CxAddCxUT( Wt[IDX2C( Net.iWeightOfst[FHID]+index, k, iSignalQTY )] ,
CxDivideRlUT(
CxMultiplyCxUT(
Deltas[Net.iNeuronOfst[FHID]+k] ,
CxConjugateUT( Signals[Net.iNeuronOfst[SIG]+index] )
) ,
CxAbsUT( Zs[Net.iNeuronOfst[FHID]+k] ) // N+1 denominator factor is considered redundant - JAW & IA 2/27/12
)
);
}
}
__syncthreads();
/* re-evaluate sample to update temp values. */
subkEvalSampleBetaMT( devSes, s, Net, false, Signals, Zs, Wt, XInputs, YEval, dYEval);
if (Net.iLayerQty>2){
/* now use those outputs' conjugates and the deltas to adjust middle layers. BP III.1 */
for (int L=2; L<Net.iLayerQty-1; ++L){
/* setup access to layers. */
long LAY = L;
long TRIB = L-1;
//int iLayWidth=Net.iNeuronQTY[LAY];
int iTribWidth=Net.iNeuronQTY[TRIB];
for (int k=1; k<Net.iNeuronQTY[LAY]; ++k){
//for (int i=0; i<Net.iNeuronQTY[TRIB]; ++i){
for (long offset=0; ( index =offset+tIx)< Net.iNeuronQTY[TRIB] ; offset+=lTotalThreads){ // index stands for i
/* the adjustment added to kth neuron's ith trib's weight = k's delta * complex conjugate of i's signal / (abs of k's previous-wt product-sum * dendrites+1) . */
Wt[IDX2C( Net.iWeightOfst[LAY]+index, k, iTribWidth )]
=CxAddCxUT( Wt[IDX2C( Net.iWeightOfst[LAY]+index, k, iTribWidth )] ,
CxDivideRlUT(
CxMultiplyCxUT(
Deltas[Net.iNeuronOfst[LAY]+k] ,
CxConjugateUT( Signals[Net.iNeuronOfst[TRIB]+index] )
) ,
(
CxAbsUT( Zs[Net.iNeuronOfst[LAY]+k] ) // N+1 denominator factor is considered redundant - JAW & IA 2/27/12
)
)
);
}
}
/* layer is complete. */
subkEvalSampleBetaMT( devSes, s, Net, true, Signals, Zs, Wt, XInputs, YEval, dYEval);
}
}
__syncthreads();
/* correct output layer BP III.3 */
long SUB = TOP-1;
//int iTopWidth=Net.iNeuronQTY[TOP];
int iSubWidth=Net.iNeuronQTY[SUB];
for (int k=1; k<Net.iNeuronQTY[TOP]; ++k){
//for (int i=0; i<Net.iNeuronQTY[SUB]; ++i){
for (long offset=0; ( index =offset+tIx)< Net.iNeuronQTY[SUB] ; offset+=lTotalThreads){ // index stands for i
/* For last layer only, adjustment to kth neuron's ith weight = k's delta * complex conjugate of i's signal / ( dendrites+1) . */
Wt[IDX2C( Net.iWeightOfst[TOP]+index, k, iSubWidth )]
=CxAddCxUT( Wt[IDX2C( Net.iWeightOfst[TOP]+index, k, iSubWidth )] ,
CxMultiplyCxUT(
Deltas[Net.iNeuronOfst[TOP]+k] ,
CxConjugateUT( Signals[Net.iNeuronOfst[SUB]+index] )
)
); // N+1 denominator factor is considered redundant - JAW & IA 2/27/12
}
}
/* backprop is complete. */
}
__device__ void subkEvalSampleBetaMT(rohanContext& Ses, long s, rohanNetwork& Net, int o, cuDoubleComplex * Signals, cuDoubleComplex * Zs, cuDoubleComplex * Wt, cuDoubleComplex * XInputs, cuDoubleComplex * YEval, double * dYEval )
{// Beta uses fixed length fields instead of nested pointer layers
// delta squared is not updated, since they'll be updated when RMSE is checked at the end of a pass through the learning set
long index, kindex; // for warpwise loops
long tIx = threadIdx.x + blockDim.x * blockIdx.x; // tIx is thread index over the kernel
long lTotalThreads = gridDim.x * blockDim.x; // total number of threads
const cuDoubleComplex cdcZero = { 0, 0 };
/*! layer zero (inputs) is special. */
long INROWLEN=Net.iNeuronQTY[0];//rSes.rLearn->iInputQty+1;
//for (int i=0; i<INROWLEN; ++i){
for (long offset=0; (index =offset+tIx)< INROWLEN ; offset+=lTotalThreads){ // index stands for i
Signals[Net.iNeuronOfst[0]+index]= XInputs[IDX2C( index, s, INROWLEN )];
}
/*! middle and top layers. */
for (int L=1; L<Net.iLayerQty; ++L){
//struct rohanLayer& lay = Net.rLayer[L];
long LAY=L;
int TRIB=L-1; // index of previous layer
int iNeuronQTY=Net.iNeuronQTY[LAY];
int iSignalQTY=Net.iDendrtQTY[LAY]; // signal qty depends on size of previous layer
//for (int k=0; k<iNeuronQTY; ++k){ //Neuron zero is not skipped, its output should be 1+0i as a check
for (long offset=0; (kindex =offset+tIx)< iNeuronQTY ; offset+=lTotalThreads){ // kindex stands for k
Zs[Net.iNeuronOfst[LAY]+kindex]=cdcZero;
for (int i=0; i<iSignalQTY; ++i){ //walk weights on inputs from previous layer
Zs[Net.iNeuronOfst[LAY]+kindex] =
CxAddCxUT( Zs[Net.iNeuronOfst[LAY]+kindex] ,
CxMultiplyCxUT(
Wt[IDX2C( Net.iWeightOfst[LAY] + i, kindex, iSignalQTY )],
Signals[Net.iNeuronOfst[TRIB]+i] ) ) ;
}
// ACTIVATE //
Signals[Net.iNeuronOfst[LAY]+kindex] = CxActivateUT( Zs[Net.iNeuronOfst[LAY]+kindex]);
}
}
/*! last layer values are converted and stored here */
long TOP = Net.iLayerQty-1;
long OUTROWLEN=Net.iNeuronQTY[TOP];
//for (int i=0; i<Net.iNeuronQTY[TOP]; ++i){ // continuous conversion begins here
for (long offset=0; (index =offset+tIx)< OUTROWLEN ; offset+=lTotalThreads){ // index stands for i
YEval[IDX2C( index, s, OUTROWLEN )]= Signals[Net.iNeuronOfst[TOP]+index] ; // store final complex output(s)
dYEval[IDX2C( index, s, OUTROWLEN )]=FUnitCxUT( YEval[IDX2C( index, s, OUTROWLEN )] ) * Net.iSectorQty; // convert final complex outputs to sectors and store that
if(devLearn.iContOutputs==false) // round off decimal if disc activation is set
dYEval[IDX2C( index, s, OUTROWLEN )]=int(dYEval[IDX2C( index, s, OUTROWLEN )]);
}
/*! end of sample evaluation. */
}
__device__ cuDoubleComplex CxActivateUT(const cuDoubleComplex Z)
{/// applies ContActivation or discrete activation function to cx neuron output and returns Phi(Z)
/// This fn should be phased out in favor of a GPU device vector based fn
cuDoubleComplex phi;
if (devNet.bContActivation) { // apply ContActivation activation function to weighted sum : phi(z)=z/|z|
phi = CxDivideRlUT( Z, CxAbsUT( Z ) );
}
else { // apply Discrete activation function to weighted sum : s=int(arctan(z)*k/2pi), phi(z)=(X(s),Y(s))
double theta = atan2(Z.y, Z.x); // theta = arctan y/x
int iSector = (int)((theta * devNet.dK_DIV_TWO_PI) + devNet.iSectorQty) % devNet.iSectorQty;
phi = devNet.gpuSectorBdry[iSector];
//printf(" %f+%fi %d Activate\n", phi.x, phi.y, threadIdx.x);
}
return phi;
}
So, I’m not going to read all that code, but I can give you a strong hint. The warp size is 32 threads, so the 64-thread case will run two warps/block — in the former case you can’t have any instruction pointer based race conditions, however, in the second case, you will effectively have two groups of threads with different IPs scheduled at different times. You may already know much of this (hence the syncthreads), but the above really makes it almost certain that you simply have one more race condition you haven’t accounted for yet.
Putting in the sync-threads is a good start to try and isolate it. Are you sure that in your loops, the source data of one warp is not overwritten by the other warp? If not try put in syncthreads into your inner loops just for debug purposes to see what may be causing the race condition.