I ported this piece of code:
if(_layersCount > 1)
{
for(int i=_layersCount-2;i>=0;i--)
{
for(int j=0;j<_neuronsPerLayerCount[i];j++) // cuda kernel
{
localGradients[indexByLayerAndNeuron(i, j)] = 0;
for(int k=0;k<_neuronsPerLayerCount[i+1];k++)
{
localGradients[indexByLayerAndNeuron(i, j)] += _neuronsInputsWeights[indexByLayerNeuronAndInput(i+1, k, j)]
* localGradients[indexByLayerAndNeuron(i+1, k)];
}
localGradients[indexByLayerAndNeuron(i, j)] *= derivatives[indexByLayerAndNeuron(i, j)];
}
}
}
to CUDA:
if(_layersCount > 1)
{
for(int i=_layersCount-2;i>=0;i--)
{
// calculateLocalGradientsForAnotherLayers
blocksCount = floor((double) _neuronsPerLayerCount[i] / threads.x) + 1;
blocks = dim3(blocksCount, 1);
calculateLocalGradientsForAnotherLayers <<<blocks, threads>>> (deviceLocalGradients, _neuronsInputsWeights, deviceDerivatives, _neuronsPerLayerCount[i], _neuronsInPreviousLayers[i], _neuronsInPreviousLayers[i+1], _neuronsPerLayerCount[i+1], _inputsInPreviousLayers[i], _inputsInCurrentLayer[i]);
}
}
The calculateLocalGradientsForAnotherLayers kernel:
__global__ void calculateLocalGradientsForAnotherLayers(double * localGradients, double * neuronsInputsWeights, double * derivatives, int neuronsCount, int neuronsInPreviousLayers, int neuronsInPreviousLayersWithCurrent, int neuronsInNextLayer, int inputsInPreviousLayers, int inputsInCurrentLayer)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < neuronsCount)
{
int neuron = neuronsInPreviousLayers + idx;
localGradients[neuron] = 0;
// this to Kernel, then reduce localGradients.
for(int k=0;k<neuronsInNextLayer;k++)
{
localGradients[neuron] += neuronsInputsWeights[inputsInPreviousLayers + k*inputsInCurrentLayer + idx]
* localGradients[neuronsInPreviousLayersWithCurrent + k];
}
localGradients[neuron] *= derivatives[neuron];
}
}
But I see the difference in the results from the second decimal place. Why error is so large? All kernels works good except this.
My GPU is NV GF555M. It supports double precision.
In the body of your kernel, you need some kind of inter-block synchronization over
localGradientsarray:Concurrent read/write accesses may destroy the actual value of
localGradientselements. Since there is no synchronization on the read/write, you may see random results.