Please look this code:
#include <stdlib.h>
#include <stdio.h>
int N, L, I;
float * inputs;
float * temp;
// first kernel
__global__ void mulKernel ( float * output, float * inputs)///, float * weights)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
output [idx] = inputs [idx] * 3;//weights [idx];
//weights [idx] = 4;
//__syncthreads();
}
//second kernel
__global__ void sumKernel ( float * output, float * input)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
output [idx] = input[idx]*2;
__syncthreads();
}
void printVector (const float *p, const int N) {
for (int i=0; i<N; i++)
printf("%f\n",p[i]);
}
int main(int argc, char *argv[])
{
if(argc < 3)
printf("Usage: cuda <layers> <inputs>\n");
else
{
L = atoi(argv[1]);
N = atoi(argv[2]);
I = atoi(argv[2]);
inputs = (float*)malloc(I*sizeof(float));
float * weights = (float*)malloc(I*sizeof(float));
// and fill with some arbitrary values
for (int i=0; i<I; i++)
{
inputs[i] = 1;
}
for (int i=0; i<I; i++)
{
weights[i] = 1.5;
}
// allocate device memory
float * devInputs = NULL;
float * devTemp = NULL;
float * devWeights = NULL;
cudaMalloc ( (void**)&devInputs, I*sizeof(float) );
cudaMalloc ( (void**)&devTemp, I*sizeof(float) );
cudaMalloc ( (void**)&devWeights, I*sizeof(float) );
// set kernel launch configuration
dim3 threadsMul = dim3(512, 1);
int blocksCount = floor(I / threadsMul.x) + 1;
dim3 blocksMul = dim3(blocksCount, 1);
dim3 threadsSum = dim3(512, 1);
blocksCount = floor(I / threadsSum.x) + 1;
dim3 blocksSum = dim3(blocksCount, 1);
cudaMemcpy ( devInputs, inputs, I*sizeof(float), cudaMemcpyHostToDevice );
cudaMemcpy ( devWeights, weights,I*sizeof(float), cudaMemcpyHostToDevice );
//kernels calling in this cycle
for(int j=0;j<L;j++)
{
// copying data to see that's ok
cudaMemcpy ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
cudaMemcpy ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );
// print it
printf("inputs:\n");
printVector (inputs, N);
printf("weights:\n");
printVector (weights, N);
printf("\n");
// running first kernel
mulKernel<<<blocksMul, threadsMul>>>(devTemp, devInputs);//, devWeights);
// copying and printing data. We can see thats array weights contains a wrong values
cudaMemcpy ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
cudaMemcpy ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );
printf("inputs:\n");
printVector (inputs, N);
printf("weights:\n");
printVector (weights, N);
printf("\n");
if(cudaDeviceSynchronize() == cudaSuccess)
printf("threads syncronized\n");
cudaMemcpy ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
cudaMemcpy ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );
printf("inputs:\n");
printVector (inputs, N);
printf("weights:\n");
printVector (weights, N);
printf("\n");
sumKernel<<<blocksSum, threadsSum>>>(devInputs, devTemp);
cudaMemcpy ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
cudaMemcpy ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );
printf("inputs:\n");
printVector (inputs, N);
printf("weights:\n");
printVector (weights, N);
printf("\n\n");
if(cudaDeviceSynchronize() == cudaSuccess)
printf("threads syncronized\n");
cudaMemcpy ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
cudaMemcpy ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );
printf("inputs:\n");
printVector (inputs, N);
printf("weights:\n");
printVector (weights, N);
printf("\n\n");
}
cudaMemcpy ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
cudaFree ( devInputs );
cudaFree ( devTemp );
cudaFree ( devWeights );
printVector (inputs, N);
free(inputs);
free(weights);
}
return 0;
}
And look the output. After callig first kernel, the devWeights array lost its data. But it doesn’t used anywhere. I just copy it to the memory, run kernels (that don’t affects it) and copy back to host. And in output I see that it changed. Why? What am I doing wrong?
In main function you can see cycle for. In it I run two kernels: sumKernel and mulKernel. Before running kernel, after it, and after synchronization threads I copy arrays to host and print it. So, I see wrong data after calling kernel. See comments in code.
I don’t see any error (only cudaSuccess).
Oh, I found the error. I forgot to use if(idx < N) in my kernels and CUDA didn’t print error when gone out array dimensions. So, when I changed inputs array, I also changed data that situated in memory after inputs.