I have a strange problem dealing with 2D array on CUDA device.
#define VR 100 // rows
#define ST 13 // columns
__global__ void test(float *arr, curandState *globalState, size_t pitch, unsigned long seed) {
int id = (blockIdx.x * blockDim.x) + threadIdx.x;
curand_init ( seed, id, 0, &globalState[id] );
cuPrintf("Thread id: %d \n", id);
float* row = (float*)(((char*)arr) + id * pitch);
for (int j = 0; j < ST; ++j) {
row[j] = generate(globalState, id);
}
}
int main() {
float *d_arr;
float *h_arr = new float[VR*ST];
size_t pitch;
cudaMallocPitch(&d_arr, &pitch, ST* sizeof(float), VR);
dim3 dimBlock(VR);
dim3 dimGrid(1,1);
curandState* devStates;
cudaMalloc ( &devStates, VR*ST*sizeof( curandState ) );
test <<< dimGrid, dimBlock >>> (d_arr, devStates, pitch, unsigned(time(NULL)));
cudaMemcpy(h_arr, d_arr,VR*ST*sizeof(float),cudaMemcpyDeviceToHost);
for (int i=0; i<VR; i++) {
for (int j=0; j<ST; j++) {
cout << "N["<<i<<"]["<<j<<"]=" << h_arr[(i*ST)+j]<<endl;
}
}
I don’t get evenly distributed numbers, instead they appear in sequence of 13 with bunch of zeros in between. See: http://pastie.org/6106381
The problem is that the original data array is being allocated using
cudaMallocPitchwhereas the copying is being done using ordinarycudaMemcpy. This will give unexpected results because thecudaMallocPitchoperation creates “padded” rows to satisfy alignment requirements, whereas cudaMemcpy assumes everything is stored contiguously. Below is code that I believe has corrections to be functional:Compiling the above code using:
and then running I get what appears to be “normal” output:
(results truncated)