I have a strange problem dealing with 2D array on CUDA device. #define VR

Question

0

Asked: June 18, 20262026-06-18T14:16:25+00:00 2026-06-18T14:16:25+00:00

I have a strange problem dealing with 2D array on CUDA device. #define VR

0

I have a strange problem dealing with 2D array on CUDA device.

    #define VR 100 // rows
    #define ST 13 // columns
    __global__ void test(float *arr, curandState *globalState, size_t pitch, unsigned long seed) {
    int id = (blockIdx.x * blockDim.x)  + threadIdx.x;
    curand_init ( seed, id, 0, &globalState[id] );
    cuPrintf("Thread id: %d \n", id);

    float* row = (float*)(((char*)arr) + id * pitch);
    for (int j = 0; j < ST; ++j) {
        row[j] = generate(globalState, id);
    }

}

int main() {
    float *d_arr;
    float *h_arr = new float[VR*ST];
    size_t pitch;
    cudaMallocPitch(&d_arr, &pitch, ST* sizeof(float), VR);

    dim3 dimBlock(VR); 
    dim3 dimGrid(1,1);

    curandState* devStates;
    cudaMalloc ( &devStates, VR*ST*sizeof( curandState ) );

    test <<< dimGrid, dimBlock >>> (d_arr, devStates, pitch, unsigned(time(NULL)));
    cudaMemcpy(h_arr, d_arr,VR*ST*sizeof(float),cudaMemcpyDeviceToHost);

    for (int i=0; i<VR; i++) {
        for (int j=0; j<ST; j++) {
            cout << "N["<<i<<"]["<<j<<"]=" << h_arr[(i*ST)+j]<<endl;
        }
    }

I don’t get evenly distributed numbers, instead they appear in sequence of 13 with bunch of zeros in between. See: http://pastie.org/6106381

Report

Leave an answer
Cancel reply

You must login to add an answer.

Need An Account,

1 Answer

Editorial Team · Answer 1 · 2026-06-18T14:16:26+00:00

The problem is that the original data array is being allocated using cudaMallocPitch whereas the copying is being done using ordinary cudaMemcpy. This will give unexpected results because the cudaMallocPitch operation creates “padded” rows to satisfy alignment requirements, whereas cudaMemcpy assumes everything is stored contiguously. Below is code that I believe has corrections to be functional:

    #include <stdio.h>
    #include <iostream>
    #include <curand_kernel.h>

    #define VR 100 // rows
    #define ST 13 // columns


__device__ float generate(curandState* globalState, int id)
{
    //int id = (blockIdx.x * blockDim.x)  + threadIdx.x;
    curandState localState = globalState[id];
    float rand;
    do {
        rand = curand_uniform( &localState );
    } while(rand == 0); //
    globalState[id] = localState;
    return rand;
}


    __global__ void test(float *arr, curandState *globalState, size_t pitch, unsigned long seed) {
    int id = (blockIdx.x * blockDim.x)  + threadIdx.x;
    curand_init ( seed, id, 0, &globalState[id] );
    //cuPrintf("Thread id: %d \n", id);

    float* row = (float*)(((char*)arr) + id * pitch);
    for (int j = 0; j < ST; ++j) {
        row[j] = generate(globalState, id);
    }

}

    using namespace std;
int main() {
    float *d_arr;
    float *h_arr = new float[VR*ST];
    size_t pitch;
    cudaMallocPitch(&d_arr, &pitch, ST* sizeof(float), VR);

    dim3 dimBlock(VR);
    dim3 dimGrid(1,1);

    curandState* devStates;
    cudaMalloc ( &devStates, VR*ST*sizeof( curandState ) );

    test <<< dimGrid, dimBlock >>> (d_arr, devStates, pitch, unsigned(time(NULL)));
    cudaMemcpy2D(h_arr, ST*sizeof(float),  d_arr, pitch, ST*sizeof(float), VR ,cudaMemcpyDeviceToHost);

    for (int i=0; i<VR; i++) {
        for (int j=0; j<ST; j++) {
            cout << "N["<<i<<"]["<<j<<"]=" << h_arr[(i*ST)+j]<<endl;
        }
    }
}

Compiling the above code using:

nvcc -arch=sm_20 -lcurand  -o t70 t70.cu

and then running I get what appears to be “normal” output:

N[0][0]=0.876772
N[0][1]=0.550017
N[0][2]=0.49023
N[0][3]=0.530145
N[0][4]=0.501616
N[0][5]=0.326232
N[0][6]=0.438308
N[0][7]=0.857651
N[0][8]=0.462743
N[0][9]=0.38252
N[0][10]=0.258212
N[0][11]=0.194021
N[0][12]=0.895522
N[1][0]=0.559201
N[1][1]=0.257747
N[1][2]=0.430971
N[1][3]=0.707209
N[1][4]=0.599081
N[1][5]=0.0457626
N[1][6]=0.702412
N[1][7]=0.88791
N[1][8]=0.508877
N[1][9]=0.702734
N[1][10]=0.379898
N[1][11]=0.138841
N[1][12]=0.540869

(results truncated)

Sign Up

Sign In

Forgot Password

The Archive Base Latest Questions

I have a strange problem dealing with 2D array on CUDA device. #define VR

Leave an answerCancel reply

1 Answer

Leave an answer
Cancel reply