EDIT In the initial posting’s code snippet (see below) I was not properly sending

Question

0

Asked: June 3, 20262026-06-03T02:57:33+00:00 2026-06-03T02:57:33+00:00

EDIT In the initial posting’s code snippet (see below) I was not properly sending

0

EDIT
In the initial posting’s code snippet (see below) I was not properly sending the struct to the device, this has been fixed, but the results are still the same. In my full code this mistake was not present. (There were two mistakes in that command in my initial posting — one, the structure was being copied from HostToDevice, but was actually reversed, and the size of the copy was also wrong. Apologies; both errors were fixed, but the recompiled code still displays the zeros phenomena described below, as does my full code.)

EDIT 2
In the haste of my de-proprietarization rewrite of the code I made a couple errors which dalekchef kindly pointed out to me (the copy of the struct to the device was performed BEFORE the allocation on the device, in my rewritten code and the device cudaMalloc calls were not multiplied with the sizeof(...) the type of the array elements. I added these fixes, recompiled and retested, but it did not fix the problem. Also double checked my original code — it did not have those mistakes. Apologies again, for the confusion.

I’m trying to dump statistics from a large simulations program. A similar pared down code is displayed below. Both codes exhibit the same problem — they output zeroes, when they should be outputting averaged values.

#include "stdio.h"

struct __align__(8) DynamicVals 
{ 
   double a;
   double b;
   int n1;
   int n2;
   int perDump;
};

__device__ int *dev_arrN1, *dev_arrN2;
__device__ double *dev_arrA, *dev_arrB;
__device__ DynamicVals *dev_myVals;
__device__ int stepsA, stepsB;
__device__ double sumA, sumB;
__device__ int stepsN1, stepsN2;
__device__ int sumN1, sumN2;

__global__ void TEST
(int step, double dev_arrA[], double dev_arrB[],
 int dev_arrN1[], int dev_arrN2[],DynamicVals *dev_myVals)
{
   if (step % dev_myVals->perDump)
   {
      dev_arrN1[step/dev_myVals->perDump] = 0;
      dev_arrN2[step/dev_myVals->perDump] = 0;
      dev_arrA[step/dev_myVals->perDump] = 0.0;
      dev_arrB[step/dev_myVals->perDump] = 0.0;
      stepsA = 0;
      stepsB = 0;
      stepsN1 = 0;
      stepsN2 = 0;
      sumA = 0.0;
      sumB = 0.0;
      sumN1 = 0;
      sumN2 = 0;
   }

   sumA += dev_myVals->a;
   sumB += dev_myVals->b;
   sumN1 += dev_myVals->n1;
   sumN2 += dev_myVals->n2;
   stepsA++;
   stepsB++;
   stepsN1++;
   stepsN2++;

   if ( sumA > 100000000 )
   {
      dev_arrA[step/dev_myVals->perDump] +=
     sumA / stepsA;
      sumA = 0.0;
      stepsA = 0;
   }
   if ( sumB > 100000000 )
   {
      dev_arrB[step/dev_myVals->perDump] +=
     sumB / stepsB;
      sumB = 0.0;
      stepsB = 0;
   }
   if ( sumN1 > 1000000 )
   {
      dev_arrN1[step/dev_myVals->perDump] +=
     sumN1 / stepsN1;
      sumN1 = 0;
      stepsN1 = 0;
   }
   if ( sumN2 > 1000000 )
   {
      dev_arrN2[step/dev_myVals->perDump] +=
     sumN2 / stepsN2;
      sumN2 = 0;
      stepsN2 = 0;
   }

   if ((step+1) % dev_myVals->perDump)
   {
      dev_arrA[step/dev_myVals->perDump] +=
     sumA / stepsA;
      dev_arrB[step/dev_myVals->perDump] +=
     sumB / stepsB;
      dev_arrN1[step/dev_myVals->perDump] +=
     sumN1 / stepsN1;
      dev_arrN2[step/dev_myVals->perDump] +=
     sumN2 / stepsN2;
   }
}

int main() 
{
   const int TOTAL_STEPS = 10000000;
   DynamicVals vals;
   int *arrN1, *arrN2;
   double *arrA, *arrB;
   int statCnt;

   vals.perDump = TOTAL_STEPS/10;
   statCnt = TOTAL_STEPS/vals.perDump+1;
   vals.a = 30000.0;
   vals.b = 60000.0;
   vals.n1 = 10000;
   vals.n2 = 20000;

   cudaMalloc( (void**)&dev_arrA, statCnt*sizeof(double) );
   cudaMalloc( (void**)&dev_arrB, statCnt*sizeof(double) );
   cudaMalloc( (void**)&dev_arrN1, statCnt*sizeof(int) );
   cudaMalloc( (void**)&dev_arrN2, statCnt*sizeof(int) );
   cudaMalloc( (void**)&dev_myVals, sizeof(DynamicVals));
   cudaMemcpy(dev_myVals, &vals, sizeof(DynamicVals), 
          cudaMemcpyHostToDevice);

   arrA = (double *)malloc(statCnt * sizeof(double));
   arrB = (double *)malloc(statCnt * sizeof(double));
   arrN1 = (int *)malloc(statCnt * sizeof(int));
   arrN2 = (int *)malloc(statCnt * sizeof(int));

   for (int i=0; i< TOTAL_STEPS; i++)
      TEST<<<1,1>>>(i, dev_arrA,dev_arrB,dev_arrN1,dev_arrN2,dev_myVals);

   cudaMemcpy(arrA,dev_arrA,statCnt * sizeof(double),cudaMemcpyDeviceToHost);
   cudaMemcpy(arrB,dev_arrB,statCnt * sizeof(double),cudaMemcpyDeviceToHost);
   cudaMemcpy(arrN1,dev_arrN1,statCnt * sizeof(int),cudaMemcpyDeviceToHost);
   cudaMemcpy(arrN2,dev_arrN2,statCnt * sizeof(int),cudaMemcpyDeviceToHost);

   for (int i=0; i< statCnt; i++)
   {
      printf("Step: %d   ; A=%g  B=%g  N1=%d  N2=%d\n",
         i*vals.perDump,
         arrA[i], arrB[i], arrN1[i], arrN2[i]);
   }
}

Output:

Step: 0   ; A=0  B=0  N1=0  N2=0
Step: 1000000   ; A=0  B=0  N1=0  N2=0
Step: 2000000   ; A=0  B=0  N1=0  N2=0
Step: 3000000   ; A=0  B=0  N1=0  N2=0
Step: 4000000   ; A=0  B=0  N1=0  N2=0
Step: 5000000   ; A=0  B=0  N1=0  N2=0
Step: 6000000   ; A=0  B=0  N1=0  N2=0
Step: 7000000   ; A=0  B=0  N1=0  N2=0
Step: 8000000   ; A=0  B=0  N1=0  N2=0
Step: 9000000   ; A=0  B=0  N1=0  N2=0
Step: 10000000   ; A=0  B=0  N1=0  N2=0

Now, if I were to use a small period for my dumps or if my #s were smaller, I could get away with just a direct

add
divide by period and the end of period

…algorithm, but I use temporary sums as otherwise my int would overflow (the double wouldn’t overflow, but I was concerned about it losing precision).

If I use the above direct algorithm for smaller values I get correct non-zero values, but the second I use the intermediates (e.g. stepsA, sumA, etc.) the values go to zero.
I know I’m doing something silly here… what am I missing?

Notes:
A.) Yes, I know this code in its above form is not parallel and by itself does not warrant parallelization. It is part of a small statistics collecting portion of a much longer code. In that code it is encased in a thread index specific conditional logic to prevent clashing (making it parallel) and serves as data gathering to a simulations program (which warrants parallelization). Hopefully you can understand where the above code originates and avoid snide comments about its lack of thread-safety. (This disclaimer is added out of past experience receiving unproductive comments from people who didn’t understand I was posting an excerpt, not a full code, despite me writing in less explicit terms as such.)

B.) Yes, I know the names of the variables are ambiguous. That is the point. The code I’m working on is proprietary, though it will eventually be open sourced. I only write this as I have posted similarly anonymized codes in the past and received rude commentary about my naming convention.

C.) Yes, I have read the CUDA manual several times, though I do make errors and I admit there’s some features I don’t understand. I’m not using shared memory here, but I am using shared memory (OF COURSE) in my full code.

D.) Yes, the above code does represent the exact same features as the data dumping portion of my non-working code, with the logic not related to this particular problem removed, and with it the thread safety conditional. The variable names have been changed, but algorithmically it should be unaltered and this is verified by the exact same non-working output (zeroes).

E.) I do realize the “dynamic” struct in the above snippet has non-dynamic values. I named the structure that because in the full code, this struct contains simulations data, and is dynamic. The static nature in the pared-down code should not make the statistics collecting code fail, it will simply mean that the average for each dump should be constant (and non-zero).

Report

Leave an answer
Cancel reply

You must login to add an answer.

Need An Account,

1 Answer

Editorial Team · Answer 1 · 2026-06-03T02:57:33+00:00

The biggest problem I see here is one of scope. The way this code is written leads me to conclude that you might not understand how variable scoping in C++ works in general, and how device and host code scope works in CUDA in particular. A couple of observations:

When you do this type of thing in code:

__device__ double *dev_arrA, *dev_arrB; __global__ void TEST(int step, double dev_arrA[], double dev_arrB[], ....)

you have a variable scope problem. dev_arrA is declared at both compilation unit scope and function scope. The two declarations do not refer to the same variable — the function unit scope declaration (in the kernel) takes precedence over the compilation unit scope declaration inside the kernel. you modify that variable, you are modifying the kernel scope declaration, not the __device__variable. This can lead to all sorts of subtle and unexpactd behaviour. It is much better to avoid ever having the same variable declared at multiple scopes.
When you declare a variable using the __device__ specifier, it is intended to be exclusively a device context symbol, and should only be used directly in device code. So something like this:

__device__ double *dev_arrA; int main() { .... cudaMalloc( (void**)&dev_arrA, statCnt*sizeof(double) ); .... }

is illegal. You cannot call an API function like cudaMalloc directly on a __device__ variable. Even though it will compile (because of the hackery involved in the CUDA compilation tradjectories for host and device code), it is incorrect to do so. In the above example dev_arrA is a device symbol. You can interact with it via the API symbol manipulation calls, but that is all it is technically legal to do. In you code, variables intended to hold device pointers and be passed as kernel arguments (like dev_arrA) should be declared at main() scope, and passed by value to the kernel.

It is a combination of the above two things which is probably causing your problems.

But the difficulty is that you have chosen to post roughy 150 lines of code (a lot of which is redundant) as a repro case. I doubt anyone cares enough about your problems to go through that much code with a fine tooth comb and pinpoint where the precise problem is. Further, you habit of doing these nasty “top edits” in your questions quickly turn what might have been reasonably written starting points into unintelligible psuedo changelogs which are incredibly hard to follow and are unlikely to be of help to anyone. Also, the mildly passive-aggressive notes section serves no real purpose – it adds nothing of value to the question.

So I will leave you with a greatly simplified version of the code you posted which I think has all the basic things which you are trying to do working. I leave it as an “exercise for the reader” to turn it back into whatever it is that you are trying to do.

#include "stdio.h"

typedef float Real;
struct __align__(8) DynamicVals 
{ 
    Real a;
    int n1;
    int perDump;
};

__device__ int stepsA;
__device__ Real sumA;
__device__ int stepsN1;
__device__ int sumN1;

__global__ void TEST
(int step, Real dev_arrA[], int dev_arrN1[], DynamicVals *dev_myVals)
{
    if (step % dev_myVals->perDump)
    {
        dev_arrN1[step/dev_myVals->perDump] = 0;
        dev_arrA[step/dev_myVals->perDump] = 0.0;
        stepsA = 0;
        stepsN1 = 0;
        sumA = 0.0;
        sumN1 = 0;
    }

    sumA += dev_myVals->a;
    sumN1 += dev_myVals->n1;
    stepsA++;
    stepsN1++;

    dev_arrA[step/dev_myVals->perDump] += sumA / stepsA;
    dev_arrN1[step/dev_myVals->perDump] += sumN1 / stepsN1;
}

inline void gpuAssert(cudaError_t code, char *file, int line, 
                 bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code),
          file, line);
      if (abort) exit(code);
   }
}

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

int main() 
{
    const int TOTAL_STEPS = 1000;
    DynamicVals vals;
    int *arrN1;
    Real *arrA;
    int statCnt;

    vals.perDump = TOTAL_STEPS/10;
    statCnt = TOTAL_STEPS/vals.perDump;
    vals.a = 30000.0;
    vals.n1 = 10000;

    Real *dev_arrA;
    int *dev_arrN1;
    DynamicVals *dev_myVals;

    gpuErrchk( cudaMalloc( (void**)&dev_arrA, statCnt*sizeof(Real)) );
    gpuErrchk( cudaMalloc( (void**)&dev_arrN1, statCnt*sizeof(int)) );
    gpuErrchk( cudaMalloc( (void**)&dev_myVals, sizeof(DynamicVals)) );
    gpuErrchk( cudaMemcpy(dev_myVals, &vals, sizeof(DynamicVals), 
                cudaMemcpyHostToDevice) );

    arrA = (Real *)malloc(statCnt * sizeof(Real));
    arrN1 = (int *)malloc(statCnt * sizeof(int));

    for (int i=0; i< TOTAL_STEPS; i++) {
        TEST<<<1,1>>>(i, dev_arrA,dev_arrN1,dev_myVals);
        gpuErrchk( cudaPeekAtLastError() );
    }

    gpuErrchk( cudaMemcpy(arrA,dev_arrA,statCnt * sizeof(Real),
                cudaMemcpyDeviceToHost) );
    gpuErrchk( cudaMemcpy(arrN1,dev_arrN1,statCnt * sizeof(int),
                cudaMemcpyDeviceToHost) );

    for (int i=0; i< statCnt; i++)
    {
        printf("Step: %d   ; A=%g N1=%d\n",
                i*vals.perDump, arrA[i], arrN1[i] );
    }
}

Sign Up

Sign In

Forgot Password

The Archive Base Latest Questions

EDIT In the initial posting’s code snippet (see below) I was not properly sending

Leave an answerCancel reply

1 Answer

Leave an answer
Cancel reply