Please look this code: #include <stdlib.h> #include <stdio.h> int N, L, I; float *

Question

0

Asked: June 3, 20262026-06-03T02:54:35+00:00 2026-06-03T02:54:35+00:00

Please look this code: #include <stdlib.h> #include <stdio.h> int N, L, I; float *

0

Please look this code:

#include <stdlib.h>
#include <stdio.h>

int N, L, I;
float * inputs;
float * temp;

// first kernel
__global__ void mulKernel ( float * output, float * inputs)///, float * weights)
{
   int idx = blockIdx.x * blockDim.x + threadIdx.x;

   output [idx] = inputs [idx] * 3;//weights [idx];
   //weights [idx] = 4;

   //__syncthreads();
}

//second kernel
__global__ void sumKernel ( float * output, float * input)
{
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   output [idx] = input[idx]*2;

   __syncthreads();
}

void printVector (const float *p, const int N) {
    for (int i=0; i<N; i++)
    printf("%f\n",p[i]);
}

int main(int argc, char *argv[])
{
    if(argc < 3)
        printf("Usage: cuda <layers> <inputs>\n");
    else
    {
        L = atoi(argv[1]);
        N = atoi(argv[2]);
        I = atoi(argv[2]);
        inputs = (float*)malloc(I*sizeof(float));
        float * weights = (float*)malloc(I*sizeof(float));

        // and fill with some arbitrary values
        for (int i=0; i<I; i++)
        {
            inputs[i] = 1;
        }
        for (int i=0; i<I; i++)
        {
            weights[i] = 1.5;
        }

        // allocate device memory
        float * devInputs = NULL;
        float * devTemp = NULL;
        float * devWeights = NULL;

        cudaMalloc ( (void**)&devInputs, I*sizeof(float) );
        cudaMalloc ( (void**)&devTemp, I*sizeof(float) );
        cudaMalloc ( (void**)&devWeights, I*sizeof(float) );

        // set kernel launch configuration
        dim3 threadsMul = dim3(512, 1);
        int blocksCount = floor(I / threadsMul.x) + 1;
        dim3 blocksMul  = dim3(blocksCount, 1);

        dim3 threadsSum = dim3(512, 1);
        blocksCount = floor(I / threadsSum.x) + 1;
        dim3 blocksSum  = dim3(blocksCount, 1);

        cudaMemcpy      ( devInputs, inputs, I*sizeof(float), cudaMemcpyHostToDevice );
        cudaMemcpy      ( devWeights, weights,I*sizeof(float), cudaMemcpyHostToDevice );

        //kernels calling in this cycle
        for(int j=0;j<L;j++)
        {
            // copying data to see that's ok
          cudaMemcpy      ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
          cudaMemcpy      ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );

            // print it
          printf("inputs:\n");
          printVector (inputs, N);
          printf("weights:\n");
          printVector (weights, N);
          printf("\n");

            // running first kernel
          mulKernel<<<blocksMul, threadsMul>>>(devTemp, devInputs);//, devWeights);

            // copying and printing data. We can see thats array weights contains a wrong values
          cudaMemcpy      ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
          cudaMemcpy      ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );

          printf("inputs:\n");
          printVector (inputs, N);
          printf("weights:\n");
          printVector (weights, N);
          printf("\n");

          if(cudaDeviceSynchronize() == cudaSuccess)
            printf("threads syncronized\n");

          cudaMemcpy      ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
          cudaMemcpy      ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );

          printf("inputs:\n");
          printVector (inputs, N);
          printf("weights:\n");
          printVector (weights, N);
          printf("\n");

          sumKernel<<<blocksSum, threadsSum>>>(devInputs, devTemp);

          cudaMemcpy      ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
          cudaMemcpy      ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );

          printf("inputs:\n");
          printVector (inputs, N);
          printf("weights:\n");
          printVector (weights, N);
          printf("\n\n");

          if(cudaDeviceSynchronize() == cudaSuccess)
            printf("threads syncronized\n");

          cudaMemcpy      ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
          cudaMemcpy      ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );

          printf("inputs:\n");
          printVector (inputs, N);
          printf("weights:\n");
          printVector (weights, N);
          printf("\n\n");
        }

        cudaMemcpy      ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );

        cudaFree         ( devInputs   );
        cudaFree         ( devTemp   );
        cudaFree         ( devWeights   );

        printVector (inputs, N);

        free(inputs);
        free(weights);
    }
    return 0;
}

And look the output. After callig first kernel, the devWeights array lost its data. But it doesn’t used anywhere. I just copy it to the memory, run kernels (that don’t affects it) and copy back to host. And in output I see that it changed. Why? What am I doing wrong?

In main function you can see cycle for. In it I run two kernels: sumKernel and mulKernel. Before running kernel, after it, and after synchronization threads I copy arrays to host and print it. So, I see wrong data after calling kernel. See comments in code.

I don’t see any error (only cudaSuccess).

Report

Leave an answer
Cancel reply

You must login to add an answer.

Need An Account,

1 Answer

Editorial Team · Answer 1 · 2026-06-03T02:54:36+00:00

Editorial Team

2026-06-03T02:54:36+00:00Added an answer on June 3, 2026 at 2:54 am

Oh, I found the error. I forgot to use if(idx < N) in my kernels and CUDA didn’t print error when gone out array dimensions. So, when I changed inputs array, I also changed data that situated in memory after inputs.

0

Reply
Share
Share

- Report

Sign Up

Sign In

Forgot Password

The Archive Base Latest Questions

Please look this code: #include <stdlib.h> #include <stdio.h> int N, L, I; float *

Leave an answerCancel reply

1 Answer

Leave an answer
Cancel reply