all I referred to simpleMultiCopy.cu in CUDA SDK 4.0 and wrote one, see code

Question

0

Editorial Team

Asked: May 23, 20262026-05-23T01:17:06+00:00 2026-05-23T01:17:06+00:00

all I referred to simpleMultiCopy.cu in CUDA SDK 4.0 and wrote one, see code

0

all

I referred to simpleMultiCopy.cu in CUDA SDK 4.0 and wrote one, see code below.

simpleMultiCopy.cu is an example of operation overlapping in a loop. And mine is similar, it will send a slice of data to GPU to compute each iteration in a loop where I perform the overlapping operation.

This is just a test/demo, don’t care the logic of the kernel(increment_kernel), it was used just to delay some time. The main logic lies in processWithStreams function.
But this program works incorrectly with this out put:

i: 0, current_stream: 0, next_stream: 1
i: 1, current_stream: 1, next_stream: 0
Cuda error in file 'ttt.cu' in line 132 : unspecified launch failure.

line 132 is:

CUDA_SAFE_CALL( cudaMemcpyAsync(
            d_data_in[next_stream], 
            h_data_in[next_stream], 
            memsize, 
            cudaMemcpyHostToDevice, 
            stream[next_stream]) ); //this is line 132

I don’t have much ideas about how CUDA works, so please help.

Any help will be appreciate.

Code:

#include <stdio.h>
#include <cutil_inline.h>

float processWithStreams(int streams_used);
#define STREAM_COUNT    2

int N = 1 << 24;

int *h_data_source;
int *h_data_sink;

int *h_data_in[STREAM_COUNT];
int *d_data_in[STREAM_COUNT];

int *h_data_out[STREAM_COUNT];
int *d_data_out[STREAM_COUNT];

cudaEvent_t cycleDone[STREAM_COUNT];
cudaStream_t stream[STREAM_COUNT];

cudaEvent_t start, stop;

dim3 block(512);
dim3 grid;

int memsize;

__global__ void increment_kernel(int *g_data, int inc_value)
{ 
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   //g_data[idx] = g_data[idx] + inc_value;

   int i = blockDim.x * gridDim.x;
   for(; i > 0; i /= 2)
   {
        if(idx > i)
            g_data[idx]++;
   }
}


int main(int argc, char *argv[])
{
    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
        cutilDeviceInit(argc, argv);
    else
        cudaSetDevice( cutGetMaxGflopsDeviceId());

    h_data_source = (int *)malloc(sizeof(int) * N);
    memset(h_data_source, 0, sizeof(int) * N);

    int i;
    memsize = 1024 * 1024 * sizeof(int);
    for(i = 0; i < STREAM_COUNT; i++)
    {
        CUDA_SAFE_CALL( cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault) );
        CUDA_SAFE_CALL( cudaMalloc(&d_data_in[i], memsize) );

        CUDA_SAFE_CALL( cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault) );
        CUDA_SAFE_CALL( cudaMalloc(&d_data_out[i], memsize) );

        CUDA_SAFE_CALL( cudaStreamCreate(&stream[i]) );
        CUDA_SAFE_CALL( cudaEventCreate(&cycleDone[i]) ); 

        cudaEventRecord(cycleDone[i], stream[i]);
    }

    CUDA_SAFE_CALL( cudaEventCreate(&start) );
    CUDA_SAFE_CALL( cudaEventCreate(&stop) );

    grid.x = N / block.x;
    grid.y = 1;



    float time1 = processWithStreams(STREAM_COUNT);
    printf("time: %f\n", time1);



    free( h_data_source );
    free( h_data_sink );

    for( i = 0; i < STREAM_COUNT; ++i ) {

        cudaFreeHost(h_data_in[i]);
        cudaFree(d_data_in[i]);

        cudaStreamDestroy(stream[i]);
        cudaEventDestroy(cycleDone[i]);
    }

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    cudaThreadExit();
    cutilExit(argc, argv);

    return 0;
}

float processWithStreams(int streams_used) {
    int current_stream = 0;
    float time;

    cudaEventRecord(start, 0);
    for( int i=0; i < N / 1024 / 1024; ++i ) {
        int next_stream = (current_stream + 1 ) % streams_used;
        printf("i: %d, current_stream: %d, next_stream: %d\n", i, current_stream, next_stream);

        // Ensure that processing and copying of the last cycle has finished
        cudaEventSynchronize(cycleDone[next_stream]);

        // Process current frame
        increment_kernel<<<grid, block, 0, stream[current_stream]>>>(
            d_data_in[current_stream], 1);

        // Upload next frame
        CUDA_SAFE_CALL( cudaMemcpyAsync(
            d_data_in[next_stream], 
            h_data_in[next_stream], 
            memsize, 
            cudaMemcpyHostToDevice, 
            stream[next_stream]) );

        CUDA_SAFE_CALL( cudaEventRecord(
            cycleDone[next_stream], 
            stream[next_stream]) );

        // Download current frame
        CUDA_SAFE_CALL( cudaMemcpyAsync(
            h_data_out[current_stream], 
            d_data_out[current_stream], 
            memsize, 
            cudaMemcpyDeviceToHost, 
            stream[current_stream]) );

        CUDA_SAFE_CALL( cudaEventRecord(
            cycleDone[current_stream], 
            stream[current_stream]) );

        current_stream = next_stream;
    }
    cudaEventRecord(stop, 0);    
    cudaEventElapsedTime(&time, start, stop);
    return time;
}

Report

Leave an answer
Cancel reply

You must login to add an answer.

Need An Account,

1 Answer

Editorial Team · Answer 1 · 2026-05-23T01:17:07+00:00

The problem is in your kernel. One thing that happens when checking errors in CUDA is that errors that occurred previously and were not checked will be reported next time you check for an error. That line is the first time you check for errors after the kernel launch which returned the error your are seeing.

The error unspecified launch failure is usually associated with out of bounds accesses to memory if I recall correctly.

You are launching your kernel with 32768 blocks and 512 threads per block. Calculating the idx value for the last thread of the last block we have 32767 * 512 + 511 = 16777215. In the first iteration idx < i and in the following ones you are trying to read and write to position 16777215 of g_data when you only allocated space for 1024 * 1024 integers.

edit: just noticed, why the tag operator overloading?

Sign Up

Sign In

Forgot Password

The Archive Base Latest Questions

all I referred to simpleMultiCopy.cu in CUDA SDK 4.0 and wrote one, see code

Leave an answerCancel reply

1 Answer

Leave an answer
Cancel reply