Sign Up

Sign Up to our social questions and Answers Engine to ask questions, answer people’s questions, and connect with other people.

Have an account? Sign In

Have an account? Sign In Now

Sign In

Login to our social questions & Answers Engine to ask questions answer people’s questions & connect with other people.

Sign Up Here

Forgot Password?

Don't have account, Sign Up Here

Forgot Password

Lost your password? Please enter your email address. You will receive a link and will create a new password via email.

Have an account? Sign In Now

You must login to ask a question.

Forgot Password?

Need An Account, Sign Up Here

Please briefly explain why you feel this question should be reported.

Please briefly explain why you feel this answer should be reported.

Please briefly explain why you feel this user should be reported.

Sign InSign Up

The Archive Base

The Archive Base Logo The Archive Base Logo

The Archive Base Navigation

  • Home
  • SEARCH
  • About Us
  • Blog
  • Contact Us
Search
Ask A Question

Mobile menu

Close
Ask a Question
  • Home
  • Add group
  • Groups page
  • Feed
  • User Profile
  • Communities
  • Questions
    • New Questions
    • Trending Questions
    • Must read Questions
    • Hot Questions
  • Polls
  • Tags
  • Badges
  • Buy Points
  • Users
  • Help
  • Buy Theme
  • SEARCH
Home/ Questions/Q 1097927
In Process

The Archive Base Latest Questions

Editorial Team
  • 0
Editorial Team
Asked: May 17, 20262026-05-17T00:30:15+00:00 2026-05-17T00:30:15+00:00

I constructed my own little Opencl example using different sources on the net. The

  • 0

I constructed my own little Opencl example using different sources on the net. The actual kernel works, and I get the output I want, but the cleanup functions, I found in one of the examples, cause segfaults. What did I do wrong?

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <CL/cl.h> //opencl

#define CL_CHECK(_expr)                                                         \
   do {                                                                         \
     cl_int _err = _expr;                                                       \
     if (_err == CL_SUCCESS)                                                    \
       break;                                                                   \
     fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
     abort();                                                                   \
   } while (0)

#define CL_CHECK_ERR(_expr)                                                     \
   ({                                                                           \
     cl_int _err = CL_INVALID_VALUE;                                            \
     typeof(_expr) _ret = _expr;                                                \
     if (_err != CL_SUCCESS) {                                                  \
       fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
       abort();                                                                 \
     }                                                                          \
     _ret;                                                                      \
   })

const char* OpenCLSource[] = {
       "__kernel void VectorAdd(__global int* c, __global int* a,__global int* b)",
       "{",
       "      // Index of the elements to add \n",
       "      unsigned int n = get_global_id(0);",
       "      // Sum the n’th element of vectors a and b and store in c \n",
       "      c[n] = a[n] + b[n];",
       "}"
};

cl_device_id* init_opencl(cl_context *GPUContext,cl_command_queue *GPUCommandQueue, cl_kernel* cl_forward1,cl_program* OpenCLProgram){

    // Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
    cl_int _err;
    *GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &_err) ;
    printf("\n1-%i\n",_err);
    // Get the list of GPU devices associated with this context
    size_t ParmDataBytes;
    CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes));
    cl_device_id* GPUDevices;
    GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
    CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL));
    // Create a command-queue on the first GPU device
    *GPUCommandQueue = clCreateCommandQueue(*GPUContext, GPUDevices[0], 0, &_err);
    printf("\n2-%i\n",_err);
    // Create OpenCL program with source code
    *OpenCLProgram = clCreateProgramWithSource(*GPUContext, 7, OpenCLSource, NULL, &_err);
    printf("\n3-%i\n",_err);

    CL_CHECK(clBuildProgram(*OpenCLProgram, 0, 
              NULL, NULL, NULL, NULL));


     cl_int errcode;
    *cl_forward1 = clCreateKernel(*OpenCLProgram, 
               "VectorAdd", &errcode);
               printf("\n7-%i\n",errcode);

    return GPUDevices;
}


int main(int argc, char** argv)
{
    cl_context GPUContext;
    cl_command_queue GPUCommandQueue;
    cl_program OpenCLProgram;
    cl_kernel OpenCLVectorAdd;
    cl_device_id* GPUDevices;

    GPUDevices=init_opencl(&GPUContext,&GPUCommandQueue,&OpenCLVectorAdd,&OpenCLProgram);

    // Two integer source vectors in Host memory
    int n=5 ;
    int x[5]={1,2,4,6,8};
    int y[5]={1,2,4,6,8};
    int output[n];
    int size_x = n*sizeof(x);
    int size_y = n*sizeof(y);

    int size_output = n*sizeof(output); // this changes for the second forward1
    cl_int _err;
    // Allocate GPU memory for source vectors AND initialize from CPU memory
    cl_mem x_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
                    CL_MEM_COPY_HOST_PTR, size_x, x, &_err);
                     printf("\n4-%i\n",_err);
    cl_mem y_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
                    CL_MEM_COPY_HOST_PTR, size_y, y, &_err);
                     printf("\n5-%i\n",_err);


    // Allocate output memory on GPU
    cl_mem total_cl = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,
                                          size_output, NULL, &_err);
                                           printf("\n6-%i\n",_err);

     // In the next step we associate the GPU memory with the Kernel arguments
    clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&total_cl);
    clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&x_cl);
    clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&y_cl);


    // 7. Launch OpenCL kernel
    size_t localWorkSize[1], globalWorkSize[1];
    //localWorkSize = ;
    globalWorkSize[0] = n;

    // Launch the Kernel on the GPU
    CL_CHECK(clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL));
    // Copy the output in GPU memory back to CPU memory

    //float* h_C = (float*) malloc(size_output);
    CL_CHECK(clEnqueueReadBuffer(GPUCommandQueue, 
              total_cl, CL_TRUE, 0, size_output, 
                output, 0, NULL, NULL));
    for (int i=0; i<n;i++){
        printf("\n%i",output[i]);
    }

    // Cleanup (each of the following lines causes a seg fault
    // ******************************
    CL_CHECK(free(GPUDevices)); 
    CL_CHECK(clReleaseKernel(OpenCLVectorAdd));
    CL_CHECK(clReleaseProgram(OpenCLProgram));
    CL_CHECK(clReleaseCommandQueue(GPUCommandQueue));
    CL_CHECK(clReleaseContext(GPUContext));
    CL_CHECK(clReleaseMemObject(total_cl));
    CL_CHECK(clReleaseMemObject(x_cl));
    CL_CHECK(clReleaseMemObject(y_cl));
    /* ****************

    return 0;
}

Merci!

  • 1 1 Answer
  • 0 Views
  • 0 Followers
  • 0
Share
  • Facebook
  • Report

Leave an answer
Cancel reply

You must login to add an answer.

Forgot Password?

Need An Account, Sign Up Here

1 Answer

  • Voted
  • Oldest
  • Recent
  • Random
  1. Editorial Team
    Editorial Team
    2026-05-17T00:30:16+00:00Added an answer on May 17, 2026 at 12:30 am

    I corrected and changed several small things. So this code should work now.

    #include <stdio.h>
    #include <stdlib.h>
    #include <errno.h>
    #include <CL/cl.h> //opencl
    
    #define CL_CHECK(_expr)                                                         \
       do {                                                                         \
         cl_int _err = _expr;                                                       \
         if (_err == CL_SUCCESS)                                                    \
           break;                                                                   \
         fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
         abort();                                                                   \
       } while (0)
    
    #define CL_CHECK_ERR(_expr)                                                     \
       ({                                                                           \
         cl_int _err = CL_INVALID_VALUE;                                            \
         typeof(_expr) _ret = _expr;                                                \
         if (_err != CL_SUCCESS) {                                                  \
           fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
           abort();                                                                 \
         }                                                                          \
         _ret;                                                                      \
       })
    
    const char* OpenCLSource[] = {
           "__kernel void VectorAdd(__global int* c, __global int* a,__global int* b)",
           "{",
           "      // Index of the elements to add \n",
           "      unsigned int n = get_global_id(0);",
           "      // Sum the n’th element of vectors a and b and store in c \n",
           "      c[n] = a[n] + b[n];",
           "}"
    };
    
    cl_device_id* init_opencl(cl_context *GPUContext,cl_command_queue *GPUCommandQueue, cl_kernel* cl_forward1,cl_program* OpenCLProgram){
    
        // Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
        cl_int _err;
        *GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &_err) ;
        printf("\nclCreateContextFromType:%i\n",_err);
        // Get the list of GPU devices associated with this context
        size_t ParmDataBytes;
        CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes));
        cl_device_id* GPUDevices;
        GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
        CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL));
        // Create a command-queue on the first GPU device
        *GPUCommandQueue = clCreateCommandQueue(*GPUContext, GPUDevices[0], 0, &_err);
        printf("\nclCreateCommandQueue:%i\n",_err);
        // Create OpenCL program with source code
        *OpenCLProgram = clCreateProgramWithSource(*GPUContext, 7, OpenCLSource, NULL, &_err);
        printf("\nclCreateProgramWithSource:%i\n",_err);
    
        CL_CHECK(clBuildProgram(*OpenCLProgram, 0, 
                  NULL, NULL, NULL, NULL));
    
    
         cl_int errcode;
        *cl_forward1 = clCreateKernel(*OpenCLProgram, 
                   "VectorAdd", &errcode);
                   printf("\nclCreateKernel:%i\n",errcode);
    
        return GPUDevices;
    }
    
    
    int main(int argc, char** argv)
    {
        cl_context GPUContext;
        cl_command_queue GPUCommandQueue;
        cl_program OpenCLProgram;
        cl_kernel OpenCLVectorAdd;
        cl_device_id* GPUDevices;
    
        GPUDevices=init_opencl(&GPUContext,&GPUCommandQueue,&OpenCLVectorAdd,&OpenCLProgram);
    
        int n=5 ;
        int x[5]={1,2,4,6,8};
        int y[5]={1,2,4,6,8};
        int output[n];
        int size_x = n*sizeof(x);
        int size_y = n*sizeof(y);
        int size_output = n*sizeof(output);
    
        cl_int _err;
    
        // Allocate GPU memory for source vectors AND initialize from CPU memory
        cl_mem x_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
                        CL_MEM_COPY_HOST_PTR, size_x, x, &_err);
                        printf("\nclCreateBuffer:%i\n",_err);
        cl_mem y_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
                        CL_MEM_COPY_HOST_PTR, size_y, y, &_err);
                        printf("\nclCreateBuffer:%i\n",_err);
    
    
        // Allocate output memory on GPU
        cl_mem total_cl = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,
                                              size_output, NULL, &_err);
                                               printf("\nclCreateBuffer:%i\n",_err);
    
         // In the next step we associate the GPU memory with the Kernel arguments
        clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&total_cl);
        clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&x_cl);
        clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&y_cl);
    
    
        size_t globalWorkSize[1];
        globalWorkSize[0] = n;
    
        // Launch the Kernel on the GPU
        CL_CHECK(clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL));
        clFinish(GPUCommandQueue);
        // Copy the output in GPU memory back to CPU memory
    
        int* h_c = (int*) malloc(size_output);
        CL_CHECK(clEnqueueReadBuffer(GPUCommandQueue, 
                  total_cl, CL_TRUE, 0, size_output, 
                    h_c, 0, NULL, NULL));
        clFinish(GPUCommandQueue);
        for (int i=0; i<n;i++){
            printf("\noutput[%i]=%i",i,h_c[i]);
        }
    
        // Cleanup
        free(GPUDevices); 
        CL_CHECK(clReleaseKernel(OpenCLVectorAdd));
        CL_CHECK(clReleaseProgram(OpenCLProgram));
        CL_CHECK(clReleaseCommandQueue(GPUCommandQueue));
        CL_CHECK(clReleaseContext(GPUContext));
        CL_CHECK(clReleaseMemObject(x_cl));
        CL_CHECK(clReleaseMemObject(total_cl));
        CL_CHECK(clReleaseMemObject(y_cl));
    
        return 0;
    }
    
    • 0
    • Reply
    • Share
      Share
      • Share on Facebook
      • Share on Twitter
      • Share on LinkedIn
      • Share on WhatsApp
      • Report

Sidebar

Related Questions

I need a little more help to get how a DI framework like Ninject
I faced a little trouble - I do not know if I can define
For a little background information, I've got an application that's running in a loop,
Sorry if the title is a little vague, I do not know how else
I've created my own custom pseudo enumerations within my domain model to allow me
I've got a LINQ problem thats got me a little stumped. I can see
I have 3 kinds of objects: Agency, BusinessUnit and Client (each with their own
I have a singleton instance that is referenced throughout the project which works like
boost::shared_ptr has an unusual constructor template<class Y> shared_ptr(shared_ptr<Y> const & r, T * p);
I watched John Resig's Best Practices in JavaScript Library Design presentation; one slide suggested

Explore

  • Home
  • Add group
  • Groups page
  • Communities
  • Questions
    • New Questions
    • Trending Questions
    • Must read Questions
    • Hot Questions
  • Polls
  • Tags
  • Badges
  • Users
  • Help
  • SEARCH

Footer

© 2021 The Archive Base. All Rights Reserved
With Love by The Archive Base

Insert/edit link

Enter the destination URL

Or link to existing content

    No search term specified. Showing recent items. Search or use up and down arrow keys to select an item.