Sign Up

Sign Up to our social questions and Answers Engine to ask questions, answer people’s questions, and connect with other people.

Have an account? Sign In

Have an account? Sign In Now

Sign In

Login to our social questions & Answers Engine to ask questions answer people’s questions & connect with other people.

Sign Up Here

Forgot Password?

Don't have account, Sign Up Here

Forgot Password

Lost your password? Please enter your email address. You will receive a link and will create a new password via email.

Have an account? Sign In Now

You must login to ask a question.

Forgot Password?

Need An Account, Sign Up Here

Please briefly explain why you feel this question should be reported.

Please briefly explain why you feel this answer should be reported.

Please briefly explain why you feel this user should be reported.

Sign InSign Up

The Archive Base

The Archive Base Logo The Archive Base Logo

The Archive Base Navigation

  • Home
  • SEARCH
  • About Us
  • Blog
  • Contact Us
Search
Ask A Question

Mobile menu

Close
Ask a Question
  • Home
  • Add group
  • Groups page
  • Feed
  • User Profile
  • Communities
  • Questions
    • New Questions
    • Trending Questions
    • Must read Questions
    • Hot Questions
  • Polls
  • Tags
  • Badges
  • Buy Points
  • Users
  • Help
  • Buy Theme
  • SEARCH
Home/ Questions/Q 7529519
In Process

The Archive Base Latest Questions

Editorial Team
  • 0
Editorial Team
Asked: May 30, 20262026-05-30T04:40:53+00:00 2026-05-30T04:40:53+00:00

I’ve written the following code. I have a loop which iterates between two red

  • 0

I’ve written the following code. I have a loop which iterates between two red and black kernels. In each iteration I call clEnqueueReadBuffer which I think is not efficient. Is there any other way to repeat calling kernels efficiently?
Thanks

#include <stdio.h>
#include <stdlib.h> 
#include <string>
#include <iostream>
#include <cmath>
#include <ctime>
#include <ocl

Utils.h>

#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif


#define DATA_SIZE (1048576)
#define NANO_TO_MILI 1e6
#define MAX_ITER 1
#define LIMIT 100
#define BIG_RANGE LIMIT*4*100

#define EPS 1e-2
#define SQ 1024

#define A(i,j) A[i*SQ+j]

using namespace std;

cl_platform_id platforms;
cl_device_id device;
cl_context context;
cl_program program1, program2;
cl_command_queue command;
cl_int err;
cl_kernel kernel_red, kernel_black;
cl_int i;
cl_mem input_A,input_b,in_out_X;
cl_event timing_event;
cl_ulong time_start, time_end,total_time = 0;


const char options[] = "-cl-mad-enable -cl-finite-math-only -Werror -DWIDTH=1024 -DHEIGHT=1024";
char *kernel_names[] = {"Red","Black"};

float norm (float*,float*,int);
void swap(float **in, float **out); 

void CreateQueue(void);
void CreateKernel(void);
void CreateBuffer(unsigned int);
void Enqueue_Write_Buffer(unsigned int);
void Kernel_Arg_Set(cl_kernel, unsigned int);
void Enqueue_Read_Buffer(unsigned int);
void Create_Work_Group(cl_kernel, unsigned int);
void Shutdown();

float *A,*oldX,*newX,*b;

int main(int argc, char** argv) {
unsigned int count = DATA_SIZE;
int i,j;
clock_t start,end;
float *XX,*XXnew;

    A = (float*)malloc(sizeof(float)*count);
    newX = (float*)malloc(sizeof(float)*SQ);
    oldX = (float*)malloc(sizeof(float)*SQ);
    b = (float*)malloc(sizeof(float)*SQ);

    XX = (float*)malloc(sizeof(float)*SQ);

    float h=1.0f/SQ;
    float xx[SQ];

    for (i=0;i<SQ;i++){
        XX[i] = 0.0f;
        oldX[i]=0.0f;
        xx[i] = 0.0f + (i+1)*h;
        if (i != 0) b[i] = -2.0f*xx[i]; else b[i] = -2.0f*xx[i]-1.0f/(h*h)+1.0f/(2.0f*h);
        for(j=0;j<SQ;j++) A(i,j) =0.0f;
        A(i,i) = -2.0f/(h*h);
        if (i!=SQ-1) A(i,i+1) = 1.0f/(h*h) + 1.0f/(2.0f*h); else A(i,i+1) = 0.0f;
        if (i != 0)  A(i,i-1) = 1.0f/(h*h) - 1.0f/(2.0f*h); else A(i,i-1) = 0.0f;
    }


    newX[0] = BIG_RANGE;

    int cnt = 0;

    CreateQueue();

    CreateKernel();

    CreateBuffer(count);



    Kernel_Arg_Set(kernel_red  ,count);
    Kernel_Arg_Set(kernel_black,count);

    end=0.0f;start =clock();cnt =0;

    Enqueue_Write_Buffer(count);


    while(norm(oldX,newX,SQ) > EPS && cnt<LIMIT){

    Create_Work_Group(kernel_red, count);

    Enqueue_Read_Buffer(count);

    Create_Work_Group(kernel_black, count);

    cnt++;

    Enqueue_Read_Buffer(count);

    }

    clFinish(command);

    Shutdown();


    free(oldX);
    free(newX);
    free(XX);
    free(XXnew);
    return 0;
}




void CreateQueue(){
err = clGetPlatformIDs(1, &platforms, NULL);
if(err<0){
    perror("no platform");getchar();exit(1);}

err = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_GPU, 1, &device,NULL);
if(err<0){
    perror("no device");getchar();exit(1);}

context = clCreateContext(NULL, 1, &device,NULL, NULL, &err);
if(err < 0) {
    perror("Couldn't create a context");exit(1);}

command = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
    if (!command)
    {
        printf("Error: Failed to create a command commands!\n");
        exit(1);
    }

clEnqueueBarrier(command);


}

void CreateBuffer(unsigned int count){

    input_A  = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * count, A, NULL);
    in_out_X = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, sizeof(float) * SQ, oldX, NULL);
    input_b  = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * SQ, b, NULL);

    if (!input_A || !input_b || !in_out_X)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }    
}


void CreateKernel(){

    FILE *fp;
    size_t program_size;
    string kernel_src;
    fp = fopen("Red.cl", "r");
    fseek(fp, 0, SEEK_END);
    program_size = ftell(fp);
    kernel_src.resize(program_size + 1);
    fseek(fp, 0, SEEK_SET);
    fread(&kernel_src[0], program_size, 1, fp);
    fclose(fp);
    kernel_src[program_size] = '\0';


const char *src = &kernel_src[0];
program1 = clCreateProgramWithSource(context, 1,&src, NULL, &err);

if (!program1)
   {
      printf("clCreateProgramWithSource failed\n");
      exit(1);
   }

err =clBuildProgram(program1, 1, &device, options, NULL, NULL);

if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2*2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program1, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }



kernel_red   = clCreateKernel(program1, kernel_names[0], &err);

if (!kernel_red || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }


kernel_black   = clCreateKernel(program1, kernel_names[1], &err);

if (!kernel_black || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }

}

void Create_Work_Group(cl_kernel kernel, unsigned int count){

    size_t global[] = {SQ,SQ,0};
    size_t local[] = {32,32,0};
    err = clEnqueueNDRangeKernel(command, kernel, 2, NULL, global, local, 0, NULL,NULL);
    if (err)
    {
        printf("Error: Failed to execute kernel!\n");
        exit(1);
    }
}

void Kernel_Arg_Set(cl_kernel kernel,unsigned int count){
    err  = 0;
        err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_A);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &in_out_X);
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_b);

    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        exit(1);
    }
}

void Enqueue_Read_Buffer(unsigned int count){   
    err = clEnqueueReadBuffer( command, in_out_X, CL_TRUE, 0, sizeof(float) * SQ, oldX, 0, NULL, NULL );  
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", err);
        exit(1);
    }
}

void Enqueue_Write_Buffer(unsigned int count){
     err  = clEnqueueWriteBuffer(command, input_A , CL_FALSE, 0, sizeof(float) * count,   A, 0, NULL,  NULL);
     err |= clEnqueueWriteBuffer(command, input_b , CL_FALSE, 0, sizeof(float) * SQ   ,   b, 0, NULL,  NULL);
     err |= clEnqueueWriteBuffer(command, in_out_X, CL_FALSE, 0, sizeof(float) * SQ   ,oldX, 0, NULL,  NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        exit(1);
    }

}
  • 1 1 Answer
  • 0 Views
  • 0 Followers
  • 0
Share
  • Facebook
  • Report

Leave an answer
Cancel reply

You must login to add an answer.

Forgot Password?

Need An Account, Sign Up Here

1 Answer

  • Voted
  • Oldest
  • Recent
  • Random
  1. Editorial Team
    Editorial Team
    2026-05-30T04:40:55+00:00Added an answer on May 30, 2026 at 4:40 am

    What you do is quite inefficient. You can write the buffer only once, then enqueue as many kernels as you want, with the same buffer as their argument. Of course if you need to compute the norm, you need to read data back. I would suggest something like this:

    1. Create an additional buffer for the norm; check at the beginning of every kernel what the norm is (just by reading its value); if it is smaller than threshold value, return immediately.

    2. Create a new kernel which will compute the norm for you.

    3. Enque tasks like:

      • write buffers,
      • kernels: { {red,black}*10, updateNorm}*10
      • read buffers.

      The computation will run 10x, then norm will be updated. In case it is already ok, already enqueued computation kernels will be will retrun immediately. After the queue is finished, read buffers back and check norm on the CPU. If the norm is still not OK, enqueue the same batch of kernels again.

      In the worst case, you will waste 9 real and 90 immediately returning kernel runs.

    • 0
    • Reply
    • Share
      Share
      • Share on Facebook
      • Share on Twitter
      • Share on LinkedIn
      • Share on WhatsApp
      • Report

Sidebar

Related Questions

I have this code: - (void)parser:(NSXMLParser *)parser foundCDATA:(NSData *)CDATABlock { NSString *someString = [[NSString
I have a text area in my form which accepts all possible characters from
I ran into a problem. Wrote the following code snippet: teksti = teksti.Trim() teksti
I have a string like this: La Torre Eiffel paragonata all&#8217;Everest What PHP function
I am trying to loop through a bunch of documents I have to put
link Im having trouble converting the html entites into html characters, (&# 8217;) i
I have just tried to save a simple *.rtf file with some websites and
I am trying to understand how to use SyndicationItem to display feed which is
I used javascript for loading a picture on my website depending on which small
I have a jquery bug and I've been looking for hours now, I can't

Explore

  • Home
  • Add group
  • Groups page
  • Communities
  • Questions
    • New Questions
    • Trending Questions
    • Must read Questions
    • Hot Questions
  • Polls
  • Tags
  • Badges
  • Users
  • Help
  • SEARCH

Footer

© 2021 The Archive Base. All Rights Reserved
With Love by The Archive Base

Insert/edit link

Enter the destination URL

Or link to existing content

    No search term specified. Showing recent items. Search or use up and down arrow keys to select an item.