__kernel void CKmix(__global short* MCL, __global short* MPCL,__global short *C, int S, int B)
{
unsigned int i=get_global_id(0);
unsigned int ii=get_global_id(1);
MCL[i]+=MPCL[B*ii+i+C[ii]+S];
}
Kernel seams ok, it compiles successfully, and I have obtained the correct results using the CPU as a device, but that was when I had the program release and and recreate my memory objects each time the kernel is called, which for my testing purpose is about 16000 times.
The code I am posting is where I am at now, trying to use pinned memory and mapping.
OpenCLProgram = clCreateProgramWithSource(hContext[Plat-1][Dev-1],11, OpenCLSource, NULL ,NULL);
clBuildProgram(OpenCLProgram, 0,NULL,NULL, NULL,NULL);
ocKernel = clCreateKernel(OpenCLProgram, "CKmix", NULL);
This is also successful. The reason I have a 2d array of contexts is that I iterate through all platforms and devices and allow the user to select the platform and device to use.
WorkSize[0]=SN;
WorkSize[1]=NF;
PinnedCCL = clCreateBuffer(hContext[Plat-1][Dev-1], CL_MEM_READ_WRITE| CL_MEM_ALLOC_HOST_PTR, sizeof(short) *NF, NULL, NULL);
PinnedMCL = clCreateBuffer(hContext[Plat-1][Dev-1], CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(short) * Z*NF, NULL, NULL);
PinnedMO = clCreateBuffer(hContext[Plat-1][Dev-1], CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(short) * Z,NULL, NULL);
PinnedMTEMP = clCreateBuffer(hContext[Plat-1][Dev-1], CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(short) * Z,NULL, NULL);
DevComboCCL = clCreateBuffer(hContext[Plat-1][Dev-1], CL_MEM_READ_WRITE, sizeof(short) *NF, NULL, NULL);
DevMappedMCL = clCreateBuffer(hContext[Plat-1][Dev-1], CL_MEM_READ_WRITE , sizeof(short) * Z*NF, NULL,NULL);
DevMO = clCreateBuffer(hContext[Plat-1][Dev-1], CL_MEM_READ_WRITE , sizeof(short) * Z,NULL, NULL);
MO = (short*) clEnqueueMapBuffer(hCmdQueue[Plat-1][Dev-1], PinnedMO, CL_TRUE, CL_MAP_READ, 0, sizeof(short)*Z, 0, NULL, NULL, NULL);
CCL = (short*) clEnqueueMapBuffer(hCmdQueue[Plat-1][Dev-1], PinnedCCL, CL_TRUE, CL_MAP_WRITE, 0, sizeof(short)*NF, 0, NULL, NULL,NULL);
MCL = (short*) clEnqueueMapBuffer(hCmdQueue[Plat-1][Dev-1], PinnedMCL, CL_TRUE, CL_MAP_WRITE, 0, sizeof(short)*Z*NF, 0, NULL, NULL, NULL);
MTEMP = (short*) clEnqueueMapBuffer(hCmdQueue[Plat-1][Dev-1], PinnedMTEMP, CL_TRUE, CL_MAP_READ, 0, sizeof(short)*Z, 0, NULL, NULL, NULL);
for (n=0; n < Z; ++n) {
MTEMP[n]=0;
}
clSetKernelArg(ocKernel, 0, sizeof(cl_mem), (void*) &DevMO);
clSetKernelArg(ocKernel, 1, sizeof(cl_mem), (void*) &DevMCL);
clSetKernelArg(ocKernel, 2, sizeof(cl_mem), (void*) &DevCCL);
clSetKernelArg(ocKernel, 3, sizeof(int), (void*) &SH);
clSetKernelArg(ocKernel, 4, sizeof(int), (void*) &SN);
The above constitutes my initialization, and the rest below, happens repeatedly.
clEnqueueWriteBuffer(hCmdQueue[Plat-1][Dev-1], DevMCL, CL_TRUE, 0, Z*NF*sizeof(short), MCL, 0, NULL, NULL);
clEnqueueWriteBuffer(hCmdQueue[Plat-1][Dev-1], DevCCL, CL_TRUE, 0, NF*sizeof(short), CCL, 0, NULL, NULL);
clEnqueueWriteBuffer(hCmdQueue[Plat-1][Dev-1], DevMO, CL_TRUE, 0, Z*sizeof(short), MTEMP, 0, NULL, NULL);
clEnqueueNDRangeKernel(hCmdQueue[Plat-1][Dev-1], ocKernel, 2, NULL, WorkSize, NULL, 0, NULL, NULL);
clEnqueueReadBuffer(hCmdQueue[Plat-1][Dev-1],DevMO, CL_TRUE, 0, Z * sizeof(short),(void*) MO , 0, NULL, NULL);
I have checked for errors, and I am not getting any errors. The kernel is launched many times repeatedly with fresh data. I am not sure where I am doing wrong.
NVIDIA 550 ti compute capability 2.1,
latest Dev Driver,
Cuda SDK 4.0,
I don’t know if its the only problem with the code, but this:
is definitely not a good idea. You will generally get multiple threads working on the same
global_id(0), so several threads might try to updateMCL[i]simultaneous (note that+=is not atomic). I would assume that for the CPU there are not enough threads generated to show such a behaviour in most of the cases, while having thousands of threads on the gpu will almost surely lead to problems.The most reasonable way to do this is to have only a 1 dimensional workingset and for each thread accumulate all values which go to one position:
Of course that might or might not be feasible. If it isn’t the fix probably won’t be quite that simple.