I wrote small CUDA code to understand global memory to shared memory transfer transactions. The code is as follows:
#include <iostream>
using namespace std;
__global__ void readUChar4(uchar4* c, uchar4* o){
extern __shared__ uchar4 gc[];
int tid = threadIdx.x;
gc[tid] = c[tid];
o[tid] = gc[tid];
}
int main(){
string a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
uchar4* c;
cudaError_t e1 = cudaMalloc((void**)&c, 128*sizeof(uchar4));
if(e1==cudaSuccess){
uchar4* o;
cudaError_t e11 = cudaMalloc((void**)&o, 128*sizeof(uchar4));
if(e11 == cudaSuccess){
cudaError_t e2 = cudaMemcpy(c, a.c_str(), 128*sizeof(uchar4), cudaMemcpyHostToDevice);
if(e2 == cudaSuccess){
readUChar4<<<1,128, 128*sizeof(uchar4)>>>(c, o);
uchar4* oFromGPU = (uchar4*)malloc(128*sizeof(uchar4));
cudaError_t e22 = cudaMemcpy(oFromGPU, o, 128*sizeof(uchar4), cudaMemcpyDeviceToHost);
if(e22 == cudaSuccess){
for(int i =0; i < 128; i++){
cout << oFromGPU[i].x << " ";
cout << oFromGPU[i].y << " ";
cout << oFromGPU[i].z << " ";
cout << oFromGPU[i].w << " " << endl;
}
}
else{
cout << "Failed to copy from GPU" << endl;
}
}
else{
cout << "Failed to copy" << endl;
}
}
else{
cout << "Failed to allocate output memory" << endl;
}
}
else{
cout << "Failed to allocate memory" << endl;
}
return 0;
}
This code simply copies data from device memory to shared memory and back to device memory. I have the following three questions:
- Is the transfer from device memory to shared memory in this case guaranteed to take 4 memory transactions? I believe it depends on how
cudaMallocallocates memory; if the memory is allocated in a haphazard manner such that the data is scattered over memory, then it will take more than 4 memory transactions. However, ifcudaMallocallocates memory in 128 byte chunks or it allocates memory contiguously, then it should not take more than 4 memory transactions. - Does the above logic also hold for writing data from shared memory to device memory i.e., the transfer will complete in 4 memory transactions?
- Can this code cause bank conflicts. I believe that this code will not cause bank conflicts if threads are assigned ids sequentially. However, if thread 32 and 64 are scheduled to run in the same warp, then this code can cause bank conflicts.
In the code you provided (repeated here) the compiler will completely remove the shared memory store and load since they don’t do anything necessary or beneficial for the code.
Assuming you did something with the shared memory so it was not eliminated, then:
uchar4= 4*8 bits) per thread (total 128 bytes per warp).cudaMallocallocates memory contiguously.