I’ve been writing CUDA code for quite a while, but I’m just now getting up to speed on how to use the texture cache.
Using the simpleTexture example from the Nvidia SDK for inspiration, I coded a simple example that uses the texture cache. The host copies the Lena image to the GPU and binds it as a texture. The kernel just copies the contents of the texture cache into an output array.
Oddly, the result (see the all-gray image below the code) is doesn’t match the input. Any thoughts about what might be going wrong?
Code (look at texCache_dummyKernel):
texture<float, 2, cudaReadModeElementType> tex; //declare texture reference for 2D float texture
//note: tex is global, so no input ptr is needed
__global__ void texCache_dummyKernel(float* out, const int width, const int height){ //copy tex to output
int x = blockIdx.x*blockDim.x + threadIdx.x; //my index into "big image"
int y = blockIdx.y*blockDim.y + threadIdx.y;
int idx = y*width+x;
if(x < width && y < height)
out[idx] = tex2D(tex, y, x);
}
int main(int argc, char **argv){
cv::Mat img = getRawImage("./Lena.pgm");
img.convertTo(img, CV_32FC1);
float* hostImg = (float*)&img.data[0];
int width = img.cols; int height = img.rows;
dim3 grid; dim3 block;
block.x = 16; block.y = 16;
grid.x = width/block.x + 1;
grid.y = height/block.y + 1;
cudaArray *dImg; //cudaArray*, not float*
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
CHECK_CUDART(cudaMallocArray(&dImg, &channelDesc, width, height));
CHECK_CUDART(cudaMemcpyToArray(dImg, 0, 0, hostImg, width*height*sizeof(float), cudaMemcpyHostToDevice));
setTexCacheParams(); //defined below
CHECK_CUDART(cudaBindTextureToArray(tex, dImg, channelDesc)); //Bind the array to the texture
float* dResult; //device memory for output
CHECK_CUDART(cudaMalloc((void**)&dResult, sizeof(float)*width*height));
texCache_dummyKernel<<<grid, block>>>(dResult, width, height); //dImg isn't an input param, since 'tex' is a global variable
CHECK_CUDART(cudaGetLastError()); //make sure kernel didn't crash
float* hostResult = (float*)malloc(sizeof(float)*width*height);
CHECK_CUDART(cudaMemcpy(hostResult, dResult, sizeof(float)*width*height, cudaMemcpyDeviceToHost));
outputProcessedImage(hostResult, width, height, "result.png"); //defined below
}
I should probably provide a couple of helper functions that I used above:
void setTexCacheParams(){ //configuration directly pulled from simpleTexture in nvidia sdk
tex.addressMode[0] = cudaAddressModeWrap;
tex.addressMode[1] = cudaAddressModeWrap;
tex.filterMode = cudaFilterModeLinear;
tex.normalized = true; // access with normalized texture coordinates
}
void outputProcessedImage(float* processedImg, int width, int height, string out_filename){
cv::Mat img = cv::Mat::zeros(height, width, CV_32FC1);
for(int i=0; i<height; i++)
for(int j=0; j<width; j++)
img.at<float>(i,j) = processedImg[i*width + j]; //just grab the 1st of the 4 pixel spaces in a uchar4
img.convertTo(img, CV_8UC1); //float to uchar
vector<int> compression_params;
compression_params.push_back(CV_IMWRITE_PNG_COMPRESSION);
compression_params.push_back(9);
cv::imwrite(out_filename, img, compression_params);
}
Input:

Output:

- Sorry that this post is such a wall of code! I’d appreciate any advice on making code like this more concise.
- I used OpenCV for the file I/O above…hope this isn’t confusing.
- When I change the kernel to read the input image from a 1D
float*array, and I keep pretty much everything else the same, I get the correct result.
In your original code, you had initialised the texture to use normalised coordinates. This means that the texture is addressed on [0,1] in each spatial dimension. So your kernel should look this this:
[Standard disclaimer: written in browser, not compiled or tested, use at own risk]
ie. you should pass coordinates to
tex2Dwhich are normalised by dividing through by the image width and height.Alternatively, as you have discovered, you can change the texture definition to
normalized=falseand use addressing in absolute rather than relative textures coordinates. Even then the texture read in your code should look like this:because texture addressing is always done using floating point coordinates and the texture data is voxel centred, thus 0.5 is added to each coordinate to ensure the read comes from the centroid of each interpolation area or volume within the texture.
You can find a description of texture filtering and addressing modes and their effect on interpolation in one of the appendices of the CUDA C programming guide.