I have the following code in cuda_computation.cu
#include <iostream>
#include <stdio.h>
#include <cuda.h>
#include <assert.h>
void checkCUDAError(const char *msg);
__global__ void euclid_kernel(float *x, float* y, float* f)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int i = blockIdx.x;
int j = threadIdx.x;
f[idx] = sqrt((x[i]-x[j])*(x[i]-x[j]) + (y[i]-y[j])*(y[i]-y[j]));
}
int main()
{
float *xh;
float *yh;
float *fh;
float *xd;
float *yd;
float *fd;
size_t n = 256;
size_t numBlocks = n;
size_t numThreadsPerBlock = n;
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(float);
xh = (float *) malloc(n * sizeof(float));
yh = (float *) malloc(n * sizeof(float));
fh = (float *) malloc(memSize);
for(int ii(0); ii!=n; ++ii)
{
xh[ii] = ii;
yh[ii] = ii;
}
cudaMalloc( (void **) &xd, n * sizeof(float) );
cudaMalloc( (void **) &yd, n * sizeof(float) );
cudaMalloc( (void **) &fd, memSize );
for(int run(0); run!=10000; ++run)
{
//change value to avoid optimizations
xh[0] = ((float)run)/10000.0;
cudaMemcpy( xd, xh, n * sizeof(float), cudaMemcpyHostToDevice );
checkCUDAError("cudaMemcpy");
cudaMemcpy( yd, yh, n * sizeof(float), cudaMemcpyHostToDevice );
checkCUDAError("cudaMemcpy");
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
euclid_kernel<<< dimGrid, dimBlock >>>( xd, yd, fd );
cudaThreadSynchronize();
checkCUDAError("kernel execution");
cudaMemcpy( fh, fd, memSize, cudaMemcpyDeviceToHost );
checkCUDAError("cudaMemcpy");
}
cudaFree(xd);
cudaFree(yd);
cudaFree(fd);
free(xh);
free(yh);
free(fh);
return 0;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(-1);
}
}
It takes about 6″ to run on an FX QUADRO 380, while the corresponding serial version using just one i7-870 core takes just about 3″. Do I miss something? Is the code under optimised in some ways? Or is it just expected behaviour that for simple calculations (like this all-pairs Euclidean distance) the overhead needed to move memory exceeds the computational gain?
I think you are being killed by the time to move the data.
Especially since you are calling the CUDA kernel with individual values, it might be quicker to upload a large set of values as a 1D array and operate on them.
Also sqrt isn’t done in HW on Cuda (at least not on my GPU) whereas the CPU has optimized FPU HW for this and is probably 10x faster than the GPU, and for a small job like this is probably keeping all the results in cache between the timign runs.