I ran into this very weird problem when coding up some CUDA code: the same piece of cudaMemcpy from the gpu to cpu memory takes different time to finish in different iterative calls to the subroutine, and this is a huge difference: ~60 ms vs ~0.02 ms.
The code is as follows:
float calc_formation_obj( int formationNo, bool calcObj )
{
int i;
int prev = prevCP[aperIndex];
int next = nextCP[aperIndex];
float ll = formations_l[formationNo];
float rl = formations_r[formationNo];
float obj = 0.0;
float *f_grid = new float[grid_size_voxe];
// use ll and rl
thrust::device_ptr<float> dll(d_leafpos_l);
thrust::device_ptr<float> drl(d_leafpos_r);
dll[rows_per_beam*aperIndex+ rowIndex] = ll;
drl[rows_per_beam*aperIndex+ rowIndex] = rl;
// set all leaf positions between prev/next
set_leafpos<<<grid_size_ncps,BLOCK_SIZE>>> (aperIndex, rowIndex, prev, next, ncps, d_leafpos_l,
d_leafpos_r, ll, rl, rows_per_beam, d_cp_angles);
// copy dose to dose_temp
thrust::device_ptr<float> ddose(d_dose);
thrust::device_ptr<float> dtp(d_dose_temp);
thrust::copy(ddose, ddose+nvoxel, dtp);
// the angles actually being added
if (prev==-1) {
prev = 0;
}
if (next==ncps) {
next = ncps-1;
}
// add dose from all these leaf positions
// if last arg 1 then add
add_remove_dose<<<grid_size_ncps,BLOCK_SIZE>>> (prev,next, rowIndex, d_dose_temp, d_leafpos_l,
d_leafpos_r, d_voxe_b, d_dijs_b, d_voxnumperbixcum, d_flu_cp, rows_per_beam, bix_per_row, beamletSize, 1);
if (!calcObj) {
return(0.0);
}
// initialize
cudaMemset((void*)d_f_voxel, 0, voxesize_f);
cudaMemset((void*)d_f_grid, 0, sizeof(float)*grid_size_voxe);
// then calculate objective
calc_obj_dose<<<grid_size_voxe,BLOCK_SIZE>>>( d_dose_temp, d_f_voxel, d_thresh, d_is_target, nvoxel,
d_f_grid, d_od_wt, d_ud_wt );
// copy results from GPU
time_t time_1,time_2;
float elapse;
time_1=clock();
cudaMemcpy(f_grid, d_f_grid, sizeof(float)*grid_size_voxe, cudaMemcpyDeviceToHost);
time_2 = clock();
elapse = ((float)time_2 - (float)time_1)/1000;
printf( "iter %d copy time: %f ms\n", formationNo, elapse );
obj = 0.0;
for (i=0; i<grid_size_voxe; i++) {
obj += f_grid[i];
}
delete[] f_grid;
return(obj);
}
This subroutine is called many times during the program, and everytime it runs I record the runtime for
cudaMemcpy(f_grid, d_f_grid, sizeof(float)*grid_size_voxe, cudaMemcpyDeviceToHost);
and the results I got look like:
iter 0 copy time: 0.018000 ms
iter 1 copy time: 66.445999 ms
iter 2 copy time: 64.239998 ms
iter 3 copy time: 66.959999 ms
iter 4 copy time: 66.328003 ms
iter 5 copy time: 65.656998 ms
iter 6 copy time: 66.120003 ms
iter 7 copy time: 63.811001 ms
iter 8 copy time: 66.530998 ms
iter 9 copy time: 65.686996 ms
iter 10 copy time: 65.808998 ms
iter 11 copy time: 0.027000 ms
iter 12 copy time: 64.346001 ms
iter 13 copy time: 66.407997 ms
iter 14 copy time: 65.796997 ms
iter 15 copy time: 65.471001 ms
iter 16 copy time: 66.209000 ms
iter 17 copy time: 63.799000 ms
iter 18 copy time: 66.542999 ms
iter 19 copy time: 65.660004 ms
iter 20 copy time: 65.102997 ms
iter 21 copy time: 0.019000 ms
iter 22 copy time: 64.665001 ms
iter 23 copy time: 66.653999 ms
iter 24 copy time: 65.648003 ms
iter 25 copy time: 65.233002 ms
iter 26 copy time: 65.851997 ms
iter 27 copy time: 63.992001 ms
iter 28 copy time: 66.172997 ms
iter 29 copy time: 65.503998 ms
iter 30 copy time: 0.020000 ms
iter 31 copy time: 66.277000 ms
iter 32 copy time: 63.881001 ms
iter 33 copy time: 66.537003 ms
iter 34 copy time: 65.626999 ms
iter 35 copy time: 65.387001 ms
iter 36 copy time: 66.084999 ms
iter 37 copy time: 63.797001 ms
iter 38 copy time: 0.017000 ms
iter 39 copy time: 65.707001 ms
iter 40 copy time: 65.553001 ms
iter 41 copy time: 66.362999 ms
iter 42 copy time: 63.634998 ms
This is run on Mac OSX with a GeForce GTX 285, using CUDA 4.2. I’ve no idea why it’s doing this — should be a straightforward copy. Any help is appreciated!!
The
cudaMemcpyblocks until the previous kernel completes. Are you sure you’re not measuring kernel performance instead of copy performance? Throw in acudaDeviceSynchronize()before you start timing thecudaMemcpy.