My program uses the CUDA Radix Sort Class. After an update from CUDA 4.0 to 4.2 a class auxiliary init function is crashing with the message “Stack around the variable ‘devprop’ was corrupted”. I have isolated the problem commenting some function code and found that cudaGetDeviceProperties is corrupting devprop. I just don’t know why this is happening and how to fix the problem. My setup is CUDA 4.2, dev driver 301.32, Nsight 2.2, Windows 7 64 bits, compiling for Win32. The following snippet has the crashing initDeviceParameters() auxiliary function:
namespace nvRadixSort
{
#include "radixsort.h"
#include "cudpp/cudpp.h"
#include <stdio.h>
#include <assert.h>
bool bManualCoalesce = false;
bool bUsePersistentCTAs = false;
void initDeviceParameters(bool keysOnly)
{
int deviceID = -1;
if(cudaSuccess == cudaGetDevice(&deviceID))
{
cudaDeviceProp devprop;
cudaGetDeviceProperties(&devprop, deviceID);
int smVersion = devprop.major * 10 + devprop.minor;
// sm_12 and later devices don't need help with coalesce in reorderData kernel
bManualCoalesce = (smVersion < 12);
bUsePersistentCTAs = (smVersion < 20);
if(bUsePersistentCTAs)
{
//Irrelevant. My setup is 2.1
}
}
}
}
And this is the relevant class code:
#include <cuda_runtime_api.h>
#include "cudpp/cudpp.h"
namespace nvRadixSort
{
class RadixSort
{
public:
RadixSort(unsigned int maxElements, bool keysOnly = false)
: mScanPlan(0),
mNumElements(0),
mTempKeys(0),
mTempValues(0),
mCounters(0),
mCountersSum(0),
mBlockOffsets(0)
{
// Allocate temporary storage
initialize(maxElements, keysOnly);
}
protected: // data
CUDPPHandle mCudppContext;
CUDPPHandle mScanPlan; // CUDPP plan handle for prefix sum
unsigned int mNumElements; // Number of elements of temp storage allocated
unsigned int *mTempKeys; // Intermediate storage for keys
unsigned int *mTempValues; // Intermediate storage for values
unsigned int *mCounters; // Counter for each radix
unsigned int *mCountersSum; // Prefix sum of radix counters
unsigned int *mBlockOffsets; // Global offsets of each radix in each block
protected: // methods
void initialize(unsigned int numElements, bool keysOnly)
{
// initialize parameters based on present CUDA device
initDeviceParameters(keysOnly);
// Allocate temporary storage
mNumElements = numElements;
unsigned int numBlocks = ((numElements % (CTA_SIZE * 4)) == 0) ?
(numElements / (CTA_SIZE * 4)) : (numElements / (CTA_SIZE * 4) + 1);
unsigned int numBlocks2 = ((numElements % (CTA_SIZE * 2)) == 0) ?
(numElements / (CTA_SIZE * 2)) : (numElements / (CTA_SIZE * 2) + 1);
// Initialize scan
cudppCreate(&mCudppContext);
CUDPPConfiguration scanConfig;
scanConfig.algorithm = CUDPP_SCAN;
scanConfig.datatype = CUDPP_UINT;
scanConfig.op = CUDPP_ADD;
scanConfig.options = CUDPP_OPTION_EXCLUSIVE | CUDPP_OPTION_FORWARD;
cudppPlan(mCudppContext , &mScanPlan, scanConfig, 16 * numBlocks2, 1, 0);
cudaMalloc((void **)&mTempKeys, numElements * sizeof(unsigned int));
if(!keysOnly)
cudaMalloc((void **)&mTempValues, numElements * sizeof(unsigned int));
cudaMalloc((void **)&mCounters, WARP_SIZE_ * numBlocks * sizeof(unsigned int));
cudaMalloc((void **)&mCountersSum, WARP_SIZE_ * numBlocks * sizeof(unsigned int));
cudaMalloc((void **)&mBlockOffsets, WARP_SIZE_ * numBlocks * sizeof(unsigned int));
checkCudaError("RadixSort::initialize()");
}
}
Solved the problem. It was that my Visual Studio project still using CUDA 4.0 build rules and tools even with CUDA 4.2 version available. Just changed the project file to use the new files and that did the trick.