I’d like to define a pointer type that is shared between device and host code and internally stores the device and host pointer to the shared memory. I want it to determine at compile time, which pointer to actually return:
#define F inline __host__ __device__
class SharedMemory;
/**
*
* Can only be set by allocating shared memory.
*/
template<typename T>
class SharedMemoryPtr {
public:
SharedMemoryPtr() : hptr(0), dptr(0) {}
//F ~ SharedMemoryPtr() {cudaFreeHost(hptr);} // Should be freed explicitly (?)
// TODO: Don't allow copying/overwriting (at least not without freeing memory...)
F T& operator() () {
#ifdef __CUDACC__
return *dptr;
#else
return *hptr;
#endif
};
F T* operator-> () {
#ifdef __CUDACC__
return dptr;
#else
return hptr;
#endif
};
F T& operator * () {
#ifdef __CUDACC__
return *dptr;
#else
return *hptr;
#endif
};
F T& operator[] (__int64 i) const {
#ifdef __CUDACC__
return *(dptr + i);
#else
return *(hptr + i);
#endif
};
friend SharedMemory;
// TODO: Make obsolete (possible?)
T * getHptr() {return hptr;}
T * getDptr() {return dptr;}
private:
T *hptr, *dptr;
};
class SharedMemory {
public:
template<typename T>
static SharedMemoryPtr<T> allocate(int count = 1) {
assert(count > 0);
SharedMemoryPtr<T> sptr;
cutilSafeCall(
cudaHostAlloc(&sptr.hptr, sizeof(T) * count, cudaHostAllocMapped));
assert(sptr.hptr);
cutilSafeCall(
cudaHostGetDevicePointer(&sptr.dptr, sptr.hptr, 0));
assert(sptr.dptr);
return sptr;
}
};
This works fine as long as I use these pointers in code that is either in a cpp file (where __CUDACC__ is never defined) or a .h file (where __CUDACC__ is only defined if the function is used by some function in a cu file). However in a __host__ function in a .cu file, I get the devptr. Obviously, .cu files are processed exclusively by nvcc. Is there some other preprocessor macro that is defined ONLY for __global__ and __device__ functions, not just everything nvcc happens to process? Or do I need to separate my code?
__CUDA_ARCH__is actual only for device code. You can specify device code behavior with it.This macro actually get compute capability of device code (like 200 for 2.0.)