my following minimalist Cuda code returns an incorrect result (all polygons have 0 vertices at the end) while the same code running in serial in C++ is working well. The problem is embarrassingly parallel : no communication, no syncthreads etc., and the Cuda memory allocations are sucessful. Even my dummy variable that stores the content of the input array for debug purpose is 0 for the Cuda version. There is no access out of bounds since my arrays are largely large enough. Replacing the memcpy by a loop in Cuda doesn’t change anything.
I really don’t understand what happens… any idea ? Thanks!
Cuda code:
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <cuda.h>
class Point2D {
public:
__device__ Point2D(double xx=0, double yy=0):x(xx),y(yy){};
double x, y;
};
__device__ double dot(const Point2D &A, const Point2D &B) {
return A.x*B.x + A.y*B.y;
}
__device__ Point2D operator*(double a, const Point2D &P) {
return Point2D(a*P.x, a*P.y);
}
__device__ Point2D operator+(Point2D A, const Point2D &B) {
return Point2D(A.x + B.x, A.y + B.y);
}
__device__ Point2D operator-(Point2D A, const Point2D &B) {
return Point2D(A.x - B.x, A.y - B.y);
}
__device__ Point2D inter(const Point2D &A, const Point2D &B, const Point2D &C, const Point2D &D) { //intersects AB by *the mediator* of CD
Point2D M = 0.5*(C+D);
return A - (dot(A-M, D-C)/dot(B-A, D-C)) * (B-A);
}
class Polygon {
public:
__device__ Polygon():nbpts(0){};
__device__ void addPts(Point2D pt) {
pts[nbpts] = pt;
nbpts++;
};
__device__ Polygon& operator=(const Polygon& rhs) {
nbpts = rhs.nbpts;
dummy = rhs.dummy;
memcpy(pts, rhs.pts, nbpts*sizeof(Point2D));
return *this;
}
__device__ void cut(const Point2D &inside_pt, const Point2D &outside_pt) {
int new_nbpts = 0;
Point2D newpts[128];
Point2D AB(outside_pt-inside_pt);
Point2D M(0.5*(outside_pt+inside_pt));
double ABM = dot(AB, M);
Point2D S = pts[nbpts-1];
for (int i=0; i<nbpts; i++) {
Point2D E = pts[i];
double ddot = -ABM + dot(AB, E);
if (ddot<0) { // E inside clip edge
double ddot2 = -ABM + dot(AB, S);
if (ddot2>0) {
newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
new_nbpts++;
}
newpts[new_nbpts] = E;
new_nbpts++;
} else {
double ddot2 = -ABM + dot(AB, S);
if (ddot2<0) {
newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
new_nbpts++;
}
}
S = E;
}
memcpy(pts, newpts, min(128, new_nbpts)*sizeof(Point2D));
nbpts = new_nbpts;
}
//private:
Point2D pts[128];
int nbpts;
float dummy;
};
__global__ void cut_poly(float *a, Polygon* polygons, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx>=N/2) return;
Polygon pol;
pol.addPts(Point2D(0.,0.));
pol.addPts(Point2D(1.,0.));
pol.addPts(Point2D(1.,1.));
pol.addPts(Point2D(0.,1.));
Point2D curPt(a[2*idx], a[2*idx+1]);
for (int i=0; i<N/2; i++) {
Point2D other_pt(a[2*i], a[2*i+1]);
pol.cut(curPt, other_pt);
}
pol.dummy = a[idx];
polygons[idx] = pol;
}
int main(int argc, unsigned char* argv[])
{
const int N = 100;
float a_h[N], *a_d;
Polygon p_h[N/2], *p_d;
size_t size = N * sizeof(float);
size_t size_pol = N/2 * sizeof(Polygon);
cudaError_t err = cudaMalloc((void **) &a_d, size);
cudaError_t err2 = cudaMalloc((void **) &p_d, size_pol);
for (int i=0; i<N; i++) a_h[i] = (float)(rand()%1000)*0.001;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
cut_poly <<< n_blocks, block_size >>> (a_d, p_d, N);
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(p_h, p_d, sizeof(Polygon)*N/2, cudaMemcpyDeviceToHost);
for (int i=0; i<N/2; i++)
printf("%f \t %f \t %u\n", a_h[i], p_h[i].dummy, p_h[i].nbpts);
cudaFree(a_d);
cudaFree(p_d);
return 0;
}
Same code in C++ that works properly:
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
class Point2D {
public:
Point2D(double xx=0, double yy=0):x(xx),y(yy){};
double x, y;
};
double dot(const Point2D &A, const Point2D &B) {
return A.x*B.x + A.y*B.y;
}
Point2D operator*(double a, const Point2D &P) {
return Point2D(a*P.x, a*P.y);
}
Point2D operator+(Point2D A, const Point2D &B) {
return Point2D(A.x + B.x, A.y + B.y);
}
Point2D operator-(Point2D A, const Point2D &B) {
return Point2D(A.x - B.x, A.y - B.y);
}
Point2D inter(const Point2D &A, const Point2D &B, const Point2D &C, const Point2D &D) { //intersects AB by *the mediator* of CD
Point2D M = 0.5*(C+D);
return A - (dot(A-M, D-C)/dot(B-A, D-C)) * (B-A);
}
class Polygon {
public:
Polygon():nbpts(0){};
void addPts(Point2D pt) {
pts[nbpts] = pt;
nbpts++;
};
Polygon& operator=(const Polygon& rhs) {
nbpts = rhs.nbpts;
dummy = rhs.dummy;
memcpy(pts, rhs.pts, nbpts*sizeof(Point2D));
return *this;
}
void cut(const Point2D &inside_pt, const Point2D &outside_pt) {
int new_nbpts = 0;
Point2D newpts[128];
Point2D AB(outside_pt-inside_pt);
Point2D M(0.5*(outside_pt+inside_pt));
double ABM = dot(AB, M);
Point2D S = pts[nbpts-1];
for (int i=0; i<nbpts; i++) {
Point2D E = pts[i];
double ddot = -ABM + dot(AB, E);
if (ddot<0) { // E inside clip edge
double ddot2 = -ABM + dot(AB, S);
if (ddot2>0) {
newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
new_nbpts++;
}
newpts[new_nbpts] = E;
new_nbpts++;
} else {
double ddot2 = -ABM + dot(AB, S);
if (ddot2<0) {
newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
new_nbpts++;
}
}
S = E;
}
memcpy(pts, newpts, std::min(128, new_nbpts)*sizeof(Point2D));
/*for (int i=0; i<128; i++) {
pts[i] = newpts[i];
}*/
nbpts = new_nbpts;
}
//private:
Point2D pts[128];
int nbpts;
float dummy;
};
void cut_poly(int idx, float *a, Polygon* polygons, int N)
{
if (idx>=N/2) return;
Polygon pol;
pol.addPts(Point2D(0.,0.));
pol.addPts(Point2D(1.,0.));
pol.addPts(Point2D(1.,1.));
pol.addPts(Point2D(0.,1.));
Point2D curPt(a[2*idx], a[2*idx+1]);
for (int i=0; i<N/2; i++) {
if (idx==i) continue;
Point2D other_pt(a[2*i], a[2*i+1]);
pol.cut(curPt, other_pt);
}
pol.dummy = a[idx];
polygons[idx] = pol;
}
int main(int argc, unsigned char* argv[])
{
const int N = 100; // Number of elements in arrays
float a_h[N], *a_d; // Pointer to host & device arrays
Polygon p_h[N/2], *p_d;
for (int i=0; i<N; i++) a_h[i] = (float)(rand()%1000)*0.001;
for (int idx=0; idx<N; idx++)
cut_poly(idx, a_h, p_h, N);
for (int i=0; i<N/2; i++)
printf("%f \t %f \t %u\n", a_h[i], p_h[i].dummy, p_h[i].nbpts);
return 0;
}
Well I guess you can disregard most of my comments. I was by mistake working on a machine I had set up with CUDA 3.2 and it was behaving differently along the lines of the kernel launch failure. When I switched to CUDA 4.1 and CUDA 5.0 things started to make sense. Apologies for my confusion there.
Anyway after getting past that, I pretty quickly noticed that there is a difference between your CPU and GPU implementations. Specifically here (looking at the CPU code):
Referring to the line I have added the comment to above, if you add that exact line of code to the corresponding spot in your GPU code in the
cut_polykernel, then for me anyway the GPU code produces the same printed result as the CPU code.One other observation I would make is that you are needlessly running blocks with just 4 threads. Nothing wrong with that while you’re working out the kinks in the code, but once you have it running for “production” purposes, you will most likely want to target a higher number like 256, and be sure to choose a number that is an integer multiple of 32, for best performance.
In response to a question posted in the comments, I believe that the data is being copied properly, but most likely you are not accessing it correctly on the host. (I don’t know how you’re determining that “my array is not properly returned to the host”). Most of your class definitions were
__device__only. As a result, it’s difficult to access structures within classes on the host (e.g. thePoint2D ptsclass within thePolygonclass). I’m inserting modified code here which I think demonstrates that the data is being transferred back to the host:I would suggest using posting new questions for these things.