I am doing something in CUDA (FFT), but I have no idea why it is generating exceptions when calling the kernel function.
All includes and definitions:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#define CPU_ARRAY_SIZE 1024 // 1024, 2048, 4096 8192
#define GPU_ARRAY_SIZE 512 //
#define THREAD_SIZE 16 // fixed
#define BLOCK_SIZE (GPU_ARRAY_SIZE/THREAD_SIZE) // 32
#define PI 3.14
As I am running it in a NVIDIA GTX480, I thought it could be the shared memory space, although it doesn’t seem to be (as there are “some many” shared variables). So, I aws changing the GPU_ARRAY_SIZE to see how it works, and it was giving me different results when I define it as 32, 64, 256, 512 (in the 512 case, it returns ALL zeros, which I guess CUDA couldn’t make anything – in other cases, it returns weird, as I don’t know the reason why it jumps 16 cells without any calculation). In most cases, in the Output window of my Microsoft Visual Studio, it returns billions of exceptions of the style “First-chance exception at 0x75b9b9bc in .exe: Microsoft C++ exception: cudaError_enum at memory location “. Before you ask me to debug, I cannot debug it, as the VS doesn’t do that for files that are not recognized by VS (like .cpp – at least this theory works in my case).
Do you guys have any idea for the questions:
1. why is it generating exceptions?
2. why is it calculating, what it should do for every cell in every block, just within few cells
How could I solve this problem… any idea?
Kernel function:
__global__ void twiddle_factor(double *d_isub_matrix, double *d_osub_matrix)
{
__shared__ double block[THREAD_SIZE][THREAD_SIZE];
__shared__ double spectrum[THREAD_SIZE][THREAD_SIZE];
__shared__ double sum_cos[THREAD_SIZE][THREAD_SIZE]; // declaring the shared sum_cos.. similarly for sum_sin
__shared__ double sum_sin[THREAD_SIZE][THREAD_SIZE];
__shared__ double local_cos[THREAD_SIZE][THREAD_SIZE]; // declaring the shared sum_cos.. similarly for sum_sin
__shared__ double local_sin[THREAD_SIZE][THREAD_SIZE];
unsigned int xIndex = threadIdx.x + blockIdx.x* blockDim.x;
unsigned int yIndex = threadIdx.y + blockIdx.y* blockDim.y;
int u;
int x=0,y=0;
int tx = threadIdx.x;
int ty = threadIdx.y;
double sum_sines=0.0,sum_cosines=0.0;
double angle=(2*PI)/GPU_ARRAY_SIZE;
block[tx][ty] = d_isub_matrix[yIndex*GPU_ARRAY_SIZE+xIndex];
__syncthreads();
//for every column!
for(u=0; u<THREAD_SIZE; u++)
{
/* All threads calculate its own sin and cos value. */
local_sin[tx][ty] = block[tx][ty] * sin((angle*ty)*u);
local_cos[tx][ty] = block[tx][ty] * cos((angle*ty)*u);
/* Only one row is activate. The thread in row adds all element of its column. */
if (ty == u)
{
sum_sines = 0.0;
sum_cosines = 0.0;
/* Access each column to add all elements of the column.*/
for (y=0; y<THREAD_SIZE; y++)
{
sum_sines += local_sin[tx][y];
sum_cosines += local_cos[tx][y];
}
//if (sum_sines < 0)
//sum_sin[u][tx] = ((-1)*sum_sines)/GPU_ARRAY_SIZE;
//else
sum_sin[u][tx] = sum_sines/GPU_ARRAY_SIZE;
//if (sum_cosines < 0)
//sum_cos[u][tx] = ((-1)*sum_cosines)/GPU_ARRAY_SIZE;
//else
sum_cos[u][tx] = sum_cosines/GPU_ARRAY_SIZE;
}
__syncthreads();
}
spectrum[tx][ty] = sqrt((double)pow(sum_sin[tx][ty],2)
+(double)pow(sum_cos[tx][ty],2));
__syncthreads();
block[tx][ty] = spectrum[tx][ty];
__syncthreads();
//for every row!
for(u=0; u<THREAD_SIZE; u++)
{
/* All threads calculate its own sin and cos value. */
local_sin[tx][ty] = block[tx][ty] * sin((angle*ty)*u);
local_cos[tx][ty] = block[tx][ty] * cos((angle*ty)*u);
/* Only one column is activate. The thread in colum adds all element of its row. */
if (tx == u)
{
sum_sines = 0.0;
sum_cosines = 0.0;
for (x=0; x<THREAD_SIZE; x++)
{
sum_sines += local_sin[x][ty];
sum_cosines += local_cos[x][ty];
}
//if (sum_sines < 0)
//sum_sin[ty][u] = ((-1)*sum_sines)/GPU_ARRAY_SIZE;
//else
sum_sin[ty][u] = sum_sines/GPU_ARRAY_SIZE;
//if (sum_cosines < 0)
//sum_cos[ty][u] = ((-1)*sum_cosines)/GPU_ARRAY_SIZE;
//else
sum_cos[ty][u] = sum_cosines/GPU_ARRAY_SIZE;
}
__syncthreads();
}
spectrum[tx][ty] = sqrt((double)pow(sum_sin[tx][ty],2)+(double)pow(sum_cos[tx][ty],2));
__syncthreads();
/* Transpose! I think this is not necessary part. */
d_osub_matrix[xIndex*GPU_ARRAY_SIZE + yIndex] = spectrum[threadIdx.y][threadIdx.x];
__syncthreads();
}
The main function:
int main(int argc, char** argv)
{
int i,j, w, h, sw, sh;
int numSubblock = CPU_ARRAY_SIZE / GPU_ARRAY_SIZE;
double *d_isub_matrix,*d_osub_matrix;
double *big_matrix = new double[CPU_ARRAY_SIZE*CPU_ARRAY_SIZE];
double *big_matrix2 = new double[CPU_ARRAY_SIZE*CPU_ARRAY_SIZE];
double *isub_matrix = new double[GPU_ARRAY_SIZE*GPU_ARRAY_SIZE];
double *osub_matrix = new double[GPU_ARRAY_SIZE*GPU_ARRAY_SIZE];
cudaEvent_t start,stop;
float elapsedtime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
for (i=0; i<CPU_ARRAY_SIZE; i++)
{
for (j=0; j<CPU_ARRAY_SIZE; j++)
big_matrix[i*CPU_ARRAY_SIZE + j] = rand();//i*CPU_ARRAY_SIZE + j;
}
cudaEventRecord(start,0);
//cudaMalloc((void**)&d_isub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)*2);
//cudaMalloc((void**)&d_osub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)*2);
for(i = 0; i < numSubblock; i++)
{
for (j=0; j < numSubblock; j++)
{
// start position of subarea of big array
cudaMalloc((void**)&d_isub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float));
cudaMalloc((void**)&d_osub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float));
h = i*GPU_ARRAY_SIZE;
w = j*GPU_ARRAY_SIZE;
//printf("h = %d, w=%d",h,w);
//system("PAUSE");
// move subarea of big array into isub array.
for (sh = 0; sh < GPU_ARRAY_SIZE; sh++)
{
for (sw = 0; sw <GPU_ARRAY_SIZE; sw++)
{
isub_matrix[sh*GPU_ARRAY_SIZE+sw] = big_matrix[(h+sh)*CPU_ARRAY_SIZE + (w+sw)];
}
}
cudaMemcpy(d_isub_matrix,isub_matrix,((GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)),cudaMemcpyHostToDevice);
//call the cuda kernel
dim3 blocks(BLOCK_SIZE, BLOCK_SIZE);
dim3 threads(THREAD_SIZE, THREAD_SIZE);
twiddle_factor<<<blocks, threads>>>(d_isub_matrix,d_osub_matrix);
cudaMemcpy(osub_matrix,d_osub_matrix,((GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);
for (sh = 0; sh < GPU_ARRAY_SIZE; sh++)
{
for (sw = 0; sw <GPU_ARRAY_SIZE; sw++)
{
big_matrix2[(h+sh)*CPU_ARRAY_SIZE + (w+sw)] = osub_matrix[sh*GPU_ARRAY_SIZE+sw];
printf(" sh %d sw %d %lf \n", sh, sw, osub_matrix[sh*GPU_ARRAY_SIZE+sw]);
}
}
printf("passei por aqui algumas vezes\n");
cudaFree(d_osub_matrix);
cudaFree(d_isub_matrix);
}
}
// cudaFree(d_osub_matrix);
// cudaFree(d_isub_matrix);
//Stop the time
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedtime,start,stop);
//showing the processing time
printf("The processing time took... %fms to execute everything",elapsedtime);
system("PAUSE");
for (sh = 0; sh < CPU_ARRAY_SIZE; sh++)
{
for (sw = 0; sw <CPU_ARRAY_SIZE; sw++)
{
printf(" sh %d sw %d %lf \n", sh, sw, big_matrix2[sh*CPU_ARRAY_SIZE+sw]);
}
}
system("PAUSE");
// I guess the result is "[1][0] = [1], [1][512] = [513], [513][0] = [524289], [513][512] = [524801]".
}
By a short look the problem could and should be the folling lines:
You are allocating just to few memory for your double values on the GPU. Your sub matrix is allocated with 4 byte per point where 8 byte are needed.