I’m doing the matrix multiplication example from the book CUDA C Programming Guide, page 35, for practice, I copied the code and completed the missing code. I understand the logic of the program and how it should work, but I get no the expected result.
Here is the complete code i made, I do not know if the error is mine or from the example?
The code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <stdio.h>
using namespace std;
#define BLOCK_SIZE 16
typedef struct
{
int width;
int height;
float *elements;
}Matrix;
__global__ void MatMulKernel(const Matrix,const Matrix, Matrix C);
void MatMul(const Matrix A,const Matrix B, Matrix C)
{
size_t size;
//Matrix A creation y storage in device memory
Matrix d_A;
d_A.width=A.width;
d_A.height=A.height;
size=A.height*A.width*sizeof(float);
cudaMalloc(&d_A.elements,size);
cudaMemcpy(d_A.elements,A.elements,size,cudaMemcpyHostToDevice);
//Matrix B creation y storage in device memory
Matrix d_B;
d_B.width=B.width;
d_B.height=B.height;
size=B.height*B.width*sizeof(float);
cudaMalloc(&d_B.elements,size);
cudaMemcpy(d_B.elements,B.elements,size,cudaMemcpyHostToDevice);
//Matrix C creation y storage in device memory
Matrix d_C;
d_C.width=C.width;
d_C.height=C.height;
size=C.height*C.width*sizeof(float);
cudaMalloc(&d_C.elements,size);
//
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(B.width/dimBlock.x,A.height/dimBlock.y);
MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);
//Copy the result in the matrix C from the device to the host.
cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);
//edit the missing code.
// for(int i=0;i<BLOCK_SIZE*BLOCK_SIZE;i++){cout<<C.elements[i]<<endl;}
// result in random numbers
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
float Cvalue=0;
int row=blockIdx.y*blockDim.y+threadIdx.y;
int col=blockIdx.x*blockDim.x+threadIdx.x;
for(int e=0;e<A.width;++e)
{
Cvalue+=A.elements[row*A.width+e]*B.elements[e*B.width+col];
}
C.elements[row*C.width+col]=Cvalue;
}
int main()
{
cout<<"Matrices"<<endl;
//Declarationd of the A,B,C matrix´s
float a[15][15];
float b[15][15];
float c[15][15];
//Fill the matrix whit some numbers.
int cont0=0;
for(int c=0;c<15;c++)
{
for(int v=0;v<15;v++)
{
a[v][c]=cont0;
b[v][c]=cont0;
cont0++;
}
}
//Flatten the matrix for the passing to the kernel
int offset=0;
float a_t[256];
float b_t[256];
for(int y=0;y<15;y++)
{
for(int x=0;x<15;x++)
{
a_t[x+offset]=a[x][y];
b_t[x+offset]=a[x][y];
}
offset=offset+15;
}
float t_C[256];
//Completing the matrix format for the kernel.
Matrix m_A;
m_A.height=15;
m_A.width=15;
m_A.elements=a_t;
Matrix m_B;
m_B.height=15;
m_B.width=15;
m_B.elements=b_t;
Matrix m_C;
m_C.height=15;
m_C.width=15;
m_C.elements=t_C;
//Passing the formated matrix to the kernel.
MatMul(m_A,m_B,m_C);
cout<<"Final"<<endl;
return 0;
}
The program compiles and runs but the result matrix C.elements from: cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);
is a random number. I’ve tried to use it like a pointer to a array but i don’t get anything from it and treating it like array does not work either.
I will be glad if anyone can help me to finish this.
Your code has minor miss match between array indexing in kernel and initialization on CPU. Here is the corrected code with debugging suggested by @harrism:
Check the output. If you see the results are wrong, check the kernel error on your system which is reported in output.