I’m doing the matrix multiplication example from the book CUDA C Programming Guide, page

Question

0

Asked: June 12, 20262026-06-12T05:46:35+00:00 2026-06-12T05:46:35+00:00

I’m doing the matrix multiplication example from the book CUDA C Programming Guide, page

0

I’m doing the matrix multiplication example from the book CUDA C Programming Guide, page 35, for practice, I copied the code and completed the missing code. I understand the logic of the program and how it should work, but I get no the expected result.

Here is the complete code i made, I do not know if the error is mine or from the example?

The code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>    
#include <stdio.h>
#include <stdio.h>

using namespace std;
#define BLOCK_SIZE 16

typedef struct
{
    int width;
    int height;
    float *elements;
}Matrix;

__global__ void MatMulKernel(const Matrix,const Matrix, Matrix C);

void MatMul(const Matrix A,const Matrix B, Matrix C) 
{
    size_t size;
    //Matrix A creation y storage in device memory 
    Matrix d_A;
    d_A.width=A.width;
    d_A.height=A.height;
    size=A.height*A.width*sizeof(float);
    cudaMalloc(&d_A.elements,size);
    cudaMemcpy(d_A.elements,A.elements,size,cudaMemcpyHostToDevice);
    //Matrix B creation y storage in device memory 
    Matrix d_B;
    d_B.width=B.width;
    d_B.height=B.height;
    size=B.height*B.width*sizeof(float);
    cudaMalloc(&d_B.elements,size);
    cudaMemcpy(d_B.elements,B.elements,size,cudaMemcpyHostToDevice);
    //Matrix C creation y storage in device memory         
    Matrix d_C;
    d_C.width=C.width;
    d_C.height=C.height;
    size=C.height*C.width*sizeof(float);
    cudaMalloc(&d_C.elements,size);
    //        
    dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
    dim3 dimGrid(B.width/dimBlock.x,A.height/dimBlock.y);
    MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);
    //Copy the result in the matrix C from the device to the host.        
    cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);  
    //edit the missing code.
    // for(int i=0;i<BLOCK_SIZE*BLOCK_SIZE;i++){cout<<C.elements[i]<<endl;}      
    // result in random numbers
    cudaFree(d_A.elements);
    cudaFree(d_B.elements);
    cudaFree(d_C.elements);
}

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
    float Cvalue=0;
    int row=blockIdx.y*blockDim.y+threadIdx.y;
    int col=blockIdx.x*blockDim.x+threadIdx.x;
    for(int e=0;e<A.width;++e)
    {
        Cvalue+=A.elements[row*A.width+e]*B.elements[e*B.width+col];
    }
    C.elements[row*C.width+col]=Cvalue;
}

int main()
{
    cout<<"Matrices"<<endl;
    //Declarationd of the A,B,C matrix´s
    float a[15][15];        
    float b[15][15];
    float c[15][15];
    //Fill the matrix whit some numbers.
    int cont0=0;
    for(int c=0;c<15;c++)
    {
        for(int v=0;v<15;v++)
        {
            a[v][c]=cont0;
            b[v][c]=cont0;
            cont0++;
        }
    }
    //Flatten the matrix for the passing to the kernel
    int offset=0;
    float a_t[256];
    float b_t[256];
    for(int y=0;y<15;y++)
    {                        
        for(int x=0;x<15;x++)
        {
            a_t[x+offset]=a[x][y];
            b_t[x+offset]=a[x][y];
        }
        offset=offset+15;
    }
    float t_C[256];
    //Completing the matrix format for the kernel.
    Matrix m_A;
    m_A.height=15;
    m_A.width=15;
    m_A.elements=a_t;
    Matrix m_B;
    m_B.height=15;
    m_B.width=15;
    m_B.elements=b_t;
    Matrix m_C;
    m_C.height=15;
    m_C.width=15;
    m_C.elements=t_C;
    //Passing the formated matrix to the kernel.
    MatMul(m_A,m_B,m_C);                
    cout<<"Final"<<endl;        
return 0;
}

The program compiles and runs but the result matrix C.elements from: cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);
is a random number. I’ve tried to use it like a pointer to a array but i don’t get anything from it and treating it like array does not work either.

I will be glad if anyone can help me to finish this.

Report

Leave an answer
Cancel reply

You must login to add an answer.

Need An Account,

1 Answer

Editorial Team · Answer 1 · 2026-06-12T05:46:36+00:00

Your code has minor miss match between array indexing in kernel and initialization on CPU. Here is the corrected code with debugging suggested by @harrism:

    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include <iostream>
    #include <stdio.h>
    #include <stdio.h>

    using namespace std;
    #define BLOCK_SIZE 16

    typedef struct
    {
        int width;
        int height;
        float *elements;
    }Matrix;

    __global__ void MatMulKernel(const Matrix,const Matrix, Matrix C);

    void MatMul(const Matrix A,const Matrix B, Matrix C)
    {
        size_t size;
        //Matrix A creation y storage in device memory
        Matrix d_A;
        d_A.width=A.width;
        d_A.height=A.height;
        size=A.height*A.width*sizeof(float);
        cudaMalloc(&d_A.elements,size);
        cudaMemcpy(d_A.elements,A.elements,size,cudaMemcpyHostToDevice);
        //Matrix B creation y storage in device memory
        Matrix d_B;
        d_B.width=B.width;
        d_B.height=B.height;
        size=B.height*B.width*sizeof(float);
        cudaMalloc(&d_B.elements,size);
    cudaMemcpy(d_B.elements,B.elements,size,cudaMemcpyHostToDevice);
    //Matrix C creation y storage in device memory
    Matrix d_C;
    d_C.width=C.width;
    d_C.height=C.height;
    //cudaMalloc(&d_C,sizeof(Matrix));
    //cudaMemcpy(d_C,C,sizeof(Matrix),cudaMemcpyHostToDevice);
    size=C.height*C.width*sizeof(float);
    cudaMalloc(&d_C.elements,size);
    //
    dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
    dim3 dimGrid(B.width/dimBlock.x,A.height/dimBlock.y);
    MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);
    //Copy the result in the matrix C from the device to the host.
    printf("error code: %s\n",cudaGetErrorString(cudaGetLastError()));
    cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);
    //
    cudaFree(d_A.elements);
    cudaFree(d_B.elements);
    cudaFree(d_C.elements);
}

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
        //printf("%d\n",threadIdx.x);
    float Cvalue=0;
    int row=blockIdx.y*blockDim.y+threadIdx.y;
    int col=blockIdx.x*blockDim.x+threadIdx.x;
    for(int e=0;e<A.width;++e)
    {
        Cvalue+=A.elements[row*A.width+e]*B.elements[e*B.width+col];
    }
    C.elements[row*C.width+col]=Cvalue;
}

int print_matrix(Matrix A){
        printf("Matrix:\n");
        int i;
        for(i=0; i<A.width*A.height; i++){
                if(i%A.width==0) printf("\n");
                printf("%6.4f\t",A.elements[i]);
        }
        printf("\n");
}
int main()
{
    cout<<"Matrices"<<endl;
    //Declarationd of the A,B,C matrix.s
    float a[BLOCK_SIZE][BLOCK_SIZE];
    float b[BLOCK_SIZE][BLOCK_SIZE];
    float c[BLOCK_SIZE][BLOCK_SIZE];
    //Fill the matrix whit some numbers.
    int cont0=0;
    for(int c=0;c<BLOCK_SIZE;c++)
    {
        for(int v=0;v<BLOCK_SIZE;v++)
        {
            a[v][c]=cont0;
            b[v][c]=cont0;
            cont0++;
        }
    }
    //Flatten the matrix for the passing to the kernel
    int offset=0;
    float a_t[BLOCK_SIZE*BLOCK_SIZE];
    float b_t[BLOCK_SIZE*BLOCK_SIZE];
    for(int y=0;y<BLOCK_SIZE;y++)
    {
        for(int x=0;x<BLOCK_SIZE;x++)
        {
            a_t[x+offset]=a[x][y];
            b_t[x+offset]=a[x][y];
        }
        offset=offset+BLOCK_SIZE;
    }
    float t_C[BLOCK_SIZE*BLOCK_SIZE];
    //Completing the matrix format for the kernel.
    Matrix m_A;
    m_A.height=BLOCK_SIZE;
    m_A.width=BLOCK_SIZE;
    m_A.elements=a_t;
    Matrix m_B;
    m_B.height=BLOCK_SIZE;
    m_B.width=BLOCK_SIZE;
    m_B.elements=b_t;
    Matrix m_C;
    m_C.height=BLOCK_SIZE;
    m_C.width=BLOCK_SIZE;
    m_C.elements=t_C;
    //Passing the formated matrix to the kernel.
    print_matrix(m_A);
    print_matrix(m_B);
    MatMul(m_A,m_B,m_C);
    print_matrix(m_C);
    cout<<"Final"<<endl;
return 0;
}

Check the output. If you see the results are wrong, check the kernel error on your system which is reported in output.

Sign Up

Sign In

Forgot Password

The Archive Base Latest Questions

I’m doing the matrix multiplication example from the book CUDA C Programming Guide, page

Leave an answerCancel reply

1 Answer

Leave an answer
Cancel reply