I am attempting to implement a shared memory based matrix multiplication kernel as outlined

Question

0

Asked: June 16, 20262026-06-16T20:57:58+00:00 2026-06-16T20:57:58+00:00

I am attempting to implement a shared memory based matrix multiplication kernel as outlined

0

I am attempting to implement a shared memory based matrix multiplication kernel as outlined in the CUDA C Programming Guide. The following is the kernel:

 __global__ void matrixMultiplyShared(float * A, float * B, float * C,
                     int ARows, int AColumns,
                     int BRows, int BColumns,
                     int CRows, int CColumns) {
     float * CSub = &C[CColumns * 16 * blockIdx.y + 16 * blockIdx.x];
     float CValue = 0;
 for (int k = 0; k < (AColumns / 16); ++k) {
         float * ASub =  &A[AColumns * 16 * blockIdx.y + 16 * k];
         float * BSub = &B[AColumns*16*k + 16*blockIdx.y];
         __shared__ float As[16][16];
         __shared__ float Bs[16][16];
         As[threadIdx.y][threadIdx.x] = ASub[threadIdx.y*AColumns+threadIdx.x];
         Bs[threadIdx.y][threadIdx.x] = BSub[threadIdx.y*AColumns+threadIdx.x];
         __syncthreads();
         for (int n = 0; n < 16; ++n)
        CValue += As[threadIdx.y][n] * Bs[n][threadIdx.x];
         __syncthreads();
     }
     CSub[threadIdx.x*CColumns+threadIdx.y]=CValue;
 }

While the following shows the kernel launch:

 dim3 dimBlock(16, 16, 1);
 dim3 dimGrid;
 dimGrid.x = (CColumns + dimBlock.x - 1)/dimBlock.x;
 dimGrid.y = (CRows + dimBlock.y - 1)/dimBlock.y;
 matrixMultiplyShared<<<dimGrid , dimBlock>>>(deviceA , deviceB , deviceC , ARows , AColumns, BRows ,BColumns , CRows , CColumns);

Unfortunately this seems to produce incorrect results.

Any assistance/explanations would be greatly appreciated.

Report

Leave an answer
Cancel reply

You must login to add an answer.

Need An Account,

1 Answer

Editorial Team · Answer 1 · 2026-06-16T20:57:59+00:00

There are at least 2 basic errors in your kernel, both relatively trivial. Where you have this:

     float * BSub = &B[AColumns*16*k + 16*blockIdx.y];

You should use this:

     float * BSub = &B[AColumns*16*k + 16*blockIdx.x];

And where you have this:

 CSub[threadIdx.x*CColumns+threadIdx.y]=CValue;

You should use this:

 CSub[threadIdx.y*CColumns+threadIdx.x]=CValue;

This should allow you to get basic correctness under the following conditions:

square matrices
matrix dimensions evenly divisible by tile dimension

Fixing the square matrix limitation is not difficult. Fixing the dimension limitation on the tile dimension involves considerable changes to the kernel, in order to:

not process out-of-range elements
properly populate your shared memory area with values that are appropriate in the “border” regions

Since your code doesn’t comprehend any of this, I wasn’t sure if you’re asking about it and chose not to address those issues specifically.

I was able to get the following adaptation of your code working as a basic example:
(note that for the benefit of reduced code size to look at, I have dispensed with usual CUDA error checking. Please don’t use this as a representative example of good coding. Do proper error checking. The point of my answer is not to explain good CUDA error checking but to show an algorithmically correct example.)

#include <stdio.h>
#include <math.h>
#define TILE_DIM 16
#define DIMX 256
#define DIMY 256
#define RES 0.1

__global__ void matrixMultiplyShared(float * A, float * B, float * C,
                     int ARows, int AColumns,
                     int BRows, int BColumns,
                     int CRows, int CColumns) {
     float CValue = 0;
     if (((blockIdx.y * blockDim.y + threadIdx.y)< CRows) && ((blockIdx.x * blockDim.x + threadIdx.x) < CColumns)) {
       for (int k = 0; k < (AColumns / TILE_DIM); ++k) {
         float * ASub =  &A[AColumns * TILE_DIM * blockIdx.y + TILE_DIM * k];
         float * BSub = &B[AColumns*TILE_DIM*k + TILE_DIM*blockIdx.x];
         __shared__ float As[TILE_DIM][TILE_DIM];
         __shared__ float Bs[TILE_DIM][TILE_DIM];
         As[threadIdx.y][threadIdx.x] = ASub[threadIdx.y*AColumns+threadIdx.x];
         Bs[threadIdx.y][threadIdx.x] = BSub[threadIdx.y*AColumns+threadIdx.x];
         __syncthreads();
         for (int n = 0; n < TILE_DIM; ++n)
         CValue += As[threadIdx.y][n] * Bs[n][threadIdx.x];
         __syncthreads();
       }
       C[((blockIdx.y * blockDim.y + threadIdx.y)*CColumns)+(blockIdx.x*blockDim.x)+threadIdx.x]=CValue;
     }
 }


void matrixMultiplyCPU(float * A, float * B, float * C,
                     int ARows, int AColumns,
                     int BRows, int BColumns,
                     int CRows, int CColumns) {
  for (int i = 0; i<ARows; i++)
    for (int j=0; j<BColumns; j++){
      float Ctemp = 0.0;
      for (int k=0; k<AColumns; k++)
        Ctemp += A[i*AColumns + k] * B[k*BColumns+j];
      C[i*CColumns+j] = Ctemp;
      }

}
int main(){
 int CColumns = DIMY, CRows=DIMX, AColumns=DIMY, ARows=DIMX, BColumns=DIMY, BRows=DIMX;
 dim3 dimBlock(TILE_DIM, TILE_DIM, 1);
 dim3 dimGrid;
 dimGrid.x = (CColumns + dimBlock.x - 1)/dimBlock.x;
 dimGrid.y = (CRows + dimBlock.y - 1)/dimBlock.y;
 float *deviceA, *deviceB, *deviceC;
 float hostA[DIMY][DIMX];
 float hostB[DIMY][DIMX];
 float hostC[DIMY][DIMX];
 float hostCp[DIMY][DIMX];
 for (int x = 0; x<DIMX; x++)
   for (int y = 0; y<DIMY; y++) {
     hostA[y][x] = rand()/(float)RAND_MAX;
     hostB[y][x] = rand()/(float)RAND_MAX;
     }
  cudaMalloc((void **)&deviceA, DIMX*DIMY*sizeof(float));
  cudaMalloc((void **)&deviceB, DIMX*DIMY*sizeof(float));
  cudaMalloc((void **)&deviceC, DIMX*DIMY*sizeof(float));
  cudaMemcpy(deviceA, hostA, DIMX*DIMY*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(deviceB, hostB, DIMX*DIMY*sizeof(float), cudaMemcpyHostToDevice);
  matrixMultiplyShared<<<dimGrid , dimBlock>>>(deviceA , deviceB , deviceC , ARows , AColumns, BRows ,BColumns , CRows , CColumns);
  cudaMemcpy(hostC, deviceC, DIMX*DIMY*sizeof(float), cudaMemcpyDeviceToHost);
  matrixMultiplyCPU(&(hostA[0][0]) , &(hostB[0][0]) , &(hostCp[0][0]) , ARows , AColumns, BRows ,BColumns , CRows , CColumns);

 for (int y = 0; y<DIMY; y++)
   for (int x = 0; x<DIMX; x++)
     if (fabs(hostCp[y][x] - hostC[y][x]) > RES)
       {
       printf("Error at offset y=%d,x=%d, CPU = %f, GPU = %f\n", y, x, hostCp[y][x], hostC[y][x]);
       return 1;
       }
 printf("Finished!\n");
 return 0;
}

Sign Up

Sign In

Forgot Password

The Archive Base Latest Questions

I am attempting to implement a shared memory based matrix multiplication kernel as outlined

Leave an answerCancel reply

1 Answer

Leave an answer
Cancel reply