Sign Up

Sign Up to our social questions and Answers Engine to ask questions, answer people’s questions, and connect with other people.

Have an account? Sign In

Have an account? Sign In Now

Sign In

Login to our social questions & Answers Engine to ask questions answer people’s questions & connect with other people.

Sign Up Here

Forgot Password?

Don't have account, Sign Up Here

Forgot Password

Lost your password? Please enter your email address. You will receive a link and will create a new password via email.

Have an account? Sign In Now

You must login to ask a question.

Forgot Password?

Need An Account, Sign Up Here

Please briefly explain why you feel this question should be reported.

Please briefly explain why you feel this answer should be reported.

Please briefly explain why you feel this user should be reported.

Sign InSign Up

The Archive Base

The Archive Base Logo The Archive Base Logo

The Archive Base Navigation

  • Home
  • SEARCH
  • About Us
  • Blog
  • Contact Us
Search
Ask A Question

Mobile menu

Close
Ask a Question
  • Home
  • Add group
  • Groups page
  • Feed
  • User Profile
  • Communities
  • Questions
    • New Questions
    • Trending Questions
    • Must read Questions
    • Hot Questions
  • Polls
  • Tags
  • Badges
  • Buy Points
  • Users
  • Help
  • Buy Theme
  • SEARCH
Home/ Questions/Q 8846229
In Process

The Archive Base Latest Questions

Editorial Team
  • 0
Editorial Team
Asked: June 14, 20262026-06-14T11:55:48+00:00 2026-06-14T11:55:48+00:00

For a project at university, i’m implementing matrix-vector multiplication using AMD OpenCL. The machine

  • 0

For a project at university, i’m implementing matrix-vector multiplication using AMD OpenCL.
The machine i’m using is a brand new desktop running Ubuntu 12.04, with a Radeon HD 7970 and an AMD FX-4100 quad-core processor. I AMD APP 1.2 and the latest ATI Catalyst drivers for the Radeon.
Here is the kernel I am trying to use.

__kernel void mvKernel(__global float* a, const __global float* x, __global float* y, int m, int n)
{
float sum = 0.0f;
 __global float* A;
int i;
int j = 0;
int indx = get_global_id(0);
__local float xs[2048];
for(i = get_local_id(0); i < n; i+= get_local_size(0)) {
    xs[i] = x[i];
} 
mem_fence(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);
A = &a[indx];
for(i = 0; i < n; i++) {
    sum += xs[i] * A[j];
    j += m;
}
y[indx] = sum;
}

When this is run on the GPU for matrix sizes 256 x 256, the results produced are correct and no problems occur. However when I try to increase the matrix size, given as command line arguments, the system will hang, requiring a reboot.
However when I run the code using AMD’s CodeXL debugger/profiler, the code will run most of the time, with no errors.
Here’s the host code I run

#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <math.h>
#include <string.h>

char* readSource(const char* sourceFilename);

void randomInit(float* data, int size)
{
int i =0;
for(i; i < size; i++)
    data[i] = (rand()/(float)RAND_MAX) * 10;
}

void cpuMV (float* y, float* A, float* X, int M, int N)
{
for(int i = 0; i< M; i++) {
    double sum = 0;
    y[i] = 0;
    for(int k = 0; k < N; k++) {
        double a = A[i + k* M];
        double x = X[k];
        sum += a * x;
    }
    y[i] = (float) sum;
 }
}

int main( int argc, char ** argv) {
int M = atoi(argv[1]);//1024;
int N = atoi(argv[2]);//1024;
float *A, *x;
float *y;
A = (float *)malloc(sizeof(float) * M * N);
x = (float *)malloc(sizeof(float) * N);
y = (float *)malloc(sizeof(float) * M);
randomInit(A, M * N);
randomInit(x, N);
int wrong;
wrong = 0;  
cl_int err;
cl_uint numPlatforms;
cl_platform_id *platforms;

err = clGetPlatformIDs(0, NULL, &numPlatforms);
if (err != CL_SUCCESS) {
    printf("clGetPlatformIDs failed\n");
    exit(-1);
}

if(numPlatforms == 0) {
    printf("No platforms detected.\n");
    exit(-1);   
}
platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));

clGetPlatformIDs(numPlatforms, platforms, NULL);

printf("%u platforms found\n", numPlatforms);
for(int i =0; i < numPlatforms; i++) {
    char buff[100];
    printf("Platform %u:\n", i);
    err = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(buff), buff, NULL);
    printf("\tVendor: %s\n", buff);
    err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(buff), buff, NULL);
    printf("\tName: %s\n", buff);
    if (err != CL_SUCCESS) {
        printf("clGetPlatformInfo failed\n");
        exit(-1);
    }
}
printf("\n");

cl_uint numDevices = 0;
cl_device_id *devices;
err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
if(err != CL_SUCCESS) {
    printf("clGetDeviceIDs failed\n");
    exit(-1);
}
if (numDevices == 0){
    printf("No devices found\n");
    exit(-1);
}
devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
printf("%u devices found\n", numDevices);
for(int i =0; i < numDevices; i++) {
    char buff[100];
    printf("Device %u:\n", i);
    err = clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buff), buff, NULL);
    printf("\tVendor: %s\n", buff);
    err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buff), buff, NULL);
    printf("\tName: %s\n", buff);
    if (err != CL_SUCCESS) {
        printf("clGetDeviceInfo failed\n");
        exit(-1);
    }
}
cl_context context;
context = clCreateContext(NULL, numDevices,devices, NULL, NULL, &err);
if(err != CL_SUCCESS){
    printf("clCreateContext failed\n");
    exit(-1);
}

cl_command_queue cmdQueue;
cmdQueue = clCreateCommandQueue(context, devices[0], 0, &err);
if(err != CL_SUCCESS) { 
    printf("clCreateCommandQueue failed\n");
    exit(-1);
}

cl_mem d_A, d_x;
cl_mem d_y;
d_A = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, M * N * sizeof(float), A, &err);
if (err != CL_SUCCESS) {
    printf("clCreateBuffer for A failed\n");
    exit(-1);
}
d_x = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,  N * sizeof(float), x, &err);
if (err != CL_SUCCESS) {
    printf("clCreateBuffer for x failed\n");
    exit(-1);
}
d_y = clCreateBuffer(context, CL_MEM_READ_WRITE, M * sizeof(float), NULL, &err);
if (err != CL_SUCCESS) {
    printf("clCreateBuffer for y failed\n");
    exit(-1);
}
cl_program program;
char* source;
const char *sourceFile = "MVM_Kernel2.cl";
source = readSource(sourceFile);
program = clCreateProgramWithSource(context, 1, (const char**) &source, NULL, &err);
if (err != CL_SUCCESS) {
    printf("clCreateProgramFailed");
    exit(-1);
}
cl_int buildErr;
buildErr = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
if (buildErr != CL_SUCCESS) {
    printf("Program failed to build,\n");
    cl_build_status buildStatus;
    for(int i = 0; i < numDevices; i++) {
        clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &buildStatus, NULL);
        if(buildStatus == CL_SUCCESS) {
            continue;
        }
        char *buildLog;
        size_t buildLogSize;
        clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize);
        buildLog = (char *)malloc(buildLogSize);
        clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG,buildLogSize, buildLog, NULL);
        buildLog[buildLogSize -1] = '\0';
        printf("Device %u Build Log:\n%s\n", i, buildLog);
        free(buildLog);
    }
    exit(0);
}
else {
    printf("No build errors\n");
}

cl_kernel kernel;
kernel = clCreateKernel(program, "mvKernel", &err);
if(err != CL_SUCCESS) {
    printf("clCreateKernel failed\n");
    exit(-1);
}
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_A);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_x);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_y);
err |= clSetKernelArg(kernel, 3, sizeof(int), &M);
err |= clSetKernelArg(kernel, 4, sizeof(int), &N);

size_t globalWorkSize[1];
globalWorkSize[0] = M * N;
size_t localWorkSize[1];
localWorkSize[0] = 256;

err = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
clEnqueueReadBuffer(cmdQueue, d_y, CL_TRUE, 0, M * sizeof(float), y, 0, NULL, NULL);
clFlush(cmdQueue);
err = clFinish(cmdQueue);
if(err != CL_SUCCESS) {
    printf("ERROR!!");
    exit(-1);
}
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(d_A);
clReleaseMemObject(d_x);
clReleaseMemObject(d_y);
clReleaseContext(context);
for(int i=0; i < (M <10 ? M : 10); i++)
    printf("vector y = %f\n", y[i]);
float* refY;
refY = (float*)malloc(M*sizeof(float));
cpuMV(refY, A, x, M, N);
for (int i = 0; i < M; ++i) {
    float diff = refY[i] - y[i];
    if (fabsf(diff)/ refY[i] > 1e-4)
        wrong++;
}
printf("There were %d errors!!\n", wrong);
free(A);
free(y);
free(x);
free(source);
free(platforms);    
free(devices);
}

char* readSource(const char *sourceFilename) {
FILE *fp;
int errs;
int size;
char *source;
fp = fopen(sourceFilename, "rb");
errs = fseek(fp, 0, SEEK_END);
if(errs != 0) {
    printf("Error seeking to end of file");
    exit(-1);
}
size = ftell(fp);
if(size<0) {
    printf("Errror getting file position");
    exit(-1);
}
errs = fseek(fp, 0, SEEK_SET);
if(errs != 0){
    printf("Error seeking to start of file\n");
    exit(-1);
}
source = (char*)malloc(size +1);
errs = fread(source, 1, size, fp);
if(errs != size) {
    printf("only read %d bytes\n", errs);
    exit(0);
}
source[size]= '\0';
return source;
}

Eventually this needs to work on matrices of order ~10000
EDIT
I’ve also tried the same code on my laptop which has an Nvidia GT525m, and the program runs fine for matrices upto 352 * 352, any bigger and the answer will just be zero, but it doesn’t crash.

  • 1 1 Answer
  • 0 Views
  • 0 Followers
  • 0
Share
  • Facebook
  • Report

Leave an answer
Cancel reply

You must login to add an answer.

Forgot Password?

Need An Account, Sign Up Here

1 Answer

  • Voted
  • Oldest
  • Recent
  • Random
  1. Editorial Team
    Editorial Team
    2026-06-14T11:55:50+00:00Added an answer on June 14, 2026 at 11:55 am

    The problem was with globalWorkSize being far too big (M * N) when it should have been just M. This must have been overloading the GPU and causing the system freeze. I now have the code running reliably on both Nvidia and AMD GPUs as well as the AMD CPU

    • 0
    • Reply
    • Share
      Share
      • Share on Facebook
      • Share on Twitter
      • Share on LinkedIn
      • Share on WhatsApp
      • Report

Sidebar

Related Questions

We are implementing an university project: a car-pooling service in Java. We need to
I am implementing a small database(university Project) and i am facing the following problem.
I am developing an iPhone application for a university project and I'm new to
I am working on a university project, which involves profile and event management using
I've got a university project to create a site using a framework given to
I am coding POP3 and SMTP servers using Java for an university project. I
I am writing a simple graphic editor for a university project using C#. I
I'm writing a platform game for my university project using the canvas element and
I'm using JavaCV for a university project. It is essentially a motion detector. I
I am currently working on a university project, using HTML5. I have been writing

Explore

  • Home
  • Add group
  • Groups page
  • Communities
  • Questions
    • New Questions
    • Trending Questions
    • Must read Questions
    • Hot Questions
  • Polls
  • Tags
  • Badges
  • Users
  • Help
  • SEARCH

Footer

© 2021 The Archive Base. All Rights Reserved
With Love by The Archive Base

Insert/edit link

Enter the destination URL

Or link to existing content

    No search term specified. Showing recent items. Search or use up and down arrow keys to select an item.