Sign Up

Sign Up to our social questions and Answers Engine to ask questions, answer people’s questions, and connect with other people.

Have an account? Sign In

Have an account? Sign In Now

Sign In

Login to our social questions & Answers Engine to ask questions answer people’s questions & connect with other people.

Sign Up Here

Forgot Password?

Don't have account, Sign Up Here

Forgot Password

Lost your password? Please enter your email address. You will receive a link and will create a new password via email.

Have an account? Sign In Now

You must login to ask a question.

Forgot Password?

Need An Account, Sign Up Here

Please briefly explain why you feel this question should be reported.

Please briefly explain why you feel this answer should be reported.

Please briefly explain why you feel this user should be reported.

Sign InSign Up

The Archive Base

The Archive Base Logo The Archive Base Logo

The Archive Base Navigation

  • Home
  • SEARCH
  • About Us
  • Blog
  • Contact Us
Search
Ask A Question

Mobile menu

Close
Ask a Question
  • Home
  • Add group
  • Groups page
  • Feed
  • User Profile
  • Communities
  • Questions
    • New Questions
    • Trending Questions
    • Must read Questions
    • Hot Questions
  • Polls
  • Tags
  • Badges
  • Buy Points
  • Users
  • Help
  • Buy Theme
  • SEARCH
Home/ Questions/Q 6634525
In Process

The Archive Base Latest Questions

Editorial Team
  • 0
Editorial Team
Asked: May 25, 20262026-05-25T22:56:25+00:00 2026-05-25T22:56:25+00:00

A previous question asked how to find to find the maximum value of an

  • 0

A previous question asked how to find to find the maximum value of an array in CUDA efficiently: Finding max value in CUDA, the top response provided a link to a NVIDIA presentation on optimizing reduction kernels.

If you are using Visual Studio, simply remove the header reference, and everything between CPU EXECUTION.

I setup a variant which found the max, but it doesn’t match what the CPU is finding:

// Returns the maximum value of
// an array of size n
float GetMax(float *maxes, int n)
{
    int i = 0;
    float max = -100000;
    for(i = 0; i < n; i++)
    {
        if(maxes[i] > max)
            max = maxes[i];
    }

    return max;
}

// Too obvious...
__device__ float MaxOf2(float a, float b)
{
    if(a > b)   return a;
    else            return b;
}


__global__ void MaxReduction(int n, float *g_idata, float *g_odata)
{
    extern __shared__ float sdata[];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(BLOCKSIZE*2) + tid;
    unsigned int gridSize = BLOCKSIZE*2*gridDim.x;

    sdata[tid] = 0;

    //MMX(index,i)
    //MMX(index,i+blockSize)
    // Final Optimized Kernel
    while (i < n) {
        sdata[tid] = MaxOf2(g_idata[i], g_idata[i+BLOCKSIZE]);
        i += gridSize;
    }
    __syncthreads();

    if (BLOCKSIZE >= 512) { if (tid < 256) { sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 256]); } __syncthreads(); }
    if (BLOCKSIZE >= 256) { if (tid < 128) { sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 128]); } __syncthreads(); }
    if (BLOCKSIZE >= 128) { if (tid < 64) { sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 64]); } __syncthreads(); }

    if (tid < 32) {
        if (BLOCKSIZE >= 64) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 32]);
        if (BLOCKSIZE >= 32) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 16]);
        if (BLOCKSIZE >= 16 ) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 8]);
        if (BLOCKSIZE >= 8) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 4]);
        if (BLOCKSIZE >= 4) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 2]);
        if (BLOCKSIZE >= 2) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 1]);
    }

    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

I have a giant setup to test this algorithm:

#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <sys/time.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#include "book.h"

#define ARRAYSIZE 16384
#define GRIDSIZE 60
#define BLOCKSIZE 32
#define SIZEFLOAT 4

using namespace std;

// Function definitions
float GetMax(float *maxes, int n);
__device__ float MaxOf2(float a, float b);
__global__ void MaxReduction(int n, float *g_idata, float *g_odata);

// Returns random floating point number
float RandomReal(float low, float high)
{
    float d;

    d = (float) rand() / ((float) RAND_MAX + 1);
    return (low + d * (high - low));
}

int main()
{
    /*****************VARIABLE SETUP*****************/
    // Pointer to CPU numbers
    float *numbers;
    // Pointer to GPU numbers
    float *dev_numbers;
    // Counter
    int i = 0;

    // Randomize
    srand(time(0));

    // Timers
    // Kernel timers
    cudaEvent_t start_kernel, stop_kernel;
    float elapsedTime_kernel;
    HANDLE_ERROR(cudaEventCreate(&start_kernel));
    HANDLE_ERROR(cudaEventCreate(&stop_kernel));
    // cudaMalloc timers
    cudaEvent_t start_malloc, stop_malloc;
    float elapsedTime_malloc;
    HANDLE_ERROR(cudaEventCreate(&start_malloc));
    HANDLE_ERROR(cudaEventCreate(&stop_malloc));
    // CPU timers
    struct timeval start, stop;
    float elapsedTime = 0;
    /*****************VARIABLE SETUP*****************/


    /*****************CPU ARRAY SETUP*****************/
    // Setup CPU array
    HANDLE_ERROR(cudaHostAlloc((void**)&numbers, ARRAYSIZE * sizeof(float), cudaHostAllocDefault));
    for(i = 0; i < ARRAYSIZE; i++)
        numbers[i] = RandomReal(0, 50000.0);
    /*****************CPU ARRAY SETUP*****************/


    /*****************GPU ARRAY SETUP*****************/
    // Start recording cuda malloc time
    HANDLE_ERROR(cudaEventRecord(start_malloc,0));

    // Allocate memory to GPU
    HANDLE_ERROR(cudaMalloc((void**)&dev_numbers, ARRAYSIZE * sizeof(float)));
    // Transfer CPU array to GPU
    HANDLE_ERROR(cudaMemcpy(dev_numbers, numbers, ARRAYSIZE*sizeof(float), cudaMemcpyHostToDevice));
    // An array to temporarily store maximum values on the GPU
    float *dev_max;
    HANDLE_ERROR(cudaMalloc((void**)&dev_max, GRIDSIZE * sizeof(float)));
    // An array to hold grab the GPU max
    float maxes[GRIDSIZE];
    /*****************GPU ARRAY SETUP*****************/

    /*****************KERNEL EXECUTION*****************/
    // Start recording kernel execution time
    HANDLE_ERROR(cudaEventRecord(start_kernel,0));
    // Run kernel
    MaxReduction<<<GRIDSIZE, BLOCKSIZE, SIZEFLOAT*BLOCKSIZE>>> (ARRAYSIZE, dev_numbers, dev_max);
    // Transfer maxes over
    HANDLE_ERROR(cudaMemcpy(maxes, dev_max, GRIDSIZE * sizeof(float), cudaMemcpyDeviceToHost));
    // Print out the max
    cout << GetMax(maxes, GRIDSIZE) << endl;

    // Stop recording kernel execution time
    HANDLE_ERROR(cudaEventRecord(stop_kernel,0));
    HANDLE_ERROR(cudaEventSynchronize(stop_kernel));
    // Retrieve recording data
    HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime_kernel, start_kernel, stop_kernel));
    // Stop recording cuda malloc time
    HANDLE_ERROR(cudaEventRecord(stop_malloc,0));
    HANDLE_ERROR(cudaEventSynchronize(stop_malloc));
    // Retrieve recording data
    HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime_malloc, start_malloc, stop_malloc));
    // Print results
    printf("%5.3f\t%5.3f\n", elapsedTime_kernel,  elapsedTime_malloc);
    /*****************KERNEL EXECUTION*****************/


    /*****************CPU EXECUTION*****************/
    // Capture the start time
    gettimeofday(&start, NULL);
    // Call generic P7Viterbi function
    cout << GetMax(numbers, ARRAYSIZE) << endl;
    // Capture the stop time
    gettimeofday(&stop, NULL);
    // Retrieve time elapsed in milliseconds
    long int elapsed_sec = stop.tv_sec - start.tv_sec;
    long int elapsed_usec = stop.tv_usec - start.tv_usec;
    elapsedTime = (float)(1000.0f * elapsed_sec) + (float)(0.001f * elapsed_usec);
    // Print results
    printf("%5.3f\n", elapsedTime);
    /*****************CPU EXECUTION*****************/

    // Free memory
    cudaFreeHost(numbers);
    cudaFree(dev_numbers);
    cudaFree(dev_max);
    cudaEventDestroy(start_kernel);
    cudaEventDestroy(stop_kernel);
    cudaEventDestroy(start_malloc);
    cudaEventDestroy(stop_malloc);

    // Exit program
    return 0;
}

I ran cuda-memcheck on this test program, with -g & -G switches on, and it reports 0 problems. Can anyone spot the issue?

NOTE: Be sure to have book.h from the CUDA by Example book in your current directory when you compile the program. Source link here: http://developer.nvidia.com/cuda-example-introduction-general-purpose-gpu-programming
Download the source code, and book.h will be under the common directory/folder.

  • 1 1 Answer
  • 0 Views
  • 0 Followers
  • 0
Share
  • Facebook
  • Report

Leave an answer
Cancel reply

You must login to add an answer.

Forgot Password?

Need An Account, Sign Up Here

1 Answer

  • Voted
  • Oldest
  • Recent
  • Random
  1. Editorial Team
    Editorial Team
    2026-05-25T22:56:26+00:00Added an answer on May 25, 2026 at 10:56 pm

    Your kernel looks broken to me. The thread local search (before the shared memory reduction), should look something like this:

    sdata[tid] = g_idata[i];
    i += gridSize;
    
    while (i < n) {
        sdata[tid] = MaxOf2(sdata[tid], g_idata[i]);
        i += gridSize;
    }
    

    shouldn’t it?

    Also note that if you run this on Fermi, the shared memory buffer should be declared volatile, and you will get a noticeable improvement in performance if the thread local search is done with a register variable, rather than in shared memory. There is about an 8 times difference in effective bandwidth between the two.


    EDIT: Here is a simplified, working version of your reduction kernel. You should note a number of differences compared to your original:

    __global__ void MaxReduction(int n, float *g_idata, float *g_odata)
    {
        extern __shared__ volatile float sdata[];
        unsigned int tid = threadIdx.x;
        unsigned int i = blockIdx.x*(BLOCKSIZE) + tid;
        unsigned int gridSize = BLOCKSIZE*gridDim.x;
    
        float val = g_idata[i];
        i += gridSize;
        while (i < n) {
            val = MaxOf2(g_idata[i],val);
            i += gridSize;
        }
        sdata[tid] = val;
        __syncthreads();
    
        // This versions uses a single warp for the shared memory 
        // reduction
    # pragma unroll
        for(int i=(tid+32); ((tid<32)&&(i<BLOCKSIZE)); i+=32)
            sdata[tid] = MaxOf2(sdata[tid], sdata[i]);
    
        if (tid < 16) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 16]);
        if (tid < 8)  sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 8]);
        if (tid < 4)  sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 4]);
        if (tid < 2)  sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 2]);
        if (tid == 0) g_odata[blockIdx.x] = MaxOf2(sdata[tid], sdata[tid + 1]);
    }
    

    This code should also be safe on Fermi. You should also familiarise yourself with the CUDA math library, because there is a fmax(x,y) intrinsic which you should use in place of your MaxOf2 function.

    • 0
    • Reply
    • Share
      Share
      • Share on Facebook
      • Share on Twitter
      • Share on LinkedIn
      • Share on WhatsApp
      • Report

Sidebar

Related Questions

a month ago I asked this question: Trying to find the top 3 properties
I asked a similar question on a previous thread ( https://stackoverflow.com/questions/5206633/java-find-out-what-application-window-is-in-focus) but I was
In a previous question I asked, the suggested answer was for me to partition
So in a previous question I asked about implementing a generic interface with a
In a previous question I asked about how to group XML elements logically, and
This stems from a previous question I asked - about a write conflict with
This is a follow up question to a previous question I asked about calculating
This is a related to a previous question I have asked here, see the
In a previous question, I asked what it meant when my program returned an
Unit testing Abstract classes in Groovy I asked a question previous about unit testing

Explore

  • Home
  • Add group
  • Groups page
  • Communities
  • Questions
    • New Questions
    • Trending Questions
    • Must read Questions
    • Hot Questions
  • Polls
  • Tags
  • Badges
  • Users
  • Help
  • SEARCH

Footer

© 2021 The Archive Base. All Rights Reserved
With Love by The Archive Base

Insert/edit link

Enter the destination URL

Or link to existing content

    No search term specified. Showing recent items. Search or use up and down arrow keys to select an item.