I’m trying to implement the Particle Swarm Optimization on CUDA. I’m partially initializing data

Question

0

Asked: May 29, 20262026-05-29T15:32:57+00:00 2026-05-29T15:32:57+00:00

I’m trying to implement the Particle Swarm Optimization on CUDA. I’m partially initializing data

0

I’m trying to implement the Particle Swarm Optimization on CUDA. I’m partially initializing data arrays on host, then I allocate memory on CUDA and copy it there, and then try to proceed with the initialization.

The problem is, when I’m trying to modify array element like so

__global__ void kernelInit(
    float* X, 
    size_t pitch, 
    int width, 
    float X_high, 
    float X_low
) {
    // Silly, but pretty reliable way to address array elements
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int r = tid / width;
    int c = tid % width;
    float* pElement = (float*)((char*)X + r * pitch) + c;
    *pElement = *pElement * (X_high - X_low) - X_low;
    //*pElement = (X_high - X_low) - X_low;
}

It corrupts the values and gives me 1.#INF00 as array element. When I uncomment the last line *pElement = (X_high - X_low) - X_low; and comment the previous, it works as expected: I get values like 15.36 and so on.

I believe the problem is either with my memory allocation and copying, and/or with adressing the specific array element. I read the CUDA manual about these both topics, but I can’t spot the error: I still get corrupt array if I do anything with the element of the array. For example, *pElement = *pElement * 2 gives unreasonable big results like 779616...00000000.00000 when the initial pElement is expected to be just a float in [0;1].

Here is the full source. Initialization of arrays begins in main (bottom of the source), then f1 function does the work for CUDA and launches the initialization kernel kernelInit:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>

const unsigned f_n = 3;
const unsigned n = 2;
const unsigned p = 64;

typedef struct {
    unsigned k_max;
    float c1;
    float c2;
    unsigned p;
    float inertia_factor;
    float Ef;
    float X_low[f_n];
    float X_high[f_n];
    float X_min[n][f_n];
} params_t;

typedef void (*kernelWrapperType) (
    float *X, 
    float *X_highVec, 
    float *V, 
    float *X_best, 
    float *Y, 
    float *Y_best, 
    float *X_swarmBest, 
    bool &termination, 
    const float &inertia, 
    const params_t *params,
    const unsigned &f
);

typedef float (*twoArgsFuncType) (
    float x1, 
    float x2
);

__global__ void kernelInit(
    float* X, 
    size_t pitch, 
    int width, 
    float X_high, 
    float X_low
) {
    // Silly, but pretty reliable way to address array elements
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int r = tid / width;
    int c = tid % width;
    float* pElement = (float*)((char*)X + r * pitch) + c;
    *pElement = *pElement * (X_high - X_low) - X_low;
    //*pElement = (X_high - X_low) - X_low;
}

__device__ float kernelF1(
    float x1, 
    float x2
) {
    float y = pow(x1, 2.f) + pow(x2, 2.f);
    return y;
}

void f1(
    float *X, 
    float *X_highVec, 
    float *V, 
    float *X_best, 
    float *Y, 
    float *Y_best, 
    float *X_swarmBest, 
    bool &termination, 
    const float &inertia, 
    const params_t *params,
    const unsigned &f
) {
    float *X_d = NULL;
    float *Y_d = NULL;
    unsigned length = n * p;
    const cudaChannelFormatDesc desc = cudaCreateChannelDesc<float4>();
    size_t pitch;
    size_t dpitch;
    cudaError_t err;
    unsigned width = n;
    unsigned height = p;

    err = cudaMallocPitch (&X_d, &dpitch, width * sizeof(float), height);
    pitch = n * sizeof(float);
    err = cudaMemcpy2D(X_d, dpitch, X, pitch, width * sizeof(float), height, cudaMemcpyHostToDevice);

    err = cudaMalloc (&Y_d, sizeof(float) * p);
    err = cudaMemcpy (Y_d, Y, sizeof(float) * p, cudaMemcpyHostToDevice);

    dim3 threads; threads.x = 32;
    dim3 blocks; blocks.x = (length/threads.x) + 1;

    kernelInit<<<threads,blocks>>>(X_d, dpitch, width, params->X_high[f], params->X_low[f]);

    err = cudaMemcpy2D(X, pitch, X_d, dpitch, n*sizeof(float), p, cudaMemcpyDeviceToHost);
    err = cudaFree(X_d);

    err = cudaMemcpy(Y, Y_d, sizeof(float) * p, cudaMemcpyDeviceToHost);
    err = cudaFree(Y_d);
}

float F1(
    float x1, 
    float x2
) {
    float y = pow(x1, 2.f) + pow(x2, 2.f);
    return y;
}

/*
 * Generates random float in [0.0; 1.0]
 */
float frand(){
    return (float)rand()/(float)RAND_MAX;
}

/*
 * This is the main routine which declares and initializes the integer vector, moves it to the device, launches kernel
 * brings the result vector back to host and dumps it on the console.
 */
int main() {
    const params_t params = {
        100, 
        0.5,
        0.5,
        p,
        0.98,
        0.01,
        {-5.12, -2.048, -5.12},
        {5.12, 2.048, 5.12},
        {{0, 1, 0}, {0, 1, 0}}
    };
    float X[p][n];
    float X_highVec[n];
    float V[p][n];
    float X_best[p][n];
    float Y[p] = {0};
    float Y_best[p] = {0};
    float X_swarmBest[n];

    kernelWrapperType F_wrapper[f_n] = {&f1, &f1, &f1};
    twoArgsFuncType F[f_n] = {&F1, &F1, &F1};

    for (unsigned f = 0; f < f_n; f++) {
        printf("Optimizing function #%u\n", f);

        srand ( time(NULL) );
        for (unsigned i = 0; i < p; i++)
            for (unsigned j = 0; j < n; j++)
                X[i][j] = X_best[i][j] = frand();
        for (int i = 0; i < n; i++)
            X_highVec[i] = params.X_high[f];
        for (unsigned i = 0; i < p; i++)
            for (unsigned j = 0; j < n; j++)
                V[i][j] = frand();
        for (unsigned i = 0; i < p; i++)
            Y_best[i] = F[f](X[i][0], X[i][1]);
        for (unsigned i = 0; i < n; i++)
            X_swarmBest[i] = params.X_high[f];
        float y_swarmBest = F[f](X_highVec[0], X_highVec[1]);

        bool termination = false;
        float inertia = 1.;

        for (unsigned k = 0; k < params.k_max; k++) {
            F_wrapper[f]((float *)X, X_highVec, (float *)V, (float *)X_best, Y, Y_best, X_swarmBest, termination, inertia, &params, f);
        }

        for (unsigned i = 0; i < p; i++)
        {
            for (unsigned j = 0; j < n; j++)
            {
                printf("%f\t", X[i][j]);
            }
            printf("F = %f\n", Y[i]);
        }
        getchar();
    }
}

Update: I tried adding error handling like so

err = cudaMallocPitch (&X_d, &dpitch, width * sizeof(float), height);
if (err != cudaSuccess) {
    fprintf(stderr, cudaGetErrorString(err));
    exit(1);
}

after each API call, but it gave me nothing and didn’t return (I still get all the results and program works to the end).

Report

Leave an answer
Cancel reply

You must login to add an answer.

Need An Account,

1 Answer

Editorial Team · Answer 1 · 2026-05-29T15:32:58+00:00

This is an unnecessarily complex piece of code for what should be a simple repro case, but this immediately jumps out:

const unsigned n = 2;
const unsigned p = 64;

unsigned length = n * p

dim3 threads; threads.x = 32;
dim3 blocks; blocks.x = (length/threads.x) + 1;

kernelInit<<<threads,blocks>>>(X_d, dpitch, width, params->X_high[f], params->X_low[f]);

So you are firstly computing the incorrect number of blocks, and then reversing the order of the blocks per grid and threads per block arguments in the kernel launch. That may well lead to out of bounds memory access, either hosing something in GPU memory or causing an unspecified launch failure, which your lack of error handling might not be catching. There is a tool called cuda-memcheck which has been shipped with the toolkit since about CUDA 3.0. If you run it, it will give you valgrind style memory access violation reports. You should get into the habit of using it, if you are not already doing so.

As for infinite values, that is to be expected isn’t it? Your code starts with values in (0,1), and then does

X[i] = X[i] * (5.12--5.12) - -5.12

100 times, which is the rough equivalent of multiplying by 10^100, which is then followed by

X[i] = X[i] * (2.048--2.048) - -2.048

100 times, which is the rough equivalent of multiplying by 4^100, finally followed by

X[i] = X[i] * (5.12--5.12) - -5.12

again. So your results should be of the order of 1E250, which is much larger than the maximum 3.4E38 which is the rough upper limit of representable numbers in IEEE 754 single precision.

Sign Up

Sign In

Forgot Password

The Archive Base Latest Questions

I’m trying to implement the Particle Swarm Optimization on CUDA. I’m partially initializing data

Leave an answerCancel reply

1 Answer

Leave an answer
Cancel reply