I am working on an application which divides a string into pieces and assigns

Question

0

Asked: June 15, 20262026-06-15T03:46:20+00:00 2026-06-15T03:46:20+00:00

I am working on an application which divides a string into pieces and assigns

0

I am working on an application which divides a string into pieces and assigns each to a block. Within each block the the text is scanned character by character and a shared array of int, D is to be updated by different threads in parallel based on the character read. At the end of each iteration the last element of D is checked, and if it satisfied the condition, a global int array m is set to 1 at the position corresponding to the text. This code was executed on a NVIDIA GEForce Fermi 550, and runs even slower than the CPU version. I have just included the kernel here:

__global__ void match(uint32_t* BB_d,const char* text_d,int n, int m,int k,int J,int lc,int start_addr,int tBlockSize,int overlap ,int* matched){
    __shared__ int D[MAX_THREADS+2];
    __shared__ char Text_S[MAX_PATTERN_SIZE];
    __shared__ int DNew[MAX_THREADS+2];
    __shared__ int BB_S[4][MAX_THREADS];
    int w=threadIdx.x+1;

    for(int i=0;i<4;i++)
    {
        BB_S[i][threadIdx.x]= BB_d[i*J+threadIdx.x];
    }

    {
        D[threadIdx.x] = 0;
        {
            D[w] = (1<<(k+1)) -1;

            for(int i = 0; i < lc - 1; i++)
            {
                D[w] = (D[w] << k+2) + (1<<(k+1)) -1;
            }
        }
        D[J+1] = (1<<((k+2)*lc)) - 1;
    }
    int startblock=(blockIdx.x == 0?start_addr:(start_addr+(blockIdx.x * (tBlockSize-overlap))));
    int size= (((startblock + tBlockSize) > n )? ((n- (startblock))):( tBlockSize));

    int copyBlock=(size/J)+ ((size%J)==0?0:1);
    if((threadIdx.x * copyBlock) <= size)
        memcpy(Text_S+(threadIdx.x*copyBlock),text_d+(startblock+threadIdx.x*copyBlock),(((((threadIdx.x*copyBlock))+copyBlock) > size)?(size-(threadIdx.x*copyBlock)):copyBlock));
    memcpy(DNew, D, (J+2)*sizeof(int));
    __syncthreads();
    uint32_t initial = D[1];
    uint32_t x;
    uint32_t mask = 1;
    for(int i = 0; i < lc - 1; i++)mask = (mask<<(k+2)) + 1;
    for(int i = 0; i < size;i++)
    {
        {
            x =  ((D[w] >> (k+2)) | (D[w - 1] << ((k + 2)* (lc - 1))) | (BB_S[(((int)Text_S[i])/2)%4][w-1])) & ((1 << (k + 2)* lc) - 1);
            DNew[w] = ((D[w]<<1) | mask)
                & (((D[w] << k+3) | mask|((D[w +1] >>((k+2)*(lc - 1)))<<1)))
                & (((x + mask) ^ x) >> 1)
                & initial;
        }
        __syncthreads();
        memcpy(D, DNew, (J+2)*sizeof(int));
        if(!(D[J] & 1<<(k + (k + 2)*(lc*J -m + k ))))
        {
            matched[startblock+i] = 1;
            D[J] |= ((1<<(k + 1 + (k + 2)*(lc*J -m + k ))) - 1);
        }
    }
}

I am not very familiar with CUDA so I dont quite understand issues such as shared memory bank conflicts. Could that be the bottleneck here?

As asked, this is the code where I launch the kernels:

#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#define uint32_t unsigned int
#define MAX_THREADS 512
#define MAX_PATTERN_SIZE 1024
#define MAX_BLOCKS 8
#define MAX_STREAMS 16
#define TEXT_MAX_LENGTH 1000000000
void calculateBBArray(uint32_t** BB,const char* pattern_h,int m,int k , int lc , int J){};
void checkCUDAError(const char *msg) {
    cudaError_t err = cudaGetLastError();
    if( cudaSuccess != err) 
    {   
            fprintf(stderr, "Cuda error: %s: %s.\n", msg, 
                            cudaGetErrorString( err) );
            exit(EXIT_FAILURE);
    }    
}
char* getTextString() {
 FILE *input, *output;
 char c;
 char * inputbuffer=(char *)malloc(sizeof(char)*TEXT_MAX_LENGTH);

int numchars = 0, index  = 0;

input = fopen("sequence.fasta", "r");
c = fgetc(input);
while(c != EOF)
{
inputbuffer[numchars] = c;
numchars++;
c = fgetc(input);
}
fclose(input);
inputbuffer[numchars] = '\0'; 
return inputbuffer;
}

int main(void) {
const char pattern_h[] = "TACACGAGGAGAGGAGAAGAACAACGCGACAGCAGCAGACTTTTTTTTTTTTACAC";
char * text_h=getTextString();  //reading text from file, supported upto 200MB currently

int k = 13;
int i;
int count=0;
char *pattern_d, *text_d;     // pointers to device memory
char* text_new_d;
int* matched_d;
int* matched_new_d;
uint32_t* BB_d;
uint32_t* BB_new_d;
int* matched_h = (int*)malloc(sizeof(int)* strlen(text_h));
cudaMalloc((void **) &pattern_d, sizeof(char)*strlen(pattern_h)+1);
cudaMalloc((void **) &text_d, sizeof(char)*strlen(text_h)+1);
cudaMalloc((void **) &matched_d, sizeof(int)*strlen(text_h));
cudaMemcpy(pattern_d, pattern_h, sizeof(char)*strlen(pattern_h)+1, cudaMemcpyHostToDevice);
cudaMemcpy(text_d, text_h, sizeof(char)*strlen(text_h)+1, cudaMemcpyHostToDevice);
cudaMemset(matched_d, 0,sizeof(int)*strlen(text_h));

int m = strlen(pattern_h);
int n = strlen(text_h);

uint32_t* BB_h[4];
    unsigned int maxLc = ((((m-k)*(k+2)) > (31))?(31/(k+2)):(m-k));
unsigned int lc=2;   // Determines the number of threads per block
    // can be varied upto maxLc for tuning performance
if(lc>maxLc)
{
    exit(0);
}
unsigned int noWordorNfa =((m-k)/lc) + (((m-k)%lc)  == 0?0:1);
cudaMalloc((void **) &BB_d, sizeof(int)*noWordorNfa*4);
if(noWordorNfa >= MAX_THREADS)
{
    printf("Error: max threads\n");
    exit(0);
}

calculateBBArray(BB_h,pattern_h,m,k,lc,noWordorNfa);  // not included this function

for(i=0;i<4;i++)
{
    cudaMemcpy(BB_d+ i*noWordorNfa, BB_h[i], sizeof(int)*noWordorNfa, cudaMemcpyHostToDevice);
}
int overlap=m;
int textBlockSize=(((m+k+1)>n)?n:(m+k+1));
cudaStream_t stream[MAX_STREAMS];
for(i=0;i<MAX_STREAMS;i++) {
    cudaStreamCreate( &stream[i] );
    }

int start_addr=0,index=0,maxNoBlocks=0;
if(textBlockSize>n)
{
    maxNoBlocks=1;
}
else
{
     maxNoBlocks=((1 + ((n-textBlockSize)/(textBlockSize-overlap)) + (((n-textBlockSize)%(textBlockSize-overlap)) == 0?0:1)));
}
int kernelBlocks = ((maxNoBlocks > MAX_BLOCKS)?MAX_BLOCKS:maxNoBlocks);
int blocksRemaining =maxNoBlocks;
printf(" maxNoBlocks %d kernel Blocks %d \n",maxNoBlocks,kernelBlocks);
while(blocksRemaining >0)
{
kernelBlocks = ((blocksRemaining > MAX_BLOCKS)?MAX_BLOCKS:blocksRemaining);
printf(" Calling %d Blocks with starting Address %d , textBlockSize %d \n",kernelBlocks,start_addr,textBlockSize);
match<<<kernelBlocks,noWordorNfa,0,stream[(index++)%MAX_STREAMS]>>>(BB_d,text_d,n,m,k,noWordorNfa,lc,start_addr,textBlockSize,overlap,matched_d);
start_addr+=kernelBlocks*(textBlockSize-overlap);;
blocksRemaining -= kernelBlocks;
}
cudaMemcpy(matched_h, matched_d, sizeof(int)*strlen(text_h), cudaMemcpyDeviceToHost);
checkCUDAError("Matched Function");
for(i=0;i<MAX_STREAMS;i++)
    cudaStreamSynchronize( stream[i] ); 
    // do stuff with matched
    // ....
    // ....
free(matched_h);cudaFree(pattern_d);cudaFree(text_d);cudaFree(matched_d);
    return 0;

}

Number of threads launched per block depends upon the length pattern_h(could be at most maxLc above). I expect it to be around 30 in this case. Shoudn’t that be enough to see a good amount of concurrency? As for blocks, I see no point in launching more than MAX_BLOCKS (=10) at a time since the hardware can schedule only 8 simultaneously

NOTE: I don’t have GUI access.

Report

Leave an answer
Cancel reply

You must login to add an answer.

Need An Account,

1 Answer

Editorial Team · Answer 1 · 2026-06-15T03:46:21+00:00

I found that I was copying the whole array Dnew to D in each thread rather than copying only the portion each thread was supposed to update D[w]. This would cause the threads to execute serially, although I don’t know if it could be called a shared memory bank conflict. Now it gives 8-9x speedup for large enough patterns(=more threads). This is much less than what I expected. I will try to increase number of blocks as suggested. I dont know how to increase the # of threads

Sign Up

Sign In

Forgot Password

The Archive Base Latest Questions

I am working on an application which divides a string into pieces and assigns

Leave an answerCancel reply

1 Answer

Leave an answer
Cancel reply