SOLVED: see EDIT 2 below I am trying to parallelise an algorithm which does

Question

0

Asked: May 22, 20262026-05-22T14:47:52+00:00 2026-05-22T14:47:52+00:00

SOLVED: see EDIT 2 below I am trying to parallelise an algorithm which does

0

SOLVED: see EDIT 2 below

I am trying to parallelise an algorithm which does some operation on a matrix (lets call it blurring for simplicity sake). Once this operation has been done, it finds the biggest change between the old and new matrix (max of absolute difference between old and new matrix on a per element basis). If this maximum difference is above some threshold, then do another iteration of the matrix operation.

So my main program has the following loop:

converged = 0;
for( i = 1; i <= iteration_limit; i++ ){
    max_diff = update( &data_grid );

    if( max_diff < tol ) {
        converged = 1;
        break;
    }
}

update( &data_grid ) then calls the actual implementation of the blurring algorithm. The blurring algorithm then iterates over the matrix, it is this loop that I am trying to parallelise:

for( i = 0; i < width; i++ ) {
    for( j = 0; j <= height; j++ ) {
        g->data[ update ][ i ][ j ] = 
        ONE_QUARTER * ( 
                     g->data[ update ][ i + 1 ][ j     ] +
                     g->data[ update ][ i - 1 ][ j     ] +
                     g->data[ update ][ i     ][ j + 1 ] +
                     g->data[ update ][ i     ][ j - 1 ] +
                     );
        diff = fabs( g->data[ old ][ i ][ j ] - g->data[ update ][ i ][ j ] );
        maxdiff = maxdiff > diff ? maxdiff : diff;
    }
}

I could just stick a parallel region inside update(&data_grid) but that would mean threads would be created and destroyed on each iteration which I am trying to avoid.:

#pragma omp parallel for private(i, j, diff, maxdg) shared(width, height, update, g, dg, chunksize) default(none) schedule(static, chunksize)

I have 2 copies of the grid and write the new answer in the “other one” on every iteration by switching old and update between 0 and 1.

EDIT:

So I’ve made an orphaned omp for loop as per Jonathan Dursi’s suggestion, but for some reason, can’t seem to find the maximum value between threads…

Here is my “outer” code:

  converged = 0;

  #pragma omp parallel shared(i, max_iter, g, tol, maxdg, dg) private(converged) default(none)
  {
      for( i = 1; i <= 40; i++ ){

          maxdg = 0;

          dg = grid_update( &g );

          printf("[%d] dg from a single thread: %f\n", omp_get_thread_num(), dg );


  #pragma omp critical
          {
              if (dg > maxdg) maxdg = dg;
          }

  #pragma omp barrier
  #pragma omp flush

          printf("[%d] maxdg: %f\n", omp_get_thread_num(), maxdg);

          if( maxdg < tol ) {
              converged = 1;
              break;
          }
      }
  }

And the result:

  [11] dg from a single thread: 0.000000
  [3] dg from a single thread: 0.000000
  [4] dg from a single thread: 0.000000
  [5] dg from a single thread: 0.000000
  [0] dg from a single thread: 0.166667
  [6] dg from a single thread: 0.000000
  [7] dg from a single thread: 0.000000
  [8] dg from a single thread: 0.000000
  [9] dg from a single thread: 0.000000
  [15] dg from a single thread: 0.000000
  [10] dg from a single thread: 0.000000
  [1] dg from a single thread: 0.166667
  [12] dg from a single thread: 0.000000
  [13] dg from a single thread: 0.000000
  [14] dg from a single thread: 0.000000
  [2] maxdg: 0.000000
  [3] maxdg: 0.000000
  [0] maxdg: 0.000000
  [8] maxdg: 0.000000
  [9] maxdg: 0.000000
  [4] maxdg: 0.000000
  [5] maxdg: 0.000000
  [6] maxdg: 0.000000
  [7] maxdg: 0.000000
  [1] maxdg: 0.000000
  [14] maxdg: 0.000000
  [11] maxdg: 0.000000
  [15] maxdg: 0.000000
  [10] maxdg: 0.000000
  [12] maxdg: 0.000000
  [13] maxdg: 0.000000

EDIT 2:
Made some mistakes with the private/shared classifiers and forgot a barrier. This is the correct code:

  #pragma omp parallel shared(max_iter, g, tol, maxdg) private(i, dg, converged) default(none)
  {
      for( i = 1; i <= max_iter; i++ ){

  #pragma omp barrier
          maxdg=0;
  /*#pragma omp flush */

          dg = grid_update( &g );

  #pragma omp critical
          {
              if (dg > maxdg) maxdg = dg;
          }

  #pragma omp barrier
  /*#pragma omp flush*/

          if( maxdg < tol ) {
              converged = 1;
              break;
          }
      }
  }

Report

Leave an answer
Cancel reply

You must login to add an answer.

Need An Account,

1 Answer

Editorial Team · Answer 1 · 2026-05-22T14:47:53+00:00

There’s no problem with having the parallel section start in another routine before the for, certainly since OpenMP 3.0 (2008) and maybe since OpenMP 2.5. With gcc4.4:

outer.c:

#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

void update(int n, int iter);

int main(int argc, char **argv) {
    int n=10;

    #pragma omp parallel num_threads(4) default(none) shared(n)
    for (int iter=0; iter<3; iter++)
    {
        #pragma omp single
        printf("---iteration %d---\n", iter);
        update(n, iter);
    }

    return 0;
}

inner.c:

#include <omp.h>
#include <stdio.h>

void update(int n, int iter) {
    int thread = omp_get_thread_num();

    #pragma omp for
    for  (int i=0;i<n;i++) {
        int newthread=omp_get_thread_num();
        printf("%3d: doing loop index %d.\n",newthread,i);
    }
}

Building:

$ make
gcc44 -g -fopenmp -std=c99   -c -o inner.o inner.c
gcc44 -g -fopenmp -std=c99   -c -o outer.o outer.c
gcc44 -o main outer.o inner.o -fopenmp -lgomp
$ ./main 
---iteration 0---
  2: doing loop index 6.
  2: doing loop index 7.
  2: doing loop index 8.
  0: doing loop index 0.
  0: doing loop index 1.
  0: doing loop index 2.
  1: doing loop index 3.
  1: doing loop index 4.
  1: doing loop index 5.
  3: doing loop index 9.
---iteration 1---
  0: doing loop index 0.
  0: doing loop index 1.
  0: doing loop index 2.
  1: doing loop index 3.
  1: doing loop index 4.
  1: doing loop index 5.
  3: doing loop index 9.
  2: doing loop index 6.
  2: doing loop index 7.
  2: doing loop index 8.
---iteration 2---
  0: doing loop index 0.
  0: doing loop index 1.
  0: doing loop index 2.
  3: doing loop index 9.
  2: doing loop index 6.
  2: doing loop index 7.
  2: doing loop index 8.
  1: doing loop index 3.
  1: doing loop index 4.
  1: doing loop index 5.

But as per @jdv-Jan de Vaan, I’d be very surprised if in a up-to-date OpenMP implmentation this led to a significant performance improvement over having the parallel for in update, particularly if update is expensive enough.

BTW, there are issues with just putting a parallel for around the i-loop in the Gauss-Seidel routine in update; you can see that the i steps aren’t independant, and that will lead to race conditions. You will need to do something like Red-Black or Jacobi iteration instead…

Update:

The code sample provided is for a G-S iteration, not Jacobi, but I’ll just assume that’s a typo.

If your question is actually about the reduce and not the orphaned for loop: yes, you sadly have to roll your own min/max reductions in OpenMP, but it’s pretty straightforward, you just use the usual tricks.

Update 2 — yikes, locmax needs to be private, not shared.

outer.c:

#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

int update(int n, int iter);

int main(int argc, char **argv) {
    int n=10;
    int max, locmax;

    max = -999;

    #pragma omp parallel num_threads(4) default(none) shared(n, max) private(locmax)
    for (int iter=0; iter<3; iter++)
    {
        #pragma omp single
            printf("---iteration %d---\n", iter);

        locmax = update(n, iter);

        #pragma omp critical
        {
            if (locmax > max) max=locmax;
        }

        #pragma omp barrier
        #pragma omp flush

        #pragma omp single
            printf("---iteration %d's max value = %d---\n", iter, max);
    }
    return 0;
}

inner.c:

#include <omp.h>
#include <stdio.h>

int update(int n, int iter) {
    int thread = omp_get_thread_num();
    int max = -999;

    #pragma omp for
    for  (int i=0;i<n;i++) {
        printf("%3d: doing loop index %d.\n",thread,i);
        if (i+iter>max) max = i+iter;
    }

    return max;
}

and building:

$ make
gcc44 -g -fopenmp -std=c99   -c -o inner.o inner.c
gcc44 -g -fopenmp -std=c99   -c -o outer.o outer.c
gcc44 -o main outer.o inner.o -fopenmp -lgomp
bash-3.2$ ./main 
---iteration 0---
  0: doing loop index 0.
  0: doing loop index 1.
  0: doing loop index 2.
  2: doing loop index 6.
  2: doing loop index 7.
  2: doing loop index 8.
  1: doing loop index 3.
  1: doing loop index 4.
  1: doing loop index 5.
  3: doing loop index 9.
---iteration 0's max value = 9---
---iteration 1---
  0: doing loop index 0.
  0: doing loop index 1.
  0: doing loop index 2.
  3: doing loop index 9.
  2: doing loop index 6.
  2: doing loop index 7.
  2: doing loop index 8.
  1: doing loop index 3.
  1: doing loop index 4.
  1: doing loop index 5.
---iteration 1's max value = 10---
---iteration 2---
  0: doing loop index 0.
  0: doing loop index 1.
  0: doing loop index 2.
  1: doing loop index 3.
  1: doing loop index 4.
  1: doing loop index 5.
  3: doing loop index 9.
  2: doing loop index 6.
  2: doing loop index 7.
  2: doing loop index 8.
---iteration 2's max value = 11---

Sign Up

Sign In

Forgot Password

The Archive Base Latest Questions

SOLVED: see EDIT 2 below I am trying to parallelise an algorithm which does

Leave an answerCancel reply

1 Answer

Leave an answer
Cancel reply