I’m making a prime number sieve using Eratosthenes Sieve in C++ using multiple threads, but when I use more than one thread, the results are inconsistent. The range it has to be able to go through is 1-2^32. When run with a smaller range, like 1-1024, it comes up with the right number of primes usually, but as the range gets bigger, so does the margin of error. I’m assuming it’s a race condition, since I don’t use mutexes (and would prefer not to, since it shouldn’t be necessary with how the program is set up), or there’s something wrong with how I’m sending data to the thread function. I’m still getting used to C++ and it’s pointers/referencing. The number of primes it finds is always greater than or equal to the actual number, never less than. The index of the bit in a bitmap gets set to 1 for composite numbers. It’s probably just some stupid little bug I’m overlooking from lack of C/C++ experience. Please let me know if I’ve been unclear about anything. Thanks for looking.
#include <iostream>
#include <pthread.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sched.h>
#include <vector>
#include <math.h>
using namespace std;
#define NUM_OF_THREADS 4
#define UPPER_LIMIT pow(2, 30)
#define SQRT_UPPER_LIMIT sqrt(UPPER_LIMIT)
typedef struct {
int thread_index;
unsigned long long prime;
unsigned long long marked_up_to;
pthread_t thread;
bool thread_is_done;
} thread_info_t;
typedef struct {
unsigned long long x;
unsigned long long y;
}indexes_t;
static const unsigned long bitmasks[] = {
0x8000000000000000, 0x4000000000000000, 0x2000000000000000, 0x1000000000000000,
0x800000000000000, 0x400000000000000, 0x200000000000000, 0x100000000000000,
0x80000000000000, 0x40000000000000, 0x20000000000000, 0x10000000000000,
0x8000000000000, 0x4000000000000, 0x2000000000000, 0x1000000000000,
0x800000000000, 0x400000000000, 0x200000000000, 0x100000000000,
0x80000000000, 0x40000000000, 0x20000000000, 0x10000000000,
0x8000000000, 0x4000000000, 0x2000000000, 0x1000000000,
0x800000000, 0x400000000, 0x200000000, 0x100000000,
0x80000000, 0x40000000, 0x20000000, 0x10000000,
0x8000000, 0x4000000, 0x2000000, 0x1000000,
0x800000, 0x400000, 0x200000, 0x100000,
0x80000, 0x40000, 0x20000, 0x10000,
0x8000, 0x4000, 0x2000, 0x1000,
0x800, 0x400, 0x200, 0x100,
0x80, 0x40, 0x20, 0x10,
0x8, 0x4, 0x2, 0x1
};
clock_t start;
clock_t stop;
static unsigned long *bitmap; //array of longs
static int bits_in_element = sizeof(unsigned long long)*8;
static thread_info_t info[NUM_OF_THREADS];
indexes_t bit_indexes_from_number(unsigned long long number);
void print_bitmap();
bool check_if_bit_index_is_prime(unsigned long long i, unsigned long long j);
static void * threadFunc(void *arg)
{
thread_info_t *thread_info = (thread_info_t *)arg;
unsigned long long prime = thread_info->prime;
unsigned long long comp_number = prime+prime;
int thread_index = thread_info->thread_index;
indexes_t comp_index;
for(; comp_number <= UPPER_LIMIT; comp_number += prime) // get rid of prime multiples
{
comp_index = bit_indexes_from_number(comp_number);
bitmap[comp_index.x] |= bitmasks[comp_index.y];
info[thread_index].marked_up_to = comp_number; // so main thread only checks for primes past what's been marked
}
thread_info->thread_is_done = true;
return NULL;
}
int main (int argc, char * argv[])
{
long ncpus;
double total_time;
unsigned long long num_to_check;
thread_info_t *thread_to_use;
int thread_ret_val;
start = clock();
for(int i = 0; i < NUM_OF_THREADS; i++)
{
info[i].thread_index = i;
info[i].marked_up_to = 2;
info[i].thread_is_done = true;
}
for(int i = 0; i < NUM_OF_THREADS; i++)
{
bitmap = (unsigned long *)malloc(sizeof(unsigned long *) * UPPER_LIMIT/8);
bitmap[0] |= (bitmasks[0] | bitmasks[1]);
}
for(unsigned long long i = 0; i <= SQRT_UPPER_LIMIT/bits_in_element; i++)// go thru elements in array
{
for(unsigned long long j = (i == 0 ? 2 : 0); j < bits_in_element; j++) //go thru bits in elements
{
num_to_check = (i * bits_in_element) + j;
//make sure all threads are past num_to_check
for(int k = 0; ; k++)
{
if(k == NUM_OF_THREADS)
k = 0;
if(info[k].marked_up_to >= num_to_check)
break;
}
if(check_if_bit_index_is_prime(i, j)) //check if bit index is prime
{
for(int k = 0; ; k++) //wait for a finished thread to use
{
if(k == NUM_OF_THREADS)
k = 0;
if(info[k].thread_is_done)
{
thread_to_use = &info[k];
info[k].thread_is_done = false;
info[k].prime = (i * bits_in_element) + j;
break;
}
}
thread_ret_val = pthread_create(&thread_to_use->thread, NULL, threadFunc, (void *)thread_to_use); //thread gets rid of multiples
if(thread_ret_val != 0)
{
cerr << "thread error: " << strerror(thread_ret_val) << "\n";
return -1;
}
}
}
}
for(int i = 0; i < NUM_OF_THREADS; i++)
{
printf("waiting on %d\n", i);
thread_ret_val = pthread_join(info[i].thread, NULL);
if(thread_ret_val != 0)
{
cout << strerror(thread_ret_val);
}
}
stop = clock();
total_time = (double)(stop - start) / (double)CLOCKS_PER_SEC;
ncpus = sysconf(_SC_NPROCESSORS_ONLN);
/* Print performance results */
printf ("Total time using %d threads : %.6f seconds\n",
NUM_OF_THREADS, total_time / (NUM_OF_THREADS < ncpus ? NUM_OF_THREADS : ncpus));
print_bitmap();
return 1;
}
indexes_t bit_indexes_from_number(unsigned long long number)
{
indexes_t indexes;
indexes.x = ceill(number / bits_in_element); //ceiling or not??
if(indexes.x == 0)
indexes.y = number;
else
indexes.y = number - indexes.x*bits_in_element;
return indexes;
}
void print_bitmap()
{
int count = 0;
for(int i = 0; i <= UPPER_LIMIT; i++)
{
if(check_if_bit_index_is_prime(bit_indexes_from_number(i).x, bit_indexes_from_number(i).y))
{
count++;
}
}
cout << "number of primes between 1 and " << UPPER_LIMIT << ": " << count << "\n";
}
bool check_if_bit_index_is_prime(unsigned long long i, unsigned long long j)
{
if(bitmap[i] & bitmasks[j])
return false;
else
return true;
}
The major major issue is your thread_info_t. You need to ensure atomicity of the changing of members in this structure. The fact that this is not atomic is what is most likely causing your issues of too many primes. Ensuring that these operations are atomic is either going to rely on c++11’s std::atomic or platform specific details. Of course you could use locks instead to make sure that only one thread touches a thread_info_t at a time.
There is also some problems with the following code:
There are two errors in this piece of code. First, there is a memory leak, every allocation except the last is lost, with no pointers pointing to it and no freeing possible.
Second, after allocating the memory for bitmap, the data is undefined. Thus bitmap[0] |= (bitmasks[0] | bitmasks[1]); is invalid as bitmap[0] has an undefined value.