I am using std::thread and gcc as my compiler in implementing the parallel-merge as described in Cormen’s Introduction to Algorithms.
I think I got the code to work. It passes all randomly seeded arrays that are not too big. However, when I try to merge two arrays that are large (1e6 elements each), I get the following termination:
terminate called without an active exception
terminate called recursively
terminate called recursively
Using gdb doesn’t help: it becomes corrupted during the run.
I am pretty certain that the run has failed due to too many threads spawned.
What can I do to confirm that this error is due to too many std::threads spawned?
NOTES
- Code works up to n=1e4, fails by n=1e5
-
define DEBUG if you want to see output, but I don’t recommend this except for small n like 10 or 50.
- STRBUF_SIZE/use of fprintf is ugly, but iostream doesn’t flush well in threads – this is hacky, but works (no need to focus here).
- I tried following Barnes53’s suggestion by using a try/catch block around the threads, but this didn’t work, apparently.
- I know that spawning a gazillion threads is a bad thing – at this point, I am just trying to implement what’s in the book and to see if it works, and perhaps discover what its limitations are.
UPDATE
- GManNickG’s answer below helped: not every run, but during some runs of 1e5, I can see that, indeed, resources are gone.
- I will probably look into some kind of k-way parallel sort, where I can control the number of threads spawned, if this algorithm is not salvageable.
CODE
#include <vector>
#include <iostream>
#include <algorithm>
#include <vector>
#include <thread>
#include <cmath>
#include <cstring>
#include <cassert>
#define STRBUF_SIZE 1024
class Random
{
public:
Random( unsigned int seed=::time(nullptr))
: m_seed( seed )
{ }
// between [ 0 .. n-1 ]
unsigned int rand_uint( unsigned int n )
{
return static_cast<unsigned int>
(static_cast<float>(n) * rand_r( &m_seed ) / RAND_MAX);
}
unsigned int getSeed() const { return m_seed; }
private:
unsigned int m_seed;
};
template<typename T>
char* dump( char* line, T it1, T it2 )
{
char buf[80];
line[0] = '\0';
for( T it=it1; it!=it2; ++it )
{
sprintf( buf, "%u ", *it );
strcat( line, buf );
}
return line;
}
template< typename T, class It >
It binary_search_it( It beg, It end, const T& value )
{
auto low = beg;
auto high = std::max( beg, end ); // end+1
while( low < high )
{
auto mid = low + std::distance( low, high ) / 2;
if ( value <= *mid )
high = mid;
else
low = mid + 1;
}
return high;
}
template< class InputIt, class OutputIt >
void p_merge(
char const* msg,
unsigned depth,
unsigned parent_lvl_id,
unsigned lr,
InputIt p1, InputIt r1,
InputIt p2, InputIt r2,
OutputIt p3, OutputIt r3
)
{
#ifdef DEBUG
char buff[STRBUF_SIZE];
#endif
unsigned sum_prev = pow( 2, depth ) - 1;
unsigned lvl_id = 2*parent_lvl_id + lr;
unsigned thread_no = sum_prev + lvl_id + 1;
unsigned limit0 = sum_prev + 1;
unsigned limit1 = pow( 2, depth+1 ) - 1;
#ifdef DEBUG
char msg_dep[256];
sprintf( msg_dep, "%s [%2d] %-10d [%d,%d]", msg, depth, thread_no, limit0, limit1 );
fprintf( stderr, "%s\n", msg_dep );
#endif
if ( thread_no<limit0 || thread_no>limit1 )
{
fprintf( stderr, "OUT OF BOUNDS\n" );
exit( 1 );
}
auto n1 = std::distance( p1, r1 );
auto n2 = std::distance( p2, r2 );
#ifdef DEBUG
fprintf( stderr, "%s dist[v1]=%2ld : %s\n", msg_dep, n1, dump( buff, p1, r1 ) );
fprintf( stderr, "%s dist[v2]=%2ld : %s\n", msg_dep, n2, dump( buff, p2, r2 ) );
#endif
if ( n1<n2 )
{
std::swap( p1, p2 );
std::swap( r1, r2 );
std::swap( n1, n2 );
#ifdef DEBUG
fprintf( stderr, "%s swapped[v1] : %s\n", msg_dep, dump( buff, p1, r1 ));
fprintf( stderr, "%s swapped[v2] : %s\n", msg_dep, dump( buff, p2, r2 ));
#endif
}
if ( n1==0 )
{
#ifdef DEBUG
fprintf( stderr, "%s done \n", msg_dep );
#endif
return;
}
auto q1 = p1 + n1 / 2; // midpoint
auto q2 = binary_search_it( p2, r2, *q1 ); // <q1 q2[q1] >=q1
auto q3 = p3 + std::distance( p1, q1 ) + std::distance( p2, q2 );
*q3 = *q1;
#ifdef DEBUG
fprintf( stderr, "%s q1[median]=%u : %s\n", msg_dep, *q1, dump( buff, p1, r1 ));
fprintf( stderr, "%s q2[fulcrum]=%u : %s\n", msg_dep, *q2, dump( buff, p2, r2 ));
fprintf( stderr, "%s q3(copied)=%u : %s\n", msg_dep, *q3, dump( buff, p3, r3 ));
#endif
#ifdef DEBUG
auto d1 = std::distance( p1, q1-1 );
auto d2 = std::distance( q1+1, r1 );
fprintf( stderr, "%s q1[dist_L]=%ld : %s\n", msg_dep, d1, dump( buff, p1, r1 ));
fprintf( stderr, "%s q1[dist_M]=%ld : %s\n", msg_dep, d2, dump( buff, p1, r1 ));
#endif
try {
std::thread t1{
[&](){ p_merge( "LESS", depth+1, lvl_id, 0, p1, q1, p2, q2, p3, r3 ); }
};
std::thread t2{
[&](){ p_merge( "MORE", depth+1, lvl_id, 1, q1+1, r1, q2, r2, q3+1, r3 ); }
};
t1.join();
t2.join();
}
catch( ... )
{
fprintf( stderr, "OK - I am dying during a std::thread spawn\n" );
exit( 1 );
}
#ifdef DEBUG
fprintf( stderr, "%s synchronized\n", msg_dep );
#endif
}
int
main( int argv, char* argc[] )
{
// ok up to 1e4, fails by 1e5
unsigned n = 1e5;
Random r;
std::vector<unsigned> v1( n ), v2( n ), v3( 2 * n );
#ifdef DEBUG
fprintf( stderr, "SEED = %u\n", r.getSeed() );
#endif
std::generate( v1.begin(), v1.end(), [&]() { return r.rand_uint(n); } );
std::generate( v2.begin(), v2.end(), [&]() { return r.rand_uint(n); } );
#ifdef DEBUG
char buff[STRBUF_SIZE];
fprintf( stderr, "%s\n", dump( buff, v1.begin(), v1.end() ));
fprintf( stderr, "%s\n", dump( buff, v2.begin(), v2.end() ));
#endif
std::sort( v1.begin(), v1.end() );
std::sort( v2.begin(), v2.end() );
p_merge( "TOP ", 0, 0, 0,
v1.begin(), v1.end(), v2.begin(), v2.end(), v3.begin(), v3.end() );
assert( std::is_sorted( v3.begin(), v3.end() ));
#ifdef DEBUG
fprintf( stderr, "FINAL : %s\n", dump( buff, v3.begin(), v3.end() ));
#endif
}
You can catch
std::system_errorand check if the code isresource_unavailable_try_again:(Run this at your own risk!)
According §30.3.1.2/4, this is the error code used to indicate thread creation failure:
Note this could be thrown by your own arguments being copied to the resulting thread. To guarantee against this, you need to pre-construct your arguments, then no-throw move them to your thread function.
That said, you’re much better off putting a limit on thread creation anyway. There’s no point in having more threads running than cores can execute. Use
std::thread::hardware_concurrencyto get that number.