I am trying to write a code that checkpoints multithreaded applications. Since fork function doesn’t work with such applications, I am working on solaris which has forkall function to achieve that.
Following is the code that contains functions checkpoint and restart_from_checkpoint along with their example usage. I call these functions only between two barriers to be safe.
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <signal.h>
#include <semaphore.h>
#include <stdint.h>
#include <pthread.h>
#define NOFTHREADS 4
pid_t checkpoint();
void restart_from_checkpoint( pid_t pid );
sem_t sem;
pthread_barrier_t barrier;
void sig_handler(int signum)
{
printf( ">> sem_post!\n" );
sem_post( &sem );
}
pid_t child_pid;
pid_t par_pid;
void *threadFunc( void *pParam )
{
unsigned int tid = *((unsigned int*)(pParam));
int i;
for( i = 0; i < 20; i++ )
{
if ( !(i % 2) )
{
pthread_barrier_wait(&barrier);
if ( tid == 0 && i == 6 )
{
child_pid = checkpoint();
}
if ( tid == 0 )
printf( "p%d: >> i = %d\n", getpid(), i );
if ( tid == 0 && i == 12 && ( getpid() == par_pid ) )
{
restart_from_checkpoint( child_pid );
}
pthread_barrier_wait(&barrier);
}
printf( "p%d: t%d: i%d\n", getpid(), tid, i );
}
}
int main( int argc, char *argv[] )
{
int i;
pthread_t hThread[NOFTHREADS];
int index[NOFTHREADS];
signal(SIGUSR1, sig_handler);
pthread_barrier_init (&barrier, NULL, NOFTHREADS);
par_pid = getpid();
for( i = 0; i < NOFTHREADS; i++ )
{
index[i] = i;
pthread_create( &hThread[i], NULL, threadFunc, &index[i] );
}
for( i = 0; i < NOFTHREADS; i++ )
pthread_join( hThread[i], NULL );
return 0;
}
pid_t checkpoint()
{
pid_t pid;
int wait_val;
sem_init( &sem, 0, 0 );
switch (pid=forkall())
{
case -1:
perror("fork");
break;
case 0: // child process starts
sem_wait( &sem );
printf( ">> passed sem_wait!\n" );
break; // child process ends
default: // parent process starts
return pid;
}
}
void restart_from_checkpoint( pid_t pid )
{
printf( ">> restart_from_checkpoint!\n" );
kill( pid, SIGUSR1 );
printf( ">> exiting!\n" );
exit( 0 );
printf( ">> should not had been printed!\n" );
}
And following is the output printed on screen…
p1159: >> i = 0
p1159: t0: i0
p1159: t0: i1
p1159: t1: i0
p1159: t1: i1
p1159: t2: i0
p1159: t2: i1
p1159: t3: i0
p1159: t3: i1
p1159: >> i = 2
p1159: t2: i2
p1159: t2: i3
p1159: t0: i2
p1159: t0: i3
p1159: t3: i2
p1159: t3: i3
p1159: t1: i2
p1159: t1: i3
p1159: >> i = 4
p1159: t0: i4
p1159: t0: i5
p1159: t2: i4
p1159: t2: i5
p1159: t1: i4
p1159: t1: i5
p1159: t3: i4
p1159: t3: i5
p1159: >> i = 6
p1159: t2: i6
p1159: t2: i7
p1159: t0: i6
p1159: t0: i7
p1159: t3: i6
p1159: t3: i7
p1159: t1: i6
p1159: t1: i7
p1159: >> i = 8
p1159: t1: i8
p1159: t1: i9
p1159: t2: i8
p1159: t2: i9
p1159: t3: i8
p1159: t3: i9
p1159: t0: i8
p1159: t0: i9
p1159: >> i = 10
p1159: t1: i10
p1159: t1: i11
p1159: t2: i10
p1159: t2: i11
p1159: t3: i10
p1159: t3: i11
p1159: t0: i10
p1159: t0: i11
p1159: >> i = 12
>> restart_from_checkpoint!
>> exiting!
p1159: >> i = 0
p1159: t0: i0
p1159: t0: i1
p1159: t1: i0
p1159: t1: i1
p1159: t2: i0
p1159: t2: i1
p1159: t3: i0
p1159: t3: i1
p1159: >> i = 2
p1159: t2: i2
p1159: t2: i3
p1159: t0: i2
p1159: t0: i3
p1159: t3: i2
p1159: t3: i3
p1159: t1: i2
p1159: t1: i3
p1159: >> i = 4
p1159: t0: i4
p1159: t0: i5
p1159: t2: i4
p1159: t2: i5
p1159: t1: i4
p1159: t1: i5
p1159: t3: i4
p1159: t3: i5
>> sem_post!
>> passed sem_wait!
p1160: >> i = 6
p1160: t0: i6
p1160: t0: i7
p1160: t2: i6
p1160: t2: i7
p1160: t3: i6
p1160: t3: i7
p1160: t1: i6
p1160: t1: i7
p1160: >> i = 8
p1160: t3: i8
p1160: t3: i9
p1160: t2: i8
p1160: t2: i9
p1160: t1: i8
p1160: t1: i9
p1160: t0: i8
p1160: t0: i9
p1160: >> i = 10
p1160: t3: i10
p1160: t3: i11
p1160: t1: i10
p1160: t1: i11
p1160: t0: i10
p1160: t0: i11
p1160: t2: i10
p1160: t2: i11
p1160: >> i = 12
p1160: t3: i12
p1160: t3: i13
p1160: t0: i12
p1160: t0: i13
p1160: t1: i12
p1160: t1: i13
p1160: t2: i12
p1160: t2: i13
p1160: >> i = 14
p1160: t1: i14
p1160: t1: i15
p1160: t2: i14
p1160: t2: i15
p1160: t0: i14
p1160: t0: i15
p1160: t3: i14
p1160: t3: i15
p1160: >> i = 16
p1160: t0: i16
p1160: t0: i17
p1160: t3: i16
p1160: t3: i17
p1160: t1: i16
p1160: t1: i17
p1160: t2: i16
p1160: t2: i17
p1160: >> i = 18
p1160: t1: i18
p1160: t1: i19
p1160: t2: i18
p1160: t2: i19
p1160: t0: i18
p1160: t0: i19
p1160: t3: i18
p1160: t3: i19
Note that parent process ID is 1159 while child process’s ID is 1160. Now my question is, why after exiting the parent process is reexecuting uptill i == 6 (the point where checkpoint was called), See output between >> exiting! and >> sem_post!. Shouldn’t it had exited immediately? What am I doing wrong here?
The
printffunction doesn’t output text to the screen or to the file immediately (in synchronous manner). It does store text to print into libc buffer and the buffer is flushed (written to screen or to file) at some time (at ‘\n’ char or when if will have a lot data).After fork all buffers are copied from parent to child. There is some text in the buffer and both processes will flush buffer.
You should consider adding a
fflush()just before fork() or setting a different buffers rules viasetvbuf, e.g. disable buffering.