Using SSE 2, on Intel core2Duo. The time spent in sse_add() and normal_add() is

Question

0

Asked: June 5, 20262026-06-05T10:25:43+00:00 2026-06-05T10:25:43+00:00

Using SSE 2, on Intel core2Duo. The time spent in sse_add() and normal_add() is

0

Using SSE 2, on Intel core2Duo.

The time spent in sse_add() and normal_add() is not constant in multiple run, and in fact now after several modifications is always coming out as 0.

The program basically finds the sum of each of the columns of the following matrix:

  1,2,10,13,15,160,6,19

  1,2,10,13,15,160,6,19

  1,2,10,13,15,160,6,19

  1,2,10,13,15,160,6,19

  1,2,10,13,15,160,6,19

  1,2,10,13,15,160,6,19

  1,2,10,13,15,160,6,19

  1,2,10,13,15,160,6,19

I have verifies the results and it is coming correct by both the functions:

 results= 8, 16, 80, 104, 120, 1280, 48, 152

Why is it happening? Or is it because I am not measuring the time properly? Can you please run the same code on your machine and verify?

Updated Based on the suggestions, I have put a for loop as show below, but the time is still coming out to be 0 (obviously I have to devide total time by no. of iteration to get correct value, but why I am getting total time 0? ):

 // variable declarations used for time calculation
 double elapTicks;
 double elapMilli ;
 double begin = BeginTimer();

   for(int i=0; i<1000000000;i++)

   {

  //sum of consecutive rows 
  __m128i t1=_mm_adds_epi16(    x1[0] ,   x2[0]  );
  __m128i t2=_mm_adds_epi16(    x3[0] ,   x4[0]  );
  __m128i t3=_mm_adds_epi16(    x5[0] ,   x6[0]  );
  __m128i t4=_mm_adds_epi16(    x7[0] ,   x8[0]  );

 //t5=t1+t2 & t6=t3 + t4
 __m128i t5=_mm_adds_epi16(  t1 ,t2  );
 __m128i t6=_mm_adds_epi16(  t3 ,t4  );


 ///t7=t6+t5, which is final answer 
 __m128i t7=_mm_adds_epi16(  t5 ,t6  );


 }


   printf ("Timer set to: %f\n", begin);
 // variable definitions  to calculate time taken
     elapTicks = EndTimer(begin)-begin;    // stop the timer,and calculate the time 
     taken
     elapMilli = elapTicks/1000;     // milliseconds from Begin to End
     printf ("Time  in SSE in Milliseconds : %f\n", elapMilli);

}

The Original programs are as under.

*UPDATE: Removed all printf and malloc*

Timing functions one by one in separate programs:

SSE version

 clock_t BeginTimer()
 {
 //timer declaration
 clock_t Begin; //initialize Begin
 Begin = clock() * CLK_TCK; //start the timer
 return Begin;
 }
 clock_t EndTimer(clock_t begin)
 {
 clock_t End;
 End = clock() * CLK_TCK;   //stop the timer
 return End;
 }


 int  main( )
 {
   sse_add();
   getch();       
  return 0;
  }


void sse_add()
{

__declspec(align(16)) unsigned short a1[8]={1,2,10,13,15,160,6,19};   
    __declspec(align(16)) unsigned short a2[8]={1,2,10,13,15,160,6,19};
__declspec(align(16)) unsigned short a3[8]={1,2,10,13,15,160,6,19};
__declspec(align(16)) unsigned short a4[8]={1,2,10,13,15,160,6,19};
__declspec(align(16)) unsigned short a5[8]={1,2,10,13,15,160,6,19};
__declspec(align(16)) unsigned short a6[8]={1,2,10,13,15,160,6,19};
__declspec(align(16)) unsigned short a7[8]={1,2,10,13,15,160,6,19};
__declspec(align(16)) unsigned short a8[8]={1,2,10,13,15,160,6,19};

   //__m128i maps to the XMM[0-7] registers
   __m128i *x1 = (__m128i*) &a1[0];  
   __m128i *x2 = (__m128i*) &a2[0]; 
   __m128i *x3 = (__m128i*) &a3[0];   
   __m128i *x4 = (__m128i*) &a4[0];          
   __m128i *x5 = (__m128i*) &a5[0];   
   __m128i *x6 = (__m128i*) &a6[0];   
   __m128i *x7 = (__m128i*) &a7[0];   
   __m128i *x8 = (__m128i*) &a8[0];   

//  _mm_adds_epi16 : Adds the 8 signed 16-bit integers in a to the 8 signed \
    //16-bit  integers in b and saturates.


 // variable declarations used for time calculation
    float elapTicks;
    float elapMilli ;


   double begin = BeginTimer();
       printf ("Timer set to: %.2f\n", begin); // print the initialised timer (0)


  //sum of consecutive rows 
      __m128i t1=_mm_adds_epi16(    x1[0] ,   x2[0]  );
      __m128i t2=_mm_adds_epi16(    x3[0] ,   x4[0]  );
      __m128i t3=_mm_adds_epi16(    x5[0] ,   x6[0]  );
      __m128i t4=_mm_adds_epi16(    x7[0] ,   x8[0]  );

   //t5=t1+t2 & t6=t3 + t4
     __m128i t5=_mm_adds_epi16(  t1 ,t2  );
     __m128i t6=_mm_adds_epi16(  t3 ,t4  );


   ///t7=t6+t5, which is final answer 
   __m128i t7=_mm_adds_epi16(  t5 ,t6  );


// variable definitions  to calculate time taken
elapTicks = EndTimer(begin);    // stop the timer, and calculate the time taken
elapMilli = elapTicks/1000;     // milliseconds from Begin to End
printf ("Time  in SSE in Milliseconds : %.2f\n", elapMilli);

}

Normal version

 clock_t BeginTimer()
 {
 //timer declaration
 clock_t Begin; //initialize Begin
 Begin = clock() * CLK_TCK; //start the timer
 return Begin;
 }

 clock_t EndTimer(clock_t begin)
{
clock_t End;
End = clock() * CLK_TCK;   //stop the timer
return End;
}


int  main( )
{

normal_add();
   getch();
  return 0;
}


  void normal_add()
  {

 unsigned short a1[8]={1,2,10,13,15,160,6,19};
 unsigned short a2[8]={1,2,10,13,15,160,6,19};
 unsigned short a3[8]={1,2,10,13,15,160,6,19};
 unsigned short a4[8]={1,2,10,13,15,160,6,19};
 unsigned short a5[8]={1,2,10,13,15,160,6,19};
 unsigned short a6[8]={1,2,10,13,15,160,6,19};
 unsigned short a7[8]={1,2,10,13,15,160,6,19};
 unsigned short a8[8]={1,2,10,13,15,160,6,19};

   unsigned short t1[8], t2[8], t3[8], t4[8],t5[8], t6[8], t7[8];   



   float elapTicks;
       float elapMilli ;


double begin1 = BeginTimer();
    printf ("Timer reset to: %f\n", begin1); // print the initialised timer (0)


 for(int i=0; i<8;i++)
 {
 t1[i]=a1[i] +a2[i];
 }


 for(int i=0; i<8;i++)
 {
 t2[i]=a3[i] +a4[i];
 }


 for(int i=0; i<8;i++)
 {
 t3[i]=a5[i] +a6[i];
 }

 for(int i=0; i<8;i++)
 {
 t4[i]=a7[i] +a8[i];
 }

 for(int i=0; i<8;i++)
         {
 t5[i]=t1[i] +t2[i];
 }

 for(int i=0; i<8;i++)
         {
 t6[i]=t3[i] +t4[i];
 }

 for(int i=0; i<8;i++)
 {
 t7[i]=t5[i] +t6[i];
 }

 elapTicks = EndTimer(begin1);    // stop the timer, and calculete the time taken
   elapMilli = elapTicks/1000;     // milliseconds from Begin to End
   printf ("Time spent in normal add in Milliseconds : %.2f\n", elapMilli);


  }

Report

Leave an answer
Cancel reply

You must login to add an answer.

Need An Account,

1 Answer

Editorial Team · Answer 1 · 2026-06-05T10:25:45+00:00

The printf() and malloc() calls will easily dominate the execution time of both those functions, and that’s likely where the variation in timing is coming from too.

There is no need to call malloc() there – in fact you have a memory leak, since you call it and then immediately overwrite its return value.

If you want to profile those functions, separate out the addition logic itself from the printing logic, and call the former a large number of times rather than just once. This will amortise random variations caused by external events, like interrupts.

Sign Up

Sign In

Forgot Password

The Archive Base Latest Questions

Using SSE 2, on Intel core2Duo. The time spent in sse_add() and normal_add() is

Leave an answerCancel reply

1 Answer

Leave an answer
Cancel reply