i just started learning assembly and making some custom loop for swapping two variables

Question

0

Asked: June 7, 20262026-06-07T20:48:19+00:00 2026-06-07T20:48:19+00:00

i just started learning assembly and making some custom loop for swapping two variables

0

i just started learning assembly and making some custom loop for swapping two variables using C++ ‘s asm{} body with Digital-Mars compiler in C-Free 5.0

Enabled the -o (optimization)

And got the results:

 time of for-loop(cycles)        844
 time of while-loop(cycles)      735
 time of custom-loop-1(cycles)   562
 time of custom-loop-2(cycles)   469

i couldnt find Digital-Mars compiler “asm output” option to compare.
There is no other optimisation options in the build options.
Should i change my compiler? if yes, which one?
Can you look at the codes below and tell me why custom loops are faster?

here is the standard for loop:

t1=clock(); 
for(int i=0;i<200000000;i++)
{
    temp=a;//instruction 1
    a=b;//instruction 2
    b=temp;//3 instructions total   
}   
t2=clock();
printf("\n time of for-loop(increasing) %i  \n",(t2-t1));

here is the standard while loop:

t1=clock();
while(j<200000000)
{
    temp=a;//again it is three instructions
    a=b;
    b=temp; 
            j++;
}
t2=clock();
printf("\n time of while-loop(cycles)  %i  \n",(t2-t1));

here is my custom loop 1:

t1=clock();
j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        //backup
        push ebx        //backup
        push ecx        //backup
        push edx        //backup

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        do_it_again:    //begin to loop


        mov eax,a       //basic swap steps between cpu and mem(cache)
        mov ebx,b       
        mov b,eax       
        mov a,ebx       //four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again  // end of loop block

        pop edx     //rolling back to history   
        pop ecx         
        pop ebx         
        pop eax         
        popf            
    }

t2=clock();
printf("\n time of custom-loop-1(cycles)   %i   \n",(t2-t1));

here is my second custom loop:

t1=clock();
j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        
        push ebx        
        push ecx        
        push edx        

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        mov eax,a       //getting variables to registers
        mov ebx,b

        do_it_again2:   //begin to loop

        //swapping with using only 2 variables(only in cpu)
        sub eax,ebx         //a is now a-b
        add ebx,eax         //b is now a
        sub eax,ebx         //a is now -b
        xor eax,80000000h   //a is now b and four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again2  // end of loop block

        pop edx         //rollback
        pop ecx         
        pop ebx         
        pop eax         
        popf            
    }

t2=clock();
printf("\n time of custom-loop-2(cycles)  %i   \n",(t2-t1));

full code:

#include<stdio.h>
#include<stdlib.h>
#include<time.h>

int main()
{
int j=0;

int a=0,b=0,temp=0;

srand(time(0));
time_t t1=0;
time_t t2=0;


t1=clock(); 
for(int i=0;i<200000000;i++)
{
    temp=a;//instruction 1
    a=b;//instruction 2
    b=temp;//3 instructions total   
}   
t2=clock();
printf("\n time of for-loop(cycles) %i  \n",(t2-t1));


t1=clock();
while(j<200000000)
{
    temp=a;//again it is three instructions
    a=b;
    b=temp; 
    j++;
}
t2=clock();
printf("\n time of while-loop(cycles)  %i  \n",(t2-t1));


t1=clock();
j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        //backup
        push ebx        //backup
        push ecx        //backup
        push edx        //backup

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        do_it_again:    //begin to loop


        mov eax,a       //basic swap steps between cpu and mem(cache)
        mov ebx,b       
        mov b,eax       
        mov a,ebx       //four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again  // end of loop block

        pop edx     //rolling back to history   
        pop ecx         
        pop ebx         
        pop eax         
        popf            
    }

t2=clock();
printf("\n time of custom-loop-1(cycles)   %i   \n",(t2-t1));


t1=clock();
j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        
        push ebx        
        push ecx        
        push edx        

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        mov eax,a       //getting variables to registers
        mov ebx,b

        do_it_again2:   //begin to loop

        //swapping with using only 2 variables(only in cpu)
        sub eax,ebx         //a is now a-b
        add ebx,eax         //b is now a
        sub eax,ebx         //a is now -b
        xor eax,80000000h   //a is now b and four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again2  // end of loop block

        pop edx         //rollback
        pop ecx         
        pop ebx         
        pop eax         
        popf            
    }

t2=clock();
printf("\n time of custom-loop-2(cycles)  %i   \n",(t2-t1));

return 0;

}

i am just learning c++ and assembly and wondered how things going on.
Thank you

windows xp, pentium 4 (2 GHz) Digital-Mars in C-Free

Report

Leave an answer
Cancel reply

You must login to add an answer.

Need An Account,

1 Answer

Editorial Team · Answer 1 · 2026-06-07T20:48:21+00:00

It’s a bit hard to guess what your compiler may be doing without seeing the assembly language result it creates. With VC++ 10, I get the following results:

time of for-loop(cycles) 155

time of while-loop(cycles)  158

time of custom-loop-1(cycles)   369

time of custom-loop-2(cycles)  314

I didn’t look at the output, but my immediate guess would be that the difference between the for and while loops is just noise. Both are obviously quite a bit faster than your hand-written assembly code though.

Edit: looking at the assembly code, I was right — the code for the for and the while is identical. It looks like this:

        call    _clock
        mov     ecx, DWORD PTR _a$[ebp]
        cdq
        mov     ebx, edx
        mov     edx, DWORD PTR _b$[ebp]
        mov     edi, eax
        mov     esi, 200000000
$LL2@main:
; Line 28
        dec     esi
; Line 30
        mov     eax, ecx
; Line 31
        mov     ecx, edx
; Line 32
        mov     edx, eax
        jne     SHORT $LL2@main
        mov     DWORD PTR _b$[ebp], edx
        mov     DWORD PTR _a$[ebp], ecx
; Line 35
        call    _clock

While arguably less “clever” than your second loop, modern CPUs tend to do best with simple code. It also just has fewer instructions inside the loop (and doesn’t reference memory inside the loop at all). Those aren’t the sole measures of efficiency by any means, but with this simple of a loop, they’re fairly indicative.

Edit 2:

Just for fun, I wrote a new version that adds the triple-XOR swap, as well as one using the CPU’s xchg instruction (just because that’s how I’d probably write it by hand if I didn’t care much about speed, etc.) Though Intel/AMD generally recommend against the more complex instructions, it doesn’t seem to cause a problem — it seems to be coming out at least as fast as anything else:

 time of for-loop(cycles) 156

 time of while-loop(cycles)  160

 time swap between register and cache  284

 time to swap using add/sub:  308

 time to swap using xchg:  155

 time to swap using triple-xor  233

Source:

// Note: updated source -- it was just too ugly to live. Same results though.
#include<stdlib.h>
#include<time.h>
#include <iostream>
#include <string>
#include <iomanip>
#include <sstream>

namespace { 
    int a, b;
    const int loops = 200000000;
}

template <class swapper>
struct timer {
    timer(std::string const &label) { 
        clock_t t1 = clock();
        swapper()();
        clock_t t2 = clock();
        std::ostringstream buffer;
        buffer << "Time for swap using " << label;
        std::cout << std::left << std::setw(30) << buffer.str() << " = " << (t2-t1) << "\n";
    }
};

struct for_loop {
    void operator()() {
        int temp;
        for(int i=0;i<loops;i++) {
            temp=a;//instruction 1
            a=b;//instruction 2
            b=temp;//3 instructions total   
        }
    }
};

struct while_loop {
    void operator()() { 
        int j = 0;
        int temp;
        while(j<loops) {
            temp=a;//again it is three instructions
            a=b;
            b=temp; 
            j++;
        }
    }
};

struct reg_mem {
    void operator()() {
        int j=loops;//setting the count
        __asm {
            mov ecx,0       //init of loop range(0 to 200000000)
            mov edx,j
    do_it_again:    //begin to loop
            mov eax,a       //basic swap steps between cpu and mem(cache)
            mov ebx,b       
            mov b,eax       
            mov a,ebx       //four instructions total

            inc ecx         // j++
            cmp ecx,edx     //i<200000000  ?
            jb do_it_again  // end of loop block
        }
    }
};

struct add_sub {
    void operator()() { 
        int j=loops;//setting the count
        __asm {
            mov ecx,0       //init of loop range(0 to 200000000)
            mov edx,j

            mov eax,a       //getting variables to registers
            mov ebx,b

    do_it_again2:   //begin to loop

            //swapping with using only 2 variables(only in cpu)
            sub eax,ebx         //a is now a-b
            add ebx,eax         //b is now a
            sub eax,ebx         //a is now -b
            xor eax,80000000h   //a is now b and four instructions total

            inc ecx         // j++
            cmp ecx,edx     //i<200000000  ?
            jb do_it_again2  // end of loop block

            mov a, eax
            mov b, ebx
        }
    }
};

struct xchg {
    void operator()() {
        __asm {
            mov ecx, loops
            mov eax, a
            mov ebx, b
    do_it_again3:
            dec ecx
            xchg eax, ebx
            jne do_it_again3
            mov a, eax
            mov b, ebx
        }
    }
};

struct xor3 {
    void operator()() { 
        _asm { 
            mov ecx, loops
            mov eax, a
            mov edx, b
    do_swap4:
            xor eax, edx
            xor edx, eax
            xor eax, edx
            dec ecx
            jnz do_swap4

            mov a, eax
            mov b, edx
        }
    }
};

int main() {
    timer<for_loop>("for loop");
    timer<while_loop>("while loop");
    timer<reg_mem>("reg<->mem");
    timer<add_sub>("add/sub");
    timer<xchg>("xchg");
    timer<xor3>("triple xor");
    return 0;
}

Bottom line: at least for this trivial of a task, you’re not going to beat a decent compiler by enough to care about (and probably not at all, except possibly in terms of minutely smaller code).

Sign Up

Sign In

Forgot Password

The Archive Base Latest Questions

i just started learning assembly and making some custom loop for swapping two variables

Leave an answerCancel reply

1 Answer

Leave an answer
Cancel reply