Sign Up

Sign Up to our social questions and Answers Engine to ask questions, answer people’s questions, and connect with other people.

Have an account? Sign In

Have an account? Sign In Now

Sign In

Login to our social questions & Answers Engine to ask questions answer people’s questions & connect with other people.

Sign Up Here

Forgot Password?

Don't have account, Sign Up Here

Forgot Password

Lost your password? Please enter your email address. You will receive a link and will create a new password via email.

Have an account? Sign In Now

You must login to ask a question.

Forgot Password?

Need An Account, Sign Up Here

Please briefly explain why you feel this question should be reported.

Please briefly explain why you feel this answer should be reported.

Please briefly explain why you feel this user should be reported.

Sign InSign Up

The Archive Base

The Archive Base Logo The Archive Base Logo

The Archive Base Navigation

  • SEARCH
  • Home
  • About Us
  • Blog
  • Contact Us
Search
Ask A Question

Mobile menu

Close
Ask a Question
  • Home
  • Add group
  • Groups page
  • Feed
  • User Profile
  • Communities
  • Questions
    • New Questions
    • Trending Questions
    • Must read Questions
    • Hot Questions
  • Polls
  • Tags
  • Badges
  • Buy Points
  • Users
  • Help
  • Buy Theme
  • SEARCH
Home/ Questions/Q 8241345
In Process

The Archive Base Latest Questions

Editorial Team
  • 0
Editorial Team
Asked: June 7, 20262026-06-07T20:48:19+00:00 2026-06-07T20:48:19+00:00

i just started learning assembly and making some custom loop for swapping two variables

  • 0

i just started learning assembly and making some custom loop for swapping two variables using C++ ‘s asm{} body with Digital-Mars compiler in C-Free 5.0

Enabled the -o (optimization)

And got the results:

 time of for-loop(cycles)        844
 time of while-loop(cycles)      735
 time of custom-loop-1(cycles)   562
 time of custom-loop-2(cycles)   469

i couldnt find Digital-Mars compiler “asm output” option to compare.
There is no other optimisation options in the build options.
Should i change my compiler? if yes, which one?
Can you look at the codes below and tell me why custom loops are faster?

here is the standard for loop:

t1=clock(); 
for(int i=0;i<200000000;i++)
{
    temp=a;//instruction 1
    a=b;//instruction 2
    b=temp;//3 instructions total   
}   
t2=clock();
printf("\n time of for-loop(increasing) %i  \n",(t2-t1));

here is the standard while loop:

t1=clock();
while(j<200000000)
{
    temp=a;//again it is three instructions
    a=b;
    b=temp; 
            j++;
}
t2=clock();
printf("\n time of while-loop(cycles)  %i  \n",(t2-t1));

here is my custom loop 1:

t1=clock();
j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        //backup
        push ebx        //backup
        push ecx        //backup
        push edx        //backup

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        do_it_again:    //begin to loop


        mov eax,a       //basic swap steps between cpu and mem(cache)
        mov ebx,b       
        mov b,eax       
        mov a,ebx       //four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again  // end of loop block

        pop edx     //rolling back to history   
        pop ecx         
        pop ebx         
        pop eax         
        popf            
    }

t2=clock();
printf("\n time of custom-loop-1(cycles)   %i   \n",(t2-t1));

here is my second custom loop:

t1=clock();
j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        
        push ebx        
        push ecx        
        push edx        

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        mov eax,a       //getting variables to registers
        mov ebx,b

        do_it_again2:   //begin to loop

        //swapping with using only 2 variables(only in cpu)
        sub eax,ebx         //a is now a-b
        add ebx,eax         //b is now a
        sub eax,ebx         //a is now -b
        xor eax,80000000h   //a is now b and four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again2  // end of loop block

        pop edx         //rollback
        pop ecx         
        pop ebx         
        pop eax         
        popf            
    }

t2=clock();
printf("\n time of custom-loop-2(cycles)  %i   \n",(t2-t1));

full code:

#include<stdio.h>
#include<stdlib.h>
#include<time.h>

int main()
{
int j=0;

int a=0,b=0,temp=0;

srand(time(0));
time_t t1=0;
time_t t2=0;


t1=clock(); 
for(int i=0;i<200000000;i++)
{
    temp=a;//instruction 1
    a=b;//instruction 2
    b=temp;//3 instructions total   
}   
t2=clock();
printf("\n time of for-loop(cycles) %i  \n",(t2-t1));


t1=clock();
while(j<200000000)
{
    temp=a;//again it is three instructions
    a=b;
    b=temp; 
    j++;
}
t2=clock();
printf("\n time of while-loop(cycles)  %i  \n",(t2-t1));


t1=clock();
j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        //backup
        push ebx        //backup
        push ecx        //backup
        push edx        //backup

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        do_it_again:    //begin to loop


        mov eax,a       //basic swap steps between cpu and mem(cache)
        mov ebx,b       
        mov b,eax       
        mov a,ebx       //four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again  // end of loop block

        pop edx     //rolling back to history   
        pop ecx         
        pop ebx         
        pop eax         
        popf            
    }

t2=clock();
printf("\n time of custom-loop-1(cycles)   %i   \n",(t2-t1));


t1=clock();
j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        
        push ebx        
        push ecx        
        push edx        

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        mov eax,a       //getting variables to registers
        mov ebx,b

        do_it_again2:   //begin to loop

        //swapping with using only 2 variables(only in cpu)
        sub eax,ebx         //a is now a-b
        add ebx,eax         //b is now a
        sub eax,ebx         //a is now -b
        xor eax,80000000h   //a is now b and four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again2  // end of loop block

        pop edx         //rollback
        pop ecx         
        pop ebx         
        pop eax         
        popf            
    }

t2=clock();
printf("\n time of custom-loop-2(cycles)  %i   \n",(t2-t1));

return 0;

}

i am just learning c++ and assembly and wondered how things going on.
Thank you

windows xp, pentium 4 (2 GHz) Digital-Mars in C-Free

  • 1 1 Answer
  • 0 Views
  • 0 Followers
  • 0
Share
  • Facebook
  • Report

Leave an answer
Cancel reply

You must login to add an answer.

Forgot Password?

Need An Account, Sign Up Here

1 Answer

  • Voted
  • Oldest
  • Recent
  • Random
  1. Editorial Team
    Editorial Team
    2026-06-07T20:48:21+00:00Added an answer on June 7, 2026 at 8:48 pm

    It’s a bit hard to guess what your compiler may be doing without seeing the assembly language result it creates. With VC++ 10, I get the following results:

    time of for-loop(cycles) 155
    
    time of while-loop(cycles)  158
    
    time of custom-loop-1(cycles)   369
    
    time of custom-loop-2(cycles)  314
    

    I didn’t look at the output, but my immediate guess would be that the difference between the for and while loops is just noise. Both are obviously quite a bit faster than your hand-written assembly code though.

    Edit: looking at the assembly code, I was right — the code for the for and the while is identical. It looks like this:

            call    _clock
            mov     ecx, DWORD PTR _a$[ebp]
            cdq
            mov     ebx, edx
            mov     edx, DWORD PTR _b$[ebp]
            mov     edi, eax
            mov     esi, 200000000
    $LL2@main:
    ; Line 28
            dec     esi
    ; Line 30
            mov     eax, ecx
    ; Line 31
            mov     ecx, edx
    ; Line 32
            mov     edx, eax
            jne     SHORT $LL2@main
            mov     DWORD PTR _b$[ebp], edx
            mov     DWORD PTR _a$[ebp], ecx
    ; Line 35
            call    _clock
    

    While arguably less “clever” than your second loop, modern CPUs tend to do best with simple code. It also just has fewer instructions inside the loop (and doesn’t reference memory inside the loop at all). Those aren’t the sole measures of efficiency by any means, but with this simple of a loop, they’re fairly indicative.

    Edit 2:

    Just for fun, I wrote a new version that adds the triple-XOR swap, as well as one using the CPU’s xchg instruction (just because that’s how I’d probably write it by hand if I didn’t care much about speed, etc.) Though Intel/AMD generally recommend against the more complex instructions, it doesn’t seem to cause a problem — it seems to be coming out at least as fast as anything else:

     time of for-loop(cycles) 156
    
     time of while-loop(cycles)  160
    
     time swap between register and cache  284
    
     time to swap using add/sub:  308
    
     time to swap using xchg:  155
    
     time to swap using triple-xor  233
    

    Source:

    // Note: updated source -- it was just too ugly to live. Same results though.
    #include<stdlib.h>
    #include<time.h>
    #include <iostream>
    #include <string>
    #include <iomanip>
    #include <sstream>
    
    namespace { 
        int a, b;
        const int loops = 200000000;
    }
    
    template <class swapper>
    struct timer {
        timer(std::string const &label) { 
            clock_t t1 = clock();
            swapper()();
            clock_t t2 = clock();
            std::ostringstream buffer;
            buffer << "Time for swap using " << label;
            std::cout << std::left << std::setw(30) << buffer.str() << " = " << (t2-t1) << "\n";
        }
    };
    
    struct for_loop {
        void operator()() {
            int temp;
            for(int i=0;i<loops;i++) {
                temp=a;//instruction 1
                a=b;//instruction 2
                b=temp;//3 instructions total   
            }
        }
    };
    
    struct while_loop {
        void operator()() { 
            int j = 0;
            int temp;
            while(j<loops) {
                temp=a;//again it is three instructions
                a=b;
                b=temp; 
                j++;
            }
        }
    };
    
    struct reg_mem {
        void operator()() {
            int j=loops;//setting the count
            __asm {
                mov ecx,0       //init of loop range(0 to 200000000)
                mov edx,j
        do_it_again:    //begin to loop
                mov eax,a       //basic swap steps between cpu and mem(cache)
                mov ebx,b       
                mov b,eax       
                mov a,ebx       //four instructions total
    
                inc ecx         // j++
                cmp ecx,edx     //i<200000000  ?
                jb do_it_again  // end of loop block
            }
        }
    };
    
    struct add_sub {
        void operator()() { 
            int j=loops;//setting the count
            __asm {
                mov ecx,0       //init of loop range(0 to 200000000)
                mov edx,j
    
                mov eax,a       //getting variables to registers
                mov ebx,b
    
        do_it_again2:   //begin to loop
    
                //swapping with using only 2 variables(only in cpu)
                sub eax,ebx         //a is now a-b
                add ebx,eax         //b is now a
                sub eax,ebx         //a is now -b
                xor eax,80000000h   //a is now b and four instructions total
    
                inc ecx         // j++
                cmp ecx,edx     //i<200000000  ?
                jb do_it_again2  // end of loop block
    
                mov a, eax
                mov b, ebx
            }
        }
    };
    
    struct xchg {
        void operator()() {
            __asm {
                mov ecx, loops
                mov eax, a
                mov ebx, b
        do_it_again3:
                dec ecx
                xchg eax, ebx
                jne do_it_again3
                mov a, eax
                mov b, ebx
            }
        }
    };
    
    struct xor3 {
        void operator()() { 
            _asm { 
                mov ecx, loops
                mov eax, a
                mov edx, b
        do_swap4:
                xor eax, edx
                xor edx, eax
                xor eax, edx
                dec ecx
                jnz do_swap4
    
                mov a, eax
                mov b, edx
            }
        }
    };
    
    int main() {
        timer<for_loop>("for loop");
        timer<while_loop>("while loop");
        timer<reg_mem>("reg<->mem");
        timer<add_sub>("add/sub");
        timer<xchg>("xchg");
        timer<xor3>("triple xor");
        return 0;
    }
    

    Bottom line: at least for this trivial of a task, you’re not going to beat a decent compiler by enough to care about (and probably not at all, except possibly in terms of minutely smaller code).

    • 0
    • Reply
    • Share
      Share
      • Share on Facebook
      • Share on Twitter
      • Share on LinkedIn
      • Share on WhatsApp
      • Report

Sidebar

Related Questions

Just started learning Scheme. I'm using Dr. Racket as my compiler/interpreter. I need some
I just started learning C++ (coming from Java ) and am having some serious
Hi I just started learning assembly in IA32. Can anyone tell me what these
I just started learning Zend. I managed to get the basic working (using zf
I just started learning C++ and am currently using codeblocks. I want to write
I just started learning how to make a website. I'm making a comment box
I just started learning JS the other day and am (of course) having some
I just started learning Java and I encountered some problems with the tag in
I just started learning some ruby, and I want to do something like this:
Just started learning/using pipes and was wondering how to route file output of an

Explore

  • Home
  • Add group
  • Groups page
  • Communities
  • Questions
    • New Questions
    • Trending Questions
    • Must read Questions
    • Hot Questions
  • Polls
  • Tags
  • Badges
  • Users
  • Help
  • SEARCH

Footer

© 2021 The Archive Base. All Rights Reserved
With Love by The Archive Base

Insert/edit link

Enter the destination URL

Or link to existing content

    No search term specified. Showing recent items. Search or use up and down arrow keys to select an item.