Sign Up

Sign Up to our social questions and Answers Engine to ask questions, answer people’s questions, and connect with other people.

Have an account? Sign In

Have an account? Sign In Now

Sign In

Login to our social questions & Answers Engine to ask questions answer people’s questions & connect with other people.

Sign Up Here

Forgot Password?

Don't have account, Sign Up Here

Forgot Password

Lost your password? Please enter your email address. You will receive a link and will create a new password via email.

Have an account? Sign In Now

You must login to ask a question.

Forgot Password?

Need An Account, Sign Up Here

Please briefly explain why you feel this question should be reported.

Please briefly explain why you feel this answer should be reported.

Please briefly explain why you feel this user should be reported.

Sign InSign Up

The Archive Base

The Archive Base Logo The Archive Base Logo

The Archive Base Navigation

  • SEARCH
  • Home
  • About Us
  • Blog
  • Contact Us
Search
Ask A Question

Mobile menu

Close
Ask a Question
  • Home
  • Add group
  • Groups page
  • Feed
  • User Profile
  • Communities
  • Questions
    • New Questions
    • Trending Questions
    • Must read Questions
    • Hot Questions
  • Polls
  • Tags
  • Badges
  • Buy Points
  • Users
  • Help
  • Buy Theme
  • SEARCH
Home/ Questions/Q 7955393
In Process

The Archive Base Latest Questions

Editorial Team
  • 0
Editorial Team
Asked: June 4, 20262026-06-04T03:42:10+00:00 2026-06-04T03:42:10+00:00

Updated , original question below the line: I need to compute a median, and

  • 0

Updated, original question below the line:

I need to compute a median, and would like to use the O(N) quickselect algorithm.
It turns out however that when the array is no longer a flat array of doubles, but rather an array of structs (of which one element is the element to use for the median computation) the run time no longer scales with O(N).

The following flat array version has approximately linear runtime:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define SWAP(a,b) temp=(a);(a)=(b);(b)=temp;

double quickselect(unsigned long k, unsigned long n, double *arr)
{
  unsigned long i, ir, j, l, mid;
  double a, temp;

  l=1;
  ir=n-1;
  for (;;) {
    if (ir <= l+1) {
      if (ir == l+1 && arr[ir] < arr[l]) {
        SWAP(arr[l],arr[ir])
      }
      return arr[k];
    } else {
      mid=(l+ir) >> 1;
      SWAP(arr[mid],arr[l+1])
      if (arr[l] > arr[ir]) {
        SWAP(arr[l],arr[ir])
      }
      if (arr[l+1] > arr[ir]) {
        SWAP(arr[l+1],arr[ir])
      }
      if (arr[l] > arr[l+1]) {
        SWAP(arr[l],arr[l+1])
      }
      i=l+1;
      j=ir;
      a=arr[l+1];
      for (;;) {
        do i++; while (arr[i] < a);
        do j--; while (arr[j] > a);
        if (j < i) break;
        SWAP(arr[i],arr[j])
      }
      arr[l+1]=arr[j];
      arr[j]=a;
      if (j >= k) ir=j-1;
      if (j <= k) l=i;
    }
  }
}

int main()
{
  unsigned long i, j, k, l, m;
  unsigned long ntest = 1e2;
  unsigned long N[5] = {1e3, 1e4, 1e5, 1e6, 1e7};
  clock_t start, diff;

  int seed = 215342512; //time(NULL);
  srand(seed);

  double *arr = (double*) malloc(N[4] * sizeof(double));

  for (i=0; i<5; i++)
  {
    start = clock();

    for (j=0; j<ntest; j++)
    {
      for (k=0; k<N[i]; k++)
      {
        arr[k] = (double) rand() / (double) RAND_MAX;
      }

      quickselect(N[i] / 2, N[i], arr);
    }

    diff = clock() - start;

    printf("%lu %.5f\n", N[i], (double) diff / CLOCKS_PER_SEC);
  }
}

Gives:

1000 0.00228
10000 0.02014
100000 0.19868
1000000 2.01272
10000000 20.41286

However the following version with structs has non linear runtime:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define SWAP(a,b) temp=(a);(a)=(b);(b)=temp;

typedef struct {
  double x;
  double y;
  double z;
  int id;
} point_t;

point_t* quickselect(unsigned long k, unsigned long n, point_t **arr)
{
  unsigned long i, ir, j, l, mid;
  point_t *a, *temp;

  l=1;
  ir=n-1;
  for (;;) {
    if (ir <= l+1) {
      if (ir == l+1 && arr[ir]->x < arr[l]->x) {
        SWAP(arr[l],arr[ir])
      }
      return arr[k];
    } else {
      mid=(l+ir) >> 1;
      SWAP(arr[mid],arr[l+1])
      if (arr[l]->x > arr[ir]->x) {
        SWAP(arr[l],arr[ir])
      }
      if (arr[l+1]->x > arr[ir]->x) {
        SWAP(arr[l+1],arr[ir])
      }
      if (arr[l]->x > arr[l+1]->x) {
        SWAP(arr[l],arr[l+1])
      }
      i=l+1;
      j=ir;
      a=arr[l+1];
      for (;;) {
        do i++; while (arr[i]->x < a->x);
        do j--; while (arr[j]->x > a->x);
        if (j < i) break;
        SWAP(arr[i],arr[j])
      }
      arr[l+1]=arr[j];
      arr[j]=a;
      if (j >= k) ir=j-1;
      if (j <= k) l=i;
    }
  }
}

int main()
{
  unsigned long i, j, k, l, m;
  unsigned long ntest = 1e2;
  unsigned long N[5] = {1e3, 1e4, 1e5, 1e6, 1e7};
  clock_t start, diff;

  int seed = 215342512; //time(NULL);
  srand(seed);

  point_t **ap, *a;

  ap = (point_t**) malloc(N[4] * sizeof(point_t*));

  if (ap == NULL) printf("Error in ap\n");

  a = (point_t*) malloc(N[4] * sizeof(point_t));

  if (a == NULL) printf("Error in a\n");

  for (i=0; i<N[4]; i++)
  {
    ap[i] = a+i;
  }

  for (i=0; i<5; i++)
  {
    start = clock();

    for (j=0; j<ntest; j++)
    {
      for (k=0; k<N[i]; k++)
      {
        ap[k]->x = (double) rand() / (double) RAND_MAX;
      }

      quickselect(N[i] / 2, N[i], ap);
    }

    diff = clock() - start;

    printf("%lu %.5f\n", N[i], (double) diff / CLOCKS_PER_SEC);
  }
}

Gives:

1000 0.00224
10000 0.02587
100000 0.37574
1000000 7.18962
10000000 96.34863

Both versions were compiled with gcc -O2 (but -O0 gives the same scaling).

Where does this change in scaling come from and how can it be fixed?

Note that while I can change the struct, I cannot just median y because I need to know the other parameters corresponding to the median point as well.
Additionally I need the quickselect behavior for the resulting array (e.g. a.y <= m.y for all a left of m and b.y > m.y for all b right of m).


I need to compute a median, and would like to use the O(N) quickselect algorithm.
It turns out however that when the array is no longer a flat array of doubles, but rather an array of structs (of which one element is the element to use for the median computation) the run time no longer scales with O(N).

I use the following implementation:

#define SWAP(a,b) temp=(a); (a)=(b); (b)=temp;

typedef struct point_t point_t;

struct point_t {
  double y;

//  unsigned long something;
//
//  double *something_else;
//
//  double yet_another thing;
//
//  point_t* again_something;
};

void median(point_t *arr, unsigned long n)
{
  unsigned long k = n / 2;
  unsigned long i, ir, j, l, mid;
  point_t a, temp;

  l=0;
  ir=n-1;
  for (;;)
  {
    if (ir <= l+1)
    {
      if (ir == l+1 && arr[ir].y < arr[l].y)
      {
        SWAP(arr[l], arr[ir])
      }
      return arr + k;
    }
    else
    {
      mid = (l + ir) >> 1;
      SWAP(arr[mid], arr[l+1])
      if (arr[l].y > arr[ir].y)
      {
        SWAP(arr[l], arr[ir])
      }
      if (arr[l+1].y > arr[ir].y)
      {
        SWAP(arr[l+1], arr[ir])
      }
      if (arr[l].y > arr[l+1].y)
      {
        SWAP(arr[l], arr[l+1])
      }
      i = l+1;
      j = ir;
      a = arr[l+1];
      for (;;)
      {
        do i++; while (arr[i].y < a.y);
        do j--; while (arr[j].y > a.y);
        if (j < i) break;
        SWAP(arr[i],  arr[j])
      }
      arr[l+1] = arr[j];
      arr[j] = a;
      if (j >= k) ir = j-1;
      if (j <= k) l = i;
    }
  }
}

with -O2 the struct is optimized away (I think, at least the scaling looks the same as with a plain array) and the scaling is linear.
However when uncommenting the other components of the struct the scaling is no longer linear.
How can this be?
And how can this be fixed?

Note that while I can change the struct, I cannot just median y because I need to know the other parameters corresponding to the median point as well.
Additionally I need the quickselect behavior for the resulting array (e.g. a.y <= m.y for all a left of m and b.y > m.y for all b right of m).

  • 1 1 Answer
  • 0 Views
  • 0 Followers
  • 0
Share
  • Facebook
  • Report

Leave an answer
Cancel reply

You must login to add an answer.

Forgot Password?

Need An Account, Sign Up Here

1 Answer

  • Voted
  • Oldest
  • Recent
  • Random
  1. Editorial Team
    Editorial Team
    2026-06-04T03:42:12+00:00Added an answer on June 4, 2026 at 3:42 am

    I think memory cache mishits explain the non-linear growth of the execution time. In my x86_64 architecture PC (Linux + gcc), sizeof(double) is 8 and sizeof(point_t) is 32 so less elements fit in cache memory. But bigger reason for the non-linear growth is that memory accesses to the point_t structures through the pointer array in your code will be quickly highly randomized and more and more cache misses occur because of this…

    I changed the code as follows:

    --- test2.c
    +++ test3.c
             -80,14 +80,12
         if (a == NULL)
             printf("Error in an");
    
    -    for (i = 0; i < N[4]; i++) {
    -        ap[i] = a + i;
    -    }
    - 
         for (i = 0; i < 5; i++) {
             start = clock();
    
             for (j = 0; j < ntest; j++) {
                for (k = 0; k < N[i]; k++) {
    +                ap[k] = a + k;
                     ap[k]->x = (double) rand() / (double) RAND_MAX;
                 }
    

    and the execution time growth is more linear.

    Original quicselect() with double array:

        1000 0.00000
       10000 0.04000
      100000 0.22000
     1000000 1.98000
    10000000 20.73000
    

    Original quickselect() with point_t * array:

        1000 0.01000
       10000 0.02000
      100000 0.71000
     1000000 8.64000
    10000000 157.77000
    

    Exactly the same quickselect() with point_t * array as above, but making sure the pointers in the array are in consequtive order before calling quickselect() by applying the above patch:

        1000 0.00000
       10000 0.02000
      100000 0.40000
     1000000 4.71000
    10000000 49.80000
    

    Note that even if the modified version does the extra sorting in the timing loop, it is still faster.

    I am running 3.2GHz Pentium(R) Dual-Core CPU E6700, 64-bit Linux, gcc 4.6, optimization -O2. (My machine is not idle, so my benchmark figures have some fluctuations – also I would consider using clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) for increased accuracy in Linux system if I were making more serious benchmarks to calculate out the time when kernel is not scheduling the benchmarked process to run.)

    UPDATE: for example, valgrind (if supported by your platform) can be used to analyse the impact of the cache hits. I modified the program to take two arguments, the array size (corresponding to the elements of the array N[]) and the test count (corresponding to ntest). Execution times without valgrind, wheretest2 is essentially the unmodified program listed in the question and test4 is the modifed version which rearranges the ap[] array before calling the quickselect() function:

    bash$ ./test2 10000000 100
    Array of 10000000 elements, 100 times: 154.40000 seconds
    bash$ ./test4 10000000 100
    Array of 10000000 elements, 100 times: 48.45000 seconds
    

    Here’s the result of running valgrind using cachegrind tool:

    bash$ valgrind --tool=cachegrind ./test2 10000000 100
    ==23563== Cachegrind, a cache and branch-prediction profiler
    ==23563== Copyright (C) 2002-2010, and GNU GPL'd, by Nicholas Nethercote et al.
    ==23563== Using Valgrind-3.6.1-Debian and LibVEX; rerun with -h for copyright info
    ==23563== Command: ./test2 10000000 100
    ==23563== 
    Array of 10000000 elements, 100 times: 1190.24000 seconds
    ==23563== 
    ==23563== I   refs:      80,075,384,594
    ==23563== I1  misses:             1,091
    ==23563== LLi misses:             1,087
    ==23563== I1  miss rate:           0.00%
    ==23563== LLi miss rate:           0.00%
    ==23563== 
    ==23563== D   refs:      36,670,292,139  (25,550,518,762 rd   + 11,119,773,377 wr)
    ==23563== D1  misses:     4,218,722,190  ( 3,223,975,942 rd   +    994,746,248 wr)
    ==23563== LLd misses:     4,190,889,241  ( 3,198,934,125 rd   +    991,955,116 wr)
    ==23563== D1  miss rate:           11.5% (          12.6%     +            8.9%  )
    ==23563== LLd miss rate:           11.4% (          12.5%     +            8.9%  )
    ==23563== 
    ==23563== LL refs:        4,218,723,281  ( 3,223,977,033 rd   +    994,746,248 wr)
    ==23563== LL misses:      4,190,890,328  ( 3,198,935,212 rd   +    991,955,116 wr)
    ==23563== LL miss rate:             3.5% (           3.0%     +            8.9%  )
    

    and

    bash$ valgrind --tool=cachegrind ./test4 10000000 100
    ==24436== Cachegrind, a cache and branch-prediction profiler
    ==24436== Copyright (C) 2002-2010, and GNU GPL'd, by Nicholas Nethercote et al.
    ==24436== Using Valgrind-3.6.1-Debian and LibVEX; rerun with -h for copyright info
    ==24436== Command: ./test4 10000000 100
    ==24436== 
    Array of 10000000 elements, 100 times: 862.89000 seconds
    ==24436== 
    ==24436== I   refs:      82,985,384,485
    ==24436== I1  misses:             1,093
    ==24436== LLi misses:             1,089
    ==24436== I1  miss rate:           0.00%
    ==24436== LLi miss rate:           0.00%
    ==24436== 
    ==24436== D   refs:      36,640,292,192  (24,530,518,829 rd   + 12,109,773,363 wr)
    ==24436== D1  misses:     2,814,232,350  ( 2,189,229,679 rd   +    625,002,671 wr)
    ==24436== LLd misses:     2,796,287,872  ( 2,171,294,250 rd   +    624,993,622 wr)
    ==24436== D1  miss rate:            7.6% (           8.9%     +            5.1%  )
    ==24436== LLd miss rate:            7.6% (           8.8%     +            5.1%  )
    ==24436== 
    ==24436== LL refs:        2,814,233,443  ( 2,189,230,772 rd   +    625,002,671 wr)
    ==24436== LL misses:      2,796,288,961  ( 2,171,295,339 rd   +    624,993,622 wr)
    ==24436== LL miss rate:             2.3% (           2.0%     +            5.1%  )
    

    See Valgrind manual for how to read these statistics. An essential point is: “On a modern machine, an L1 miss will typically cost around 10 cycles, an LL miss can cost as much as 200 cycles”. Calculating LLd (last level data cache) misshit cost difference between the two cases (each difference in mishits times the assumed 200 cycles per 3.2e9 cycles/second for 3.2GHz CPU) gives

    bash$ echo $(( (4190889241 - 2796287872) * 200 / 3200000000 )) seconds
    87 seconds
    

    The D1 misses contribute quite little here (given total of 91 seconds if D1 misshit cost is independent of LLd misshit cost); with all our inaccuraces (most notably about the actual cost of LLd misshit in this computer) the D1 misses can be just ignored.

    The run-time differences for test2 and test4 come to about 106 seconds which is reasonably close to the above 86 seconds. This all could be made more accurate, but this seems to demonstrate already the effect of cache misses in the test arrangement.

    P.S. valgrind writes a log file, where I could verify that it seemed to detect the L2 and L1 cache sizes and types correctly.

    • 0
    • Reply
    • Share
      Share
      • Share on Facebook
      • Share on Twitter
      • Share on LinkedIn
      • Share on WhatsApp
      • Report

Sidebar

Related Questions

Updated question, see below I'm starting a new project and I would like to
ORIGINAL (see UPDATED QUESTION below) I am designing a new laboratory database that tests
UPDATED - please read further details below original question I have a select form
See below the solid line for my original question. I have a folder in
UPDATE: This question is out of date, but left for informational purposes. Original Question
UPDATED See post #3 below. There is a need to upload a file to
SEE EDIT UPDATES BELOW. Original question has been modified! I have a working window
(please read the update section below, I leave the original question too for clarity)
EDIT 3: I am maintaining the original question below for historical reasons. However, I
Original question: Can someone tell me how to use slice lists and the ellipsis?

Explore

  • Home
  • Add group
  • Groups page
  • Communities
  • Questions
    • New Questions
    • Trending Questions
    • Must read Questions
    • Hot Questions
  • Polls
  • Tags
  • Badges
  • Users
  • Help
  • SEARCH

Footer

© 2021 The Archive Base. All Rights Reserved
With Love by The Archive Base

Insert/edit link

Enter the destination URL

Or link to existing content

    No search term specified. Showing recent items. Search or use up and down arrow keys to select an item.