Updated, original question below the line:
I need to compute a median, and would like to use the O(N) quickselect algorithm.
It turns out however that when the array is no longer a flat array of doubles, but rather an array of structs (of which one element is the element to use for the median computation) the run time no longer scales with O(N).
The following flat array version has approximately linear runtime:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define SWAP(a,b) temp=(a);(a)=(b);(b)=temp;
double quickselect(unsigned long k, unsigned long n, double *arr)
{
unsigned long i, ir, j, l, mid;
double a, temp;
l=1;
ir=n-1;
for (;;) {
if (ir <= l+1) {
if (ir == l+1 && arr[ir] < arr[l]) {
SWAP(arr[l],arr[ir])
}
return arr[k];
} else {
mid=(l+ir) >> 1;
SWAP(arr[mid],arr[l+1])
if (arr[l] > arr[ir]) {
SWAP(arr[l],arr[ir])
}
if (arr[l+1] > arr[ir]) {
SWAP(arr[l+1],arr[ir])
}
if (arr[l] > arr[l+1]) {
SWAP(arr[l],arr[l+1])
}
i=l+1;
j=ir;
a=arr[l+1];
for (;;) {
do i++; while (arr[i] < a);
do j--; while (arr[j] > a);
if (j < i) break;
SWAP(arr[i],arr[j])
}
arr[l+1]=arr[j];
arr[j]=a;
if (j >= k) ir=j-1;
if (j <= k) l=i;
}
}
}
int main()
{
unsigned long i, j, k, l, m;
unsigned long ntest = 1e2;
unsigned long N[5] = {1e3, 1e4, 1e5, 1e6, 1e7};
clock_t start, diff;
int seed = 215342512; //time(NULL);
srand(seed);
double *arr = (double*) malloc(N[4] * sizeof(double));
for (i=0; i<5; i++)
{
start = clock();
for (j=0; j<ntest; j++)
{
for (k=0; k<N[i]; k++)
{
arr[k] = (double) rand() / (double) RAND_MAX;
}
quickselect(N[i] / 2, N[i], arr);
}
diff = clock() - start;
printf("%lu %.5f\n", N[i], (double) diff / CLOCKS_PER_SEC);
}
}
Gives:
1000 0.00228
10000 0.02014
100000 0.19868
1000000 2.01272
10000000 20.41286
However the following version with structs has non linear runtime:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define SWAP(a,b) temp=(a);(a)=(b);(b)=temp;
typedef struct {
double x;
double y;
double z;
int id;
} point_t;
point_t* quickselect(unsigned long k, unsigned long n, point_t **arr)
{
unsigned long i, ir, j, l, mid;
point_t *a, *temp;
l=1;
ir=n-1;
for (;;) {
if (ir <= l+1) {
if (ir == l+1 && arr[ir]->x < arr[l]->x) {
SWAP(arr[l],arr[ir])
}
return arr[k];
} else {
mid=(l+ir) >> 1;
SWAP(arr[mid],arr[l+1])
if (arr[l]->x > arr[ir]->x) {
SWAP(arr[l],arr[ir])
}
if (arr[l+1]->x > arr[ir]->x) {
SWAP(arr[l+1],arr[ir])
}
if (arr[l]->x > arr[l+1]->x) {
SWAP(arr[l],arr[l+1])
}
i=l+1;
j=ir;
a=arr[l+1];
for (;;) {
do i++; while (arr[i]->x < a->x);
do j--; while (arr[j]->x > a->x);
if (j < i) break;
SWAP(arr[i],arr[j])
}
arr[l+1]=arr[j];
arr[j]=a;
if (j >= k) ir=j-1;
if (j <= k) l=i;
}
}
}
int main()
{
unsigned long i, j, k, l, m;
unsigned long ntest = 1e2;
unsigned long N[5] = {1e3, 1e4, 1e5, 1e6, 1e7};
clock_t start, diff;
int seed = 215342512; //time(NULL);
srand(seed);
point_t **ap, *a;
ap = (point_t**) malloc(N[4] * sizeof(point_t*));
if (ap == NULL) printf("Error in ap\n");
a = (point_t*) malloc(N[4] * sizeof(point_t));
if (a == NULL) printf("Error in a\n");
for (i=0; i<N[4]; i++)
{
ap[i] = a+i;
}
for (i=0; i<5; i++)
{
start = clock();
for (j=0; j<ntest; j++)
{
for (k=0; k<N[i]; k++)
{
ap[k]->x = (double) rand() / (double) RAND_MAX;
}
quickselect(N[i] / 2, N[i], ap);
}
diff = clock() - start;
printf("%lu %.5f\n", N[i], (double) diff / CLOCKS_PER_SEC);
}
}
Gives:
1000 0.00224
10000 0.02587
100000 0.37574
1000000 7.18962
10000000 96.34863
Both versions were compiled with gcc -O2 (but -O0 gives the same scaling).
Where does this change in scaling come from and how can it be fixed?
Note that while I can change the struct, I cannot just median y because I need to know the other parameters corresponding to the median point as well.
Additionally I need the quickselect behavior for the resulting array (e.g. a.y <= m.y for all a left of m and b.y > m.y for all b right of m).
I need to compute a median, and would like to use the O(N) quickselect algorithm.
It turns out however that when the array is no longer a flat array of doubles, but rather an array of structs (of which one element is the element to use for the median computation) the run time no longer scales with O(N).
I use the following implementation:
#define SWAP(a,b) temp=(a); (a)=(b); (b)=temp;
typedef struct point_t point_t;
struct point_t {
double y;
// unsigned long something;
//
// double *something_else;
//
// double yet_another thing;
//
// point_t* again_something;
};
void median(point_t *arr, unsigned long n)
{
unsigned long k = n / 2;
unsigned long i, ir, j, l, mid;
point_t a, temp;
l=0;
ir=n-1;
for (;;)
{
if (ir <= l+1)
{
if (ir == l+1 && arr[ir].y < arr[l].y)
{
SWAP(arr[l], arr[ir])
}
return arr + k;
}
else
{
mid = (l + ir) >> 1;
SWAP(arr[mid], arr[l+1])
if (arr[l].y > arr[ir].y)
{
SWAP(arr[l], arr[ir])
}
if (arr[l+1].y > arr[ir].y)
{
SWAP(arr[l+1], arr[ir])
}
if (arr[l].y > arr[l+1].y)
{
SWAP(arr[l], arr[l+1])
}
i = l+1;
j = ir;
a = arr[l+1];
for (;;)
{
do i++; while (arr[i].y < a.y);
do j--; while (arr[j].y > a.y);
if (j < i) break;
SWAP(arr[i], arr[j])
}
arr[l+1] = arr[j];
arr[j] = a;
if (j >= k) ir = j-1;
if (j <= k) l = i;
}
}
}
with -O2 the struct is optimized away (I think, at least the scaling looks the same as with a plain array) and the scaling is linear.
However when uncommenting the other components of the struct the scaling is no longer linear.
How can this be?
And how can this be fixed?
Note that while I can change the struct, I cannot just median y because I need to know the other parameters corresponding to the median point as well.
Additionally I need the quickselect behavior for the resulting array (e.g. a.y <= m.y for all a left of m and b.y > m.y for all b right of m).
I think memory cache mishits explain the non-linear growth of the execution time. In my x86_64 architecture PC (Linux + gcc),
sizeof(double)is 8 andsizeof(point_t)is 32 so less elements fit in cache memory. But bigger reason for the non-linear growth is that memory accesses to thepoint_tstructures through the pointer array in your code will be quickly highly randomized and more and more cache misses occur because of this…I changed the code as follows:
and the execution time growth is more linear.
Original
quicselect()withdoublearray:Original
quickselect()withpoint_t *array:Exactly the same
quickselect()withpoint_t *array as above, but making sure the pointers in the array are in consequtive order before callingquickselect()by applying the above patch:Note that even if the modified version does the extra sorting in the timing loop, it is still faster.
I am running 3.2GHz Pentium(R) Dual-Core CPU E6700, 64-bit Linux, gcc 4.6, optimization
-O2. (My machine is not idle, so my benchmark figures have some fluctuations – also I would consider usingclock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...)for increased accuracy in Linux system if I were making more serious benchmarks to calculate out the time when kernel is not scheduling the benchmarked process to run.)UPDATE: for example,
valgrind(if supported by your platform) can be used to analyse the impact of the cache hits. I modified the program to take two arguments, the array size (corresponding to the elements of the arrayN[]) and the test count (corresponding tontest). Execution times withoutvalgrind, wheretest2is essentially the unmodified program listed in the question andtest4is the modifed version which rearranges theap[]array before calling thequickselect()function:Here’s the result of running
valgrindusing cachegrind tool:and
See Valgrind manual for how to read these statistics. An essential point is: “On a modern machine, an L1 miss will typically cost around 10 cycles, an LL miss can cost as much as 200 cycles”. Calculating LLd (last level data cache) misshit cost difference between the two cases (each difference in mishits times the assumed 200 cycles per 3.2e9 cycles/second for 3.2GHz CPU) gives
The D1 misses contribute quite little here (given total of 91 seconds if D1 misshit cost is independent of LLd misshit cost); with all our inaccuraces (most notably about the actual cost of LLd misshit in this computer) the D1 misses can be just ignored.
The run-time differences for
test2andtest4come to about 106 seconds which is reasonably close to the above 86 seconds. This all could be made more accurate, but this seems to demonstrate already the effect of cache misses in the test arrangement.P.S.
valgrindwrites a log file, where I could verify that it seemed to detect the L2 and L1 cache sizes and types correctly.