I said I would post the algorithm along time ago and forgot so here it is. Kinda messy, sorry.
#include "utils.h"
#include <math.h>
#include <stdlib.h>
#define TTNB 9
#define LCUT 4
#define HCUT 9
#define BEST 7.55
#define BCUT 10
#define RMX 31
struct _node {
int d;
struct _node *n;
};
typedef struct _node ll;
void bucket1k(int *a, int *e, int max) {
int shift = max - TTNB, nb = (1 << TTNB);
int *i, *w, d;
ll *c, **j;
ll **bucket = (ll **)calloc(nb, sizeof(ll *));
ll *water = (ll *)malloc((e - a) * sizeof(ll));
for(i = a, c = water; i < e; i++) {
//vec_dst(i + 4, 0x04080000, 1);
j = bucket + (*i >> shift & ((1 << TTNB) - 1));
c->n = *j;
c->d = *i;
*j = c++;
}
for(j = bucket, i = a; j < bucket + nb; j++)
for(c = *j; c; c = c->n) {
// *(i++) = c->d;
d = c->d;
for(w = i++ - 1; *w > d; w--) w[1] = *w;
w[1] = d;
}
free(bucket);
free(water);
}
void count1bit(int *a, int *e, int shift, int times) {
int *i, *j, temp, mask;
i = a - 1; j = e, mask = 1 << shift - 1;
while(1) {
while(!(*++i & mask));
while(*--j & mask);
if (j < i) break;
SWAP(*i, *j);
}
if(times > 1) {
count1bit(a, i, shift - 1, times - 1);
count1bit(i, e, shift - 1, times - 1);
}
else {
bucket1k(a, i, shift - 1);
bucket1k(i, e, shift - 1);
}
}
int pickshift(int r) {
int v = r / BEST + 0.5;
if(v == 0) v = r;
else v = ceil((float)r / v);
if(v > HCUT) v = HCUT;
return v;
}
#define F(x) ((x) >> shift & mask)
void countess(int *a, int *e, int max, int remains) {
int i, shift, k, temp, v, j, m, mask, *s, *c, *p;
v = pickshift(remains);
k = 1 << v;
mask = k - 1;
shift = max - v;
c = (int *)calloc(k, sizeof(int));
s = (int *)malloc((k + 1) * sizeof(int));
//for(i = 0; i < k; i+= 8) vec_dst(c + i, 0x04080000, 0);
for(p = a; p < e; p++) {
vec_dst(p + 8, 0x04080000, 1);
c[F(*p)]++;
}
for(i = 1, *s = 0, s[k] = e - a; i < k; i++) c[i] += s[i] = c[i-1];
for(m = 0; m < k; m++) {
if(s[m] == c[m]) continue;
i = s[m];
v = a[i];
do {
j = --c[F(v)];
SWAP(a[j], v);
vec_dst(a + j - 1, 0x04010000, 0);
} while(j != i);
}
free(c);
v = remains - max + shift;
if(v >= LCUT) for(i = 0; i < k; i++)
countess(a + s[i], a + s[i + 1], shift, v);
else if(v == 0) for(i = 0; i < k; i++)
bucket1k(a + s[i], a + s[i + 1], shift);
else for(i = 0; i < k; i++)
count1bit(a + s[i], a + s[i + 1], shift, v);
free(s);
}
void darren(int *a, int *e) {
int lgn = log10(e - a) / log10(2) + 0.5 - BCUT;
if(lgn <= 0) bucket1k(a, e, RMX);
else if(lgn < LCUT) count1bit(a, e, RMX, lgn);
else countess(a, e, RMX, lgn);
// countess(a, a + n, RMX, 31);
}
If you're not using mac comment out the vec_dst, it just prefetches data, I don't know the other operating systems equivalent. Basically the function 'countess' is the only important one, it is like counting sort but without extra memory usuage.