// radix sort - gt style.  I assume someone has come up with this before me,
// but I don't remember ever coming across it.  Doing a google search just
// now I found the related 'bradsort' but it is significantly different.

// I came up with this a *long long* time ago, but never coded it as I had
// no actual need for the code.  But watching Jon Bentley talk about 'the most
// beautiful code I never wrote' made me think about the neatest algorithm
// I never wrote too.  By coincidence, also a sort.  So here's the first
// rough draft, after taking what I thought would be short and elegant, and
// adding enough real-life code to it to make it work, but before putting the
// effort into simplifying it and making it both robust and efficient. 

// linear time (in array size; plus log_2 factor in data width), linear space
// asymptotically better than quicksort et al, as long as we have sufficient
// ram available and don't start swapping.  memory demands are higher than
// quicksort but linear.  Question: on modern computers with huge rams, at what
// point does the more expensive code in this sort start to pay out over quicksort,
// and is that point reached before we hit the point where we start paging?

// code assumes gcc extensions (because it's programmed in my head in Algol
// and only realised in C since that's the compiler that's available to me)


#include <stdio.h>
#include <stdlib.h>

#ifndef BITSPERBYTE
const int BITSPERBYTE = 8; // should calculate dynamically. Wish C had a BITSPERINT etc.
#endif

// the interface and demo is a simple sort of an array of ints.
// can be trivially generalised to an array of any fixed-size object
// slightly more effort for variable-sized objects such as strings
// left as an exercise for the reader how to handle the case where
// the data is a key and includes a separate pointer to other data
// can also be coded to save space, but the reason I wrote this now
// (and not 30 years ago when I first thought of it) is that big
// rams are the norm so why not use them?

void sort(int array[], int items) // exclusive upper bound
{
    // using gcc extension allowing algol-style nested procedures
  static int nextfreeslot;

  void add(int trie[], int start, int val, int bitsleft) // add an int to a bit-wise trie
  {
    int ptr;
    int bit;

    while (bitsleft > 0) {
      bit = val < 0 ? 1 : 0; // this is unfortunate, but we have to look at the high bit, otherwise
                             // the data we pull out after sorting is bit-reversed. (i.e. if we say bit = val&1)
                             // (could pick the top bit out with a shift but am trying for more portable coding style)
      ptr = trie[start+bit];
      if (ptr == 0) { // unallocated
          ptr = trie[start+bit] = nextfreeslot;
          trie[nextfreeslot++] = 0; trie[nextfreeslot++] = 0; // reserve 2 cells for 0 or 1 links (or count + spare)
      }
      // add(trie, ptr, val<<1, bitsleft-1); // Tail recursion, now transformed into a loop
      start = ptr; val <<= 1; bitsleft -= 1;
    }
    // final slot holds count of values
    trie[start] += 1; // Final trie[start+1] is currently unused.  We *could* use it as a flag that this
                      // is a final node in the trie, and thereby not require 32 cell pairs for every word.
  }

  void walk(int trie[], int start, int array[], int bitsleft, int data)
  {
    static int outputpos;
    if (start == 0) outputpos = 0; // somewhat hacky
    if (bitsleft == 0) {
        while (trie[start]-- > 0) array[outputpos++] = data; // count of items which were equal to this value
        return;
    }
    if (trie[start] != 0) { // a word was present in which this bit was 0
        walk(trie, trie[start], array, bitsleft-1, data<<1);
        // this one cannot be optimised away.  We could use an explicit stack though.  No more that BITSPERWORD deep
    }
    if (trie[start+1] != 0) { // a word was present in which this bit was 1
        walk(trie, trie[start+1], array, bitsleft-1, (data<<1)|1);  // first bit is highest bit.  adds a minor complication
        // again, tail-recursion could be manually optimised away.
    }
  }

  /* Uses gcc extension allowing algol-style dynamic bounds on stack array.  Should use heap for portability */
  int i, size, trie[size = (sizeof(int)*BITSPERBYTE+2)*2 * items]; // I think this is an OK upper bound

  nextfreeslot = 2; trie[0] = 0; trie[1] = 0; // inelegant, but this is only the first hack.

  // first add all the input data to a bit-wise trie

  for (i = 0; i < items; i++) add(trie, 0, array[i], sizeof(int)*BITSPERBYTE);

  // then pre-order traverse the trie to extract the data back into the original input array
  // (we're careful about duplicates)

  // do this for unsigned ints:
  // walk(trie, 0, array, sizeof(unsigned int)*BITSPERBYTE, 0); // reassign sorted values to array

  // do this for signed ints:
  walk(trie, trie[1], array, sizeof(int)*BITSPERBYTE-1, 1); // reassign sorted values to array
  walk(trie, trie[0], array, sizeof(int)*BITSPERBYTE-1, 0); // hack to make negative numbers come first
                                                            // as by default, alg treats numbers as unsigned
}

int main(int argc, char **argv)
{
    int i, data[] = {42, 0x400000, 3, 66, 21, -42, -1, 0};

    fprintf(stdout, "Input:"); for (i = 0; i < sizeof(data)/sizeof(*data); i++) fprintf(stdout, " %d", data[i]); fprintf(stdout, "\n");
    sort(data, 8);
    fprintf(stdout, "Output:"); for (i = 0; i < sizeof(data)/sizeof(*data); i++) fprintf(stdout, " %d", data[i]); fprintf(stdout, "\n");
    exit(0);
}