top300.c

/* This program is by Graham Toal.  It's in the "jjchew"
   directory, because I was inspired to write it while looking
   at his "wordprob.c" program and I thought it would be
   useful to keep the two pograms together, if anyone was
   looking for sources to do with the probabilities involved
   in scrabble racks.

   The purpose of this program is to generate study lists of
   most probable rack draws, but I expect I will be reusing some
   of the code in here for other nefarious purposes soon ;-)

   I am indebted to  "Harold Rennett" <haroldelaine@msn.com>
   (who goes by the handle of Tuchusfacius on the web -
    see his web page http://www.addicted2words.com/tf/7and8.htm)
   who explained the maths behind this for me, and whose
   manually crafted list was a superb help in verifying
   that this code is on the right track, despite it being
   only an approximation.

   This program has a few built-in safety checks, in case it is
   hacked on by someone other than me, who may accidentally break
   some of the assumptions that make it work.

 */

#include <stdio.h>
#include <stdlib.h>
#include <stdlib.h>

#define MaxN 7
#define MaxM 100

#define MAXRESULTS 300
#define BUCKETSIZE 400
typedef struct {
  int prob;
  char rack[MaxN+1];
} result;

static result junkpile[BUCKETSIZE*2];
static int nextfree = 0;
static int sorted_watermark = 0;
static int lowest_of_high_bucket = -1;
static char rack[MaxN+1];


static int tot[256];
static int itot[27] =
  /*  a b c d  e f g h i j k l m n o p  q r s t u v w x y  z */
   {  9,2,2,4,12,2,3,2,9,1,1,4,2,6,8,2, 1,6,4,6,4,2,2,1,2, 1};

static unsigned long combCount = 0UL;

/* Calculate Combinations(N, R) without using
   factorials; 100% accurate for normal numbers
   but suffers some minor rounding errors only
   in cases where the more obvious algorithm would
   have exceeded integer capacity anyway, or would
   have forced the use of floating point (which I like
   to avoid if possible).

   Yes, I did come up with this myself from
   first principles, though I have no doubt it
   has been thought of before.  It's probably a
   standard example in every book of integer
   numerical algorithms.  I was sort of thinking
   of Bresenhams algorithm for drawing lines
   in a raster when I invented it.

   It does fail silently in the case when the
   final answer still exceeds the largest
   unsigned long integer.  Tough.
 */

#define Combs(R, N) XCombs(R, N, __LINE__) 
static unsigned long XCombs(int R, int N, int L)
{
  unsigned long i = 1L;
  int r = R;
  if (R > N) {
    fprintf(stderr, "Who called Combs(%d, %d) at line %0d ?\n", R, N, L);
    exit(1);
  }
  while (((unsigned long)R > 1) || (r > 0)) {
    while (r-- > 0) {
      i *= (unsigned long)N--;
      /* fprintf(stderr, "* %ld => %lu\n", N+1, i); */
      if ((N > 0) && (i >= (0x7fffffffUL / (unsigned long)N))) break; /* no more room */
    }
    while ((unsigned long)R > 1) {
      i /= (unsigned long)R--;
      /* fprintf(stderr, "/ %ld => %lu\n", R+1, i); */
      if ((N > 0) && (i < (0x7fffffffUL / (unsigned long)N))) break; /* room for another */
    }
  }
  return(i);
}

int cmp(const void *a, const void *b)
{
  int i;
  result *ar = (result *)a;
  result *br = (result *)b;
  if ((long)ar == (long)br) return(0); /* shouldn't happen */
  i = ((br->prob)-(ar->prob));
  if (i == 0) i = strcmp(ar->rack, br->rack);
  return(i);
}

/* I know that normally people use a heap to keep the largest <N>
   objects in memory, but I've always had some success with this
   algorithm, which uses threshholds and double-buffering to take
   advantage of an efficient quicksort routine.  It's certainly
   a lot easier to write, and I don't think significantly less
   efficient.  (The only drawback is in this code we want to
   eliminate duplicates; without that requirement, this is *much*
   more efficient)
 */

void Output(int thisp, char *s, int N)
{
  int i, j, k;

  combCount += 1UL;
  if (combCount == 0x7fffffffUL) {
    fprintf(stderr, "Oops!  31bit wraparound.\n");
    /* Probably impossible to do 32-bits worth of
       anything useful (like board evals) during
       the lifetime of a computer */
    exit(1);
  }

  /* throw away any results worse than current best <N> */
  if ((sorted_watermark >= MAXRESULTS) && (thisp < lowest_of_high_bucket)) return; /* too low */

  junkpile[nextfree].prob = thisp;
  strncpy(junkpile[nextfree].rack, s, N); junkpile[nextfree].rack[N] = '\0';
  /*fprintf(stdout, "%09d %s\n", thisp, junkpile[nextfree].rack); fflush(stdout);*/
  nextfree += 1;

  if (nextfree == (BUCKETSIZE*2)) {
    qsort(&junkpile[0], (size_t)nextfree, (size_t)sizeof(junkpile[0]), cmp);
    /* Remove duplicates */
    i = 0; j = 0;
    while (i < BUCKETSIZE) {
      if (i != j) {
        strcpy(junkpile[i].rack,  junkpile[j].rack);
        junkpile[i].prob = junkpile[j].prob;
      }
      i += 1;
      k = j;
      while ((j < (BUCKETSIZE*2)) && (strcmp(junkpile[k].rack, junkpile[j].rack)) == 0) {
        j += 1;
      }
      if (j >= (BUCKETSIZE*2)) break;
    }
    sorted_watermark = nextfree = i; 
    lowest_of_high_bucket = junkpile[nextfree-1].prob;
/*
    fprintf(stderr, "The critical threshhold is now %d (watermark at %d)\n",
      lowest_of_high_bucket, sorted_watermark);
 */
    fprintf(stderr, "."); fflush(stderr);
  }
}

/* There is an obvious transliteration of this procedure possible
   that would make it shorter, more general - and recursive; however
   the non-recursive version runs like shit off a shovel and can be
   highly optimised by a good compiler. */

void EnumerateCombs(char *T, int N, int M)
{
int C[MaxN+1];
int P[MaxN+1]; /* Cumulative probability so far.  Stacked. */
char OT[MaxM+1];
int i;

  /* Seven nested loops because we only ever have 7 tiles */
#if MaxN > 7
  fprintf(stderr, "You don't know what you're doing, do you?\n");
  exit(1);
#endif

  /* Perhaps if I replaced C[0-7] with C0 - C7, the
     compiler would be able to allocate them to
     registers more easily? */

  /* We use our knowlege that the search space is restricted
     to having at nost two of the same letter, and those
     appearing consecutively, to perpetrate a gross hack
     that allows us to add up the probabilities on the fly.
     Easy enough to modify to be completely general, but
     I don't need that yet. */

  C[0] = -1;

  if (N>=1)  for (C[1] = C[0]+1; C[1] < M; C[1]++) {
    OT[0] = T[C[1]];
    P[0] = tot[OT[0]];
    if (N>=2)  for (C[2] = C[1]+1; C[2] < M; C[2]++) {
      OT[1] = T[C[2]];
      if (OT[1] == OT[0]) {
        P[1] = Combs(2, tot[OT[1]]);
      } else {
        P[1] = tot[OT[1]] * P[0];
      }
      if (N>=3)  for (C[3] = C[2]+1; C[3] < M; C[3]++) {
        OT[2] = T[C[3]];
        if (OT[2] == OT[1]) {
          P[2] = Combs(2, tot[OT[2]]) * P[0];
        } else {
          P[2] = tot[OT[2]] * P[1];
        }
        if (N>=4)  for (C[4] = C[3]+1; C[4] < M; C[4]++) {
          OT[3] = T[C[4]];
          if (OT[3] == OT[2]) {
            P[3] = Combs(2, tot[OT[3]]) * P[1];
          } else {
            P[3] = tot[OT[3]] * P[2];
          }
          if (N>=5)  for (C[5] = C[4]+1; C[5] < M; C[5]++) {
            OT[4] = T[C[5]];
            if (OT[4] == OT[3]) {
              P[4] = Combs(2, tot[OT[4]]) * P[2];
            } else {
              P[4] = tot[OT[4]] * P[3];
            }
            if (N>=6)  for (C[6] = C[5]+1; C[6] < M; C[6]++) {
              OT[5] = T[C[6]];
              if (OT[5] == OT[4]) {
                P[5] = Combs(2, tot[OT[5]]) * P[3];
              } else {
                P[5] = tot[OT[5]] * P[4];
              }
              if (N>=7)  for (C[7] = C[6]+1; C[7] < M; C[7]++) {
                OT[6] = T[C[7]];
                if (OT[6] == OT[5]) {
                  P[6] = Combs(2, tot[OT[6]]) * P[4];
                } else {
                  P[6] = tot[OT[6]] * P[5];
                }
                Output(P[6], OT, N);
              } else {
                Output(P[5], OT, N);
              }
            } else {
              Output(P[4], OT, N);
            }
          } else {
            Output(P[3], OT, N);
          }
        } else {
          Output(P[2], OT, N);
        }
      } else {
        Output(P[1], OT, N);
      }
    } else {
      Output(P[0], OT, N);
    }
  }
}

int main(int argc, char **argv)
{
/* We really would prefer to enumerate *ALL* possible racks,
   [Combs(7, 100)] which is beyond the bounds of current computers.

   However we already know that racks with three letters
   the same are low probability, so in our search for
   the high probability racks, we never generate those by virtue
   of only having two letters the same at most in our 'Alphabet'
   array. We also discard blanks (better handled in some other
   ad-hoc way).  That and throwing away the letters of which there
   are only 1 (jkqxz) and reducing all others to 2 letters, brings
   our alphabet space down to C(7, 42) which can be enumerated (including
   all the ancilliary junk of keeping the best <N> (sorted)) in 45 seconds
   on my old 486.  People with faster computers can put some tiles back... :-)

   The manual pruning that I did to generate this string should really
   be done automatically from the full 100 tile alphabet.  That
   would make the program more easily adapted to being called in
   the middle of a game, say, when the bag is down to 70 tiles
   and all the 'c's have been played, for instance.

   (Also it would allow easy application to other language versions
    of Scrabble)

 */
  char *alphabet = "aabbccddeeffgghhiillmmnnoopprrssttuuvvwwyy";
  int i, j;

  if (argc == 2) alphabet = argv[1];

  fprintf(stderr, "Alphabet: %s\n", alphabet);
  if (strlen(alphabet) <= MaxN) {
    fprintf(stderr, "The alphabet parameter must be larger than the\n");
    fprintf(stderr, " number of tiles you are using (%d)\n", MaxN);
    exit(1);
  } else if (strlen(alphabet) > 47) {
    fprintf(stderr, "You don't have much hope of this ever terminating\n");
    fprintf(stderr, " in the near future, do you?\n");
  } else {
    fprintf(stderr, "I will calculate the most likely racks using\n");
    fprintf(stderr, "the standard English Scrabble letters and frequencies\n");
    fprintf(stderr, "- remember, you will only get an approximation\n");
    fprintf(stderr, "to the most likely racks using this input\n");
  }

  for (i = 0; i < 256; i++) tot[i] = 0;
  for (i = 0; i < 26; i++) tot['a'+i] = itot[i];

  /* Some confidence tests of C(N, R) function. */
  if ((Combs(2, 12) != 66) || (Combs(3, 9) != 84)) {
    fprintf(stderr,
      "Coding error - someone has been tweaking the combination calculation\nfunction wrongly!\n");
    exit(1);
  }

  EnumerateCombs(alphabet, 7, strlen(alphabet));
  fprintf(stderr,
    "\nI generated %d racks of 7 letters from the set of %d letters\n",
    (int)combCount, strlen(alphabet));

  /* Sanity check - combs enumerated should match C(N, R) calculated */
  if (combCount != Combs(7, strlen(alphabet))) {
    fprintf(stderr,
      "Coding error - someone has been tweaking the combination generator\nloop wrongly!\n");
    exit(1);
  }

  if (sorted_watermark < MAXRESULTS) {
    fprintf(stderr, "Warning: there may be some duplication\n");
  }

  qsort(&junkpile[0], (size_t)nextfree, (size_t)sizeof(junkpile[0]), cmp);
  fprintf(stderr, "The best was %s, with a score of %d\n",
    junkpile[0].rack, junkpile[0].prob);
  fprintf(stderr, "The best %d are:\n", MAXRESULTS); fflush(stderr);
  for (j = 0; j < (MAXRESULTS/5); j++) {
    if (junkpile[j*5].prob == 0) break;
    for (i = 0; i < 5; i++) {
      if (junkpile[j*5+i].prob == 0) break;
      fprintf(stdout, "%7s %7d ", junkpile[j*5+i].rack,
                                  junkpile[j*5+i].prob);
    }
    fprintf(stdout, "\n");
  }
  fflush(stdout); fflush(stderr);

  return(0);
}
/* NOTES: the info below will be needed for the next part of this
   project.

QUESTION: The first question I have is very basic.  Let's say we've
made a play that puts down 3 specific letters.  What is the probability
of having picked a rack which contained those letters?  Is is just
Combs(3, 100), or is it something like Combs(3, Combs(7, 100))?  I suspect
it makes a difference if you say that the rest of the rack is different
letters, as opposed to you don't care what the rest of the rack is -
i.e. you could allow duplicated letters.  I think for this application
I don't care what the other letters are.

ANSWER: Well, it makes a heck of a lot of difference what the three letters
are.  The chances of having a rack with the three letters EIA is a whole lot
greater than having JZX.  What you'd do to figure this out is to first
calculate the number of racks that would meet the criteria.  First, you'd
calculate the number of different ways you could form the 3-letter
combination in question with the letters in the Scrabble set - in the case
of EIA, it would be 12 x 9 x 9.  (In the case of JZX, it would be 1.)  Then,
you'd multiply that number by the number of ways that you could put
together the other four tiles, which is "97 choose 4"  (i.e., the number of
ways you could choose 4 items out of 97 items), which is 97!/(93! x 4!).
Therefore, the numerator of your probability, in the case of EIA???? would
be (12 x 9 x 9 x 97!)/(93! x 4!).  And the denominator of the probability,
as always, is "100 choose 7", or 100!/(93! x 7!) - the number of ways you
could choose an opening rack from a full bag.

*/