/* This program is by Graham Toal. It's in the "jjchew"
directory, because I was inspired to write it while looking
at his "wordprob.c" program and I thought it would be
useful to keep the two pograms together, if anyone was
looking for sources to do with the probabilities involved
in scrabble racks.
The purpose of this program is to generate study lists of
most probable rack draws, but I expect I will be reusing some
of the code in here for other nefarious purposes soon ;-)
I am indebted to "Harold Rennett" <haroldelaine@msn.com>
(who goes by the handle of Tuchusfacius on the web -
see his web page http://www.addicted2words.com/tf/7and8.htm)
who explained the maths behind this for me, and whose
manually crafted list was a superb help in verifying
that this code is on the right track, despite it being
only an approximation.
This program has a few built-in safety checks, in case it is
hacked on by someone other than me, who may accidentally break
some of the assumptions that make it work.
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdlib.h>
#define MaxN 7
#define MaxM 100
#define MAXRESULTS 300
#define BUCKETSIZE 400
typedef struct {
int prob;
char rack[MaxN+1];
} result;
static result junkpile[BUCKETSIZE*2];
static int nextfree = 0;
static int sorted_watermark = 0;
static int lowest_of_high_bucket = -1;
static char rack[MaxN+1];
static int tot[256];
static int itot[27] =
/* a b c d e f g h i j k l m n o p q r s t u v w x y z */
{ 9,2,2,4,12,2,3,2,9,1,1,4,2,6,8,2, 1,6,4,6,4,2,2,1,2, 1};
static unsigned long combCount = 0UL;
/* Calculate Combinations(N, R) without using
factorials; 100% accurate for normal numbers
but suffers some minor rounding errors only
in cases where the more obvious algorithm would
have exceeded integer capacity anyway, or would
have forced the use of floating point (which I like
to avoid if possible).
Yes, I did come up with this myself from
first principles, though I have no doubt it
has been thought of before. It's probably a
standard example in every book of integer
numerical algorithms. I was sort of thinking
of Bresenhams algorithm for drawing lines
in a raster when I invented it.
It does fail silently in the case when the
final answer still exceeds the largest
unsigned long integer. Tough.
*/
#define Combs(R, N) XCombs(R, N, __LINE__)
static unsigned long XCombs(int R, int N, int L)
{
unsigned long i = 1L;
int r = R;
if (R > N) {
fprintf(stderr, "Who called Combs(%d, %d) at line %0d ?\n", R, N, L);
exit(1);
}
while (((unsigned long)R > 1) || (r > 0)) {
while (r-- > 0) {
i *= (unsigned long)N--;
/* fprintf(stderr, "* %ld => %lu\n", N+1, i); */
if ((N > 0) && (i >= (0x7fffffffUL / (unsigned long)N))) break; /* no more room */
}
while ((unsigned long)R > 1) {
i /= (unsigned long)R--;
/* fprintf(stderr, "/ %ld => %lu\n", R+1, i); */
if ((N > 0) && (i < (0x7fffffffUL / (unsigned long)N))) break; /* room for another */
}
}
return(i);
}
int cmp(const void *a, const void *b)
{
int i;
result *ar = (result *)a;
result *br = (result *)b;
if ((long)ar == (long)br) return(0); /* shouldn't happen */
i = ((br->prob)-(ar->prob));
if (i == 0) i = strcmp(ar->rack, br->rack);
return(i);
}
/* I know that normally people use a heap to keep the largest <N>
objects in memory, but I've always had some success with this
algorithm, which uses threshholds and double-buffering to take
advantage of an efficient quicksort routine. It's certainly
a lot easier to write, and I don't think significantly less
efficient. (The only drawback is in this code we want to
eliminate duplicates; without that requirement, this is *much*
more efficient)
*/
void Output(int thisp, char *s, int N)
{
int i, j, k;
combCount += 1UL;
if (combCount == 0x7fffffffUL) {
fprintf(stderr, "Oops! 31bit wraparound.\n");
/* Probably impossible to do 32-bits worth of
anything useful (like board evals) during
the lifetime of a computer */
exit(1);
}
/* throw away any results worse than current best <N> */
if ((sorted_watermark >= MAXRESULTS) && (thisp < lowest_of_high_bucket)) return; /* too low */
junkpile[nextfree].prob = thisp;
strncpy(junkpile[nextfree].rack, s, N); junkpile[nextfree].rack[N] = '\0';
/*fprintf(stdout, "%09d %s\n", thisp, junkpile[nextfree].rack); fflush(stdout);*/
nextfree += 1;
if (nextfree == (BUCKETSIZE*2)) {
qsort(&junkpile[0], (size_t)nextfree, (size_t)sizeof(junkpile[0]), cmp);
/* Remove duplicates */
i = 0; j = 0;
while (i < BUCKETSIZE) {
if (i != j) {
strcpy(junkpile[i].rack, junkpile[j].rack);
junkpile[i].prob = junkpile[j].prob;
}
i += 1;
k = j;
while ((j < (BUCKETSIZE*2)) && (strcmp(junkpile[k].rack, junkpile[j].rack)) == 0) {
j += 1;
}
if (j >= (BUCKETSIZE*2)) break;
}
sorted_watermark = nextfree = i;
lowest_of_high_bucket = junkpile[nextfree-1].prob;
/*
fprintf(stderr, "The critical threshhold is now %d (watermark at %d)\n",
lowest_of_high_bucket, sorted_watermark);
*/
fprintf(stderr, "."); fflush(stderr);
}
}
/* There is an obvious transliteration of this procedure possible
that would make it shorter, more general - and recursive; however
the non-recursive version runs like shit off a shovel and can be
highly optimised by a good compiler. */
void EnumerateCombs(char *T, int N, int M)
{
int C[MaxN+1];
int P[MaxN+1]; /* Cumulative probability so far. Stacked. */
char OT[MaxM+1];
int i;
/* Seven nested loops because we only ever have 7 tiles */
#if MaxN > 7
fprintf(stderr, "You don't know what you're doing, do you?\n");
exit(1);
#endif
/* Perhaps if I replaced C[0-7] with C0 - C7, the
compiler would be able to allocate them to
registers more easily? */
/* We use our knowlege that the search space is restricted
to having at nost two of the same letter, and those
appearing consecutively, to perpetrate a gross hack
that allows us to add up the probabilities on the fly.
Easy enough to modify to be completely general, but
I don't need that yet. */
C[0] = -1;
if (N>=1) for (C[1] = C[0]+1; C[1] < M; C[1]++) {
OT[0] = T[C[1]];
P[0] = tot[OT[0]];
if (N>=2) for (C[2] = C[1]+1; C[2] < M; C[2]++) {
OT[1] = T[C[2]];
if (OT[1] == OT[0]) {
P[1] = Combs(2, tot[OT[1]]);
} else {
P[1] = tot[OT[1]] * P[0];
}
if (N>=3) for (C[3] = C[2]+1; C[3] < M; C[3]++) {
OT[2] = T[C[3]];
if (OT[2] == OT[1]) {
P[2] = Combs(2, tot[OT[2]]) * P[0];
} else {
P[2] = tot[OT[2]] * P[1];
}
if (N>=4) for (C[4] = C[3]+1; C[4] < M; C[4]++) {
OT[3] = T[C[4]];
if (OT[3] == OT[2]) {
P[3] = Combs(2, tot[OT[3]]) * P[1];
} else {
P[3] = tot[OT[3]] * P[2];
}
if (N>=5) for (C[5] = C[4]+1; C[5] < M; C[5]++) {
OT[4] = T[C[5]];
if (OT[4] == OT[3]) {
P[4] = Combs(2, tot[OT[4]]) * P[2];
} else {
P[4] = tot[OT[4]] * P[3];
}
if (N>=6) for (C[6] = C[5]+1; C[6] < M; C[6]++) {
OT[5] = T[C[6]];
if (OT[5] == OT[4]) {
P[5] = Combs(2, tot[OT[5]]) * P[3];
} else {
P[5] = tot[OT[5]] * P[4];
}
if (N>=7) for (C[7] = C[6]+1; C[7] < M; C[7]++) {
OT[6] = T[C[7]];
if (OT[6] == OT[5]) {
P[6] = Combs(2, tot[OT[6]]) * P[4];
} else {
P[6] = tot[OT[6]] * P[5];
}
Output(P[6], OT, N);
} else {
Output(P[5], OT, N);
}
} else {
Output(P[4], OT, N);
}
} else {
Output(P[3], OT, N);
}
} else {
Output(P[2], OT, N);
}
} else {
Output(P[1], OT, N);
}
} else {
Output(P[0], OT, N);
}
}
}
int main(int argc, char **argv)
{
/* We really would prefer to enumerate *ALL* possible racks,
[Combs(7, 100)] which is beyond the bounds of current computers.
However we already know that racks with three letters
the same are low probability, so in our search for
the high probability racks, we never generate those by virtue
of only having two letters the same at most in our 'Alphabet'
array. We also discard blanks (better handled in some other
ad-hoc way). That and throwing away the letters of which there
are only 1 (jkqxz) and reducing all others to 2 letters, brings
our alphabet space down to C(7, 42) which can be enumerated (including
all the ancilliary junk of keeping the best <N> (sorted)) in 45 seconds
on my old 486. People with faster computers can put some tiles back... :-)
The manual pruning that I did to generate this string should really
be done automatically from the full 100 tile alphabet. That
would make the program more easily adapted to being called in
the middle of a game, say, when the bag is down to 70 tiles
and all the 'c's have been played, for instance.
(Also it would allow easy application to other language versions
of Scrabble)
*/
char *alphabet = "aabbccddeeffgghhiillmmnnoopprrssttuuvvwwyy";
int i, j;
if (argc == 2) alphabet = argv[1];
fprintf(stderr, "Alphabet: %s\n", alphabet);
if (strlen(alphabet) <= MaxN) {
fprintf(stderr, "The alphabet parameter must be larger than the\n");
fprintf(stderr, " number of tiles you are using (%d)\n", MaxN);
exit(1);
} else if (strlen(alphabet) > 47) {
fprintf(stderr, "You don't have much hope of this ever terminating\n");
fprintf(stderr, " in the near future, do you?\n");
} else {
fprintf(stderr, "I will calculate the most likely racks using\n");
fprintf(stderr, "the standard English Scrabble letters and frequencies\n");
fprintf(stderr, "- remember, you will only get an approximation\n");
fprintf(stderr, "to the most likely racks using this input\n");
}
for (i = 0; i < 256; i++) tot[i] = 0;
for (i = 0; i < 26; i++) tot['a'+i] = itot[i];
/* Some confidence tests of C(N, R) function. */
if ((Combs(2, 12) != 66) || (Combs(3, 9) != 84)) {
fprintf(stderr,
"Coding error - someone has been tweaking the combination calculation\nfunction wrongly!\n");
exit(1);
}
EnumerateCombs(alphabet, 7, strlen(alphabet));
fprintf(stderr,
"\nI generated %d racks of 7 letters from the set of %d letters\n",
(int)combCount, strlen(alphabet));
/* Sanity check - combs enumerated should match C(N, R) calculated */
if (combCount != Combs(7, strlen(alphabet))) {
fprintf(stderr,
"Coding error - someone has been tweaking the combination generator\nloop wrongly!\n");
exit(1);
}
if (sorted_watermark < MAXRESULTS) {
fprintf(stderr, "Warning: there may be some duplication\n");
}
qsort(&junkpile[0], (size_t)nextfree, (size_t)sizeof(junkpile[0]), cmp);
fprintf(stderr, "The best was %s, with a score of %d\n",
junkpile[0].rack, junkpile[0].prob);
fprintf(stderr, "The best %d are:\n", MAXRESULTS); fflush(stderr);
for (j = 0; j < (MAXRESULTS/5); j++) {
if (junkpile[j*5].prob == 0) break;
for (i = 0; i < 5; i++) {
if (junkpile[j*5+i].prob == 0) break;
fprintf(stdout, "%7s %7d ", junkpile[j*5+i].rack,
junkpile[j*5+i].prob);
}
fprintf(stdout, "\n");
}
fflush(stdout); fflush(stderr);
return(0);
}
/* NOTES: the info below will be needed for the next part of this
project.
QUESTION: The first question I have is very basic. Let's say we've
made a play that puts down 3 specific letters. What is the probability
of having picked a rack which contained those letters? Is is just
Combs(3, 100), or is it something like Combs(3, Combs(7, 100))? I suspect
it makes a difference if you say that the rest of the rack is different
letters, as opposed to you don't care what the rest of the rack is -
i.e. you could allow duplicated letters. I think for this application
I don't care what the other letters are.
ANSWER: Well, it makes a heck of a lot of difference what the three letters
are. The chances of having a rack with the three letters EIA is a whole lot
greater than having JZX. What you'd do to figure this out is to first
calculate the number of racks that would meet the criteria. First, you'd
calculate the number of different ways you could form the 3-letter
combination in question with the letters in the Scrabble set - in the case
of EIA, it would be 12 x 9 x 9. (In the case of JZX, it would be 1.) Then,
you'd multiply that number by the number of ways that you could put
together the other four tiles, which is "97 choose 4" (i.e., the number of
ways you could choose 4 items out of 97 items), which is 97!/(93! x 4!).
Therefore, the numerator of your probability, in the case of EIA???? would
be (12 x 9 x 9 x 97!)/(93! x 4!). And the denominator of the probability,
as always, is "100 choose 7", or 100!/(93! x 7!) - the number of ways you
could choose an opening rack from a full bag.
*/