/* This program is by Graham Toal. It's in the "jjchew" directory, because I was inspired to write it while looking at his "wordprob.c" program and I thought it would be useful to keep the two pograms together, if anyone was looking for sources to do with the probabilities involved in scrabble racks. The purpose of this program is to generate study lists of most probable rack draws, but I expect I will be reusing some of the code in here for other nefarious purposes soon ;-) I am indebted to "Harold Rennett" <haroldelaine@msn.com> (who goes by the handle of Tuchusfacius on the web - see his web page http://www.addicted2words.com/tf/7and8.htm) who explained the maths behind this for me, and whose manually crafted list was a superb help in verifying that this code is on the right track, despite it being only an approximation. This program has a few built-in safety checks, in case it is hacked on by someone other than me, who may accidentally break some of the assumptions that make it work. */ #include <stdio.h> #include <stdlib.h> #include <stdlib.h> #define MaxN 7 #define MaxM 100 #define MAXRESULTS 300 #define BUCKETSIZE 400 typedef struct { int prob; char rack[MaxN+1]; } result; static result junkpile[BUCKETSIZE*2]; static int nextfree = 0; static int sorted_watermark = 0; static int lowest_of_high_bucket = -1; static char rack[MaxN+1]; static int tot[256]; static int itot[27] = /* a b c d e f g h i j k l m n o p q r s t u v w x y z */ { 9,2,2,4,12,2,3,2,9,1,1,4,2,6,8,2, 1,6,4,6,4,2,2,1,2, 1}; static unsigned long combCount = 0UL; /* Calculate Combinations(N, R) without using factorials; 100% accurate for normal numbers but suffers some minor rounding errors only in cases where the more obvious algorithm would have exceeded integer capacity anyway, or would have forced the use of floating point (which I like to avoid if possible). Yes, I did come up with this myself from first principles, though I have no doubt it has been thought of before. It's probably a standard example in every book of integer numerical algorithms. I was sort of thinking of Bresenhams algorithm for drawing lines in a raster when I invented it. It does fail silently in the case when the final answer still exceeds the largest unsigned long integer. Tough. */ #define Combs(R, N) XCombs(R, N, __LINE__) static unsigned long XCombs(int R, int N, int L) { unsigned long i = 1L; int r = R; if (R > N) { fprintf(stderr, "Who called Combs(%d, %d) at line %0d ?\n", R, N, L); exit(1); } while (((unsigned long)R > 1) || (r > 0)) { while (r-- > 0) { i *= (unsigned long)N--; /* fprintf(stderr, "* %ld => %lu\n", N+1, i); */ if ((N > 0) && (i >= (0x7fffffffUL / (unsigned long)N))) break; /* no more room */ } while ((unsigned long)R > 1) { i /= (unsigned long)R--; /* fprintf(stderr, "/ %ld => %lu\n", R+1, i); */ if ((N > 0) && (i < (0x7fffffffUL / (unsigned long)N))) break; /* room for another */ } } return(i); } int cmp(const void *a, const void *b) { int i; result *ar = (result *)a; result *br = (result *)b; if ((long)ar == (long)br) return(0); /* shouldn't happen */ i = ((br->prob)-(ar->prob)); if (i == 0) i = strcmp(ar->rack, br->rack); return(i); } /* I know that normally people use a heap to keep the largest <N> objects in memory, but I've always had some success with this algorithm, which uses threshholds and double-buffering to take advantage of an efficient quicksort routine. It's certainly a lot easier to write, and I don't think significantly less efficient. (The only drawback is in this code we want to eliminate duplicates; without that requirement, this is *much* more efficient) */ void Output(int thisp, char *s, int N) { int i, j, k; combCount += 1UL; if (combCount == 0x7fffffffUL) { fprintf(stderr, "Oops! 31bit wraparound.\n"); /* Probably impossible to do 32-bits worth of anything useful (like board evals) during the lifetime of a computer */ exit(1); } /* throw away any results worse than current best <N> */ if ((sorted_watermark >= MAXRESULTS) && (thisp < lowest_of_high_bucket)) return; /* too low */ junkpile[nextfree].prob = thisp; strncpy(junkpile[nextfree].rack, s, N); junkpile[nextfree].rack[N] = '\0'; /*fprintf(stdout, "%09d %s\n", thisp, junkpile[nextfree].rack); fflush(stdout);*/ nextfree += 1; if (nextfree == (BUCKETSIZE*2)) { qsort(&junkpile[0], (size_t)nextfree, (size_t)sizeof(junkpile[0]), cmp); /* Remove duplicates */ i = 0; j = 0; while (i < BUCKETSIZE) { if (i != j) { strcpy(junkpile[i].rack, junkpile[j].rack); junkpile[i].prob = junkpile[j].prob; } i += 1; k = j; while ((j < (BUCKETSIZE*2)) && (strcmp(junkpile[k].rack, junkpile[j].rack)) == 0) { j += 1; } if (j >= (BUCKETSIZE*2)) break; } sorted_watermark = nextfree = i; lowest_of_high_bucket = junkpile[nextfree-1].prob; /* fprintf(stderr, "The critical threshhold is now %d (watermark at %d)\n", lowest_of_high_bucket, sorted_watermark); */ fprintf(stderr, "."); fflush(stderr); } } /* There is an obvious transliteration of this procedure possible that would make it shorter, more general - and recursive; however the non-recursive version runs like shit off a shovel and can be highly optimised by a good compiler. */ void EnumerateCombs(char *T, int N, int M) { int C[MaxN+1]; int P[MaxN+1]; /* Cumulative probability so far. Stacked. */ char OT[MaxM+1]; int i; /* Seven nested loops because we only ever have 7 tiles */ #if MaxN > 7 fprintf(stderr, "You don't know what you're doing, do you?\n"); exit(1); #endif /* Perhaps if I replaced C[0-7] with C0 - C7, the compiler would be able to allocate them to registers more easily? */ /* We use our knowlege that the search space is restricted to having at nost two of the same letter, and those appearing consecutively, to perpetrate a gross hack that allows us to add up the probabilities on the fly. Easy enough to modify to be completely general, but I don't need that yet. */ C[0] = -1; if (N>=1) for (C[1] = C[0]+1; C[1] < M; C[1]++) { OT[0] = T[C[1]]; P[0] = tot[OT[0]]; if (N>=2) for (C[2] = C[1]+1; C[2] < M; C[2]++) { OT[1] = T[C[2]]; if (OT[1] == OT[0]) { P[1] = Combs(2, tot[OT[1]]); } else { P[1] = tot[OT[1]] * P[0]; } if (N>=3) for (C[3] = C[2]+1; C[3] < M; C[3]++) { OT[2] = T[C[3]]; if (OT[2] == OT[1]) { P[2] = Combs(2, tot[OT[2]]) * P[0]; } else { P[2] = tot[OT[2]] * P[1]; } if (N>=4) for (C[4] = C[3]+1; C[4] < M; C[4]++) { OT[3] = T[C[4]]; if (OT[3] == OT[2]) { P[3] = Combs(2, tot[OT[3]]) * P[1]; } else { P[3] = tot[OT[3]] * P[2]; } if (N>=5) for (C[5] = C[4]+1; C[5] < M; C[5]++) { OT[4] = T[C[5]]; if (OT[4] == OT[3]) { P[4] = Combs(2, tot[OT[4]]) * P[2]; } else { P[4] = tot[OT[4]] * P[3]; } if (N>=6) for (C[6] = C[5]+1; C[6] < M; C[6]++) { OT[5] = T[C[6]]; if (OT[5] == OT[4]) { P[5] = Combs(2, tot[OT[5]]) * P[3]; } else { P[5] = tot[OT[5]] * P[4]; } if (N>=7) for (C[7] = C[6]+1; C[7] < M; C[7]++) { OT[6] = T[C[7]]; if (OT[6] == OT[5]) { P[6] = Combs(2, tot[OT[6]]) * P[4]; } else { P[6] = tot[OT[6]] * P[5]; } Output(P[6], OT, N); } else { Output(P[5], OT, N); } } else { Output(P[4], OT, N); } } else { Output(P[3], OT, N); } } else { Output(P[2], OT, N); } } else { Output(P[1], OT, N); } } else { Output(P[0], OT, N); } } } int main(int argc, char **argv) { /* We really would prefer to enumerate *ALL* possible racks, [Combs(7, 100)] which is beyond the bounds of current computers. However we already know that racks with three letters the same are low probability, so in our search for the high probability racks, we never generate those by virtue of only having two letters the same at most in our 'Alphabet' array. We also discard blanks (better handled in some other ad-hoc way). That and throwing away the letters of which there are only 1 (jkqxz) and reducing all others to 2 letters, brings our alphabet space down to C(7, 42) which can be enumerated (including all the ancilliary junk of keeping the best <N> (sorted)) in 45 seconds on my old 486. People with faster computers can put some tiles back... :-) The manual pruning that I did to generate this string should really be done automatically from the full 100 tile alphabet. That would make the program more easily adapted to being called in the middle of a game, say, when the bag is down to 70 tiles and all the 'c's have been played, for instance. (Also it would allow easy application to other language versions of Scrabble) */ char *alphabet = "aabbccddeeffgghhiillmmnnoopprrssttuuvvwwyy"; int i, j; if (argc == 2) alphabet = argv[1]; fprintf(stderr, "Alphabet: %s\n", alphabet); if (strlen(alphabet) <= MaxN) { fprintf(stderr, "The alphabet parameter must be larger than the\n"); fprintf(stderr, " number of tiles you are using (%d)\n", MaxN); exit(1); } else if (strlen(alphabet) > 47) { fprintf(stderr, "You don't have much hope of this ever terminating\n"); fprintf(stderr, " in the near future, do you?\n"); } else { fprintf(stderr, "I will calculate the most likely racks using\n"); fprintf(stderr, "the standard English Scrabble letters and frequencies\n"); fprintf(stderr, "- remember, you will only get an approximation\n"); fprintf(stderr, "to the most likely racks using this input\n"); } for (i = 0; i < 256; i++) tot[i] = 0; for (i = 0; i < 26; i++) tot['a'+i] = itot[i]; /* Some confidence tests of C(N, R) function. */ if ((Combs(2, 12) != 66) || (Combs(3, 9) != 84)) { fprintf(stderr, "Coding error - someone has been tweaking the combination calculation\nfunction wrongly!\n"); exit(1); } EnumerateCombs(alphabet, 7, strlen(alphabet)); fprintf(stderr, "\nI generated %d racks of 7 letters from the set of %d letters\n", (int)combCount, strlen(alphabet)); /* Sanity check - combs enumerated should match C(N, R) calculated */ if (combCount != Combs(7, strlen(alphabet))) { fprintf(stderr, "Coding error - someone has been tweaking the combination generator\nloop wrongly!\n"); exit(1); } if (sorted_watermark < MAXRESULTS) { fprintf(stderr, "Warning: there may be some duplication\n"); } qsort(&junkpile[0], (size_t)nextfree, (size_t)sizeof(junkpile[0]), cmp); fprintf(stderr, "The best was %s, with a score of %d\n", junkpile[0].rack, junkpile[0].prob); fprintf(stderr, "The best %d are:\n", MAXRESULTS); fflush(stderr); for (j = 0; j < (MAXRESULTS/5); j++) { if (junkpile[j*5].prob == 0) break; for (i = 0; i < 5; i++) { if (junkpile[j*5+i].prob == 0) break; fprintf(stdout, "%7s %7d ", junkpile[j*5+i].rack, junkpile[j*5+i].prob); } fprintf(stdout, "\n"); } fflush(stdout); fflush(stderr); return(0); } /* NOTES: the info below will be needed for the next part of this project. QUESTION: The first question I have is very basic. Let's say we've made a play that puts down 3 specific letters. What is the probability of having picked a rack which contained those letters? Is is just Combs(3, 100), or is it something like Combs(3, Combs(7, 100))? I suspect it makes a difference if you say that the rest of the rack is different letters, as opposed to you don't care what the rest of the rack is - i.e. you could allow duplicated letters. I think for this application I don't care what the other letters are. ANSWER: Well, it makes a heck of a lot of difference what the three letters are. The chances of having a rack with the three letters EIA is a whole lot greater than having JZX. What you'd do to figure this out is to first calculate the number of racks that would meet the criteria. First, you'd calculate the number of different ways you could form the 3-letter combination in question with the letters in the Scrabble set - in the case of EIA, it would be 12 x 9 x 9. (In the case of JZX, it would be 1.) Then, you'd multiply that number by the number of ways that you could put together the other four tiles, which is "97 choose 4" (i.e., the number of ways you could choose 4 items out of 97 items), which is 97!/(93! x 4!). Therefore, the numerator of your probability, in the case of EIA???? would be (12 x 9 x 9 x 97!)/(93! x 4!). And the denominator of the probability, as always, is "100 choose 7", or 100!/(93! x 7!) - the number of ways you could choose an opening rack from a full bag. */