#include <stdio.h> #include <stdlib.h> #include <string.h> #define BITS 10 /* Trivial hash with end-around carry. No thought went into this. You can probably do better. This worked well enough for me on the first try I didn't see a need to improve it. Return a hash of the string between 0 and max-1. A small degree of care was exerted so that a, aa, aaa and aaaa etc didn't all hash to 0. */ int hashfun(char *s, int max) { int c; int rem; int h = 0; for (;;) { c = *s++; if (c == '\0') break; if (isalpha(c)) c = c - (isupper(c) ? 'A' : 'a'); h = (h + (c&15) + 1) * 10; for (;;) { /* I suspect this can only be executed twice at most but I'm lazy and paranoid, hence a loop */ rem = h / max; h = (h % max) + rem; if (h < max) break; } } return(h); } int main(int argc, char **argv) { FILE *dict; FILE *data; int i; char *s; char line[128]; int N; /* Determine size of text (no of words) */ int dictsize; int hashsize, hashpos; /* Allocate hash table of 10 * N bits */ unsigned char *hashtable; /* Files are just words, one per line. Nothing fancy. Standard newlines. */ if (argc != 3) { fprintf(stderr, "syntax: %s dictfile testwords\n", argv[0]); exit(1); } /* First arg is the large dictionary file */ dict = fopen(argv[1], "r"); if (dict == NULL) { fprintf(stderr, "Cannot open %s (master dictionary)\n", argv[1]); exit(1); } /* Second arg is a short file of words to check - real words or bogus */ data = fopen(argv[2], "r"); if (data == NULL) { fprintf(stderr, "Cannot open %s (list of words to check)\n", argv[2]); exit(1); } /* First pass - count dict size. Saves so much hassle. */ N = 0; dictsize = 0; for (;;) { s = fgets(line, 127, dict); if (s == NULL) break; N += 1; dictsize += strlen(line); } fclose(dict); dict = fopen(argv[1], "r"); if (dict == NULL) { fprintf(stderr, "Cannot reopen %s (master dictionary)\n", argv[1]); exit(1); } /* Allocate hash table of 10 * N bits */ hashsize = (N * BITS) >> 3; hashtable = malloc(hashsize+1); if (hashtable == NULL) { fprintf(stderr, "Not enough ram for a %d byte hash table\?\?\?\n", hashsize+1); exit(1); } for (i = 0; i < hashsize+1; i++) hashtable[i] = 0; /* Add words to table. */ for (;;) { s = fgets(line, 127, dict); if (s == NULL) break; s = strchr(s, '\n'); *s = '\0'; hashpos = hashfun(line, N * BITS); hashtable[hashpos >> 3] |= (1 << (hashpos & 7)); } fclose(dict); #ifdef TEST /* Take test file, confirm: all real words pass, 90% of bogus words fail */ for (;;) { s = fgets(line, 127, data); if (s == NULL) break; s = strchr(s, '\n'); *s = '\0'; hashpos = hashfun(line, N * BITS); fprintf(stdout, "%s %s at %d [%02x/%02x]\n", line, (((hashtable[hashpos >> 3] & (1 << (hashpos & 7))) != 0) ? "OK" : "not found"), hashpos>>3, hashtable[hashpos >> 3], 1 << (hashpos & 7)); } fclose(data); #else fprintf(stdout, "#define N %d\n", N); fprintf(stdout, "#define HASHSIZE %d\n", hashsize); fprintf(stdout, "unsigned char hashtable[HASHSIZE+1] = {\n"); for (i = 0; i < hashsize+1; i++) { fprintf(stdout, "%4d, ", hashtable[i]); if ((i&7) == 7) fprintf(stdout, "\n"); } fprintf(stdout, "\n};\n"); #endif fflush(stdout); fprintf(stderr, "Dictionary size was %d bytes\n", dictsize); fprintf(stderr, "Table size was %d bytes\n", hashsize+1); exit(0); return(1); }