#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define BITS 10

/* Trivial hash with end-around carry.  No thought went into this.
   You can probably do better.  This worked well enough for me on
   the first try I didn't see a need to improve it.

   Return a hash of the string between 0 and max-1.  A small degree
   of care was exerted so that a, aa, aaa and aaaa etc didn't all hash to 0.
 */

int hashfun(char *s, int max)
{
  int c;
  int rem;
  int h = 0;

  for (;;) {
    c = *s++;
    if (c == '\0') break;
    if (isalpha(c)) c = c - (isupper(c) ? 'A' : 'a');
    h = (h + (c&15) + 1) * 10;
    for (;;) {
      /* I suspect this can only be executed twice at most but
         I'm lazy and paranoid, hence a loop */
      rem = h / max;
      h = (h % max) + rem;
      if (h < max) break;
    }
  }
  return(h);
}

int main(int argc, char **argv)
{
  FILE *dict;
  FILE *data;
  int i;
  char *s;
  char line[128];

  int N;                   /* Determine size of text (no of words) */
  int dictsize;

  int hashsize, hashpos;   /* Allocate hash table of 10 * N bits */
  unsigned char *hashtable;

  /* Files are just words, one per line.  Nothing fancy.  Standard newlines. */

  if (argc != 3) {
    fprintf(stderr, "syntax: %s dictfile testwords\n", argv[0]);
    exit(1);
  }

  /* First arg is the large dictionary file */
  dict = fopen(argv[1], "r");
  if (dict == NULL) {
    fprintf(stderr, "Cannot open %s (master dictionary)\n", argv[1]);
    exit(1);
  }

  /* Second arg is a short file of words to check - real words or bogus */
  data = fopen(argv[2], "r");
  if (data == NULL) {
    fprintf(stderr, "Cannot open %s (list of words to check)\n", argv[2]);
    exit(1);
  }

  /* First pass - count dict size.  Saves so much hassle. */
  N = 0; dictsize = 0;
  for (;;) {
    s = fgets(line, 127, dict);
    if (s == NULL) break;
    N += 1;
    dictsize += strlen(line);
  }
  fclose(dict);
  dict = fopen(argv[1], "r");
  if (dict == NULL) {
    fprintf(stderr, "Cannot reopen %s (master dictionary)\n", argv[1]);
    exit(1);
  }

  /* Allocate hash table of 10 * N bits */
  hashsize = (N * BITS) >> 3;
  hashtable = malloc(hashsize+1);
  if (hashtable == NULL) {
    fprintf(stderr, "Not enough ram for a %d byte hash table\?\?\?\n",
      hashsize+1);
    exit(1);
  }
  for (i = 0; i < hashsize+1; i++) hashtable[i] = 0;

  /* Add words to table. */
  for (;;) {
    s = fgets(line, 127, dict);
    if (s == NULL) break;
    s = strchr(s, '\n'); *s = '\0';
    hashpos = hashfun(line, N * BITS);
    hashtable[hashpos >> 3] |= (1 << (hashpos & 7));
  }
  fclose(dict);


#ifdef TEST
  /* Take test file, confirm: all real words pass, 90% of bogus words fail */
  for (;;) {
    s = fgets(line, 127, data);
    if (s == NULL) break;
    s = strchr(s, '\n'); *s = '\0';
    hashpos = hashfun(line, N * BITS);
    fprintf(stdout, "%s %s at %d [%02x/%02x]\n",
     line,
     (((hashtable[hashpos >> 3] & (1 << (hashpos & 7))) != 0)
       ? "OK" : "not found"),
     hashpos>>3,
     hashtable[hashpos >> 3], 1 << (hashpos & 7));
  }
  fclose(data);
#else
  fprintf(stdout, "#define N %d\n", N);
  fprintf(stdout, "#define HASHSIZE %d\n", hashsize);
  fprintf(stdout, "unsigned char hashtable[HASHSIZE+1] = {\n");
  for (i = 0; i < hashsize+1; i++) {
    fprintf(stdout, "%4d, ", hashtable[i]);
    if ((i&7) == 7) fprintf(stdout, "\n");
  }
  fprintf(stdout, "\n};\n");
#endif

  fflush(stdout);
  fprintf(stderr, "Dictionary size was %d bytes\n", dictsize);
  fprintf(stderr, "Table size was %d bytes\n", hashsize+1);
  exit(0);
  return(1);
}