dyntrie.c


/*

    File:    dyntrie.c
    Author:  Graham Toal
    Purpose: Check a word using DAWG or TRIE.
    Functions exported:  trie_new, trie_add, trie_dump
    Internal functions:  insert_simple recurse_add

Description:

   Builds DAWG-compatible trie in store, incrementally; words do not
   have to be presented in order.  (Note it is NOT a 'packed-trie';
   in our scheme it is closer to a dawg - but without the tail-compression
   precomputed dawgs have).

   Any tries built using this should be used by check_dawg, print_dawg etc.

   Has rather nice property of only claiming enough memory for the
   job; dynamically moves the data structure when it fills!

   I'm missing a header file for this code.  So until I write one,
   be very careful with the parameters which are mostly **'s

*/

  /* To avoid global state, I'm putting these useful bits of info
     in the two words before the dawg itself; I *HOPE* that all systems
     allow negative indexing off arrays; I'm not at all sure they do
     though.  However since I moved the base of the dict up by adding
     2 to it, I *should* be able to get the old address by by subtracting
     2 from it, so I'm using the first pair of macros below, not the
     more intuitive second pair.
   */

#include "splib.h"

#define EXTRAS 2
#define MAX_ENTRIES(dawg) (*(dawg-2))          /* Safe way of aliasing */
#define NEXT_FREE(dawg) (*(dawg-1))


  /* #define MAX_ENTRIES(dawg) dawg[-2] */     /* 'Obvious' alias      */
  /* #define NEXT_FREE(dawg) dawg[-1] */       /* may be buggy         */

#define EMPTY 0
#define INIT_MAX_ENTRIES 512

/* Slop is so that we don't need to be continually checking nextfree
   as being close to max_entries */

#define SLOP 12

/* A debugging macro which rangechecks arrays -- happens when
   utils.c is included with RCHECK defined. Otherwise no overhead. */

#define DICT(idx) dict[RANGECHECK(idx, MAX_ENTRIES(dict))]

  /*
      This quick hack job has special-case code in each function
      for the root node-set.  Its rather tacky; I could clean it
      up and make the root node like all other nodes, but why
      bother; this is only test code anyway...
   */

static INDEX insert_simple(NODE **dictp, char *word)
{
#define dict (*dictp)
NODE bugfix;
INDEX i; NODE n; int c;

  if (NEXT_FREE(dict) >= MAX_ENTRIES(dict)-SLOP) {
    NODE *moved;
    moved = calloc((size_t)MAX_ENTRIES(dict)*2+EXTRAS, sizeof(NODE));
    if (moved != NULL) {
      moved += EXTRAS;
      MAX_ENTRIES(moved) = MAX_ENTRIES(dict);
      NEXT_FREE(moved) = NEXT_FREE(dict);
      /* Should use realloc but appears to be buggy :-( */
      for (i = MAX_ENTRIES(dict); i < MAX_ENTRIES(dict)*2; i++) {
        moved[i] = EMPTY;
      }
      for (i = 0; i < MAX_ENTRIES(dict); i++) moved[i] = DICT(i);
      dict -= EXTRAS;
      free(dict);
      dict = moved; MAX_ENTRIES(dict) *= 2;
    } else {
      fprintf(stderr, "Can\'t add any more words again\n");
      exit(1);
    }
  }

  c = (*word++)&255;
  if (c == '\0') return(TERMINAL_NODE);
  i = NEXT_FREE(dict)++;
  bugfix = insert_simple(&dict, word);
  n = ((NODE)c << (NODE)V_LETTER) | M_END_OF_NODE | bugfix;
  DICT(i) = n; if (*word == '\0') DICT(i) |= M_END_OF_WORD;
  return(i);
#undef dict
}

static void recurse_add(NODE **dictp, INDEX base, char *word)
{
#define dict (*dictp)
NODE bugfix;
INDEX i = base, newbase;
NODE unpicked[MAX_CHARS], n;
int c, ch;
int gap, nextslot = 0, j;
  /* First see if first char is already in trie */
  ch = (*word++)&255;
  for (;;) {
    c = (int)(((dict[i] >> V_LETTER) & M_LETTER)&255);
    if (ch == c) {
      newbase = dict[i]&M_NODE_POINTER;
      if (newbase == 0) {
        /* have abc (this is c), adding (abc)defg */
        dict[i] &= (~M_NODE_POINTER);
        bugfix = insert_simple(&dict, word);
        dict[i] |= bugfix;
      } else {
        if (*word == '\0') {
          dict[i] |= M_END_OF_WORD;
        } else {
          recurse_add(dictp, newbase, word);
        }
      }
      return;
    }
    if ((dict[i]&M_END_OF_NODE) != 0) break;
    i++;
  }

  /* unpick base */
  for (nextslot = 0; nextslot < MAX_CHARS; nextslot++) {
    unpicked[nextslot] = EMPTY;
  }
  nextslot = 0;

  i = base; j = 0;
  for (;;) {
    if ((j == 0) && (((dict[i] >> V_LETTER) & M_LETTER) > ch)) {
      j = 1; newbase = nextslot++;
    }
    n = dict[i]; dict[i] = EMPTY; i++;
    unpicked[nextslot++] = n & (~M_END_OF_NODE);
    if ((n & M_END_OF_NODE) != 0) break;
  }
  if (j == 0) newbase = nextslot++; /* Was it last alphabetically? */
  /* add this char to the node */
  if (*word == '\0') {
    unpicked[newbase] =
      ((NODE)ch << (NODE)V_LETTER) | TERMINAL_NODE | M_END_OF_WORD;
  } else {
    /* and insert the rest of the
       string with insert_simple */
    bugfix = insert_simple(&dict, word);
    unpicked[newbase] =
      ((NODE)ch << (NODE)V_LETTER) | bugfix;
      /* The insert_simple call has to come after above loop, because
         of freeing slots... */
  }
  unpicked[nextslot-1] |= M_END_OF_NODE;

  /* scan for suitably-sized gap */
  /* MAX_CHARS+1 should be first 'real' node base (0, 1..MAX_CHARS reserved) */

  /*
     The particular application this is wanted for doesn't have a large
     number of words to be added dynamically, so the BFI code below which
     finds free holes in the trie space works fine.  However if some bright
     spark cares to keep a freelist *in the holes* then this program
     effectively implements a linear-time sorting algorithm.

     (I know it's not *really*, but think about it before jumping
     down my throat; the N^2 case is very statistically unlikely)
   */

  for (i = (INDEX)MAX_CHARS+1; i < MAX_ENTRIES(dict)-nextslot-SLOP; i++) {
    /*
        Even without the freelist, the sneaky hack in pack.c for
        noting earliest bunch of free holes might well make a
        significant difference... (It was a lowest_search_start
        variable and used a bit of hysteresis)
     */
    gap = TRUE;
    for (j = 0; j < nextslot; j++) {
      if (dict[i+j] != EMPTY) {
        gap = FALSE;
        break;
      }
    }
    if (gap) break;
  }
  if (!gap) {
    NODE *moved;
    moved = calloc((size_t)(MAX_ENTRIES(dict)*2+EXTRAS), sizeof(NODE));
    if (moved != NULL) {
      moved += EXTRAS;
      MAX_ENTRIES(moved) = MAX_ENTRIES(dict);
      NEXT_FREE(moved) = NEXT_FREE(dict);
      /* Should use realloc but appears to be buggy :-( */
      for (i = MAX_ENTRIES(dict);
           i < MAX_ENTRIES(dict)*2; i++) moved[i] = EMPTY;
      for (i = 0; i < MAX_ENTRIES(dict); i++) moved[i] = DICT(i);
      dict -= EXTRAS;
      free(dict);
      dict = moved; /* If your compiler has aliasing bugs they may hit here. */
      MAX_ENTRIES(dict) *= 2;
      /* scan for suitably-sized gap */
      for (i = ((MAX_ENTRIES(dict)/2)-(MAX_CHARS+1));
           i < MAX_ENTRIES(dict)-nextslot; i++) {
        gap = TRUE;
        for (j = 0; j < nextslot; j++) {
          if (dict[i+j] != EMPTY) {
            gap = FALSE;
            break;
          }
        }
        if (gap) break;
      }
    }
    if (!gap) {
      fprintf(stderr, "Can\'t add any more words\n");
      exit(1);
    }
  }
  newbase = i;
  /* reinsert */
  for (j = 0; j < nextslot; j++) {
    dict[newbase+j] = unpicked[j];
  }
  if (newbase+nextslot-1 >= NEXT_FREE(dict)) NEXT_FREE(dict) = newbase+nextslot;
  /* update pointer to the base of this node */
  for (i = 0; i < MAX_ENTRIES(dict); i++) {
    if ((dict[i] & M_NODE_POINTER) == base) {
      dict[i] &= (~M_NODE_POINTER);
      dict[i] |= newbase;
      break;            /* (should only be one since trie, not dawg) */
    }
  }
#undef dict
}

void trie_add(NODE **dictp, char *word)
{
#define dict (*dictp)
INDEX next;
INDEX bugfix;
int c;
  /* Root node is pre-reserved MAX_CHARS entries */
  c = (*word++)&255; /* bugfix - remember chars can be signed :-( */
  if (DICT((INDEX)ROOT_NODE+c) == EMPTY) {
    /* I'm allowing the root node to be sparse for speed; it *should*
       be dense like any other node.  May change this later */
    if (*word == '\0') {
      DICT((INDEX)ROOT_NODE+c) =
        ((NODE)c << (NODE)V_LETTER) | M_END_OF_WORD;
    } else {
      bugfix = insert_simple(&dict, word);
      DICT((INDEX)ROOT_NODE+c) =
        ((NODE)c << (NODE)V_LETTER) | bugfix;
    }
  } else {
    if (*word == '\0') {
      DICT((INDEX)ROOT_NODE+c) |= M_END_OF_WORD;
    } else {
      next = DICT((INDEX)ROOT_NODE+c) & M_NODE_POINTER;
      if (next == 0) {
        /* have 'a', adding 'abcd' */
        /* (Should really check the letter for safety...) */
        bugfix = insert_simple(&dict, word);
        DICT((INDEX)ROOT_NODE+c) = ((NODE)c << (NODE)V_LETTER)
          | bugfix | M_END_OF_WORD;
      } else {
        recurse_add(dictp, next, word);
      }
    }
  }
#undef dict
}

int trie_new(NODE **dawgp)
{
#define dawg (*dawgp)
INDEX i;

  dawg = calloc(INIT_MAX_ENTRIES+EXTRAS, sizeof(NODE));
  if (dawg == NULL) {
    fprintf(stderr, "Main malloc failed in dyntrie\n");
    exit(1);
  }
  dawg += EXTRAS;

  MAX_ENTRIES(dawg) = INIT_MAX_ENTRIES; NEXT_FREE(dawg) = MAX_CHARS+1;

  dawg[0] = 0;
  for (i = 1; i <= MAX_CHARS; i++) dawg[i] = 0;
  dawg[MAX_CHARS] |= M_END_OF_NODE;
  /* 0 base, MAX_CHARS chars */

  return(TRUE);
#undef dawg
}

int trie_free(NODE **dictp)
#define dict (*dictp)
{
  dict -= EXTRAS;
  free(dict);
  dict = NULL;
#undef dict
}


#ifdef NEVER
int trie_dump(NODE *dawg, char *filename)
{
INDEX cnt, max;
FILE *dumper;
  max = NEXT_FREE(dawg)*sizeof(NODE);
    /* I *knew* the next_free variable would come in handy :-) */
  dumper = fopen(filename, WRITE_BIN);
  if (dumper == NULL) {
    fprintf(stderr, "Failed to dump trie on file \'%s\'\n", filename);
    return(FALSE);
  }
  putword(max, dumper);
  if ((cnt = putwords(dawg, max, dumper)) != max) {
    fprintf(stderr, "Failed to dump: %ld bytes written instead of %ld\n",
      cnt, max);
    return(FALSE);
  }
  fclose(dumper);
  return(TRUE);
}
#endif NEVER

#undef MAX_ENTRIES
#undef NEXT_FREE
#undef DICT
#undef SLOP