/* File: dyntrie.c Author: Graham Toal Purpose: Check a word using DAWG or TRIE. Functions exported: trie_new, trie_add, trie_dump Internal functions: insert_simple recurse_add Description: Builds DAWG-compatible trie in store, incrementally; words do not have to be presented in order. (Note it is NOT a 'packed-trie'; in our scheme it is closer to a dawg - but without the tail-compression precomputed dawgs have). Any tries built using this should be used by check_dawg, print_dawg etc. Has rather nice property of only claiming enough memory for the job; dynamically moves the data structure when it fills! I'm missing a header file for this code. So until I write one, be very careful with the parameters which are mostly **'s */ /* To avoid global state, I'm putting these useful bits of info in the two words before the dawg itself; I *HOPE* that all systems allow negative indexing off arrays; I'm not at all sure they do though. However since I moved the base of the dict up by adding 2 to it, I *should* be able to get the old address by by subtracting 2 from it, so I'm using the first pair of macros below, not the more intuitive second pair. */ #include "splib.h" #define EXTRAS 2 #define MAX_ENTRIES(dawg) (*(dawg-2)) /* Safe way of aliasing */ #define NEXT_FREE(dawg) (*(dawg-1)) /* #define MAX_ENTRIES(dawg) dawg[-2] */ /* 'Obvious' alias */ /* #define NEXT_FREE(dawg) dawg[-1] */ /* may be buggy */ #define EMPTY 0 #define INIT_MAX_ENTRIES 512 /* Slop is so that we don't need to be continually checking nextfree as being close to max_entries */ #define SLOP 12 /* A debugging macro which rangechecks arrays -- happens when utils.c is included with RCHECK defined. Otherwise no overhead. */ #define DICT(idx) dict[RANGECHECK(idx, MAX_ENTRIES(dict))] /* This quick hack job has special-case code in each function for the root node-set. Its rather tacky; I could clean it up and make the root node like all other nodes, but why bother; this is only test code anyway... */ static INDEX insert_simple(NODE **dictp, char *word) { #define dict (*dictp) NODE bugfix; INDEX i; NODE n; int c; if (NEXT_FREE(dict) >= MAX_ENTRIES(dict)-SLOP) { NODE *moved; moved = calloc((size_t)MAX_ENTRIES(dict)*2+EXTRAS, sizeof(NODE)); if (moved != NULL) { moved += EXTRAS; MAX_ENTRIES(moved) = MAX_ENTRIES(dict); NEXT_FREE(moved) = NEXT_FREE(dict); /* Should use realloc but appears to be buggy :-( */ for (i = MAX_ENTRIES(dict); i < MAX_ENTRIES(dict)*2; i++) { moved[i] = EMPTY; } for (i = 0; i < MAX_ENTRIES(dict); i++) moved[i] = DICT(i); dict -= EXTRAS; free(dict); dict = moved; MAX_ENTRIES(dict) *= 2; } else { fprintf(stderr, "Can\'t add any more words again\n"); exit(1); } } c = (*word++)&255; if (c == '\0') return(TERMINAL_NODE); i = NEXT_FREE(dict)++; bugfix = insert_simple(&dict, word); n = ((NODE)c << (NODE)V_LETTER) | M_END_OF_NODE | bugfix; DICT(i) = n; if (*word == '\0') DICT(i) |= M_END_OF_WORD; return(i); #undef dict } static void recurse_add(NODE **dictp, INDEX base, char *word) { #define dict (*dictp) NODE bugfix; INDEX i = base, newbase; NODE unpicked[MAX_CHARS], n; int c, ch; int gap, nextslot = 0, j; /* First see if first char is already in trie */ ch = (*word++)&255; for (;;) { c = (int)(((dict[i] >> V_LETTER) & M_LETTER)&255); if (ch == c) { newbase = dict[i]&M_NODE_POINTER; if (newbase == 0) { /* have abc (this is c), adding (abc)defg */ dict[i] &= (~M_NODE_POINTER); bugfix = insert_simple(&dict, word); dict[i] |= bugfix; } else { if (*word == '\0') { dict[i] |= M_END_OF_WORD; } else { recurse_add(dictp, newbase, word); } } return; } if ((dict[i]&M_END_OF_NODE) != 0) break; i++; } /* unpick base */ for (nextslot = 0; nextslot < MAX_CHARS; nextslot++) { unpicked[nextslot] = EMPTY; } nextslot = 0; i = base; j = 0; for (;;) { if ((j == 0) && (((dict[i] >> V_LETTER) & M_LETTER) > ch)) { j = 1; newbase = nextslot++; } n = dict[i]; dict[i] = EMPTY; i++; unpicked[nextslot++] = n & (~M_END_OF_NODE); if ((n & M_END_OF_NODE) != 0) break; } if (j == 0) newbase = nextslot++; /* Was it last alphabetically? */ /* add this char to the node */ if (*word == '\0') { unpicked[newbase] = ((NODE)ch << (NODE)V_LETTER) | TERMINAL_NODE | M_END_OF_WORD; } else { /* and insert the rest of the string with insert_simple */ bugfix = insert_simple(&dict, word); unpicked[newbase] = ((NODE)ch << (NODE)V_LETTER) | bugfix; /* The insert_simple call has to come after above loop, because of freeing slots... */ } unpicked[nextslot-1] |= M_END_OF_NODE; /* scan for suitably-sized gap */ /* MAX_CHARS+1 should be first 'real' node base (0, 1..MAX_CHARS reserved) */ /* The particular application this is wanted for doesn't have a large number of words to be added dynamically, so the BFI code below which finds free holes in the trie space works fine. However if some bright spark cares to keep a freelist *in the holes* then this program effectively implements a linear-time sorting algorithm. (I know it's not *really*, but think about it before jumping down my throat; the N^2 case is very statistically unlikely) */ for (i = (INDEX)MAX_CHARS+1; i < MAX_ENTRIES(dict)-nextslot-SLOP; i++) { /* Even without the freelist, the sneaky hack in pack.c for noting earliest bunch of free holes might well make a significant difference... (It was a lowest_search_start variable and used a bit of hysteresis) */ gap = TRUE; for (j = 0; j < nextslot; j++) { if (dict[i+j] != EMPTY) { gap = FALSE; break; } } if (gap) break; } if (!gap) { NODE *moved; moved = calloc((size_t)(MAX_ENTRIES(dict)*2+EXTRAS), sizeof(NODE)); if (moved != NULL) { moved += EXTRAS; MAX_ENTRIES(moved) = MAX_ENTRIES(dict); NEXT_FREE(moved) = NEXT_FREE(dict); /* Should use realloc but appears to be buggy :-( */ for (i = MAX_ENTRIES(dict); i < MAX_ENTRIES(dict)*2; i++) moved[i] = EMPTY; for (i = 0; i < MAX_ENTRIES(dict); i++) moved[i] = DICT(i); dict -= EXTRAS; free(dict); dict = moved; /* If your compiler has aliasing bugs they may hit here. */ MAX_ENTRIES(dict) *= 2; /* scan for suitably-sized gap */ for (i = ((MAX_ENTRIES(dict)/2)-(MAX_CHARS+1)); i < MAX_ENTRIES(dict)-nextslot; i++) { gap = TRUE; for (j = 0; j < nextslot; j++) { if (dict[i+j] != EMPTY) { gap = FALSE; break; } } if (gap) break; } } if (!gap) { fprintf(stderr, "Can\'t add any more words\n"); exit(1); } } newbase = i; /* reinsert */ for (j = 0; j < nextslot; j++) { dict[newbase+j] = unpicked[j]; } if (newbase+nextslot-1 >= NEXT_FREE(dict)) NEXT_FREE(dict) = newbase+nextslot; /* update pointer to the base of this node */ for (i = 0; i < MAX_ENTRIES(dict); i++) { if ((dict[i] & M_NODE_POINTER) == base) { dict[i] &= (~M_NODE_POINTER); dict[i] |= newbase; break; /* (should only be one since trie, not dawg) */ } } #undef dict } void trie_add(NODE **dictp, char *word) { #define dict (*dictp) INDEX next; INDEX bugfix; int c; /* Root node is pre-reserved MAX_CHARS entries */ c = (*word++)&255; /* bugfix - remember chars can be signed :-( */ if (DICT((INDEX)ROOT_NODE+c) == EMPTY) { /* I'm allowing the root node to be sparse for speed; it *should* be dense like any other node. May change this later */ if (*word == '\0') { DICT((INDEX)ROOT_NODE+c) = ((NODE)c << (NODE)V_LETTER) | M_END_OF_WORD; } else { bugfix = insert_simple(&dict, word); DICT((INDEX)ROOT_NODE+c) = ((NODE)c << (NODE)V_LETTER) | bugfix; } } else { if (*word == '\0') { DICT((INDEX)ROOT_NODE+c) |= M_END_OF_WORD; } else { next = DICT((INDEX)ROOT_NODE+c) & M_NODE_POINTER; if (next == 0) { /* have 'a', adding 'abcd' */ /* (Should really check the letter for safety...) */ bugfix = insert_simple(&dict, word); DICT((INDEX)ROOT_NODE+c) = ((NODE)c << (NODE)V_LETTER) | bugfix | M_END_OF_WORD; } else { recurse_add(dictp, next, word); } } } #undef dict } int trie_new(NODE **dawgp) { #define dawg (*dawgp) INDEX i; dawg = calloc(INIT_MAX_ENTRIES+EXTRAS, sizeof(NODE)); if (dawg == NULL) { fprintf(stderr, "Main malloc failed in dyntrie\n"); exit(1); } dawg += EXTRAS; MAX_ENTRIES(dawg) = INIT_MAX_ENTRIES; NEXT_FREE(dawg) = MAX_CHARS+1; dawg[0] = 0; for (i = 1; i <= MAX_CHARS; i++) dawg[i] = 0; dawg[MAX_CHARS] |= M_END_OF_NODE; /* 0 base, MAX_CHARS chars */ return(TRUE); #undef dawg } int trie_free(NODE **dictp) #define dict (*dictp) { dict -= EXTRAS; free(dict); dict = NULL; #undef dict } #ifdef NEVER int trie_dump(NODE *dawg, char *filename) { INDEX cnt, max; FILE *dumper; max = NEXT_FREE(dawg)*sizeof(NODE); /* I *knew* the next_free variable would come in handy :-) */ dumper = fopen(filename, WRITE_BIN); if (dumper == NULL) { fprintf(stderr, "Failed to dump trie on file \'%s\'\n", filename); return(FALSE); } putword(max, dumper); if ((cnt = putwords(dawg, max, dumper)) != max) { fprintf(stderr, "Failed to dump: %ld bytes written instead of %ld\n", cnt, max); return(FALSE); } fclose(dumper); return(TRUE); } #endif NEVER #undef MAX_ENTRIES #undef NEXT_FREE #undef DICT #undef SLOP