/* THIS IS UNDER DEVELOPMENT: the source file is most likely being edited even as you look at it. For the problem which this solves, see post: http://groups.yahoo.com/group/wordgame-programmers/message/495 This is a proof-of-concept prototype; there's still a bit of tidying up to be done before incorporating it into a program, but that's the easy part; the actual algorithm work is done. */ #include <stdio.h> #include <errno.h> #include "splib.h" #define DEFAULT_MAXLEN 80 #define NUM_RECOGNISERS (DEFAULT_MAXLEN+1) static int debug = FALSE; static int use_stdin = FALSE; /* for this quick hack its more convenient to use lots of globals than it is to pass everything around cleanly as parameters */ static FILE *infile; static NODE *dawg; static INDEX nedges; static int maxlen; /* replace these with allocations off the heap */ static NODE *edge[NUM_RECOGNISERS]; static int firsttime[NUM_RECOGNISERS]; /* all init to TRUE */ static char word[NUM_RECOGNISERS][DEFAULT_MAXLEN]; /* the old procedure to check a word in the dawg has been broken out so that it is fed one char at a time. It keeps its global state in the three arrays above. The parallel search consists of multiple invocations of this procedure, one starting at each position in the target text input stream. The dawg contains the list of words to be recognised, pre-calculated. */ static int process_one_char(char c, int id) { int lenp, this_char; /* Not preserved between calls */ if (firsttime[id]) { edge[id] = dawg+ROOT_NODE; *word[id] = '\0'; } firsttime[id] = FALSE; for (;;) { /* locate this letter at current point in trie */ if (debug) { fprintf(stderr, "Checking for %s+%c against %s+%c\n", word, c, word, this_char); } this_char = ((*edge[id] >> V_LETTER) & M_LETTER); if (c == this_char) { if ((*edge[id] & M_END_OF_WORD) != 0) { /* OUTPUT THIS WORD */ fprintf(stderr, "Recognised: %s%c\n", word[id], c); /* need to check if dest == dawg (end of trie) */ } /* prepare for next iteration. */ /* "edge" is the preserved global state */ edge[id] = dawg+(*edge[id] & M_NODE_POINTER); lenp = strlen(word[id]); word[id][lenp] = c; word[id][lenp+1] = '\0'; if (edge[id] == dawg){ /* end of dawg? */ /* return code is not whether a word was found, but whether we can progress */ firsttime[id] = TRUE; /* let the next one reset the variables */ return(FALSE); } else { return(TRUE); } } if (((*edge[id]++) & M_END_OF_NODE) != 0) break; /* checked all branches */ } firsttime[id] = TRUE; return(FALSE); /* No more choices are possible using this recogniser */ } static void filter(FILE *in) { int i, c, chars_processed, this_slot; if (debug) { fprintf(stderr, "This test is set up to detect 'scat' and 'cat' and 'catch'\n"); c = process_one_char('s',0); fprintf(stderr, "s0: %s\n", (c ? "TRUE":"FALSE")); c = process_one_char('c',0); fprintf(stderr, "c0: %s\n", (c ? "TRUE":"FALSE")); c = process_one_char('c',1); fprintf(stderr, "c1: %s\n", (c ? "TRUE":"FALSE")); c = process_one_char('a',0); fprintf(stderr, "a0: %s\n", (c ? "TRUE":"FALSE")); c = process_one_char('a',1); fprintf(stderr, "a1: %s\n", (c ? "TRUE":"FALSE")); c = process_one_char('t',0); fprintf(stderr, "t0: %s\n", (c ? "TRUE":"FALSE")); c = process_one_char('t',1); fprintf(stderr, "t1: %s\n", (c ? "TRUE":"FALSE")); c = process_one_char('c',0); fprintf(stderr, "c0: %s\n", (c ? "TRUE":"FALSE")); c = process_one_char('c',1); fprintf(stderr, "c1: %s\n", (c ? "TRUE":"FALSE")); c = process_one_char('h',0); fprintf(stderr, "h0: %s\n", (c ? "TRUE":"FALSE")); c = process_one_char('h',1); fprintf(stderr, "h1: %s\n", (c ? "TRUE":"FALSE")); c = process_one_char('!',0); fprintf(stderr, "!0: %s\n", (c ? "TRUE":"FALSE")); c = process_one_char('!',1); fprintf(stderr, "!1: %s\n", (c ? "TRUE":"FALSE")); exit(0); } chars_processed = 0; for (;;) { c = fgetc(in); if (c == EOF) break; putchar(c); chars_processed += 1; /* each slot starts off another recogniser */ this_slot = chars_processed % NUM_RECOGNISERS; firsttime[this_slot] = TRUE; (void)process_one_char(c, this_slot); /* kick off a new recogniser for this position */ /* and pass the character to any still active recognisers */ for (i = 0; i < NUM_RECOGNISERS; i++) { if ((i != this_slot) && !(firsttime[i])) (void)process_one_char(c, i); } } } int main(int argc, char **argv) { int i; /* useful little debugging tool in http://www.gtoal.com/devel/backtrace/auto_gdb.c */ /* restart_under_gdb(argc, argv); */ if (argc == 4 && strcmp(argv[1], "-d") == 0) { debug = TRUE; argc -= 1; argv += 1; } if (argc != 3) { fprintf(stderr, "syntax: multiscan dawgfile textfile\n"); exit(1); } if (!dawg_init((argc == 1 ? "" : argv[1]), &dawg, &nedges)) { exit(EXIT_FAILURE); } /* ideally we should at this point walk the dawg and find the longest string */ maxlen = DEFAULT_MAXLEN; for (i = 0; i < NUM_RECOGNISERS; i++) firsttime[i] = TRUE; if ((*argv[2] == '-') && (*(argv[2]+1) == '\0')) { use_stdin = TRUE; } else { infile = fopen(argv[2], "r"); if (infile == NULL) { fprintf(stderr, "multiscan: %s\n", strerror(errno)); exit(2); } } if (debug) { if (use_stdin) { fprintf(stderr, "Searching stdin for the following strings:\n", argv[2]); } else { fprintf(stderr, "Searching for the following strings in %s:\n", argv[2]); } /* not advisable if using a large word list! dawg_print(dawg, (INDEX) ROOT_NODE); */ } if (use_stdin) filter(stdin); else filter(infile); exit(0); return(0); }