#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>

#ifndef FALSE
#define FALSE (0!=0)
#define TRUE (0==0)
#endif

/*
     Takeon generates a grammar in gram[].  Phrases are numbered
     sequentially from 512 upwards, with a lookup table from this
     sequential numbering into gram indexes stored in phrase[].

     We *could* make the phrase numbers sparse, and have only the
     gram table without the phrase table - the only reason to do it
     this way is so that the phrasename[] table is compact.  If
     we didn't do the indirection (thus saving the phrase[] table)
     then the phrasename table would be as big as the gram table,
     which is much worse.  Of course, the downside is that we only
     really need the phrasename table for diagnostics, so in a
     production compiler it's true that we do have a slight
     unnecessary overhead.

     This implementation isn't too bad, but it could be improved
     by better support for BIPs (eg automatic numbering).  Need to
     find a way to do some of the tricks I used to do in TACC:
     guards, negated conditions, and parse-time code execution.
     The latter can be done with BIPS.

     The big thing to be done is to read the grammar that's
     embedded in the source file directly rather than requiring
     a pre-pass.

     Finally, there are way too many pre-defined ranges here which
     can all be done away with.  And 16 max BIPS is ridiculous
     if we start using BIPS for parse-time code.

     Also a major improvement would be to walk each phrase down the
     left hand path and detect any left-associative infinite loops
     before it gets as far as being used in a compiler :-)
 */

static FILE *grammar;

#define OPTIONAL_PHRASE (1<<14)
#define NEGATED_PHRASE (1<<15)

#define MAX_GRAMMAR (1024*16)
#define MAX_PHRASES 1024

static int gram[MAX_GRAMMAR];
static int phrase[MAX_PHRASES];

static int lineno;
void fatal_(int line) {
  fprintf(stderr, "* Syntax error at line %d (detected in %s, line %d)\n", lineno, __FILE__, line);
  exit(1);
}
#define fatal() fatal_(__LINE__)

char *minus_to_ul(char *s)
{
  static char temp[128];
  char *t = temp;
  for (;;) {
    int c = *s++;
    if (c == '-') c = '_';
    *t++ = c;
    if (c == '\0') return temp;
  }
}

char *upto(int ends) {
  char temp[128];
  char *s = temp;
  int count = 0;
  for (;;) {
    int c = fgetc(grammar);
    if ((c == EOF) || ferror(grammar) || (count == 127)) fatal();
    if (c == ends) return(strdup(temp));
    *s++ = c; *s = '\0'; count++;
  }
}

int nextch(void) {
  int c = fgetc(grammar);
  if ((c == EOF) || ferror(grammar)) fatal();
  return c;
}

int nonspace(void) {
  for (;;) {
    int c = fgetc(grammar);
    if ((c == EOF) || ferror(grammar)) fatal();
    if (c == '\n') lineno++;
    if (!isspace(c)) return(c);
  }
}

#define MAX_BIPS 16  /* Surely more than enough? [undoubtedly I'll curse that later ;-)] */
static int nextbip = 0;
static int BIP[MAX_BIPS];
static char *phrasename[MAX_PHRASES+512];
static char *key[128];
static int nextfreekey = 0;
static int next_gram = 0;
static int pnp = 512;

void dump_tables(void) {
  int i;
  int nums_per_line = 16;
  int keys_per_line = 4;
  fprintf(stdout, "#define MAX_GRAMMAR %d\n", next_gram);
  fprintf(stdout, "#define PHRASE_BASE %d\n", nextbip+512);
  fprintf(stdout, "int gram[MAX_GRAMMAR] = {\n");
  for (i = 0; i < next_gram; i++) {
    fprintf(stdout, "%5d, ", gram[i]);
    if ((i+1) % nums_per_line == 0) fprintf(stdout, "\n");
  }
  if ((next_gram % nums_per_line) != 0) fprintf(stdout, "\n");
  fprintf(stdout, "};\n\n");

  fprintf(stdout, "#define MAX_KEYWORD %d\n", nextfreekey);
  fprintf(stdout, "char *keyword[MAX_KEYWORD] = { // Keywords are based at 256\n  ");
  for (i = 0; i < nextfreekey; i++) {
    fprintf(stdout, "\"%s\", ", key[i]);
    if ((i+1) % keys_per_line == 0) fprintf(stdout, "\n  ");
  }
  if ((nextfreekey % keys_per_line) != 0) fprintf(stdout, "\n");
  fprintf(stdout, "};\n\n");

  fprintf(stdout, "#define MAX_BIP %d\n", nextbip);
  fprintf(stdout, "int BIP[MAX_BIP] = { // BIPs precede PHRASEs at 512 upwards\n");
  for (i = 0; i < nextbip; i++) {
    fprintf(stdout, "%2d, ", BIP[i]);
    if ((i+1) % nums_per_line == 0) fprintf(stdout, "\n");
  }
  if ((nextbip % nums_per_line) != 0) fprintf(stdout, "\n");
  fprintf(stdout, "};\n");

  fprintf(stdout, "\n#define MAX_PHRASE %d\n", pnp-512);
  fprintf(stdout, "#if defined(DEBUG_PARSER) || defined(EXTENDED_ERROR_REPORTING)\n// FOR DEBUGGING ONLY\n");
  fprintf(stdout, "char *phrasename[MAX_PHRASE] = { // Based at 512 upwards\n  ");
  for (i = 512; i < pnp; i++) {
    fprintf(stdout, "\"%s\", ", phrasename[i]);
    if ((i+1) % keys_per_line == 0) fprintf(stdout, "\n  ");
  }
  if ((pnp % keys_per_line) != 0) fprintf(stdout, "\n");
  fprintf(stdout, "};\n#endif /* DEBUG_PARSER */\n");

  fprintf(stdout, "\nint phrase_start[MAX_PHRASE-MAX_BIP] = {\n");
  for (i = 512+nextbip; i < pnp; i++) {
    fprintf(stdout, "%5d, ", phrase[i-512]);
    if ((i+1) % nums_per_line == 0) fprintf(stdout, "\n");
  }
  if ((pnp % nums_per_line) != 0) fprintf(stdout, "\n");
  fprintf(stdout, "};\n\n");

  for (i = 512; i < pnp; i++) {
    fprintf(stdout, "#define P_%s %d\n", minus_to_ul(phrasename[i]), i);
    if (i < 512+nextbip) fprintf(stdout, "#define B_%s %d\n\n", minus_to_ul(phrasename[i]), BIP[i-512]);
  }

}

int keyword_code(char *keyword)
{
  int i;
  key[nextfreekey] = keyword;
  // POOR IMPLEMENTATION for now.  Quick & Dirty to get something working.
  for (i = 0; i <= nextfreekey; i++) {
    if (strcmp(keyword, key[i]) == 0) break;
  }
  if (i == nextfreekey) {
    key[i] = strdup(key[i]); nextfreekey++;
  }
  return i+256; // 256..511 are for keywords
}

void takeon(int finalpass) {
  int gp = 0; // grammar pointer
  int sym, lastsym = ';';
  char *def_name = NULL, *name = NULL, *string = NULL, *keyword = NULL;
  int def_bip = FALSE, def_phrase = FALSE, def_type = 0, specialbit = 0;
  int alt_count = 1, alt_count_index = 0;
  int phrase_count = 0, phrase_count_index = 0;
  int this_phrase_start = 0;
  int indent_len = 0;
  static int max_phrase = 0;

  lineno = 1; // (re)init globals too.
  next_gram = 0;
  pnp = 512; // phrase name pointer
  nextbip = pnp; // First BIPS, then phrases.

  for (;;) {
    switch (sym = nonspace()) {
    case 'B': // BIP DEFINITION
    case 'C': // PHRASE DEFINITION
    case 'P': // PHRASE DEFINITION
      def_type = sym;
      def_phrase = TRUE; break;

    case 'E': // END OF GRAMMAR
      if (finalpass) dump_tables();
      max_phrase = pnp;
      return;

    case '=': // start of a phrase alternative or a BIP defn
      // name should be valid at this point
      if (def_phrase && (def_type == 'B')) {
        // expect a number and a ';'.
        int digit = nonspace();
        if (!isdigit(digit)) fatal();
        // For now, single digit...
        // DO SOMETHING HERE WITH "def_name" and "digit" TO DEFINE BIP
        BIP[pnp-512] = digit-'0';
        nextbip = pnp-512+1;
        phrase[pnp-512] = 0;
        phrasename[pnp++] = strdup(def_name);
        if (nonspace() != ';') fatal();
        def_phrase = FALSE;
      } else if (def_phrase) { // MAY NEED ACTION FOR C vs P
        // expect a phrase definition.
        phrase[pnp-512] = next_gram;
        phrasename[pnp++] = strdup(def_name);
        def_phrase = FALSE;
        alt_count_index = this_phrase_start = next_gram++; // Hole for number of alternatives
        phrase_count_index = next_gram++; // hole for number of items in first alt.
        alt_count = 1; phrase_count = 0;
      } else {
        fatal(); // missing definition
      }
      break;

    case '<': // phrase def *or* instance within an alt.
      if (def_phrase) {
        name = upto('>');
        def_name = name;
      } else {
        int i = nextch();
        if (i == '?') {
          specialbit = OPTIONAL_PHRASE;
          // parse and assign but do not step the pointer past this location (NOTE/POP-POINTER)
	} else if (i == '!') { // or use '~' or '\' ???
          // parse & invert.  return NULL on a failed parse, don't step the pointer, and continue parsing this alt.
          specialbit = NEGATED_PHRASE;
	} else {
	  ungetc(i, grammar);
          specialbit = 0;
	}
        name = upto('>');
        for (i = 512; i < max_phrase; i++) {if (strcmp(phrasename[i], name)==0) break;}
        if (finalpass && (i >= max_phrase)) {fflush(stderr); fprintf(stderr, "\n* UNDEFINED P<%s>\n", name); fatal();}
        phrase_count++; gram[next_gram++] = i | specialbit;
      }
      break;

    case '\'': // string literal
      string = upto('\'');
      phrase_count += strlen(string); // Each char counts as a phrase, albeit a short one...
      { char *s = string; while (*s != '\0') gram[next_gram++] = *s++; }
      break;

    case '"': // keyword literal
      keyword = upto('"');
      phrase_count++;
      gram[next_gram++] = keyword_code(keyword);
      break;

    case '|': // next alternative
    case ',': // next alternative
      gram[phrase_count_index] = phrase_count; phrase_count = 0;
      phrase_count_index = next_gram++; // hole for number of items in first alt.
      alt_count++;
      break;

    case ';': // end of alternatives
      gram[phrase_count_index] = phrase_count;
      // Can tell is last alt was null by checking lastsym == ',', should we ever need to know
      gram[this_phrase_start] = alt_count;
      gram[phrase_count_index] = phrase_count; phrase_count = 0;
      def_phrase = FALSE;
      free(def_name); def_name = NULL;
      break;

    case '#': // comment to end of line
      {int c; for (;;) {c = fgetc(grammar); if ((c == EOF) || ferror(grammar)) fatal(); if (c == '\n') break;}}
      lineno++;
      break;

    default:
      fatal();
    }
    lastsym = sym;
  }
}

int main(int argc, char **argv) {
  int pass;
  if (argc != 2) {
    fprintf(stderr, "syntax: takeon compiler.g\n");
    exit(1);
  }
  for (pass = 0; pass <= 1; pass++) {
    grammar = fopen(argv[1], "r");
    if (grammar == NULL) {
      fprintf(stderr, "takeon: %s - %s\n", strerror(errno), argv[1]);
      exit(errno);
    }
    takeon(pass); // build tables
    fclose(grammar);
  }
  exit(0); return(1);
}