/*

# syntax for a very small subset of C as generated by the static binary translator
# after running through both 'dumpf' and 'peephole'...

# Note that expressions must be fully bracketed - no operator precedences allowed.
# If we hit constructs that are too awkward to parse, go back to the code generator
# and modify it so that they are never generated in the first place...
# For instance, "a + b + c" would need to be rewritten as "(a + b) + c"

*/

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <regex.h>

#ifndef FALSE
#define FALSE (0!=0)
#define TRUE (0==0)
#endif

char *emergency;
static int debug = 0;

#define MAX_LINES 100000
#define MAX_LINE  1024

// entire program is read into this array.  We could do it a line at a time but
// this leaves options open for later.
char *prog_line[MAX_LINES]; // max lines in a generated program
int end_of_file = 0; // exclusive upper bound - line after last used line

// for parsing:
int cur_line = 0;
int cur_col = 0;

// for syntax failure message:
int best_line = 0;
int best_col = 0;

void preload_entire_source(FILE *in) { // read entire program before starting.
  char s[1024]; // max length of a single line...
  int c;
  int cp; // char position
  for (;;) { // loop over lines
    if (end_of_file == MAX_LINES) {
      fprintf(stderr, "parse: too many input lines for program to store.  Ask the programmer to increase MAX_LINES\n");
      exit(EXIT_FAILURE);
    }
    cp = 0;
    for (;;) { // loop over chars in a line
      c = fgetc(in);
      if (c == EOF) {
        if (cp == 0) return;
        s[cp] = '\0';
        prog_line[end_of_file++] = strdup(s); // to do: check malloc ok
        return;
      }
      if (c == '\r') continue;
      if (c == '\n') {
        s[cp] = '\0';
        prog_line[end_of_file++] = strdup(s); // to do: check malloc ok
        break; // next line
      }
      s[cp++] = c;
      if (cp == MAX_LINE) {
        fprintf(stderr, "parse: A single input line (%d) was too long.  Ask the programmer to increase MAX_LINE\n", end_of_file);
        exit(EXIT_FAILURE);
      }
    }
  }
}

enum phrase {
  NONE,
  SS,
    SS__COMMENT,
    SS__BR_LIST,
    SS__CASELAB,
    SS__LAB,
    SS__ASSIG,
    SS__STRINGPROCCALL,
    SS__PROCCALL,
    SS__IF,
    SS__GOTO,
    SS__JUMP,
    SS__NULL, 
  SS_LIST,
  IDENT,
  LABEL,
  VAR,
  VALUE,
    VAL__COND_EXPR,
    VAL__BR_EXPR,
  OPT_INDEX,
  FNNAME,
  PROCNAME,
  PARAM_LIST,
  REST_OF_PARAM_LIST,
  EXPR,
  FNCALL,
  TYPED_VAR,
    TYPED_VAR__CAST,
    TYPED_VAR__DEFAULT,
  OPT_SECOND_OPERAND,
    OPT_SECOND_OPERAND__PRESENT,
    OPT_SECOND_OPERAND__ABSENT,
  OPT_MONOP,
  CASTNAME,
  BINOP,
  SELECTOR,
  HEX_CONSTANT,
  POSITIVE_INTEGER,
  STRING_LITERAL
};

// Tuples are the nodes of the AST (Abstract Syntax Tree).
typedef struct tuple {
  int phraseno;
  int subphrase; // might not need this. tuple format is under development.
  int line;
  int col;
  char *matched;
  struct tuple *phrase[10]; // max args to a proc, for safety.
} tuple;

tuple *new_tuple(int type) {
  // currently we never free anything. RAM is cheap.  If we really needed to save
  // space, a mark/release scheme would be more efficient than calloc/free,
  // though just handling a line at a time would be much better...
  tuple *t;
  t = calloc(1, sizeof(tuple)); // we rely on remaining fields being initialised to 0's.
  t->phraseno = type;
  return t;
}

// indentation for parser tracing.
static int indent = 0;
char *sp(void) {
  static char string[1024];
  char *p = string;
  int i, ind=indent;
  // this hack is because parsing an entire program would lead to huge indents.
  if (ind > 6000) {
    int hashes = (ind-40)/2000;
    for (i = 0; i < hashes; i++) *p++ = '#';
    ind -= hashes*2000;
  }
  if (ind > 600) {
    int ats = (ind-40)/200;
    for (i = 0; i < ats; i++) *p++ = '@';
    ind -= ats*200;
  }
  if (ind > 60) {
    int plusses = (ind-40)/20;
    for (i = 0; i < plusses; i++) *p++ = '+';
    ind -= plusses*20;
  }
  for (i = 0; i < ind; i++) *p++ = ' ';
  *p = '\0';

  return string;
}

void in(const char *s) { // called on entry to a parsing procedure if debugging
  int i;
  char *p;

  // skip spaces and newlines before printing text that follows cur pos.
  // We actually update the current position - it's not just for the diagnostic message.
  // - this may be a bad design decision... should try a version where cur_* is restored.
  p = prog_line[cur_line]+cur_col;
  for (;;) {
    if (cur_line >= end_of_file) break;
    while (*p == ' ') { p += 1; cur_col += 1; }
    // skip newlines as well as spaces
    if (*p == '\0') {
      cur_line += 1; cur_col = 0;
      if (cur_line > best_line) { best_line = cur_line; best_col = cur_col; }
      if ((cur_line == best_line) && (cur_col > best_col)) best_col = cur_col;

      p = prog_line[cur_line]+cur_col;
    } else break;
  }

  if (debug <= 0) return;
  //for (i = 0; i < indent; i++) fprintf(stderr, " ");
  fprintf(stderr, "%s>> %s at: \"%s\"\n", sp(), s, (cur_line >= end_of_file ? "<EOF>" : p));
  indent += 2;
}
#define IN in(__FUNCTION__)

int out(int success, const char *s) { // called on exit from a parsing procedure if debugging
  //int i;
  int line = cur_line, col = cur_col;
  char *p;
  
  p = prog_line[cur_line]+cur_col;
  indent -= 2;
  //for (i = 0; i < indent; i++) fprintf(stderr, " ");
  if (success) {
    for (;;) {
      if (cur_line >= end_of_file) break;
      while (*p == ' ') { p += 1; cur_col += 1; } // always skip white space on this line before an atom (OK if we leave it skipped)
      // skip newlines as well as spaces
      if (*p == '\0') {
        cur_line += 1; cur_col = 0;
        if (cur_line > best_line) { best_line = cur_line; best_col = cur_col; }
        if ((cur_line == best_line) && (cur_col > best_col)) best_col = cur_col;

        p = prog_line[cur_line]+cur_col;
      } else break;
    }
    if (debug >= 1) fprintf(stderr, "%s<< %s MATCHED leaving: \"%s\"\n", sp(), s, (cur_line == end_of_file ? "<EOF>" : p));
  } else {
    if (debug >= 1) fprintf(stderr, "%s<< %s\n", sp(), s);
  }
  cur_line = line; cur_col = col;
  return success;
}
#define OUT(cond) out(cond, __FUNCTION__)

int parse_ss_list(tuple *ss_list);
int parse_selector(tuple *sel);
int parse_rest_of_param_list(tuple *param_list);
int parse_expr(tuple *expr);
int parse_positive_integer(tuple *posint);
int parse_hex_constant(tuple *hex);
int parse_procname(tuple *fnname);
int parse_fnname(tuple *procname);
int parse_param_list(tuple *param_list);
int parse_cond(tuple *cond);

// match a literal string against source at current position.  increment
// the current position if a match.  Leave untouched otherwise.
int lex_lit(char *lit, char **matched) {
  char *p = prog_line[cur_line]+cur_col;
  IN;

  if (matched) *matched = NULL;
  if (debug >= 1) fprintf(stderr, "%sLex: Trying \"%s\"\n", sp(), lit);
  
  if (cur_line > best_line) { best_line = cur_line; best_col = cur_col; }
  if ((cur_line == best_line) && (cur_col > best_col)) best_col = cur_col;

  if (*lit == '\0') return OUT(TRUE); // empty string always matches

  for (;;) {
    if (cur_line >= end_of_file) return OUT(FALSE); // non-empty string can't match end of file
    p = prog_line[cur_line]+cur_col;
    while (*p == ' ') { // always skip white space on this line before an atom (OK if we leave it skipped)
      p += 1;
      cur_col += 1;
    }
    if (strcmp(lit, "\n") == 0) { // special case for end-of-line (should always be only char in string)
      if (*p == '\0') {
        cur_line += 1; cur_col = 0;
        if (cur_line > best_line) { best_line = cur_line; best_col = cur_col; }
        if ((cur_line == best_line) && (cur_col > best_col)) best_col = cur_col;

        if (matched) *matched = lit;
        return OUT(TRUE);
      }
      return OUT(FALSE);
    } else {
      // skip newlines as well as spaces
      if (*p == '\0') {
        cur_line += 1; cur_col = 0;
        if (cur_line > best_line) { best_line = cur_line; best_col = cur_col; }
        if ((cur_line == best_line) && (cur_col > best_col)) best_col = cur_col;

        p = prog_line[cur_line]+cur_col;
        if (debug >= 1) fprintf(stderr, "%s+Lex: Trying \"%s\"\n", sp(), lit);
      } else break;
    }
  }
  
  p = prog_line[cur_line]+cur_col;
  if (strncmp(lit, p, strlen(lit)) == 0) {
    cur_col += strlen(lit);
    if (cur_line > best_line) { best_line = cur_line; best_col = cur_col; }
    if ((cur_line == best_line) && (cur_col > best_col)) best_col = cur_col;

    if (matched) *matched = lit;
    return OUT(TRUE);
  }
  return OUT(FALSE);
}

// match regex against source at current position.  increment the current
// position if a match.  Leave untouched otherwise.
int lex_regex(char *regexs, char **matched) {
#define MAX_MATCHES 32 // or is this always 1 in this code?

  // when a regex matches something in the source, this extracts the matched text as a string
  auto char *extract(char *s, int start /* inclusive */, int end /* exclusive */) {
    char *t = malloc(end-start+1);
    int i;
    if (t == NULL) {
      free(emergency);
      fprintf(stderr, "parse: running out of memory. Programmer may need to restructure the code.\n");
      fflush(stderr); fflush(stdout);
      exit(EXIT_FAILURE);
    }
    for (i = 0; i < end-start; i++) t[i] = s[i+start];
    t[end-start] = '\0';
    return t;
  }

  regmatch_t matches[MAX_MATCHES];
  regex_t regex;
  int reti;
  char msgbuf[100];
  char *p = prog_line[cur_line]+cur_col;

  IN;
  if (cur_line > best_line) { best_line = cur_line; best_col = cur_col; }
  if ((cur_line == best_line) && (cur_col > best_col)) best_col = cur_col;

  if (debug >= 1) fprintf(stderr, "%sRegex 1: pending data is: \"%s\"\n", sp(), prog_line[cur_line]+cur_col);
  if (debug >= 1) fprintf(stderr, "%sTrying \"%s\"\n", sp(), regexs);

  if (matched == NULL) {
    fprintf(stderr, "parse: programmer error - 'matched' parameter to lex_regex was passed a null pointer\n");
    exit(EXIT_FAILURE);
  }
  
  if (*regexs == '\0') return OUT(TRUE); // empty string always matches
  
  for (;;) {
    p = prog_line[cur_line]+cur_col;
    if (cur_line >= end_of_file) return OUT(FALSE); // non-empty string can't match end of file
    while (*p == ' ') {
      p += 1;
      cur_col += 1;
      if (debug >= 1) fprintf(stderr, "%sRegex 2: pending data is: \"%s\"\n", sp(), prog_line[cur_line]+cur_col);
    } // always skip white space on this line before an atom (OK if we leave it skipped)
    if (strcmp(regexs, "^$") == 0) { // special case for end-of-line (should always be only char in string)
      if (*p == '\0') {
        cur_line += 1; cur_col = 0;
        if (cur_line > best_line) { best_line = cur_line; best_col = cur_col; }
        if ((cur_line == best_line) && (cur_col > best_col)) best_col = cur_col;

        if (debug >= 1) fprintf(stderr, "%sRegex 3: pending data is: \"%s\"\n", sp(), prog_line[cur_line]+cur_col);
        return OUT(TRUE);
      }
      if (debug >= 1) fprintf(stderr, "%sRegex 4: pending data is: \"%s\"\n", sp(), prog_line[cur_line]+cur_col);
      return OUT(FALSE);
    } else {
      // skip newlines as well as spaces
      if (*p == '\0') {
        cur_line += 1; cur_col = 0;
        if (cur_line > best_line) { best_line = cur_line; best_col = cur_col; }
        if ((cur_line == best_line) && (cur_col > best_col)) best_col = cur_col;

        p = prog_line[cur_line]+cur_col;
        if (debug >= 1) fprintf(stderr, "%s+Trying \"%s\"\n", sp(), regexs);
        if (debug >= 1) fprintf(stderr, "%sRegex 5: pending data is: \"%s\"\n", sp(), prog_line[cur_line]+cur_col);
      } else break;
    }
  }

  /* Compile regular expression */
  if (regcomp(&regex, regexs, REG_EXTENDED) != 0) {
    free(emergency);
    fprintf(stderr, "parse: programmer error - could not compile regex \"%s\" - we may have run out of malloc memory\n", regexs);
    fflush(stderr); fflush(stdout);
    exit(EXIT_FAILURE);
  }

  /* Execute regular expression */
  reti = regexec(&regex, p, MAX_MATCHES, matches, 0);

  if (debug >= 1) fprintf(stderr, "%sRegex 6: pending data is: \"%s\"\n", sp(), prog_line[cur_line]+cur_col);
  if ((!reti) && (matches[0].rm_so == 0)) { // 0 is in case user forgets the "^" anchor ...
    // Step over matched string.

    if (debug >= 1) fprintf(stderr, "%sRegex: pending data is: \"%s\"\n", sp(), prog_line[cur_line]+cur_col);

    *matched = extract(p, matches[0].rm_so, matches[0].rm_eo);
    cur_col += strlen(*matched);
    //cur_col +=  matches[0].rm_eo-matches[0].rm_so-1;

    if (cur_line > best_line) { best_line = cur_line; best_col = cur_col; }
    if ((cur_line == best_line) && (cur_col > best_col)) best_col = cur_col;
    p = prog_line[cur_line]+cur_col;

    if (debug >= 1) fprintf(stderr, "%sRegex \"%s\" matched \"%s\" leaving \"%s\"\n", sp(), regexs, *matched, p);
    /* Free memory allocated to the pattern buffer by regcomp() */
//  regfree(&regex);
    return OUT(TRUE);
  } else if (reti == REG_NOMATCH) {
    // No match
    /* Free memory allocated to the pattern buffer by regcomp() */
//  regfree(&regex); // maybe? -  http://redhat.com/archives/libvir-list/2013-September/msg00276.html 
    return OUT(FALSE);
  }

  free(emergency);
  regerror(reti, &regex, msgbuf, sizeof(msgbuf));
  fprintf(stderr, "parse: Regex match internal error: %s\n", msgbuf);
  fflush(stderr); fflush(stdout);
  exit(EXIT_FAILURE);
  return OUT(FALSE);
}

// Parse source file using these phrases from the grammar.  Advance the current position
// if there is a match - don't move it if there is not.

//\\ stringlit := "\"(\\\\.|[^\"])*\"";
int parse_stringlit(tuple *stringlit) {
  return lex_regex("\"(\\\\.|[^\"])*\"", &stringlit->matched); // This regex is why we need regcomp to use REG_EXTENDED.
  // Simpler regex below does not support \-escaped characters in a string.
  // return lex_regex("^\".*\"", &stringlit->matched);
}

//\\ castname := "[A-Z][A-Z]*[0-9][0-9]*";
int parse_castname(tuple *castname) {
  // SBT casts include some semantic content: [US]INT<bitsize>
  return lex_regex("^[A-Z][A-Z]*[0-9][0-9]*", &castname->matched);
}

//\\ ident := "[A-Za-z_][A-Za-z_0-9]*";
int parse_ident(tuple *ident) {
  ident->phraseno = IDENT; // override label, etc
  return lex_regex("^[A-Za-z_][A-Za-z_0-9]*", &ident->matched);
}

//\\ label := <ident>;
int parse_label(tuple *labelname) {
  return parse_ident(labelname);
}

//\\ <opt-index> :=
//\\   '[' <expr> ']',
//\\   ;
int parse_opt_index(tuple *opt_index) {
  int line = cur_line, col = cur_col; // save current parse position for backtracking...
  IN;
  if (lex_lit("[", NULL) && parse_expr(opt_index) && lex_lit("]", NULL)) {
    opt_index->phraseno = OPT_INDEX;
    return OUT(TRUE);
  } else { cur_line = line; cur_col = col; }
  opt_index->phraseno = NONE;
  return OUT(TRUE);
}

int parse_fn_call(tuple *fncall) {
  // should create an opt-param-list phrase...
  //\\   <fnname> '(' ')',
  //\\   <fnname> '(' <param-list> ')',
  int line = cur_line, col = cur_col; // save current parse position for backtracking...
  tuple *fnname = new_tuple(FNNAME), *param_list = new_tuple(PARAM_LIST);
  IN;
  fncall->phraseno = FNCALL;
  param_list->phrase[0] = NULL;
  if (parse_fnname(fnname) && lex_lit("(", NULL) && ((lex_lit(")", NULL)) || (parse_param_list(param_list) && lex_lit(")", NULL)))) {
    // TO DO: plug in procname and param_list fields here...
    if (param_list->phrase[0] == NULL) {
      fncall->phrase[1] = NULL;
    } else {
      fncall->phrase[1] = param_list;
    }
    fncall->phrase[0] = fnname;
    return OUT(TRUE);
  } else { cur_line = line; cur_col = col; }
  return OUT(FALSE);
}

// var can be a LVALUE (as opposed to 'val' which is an RVALUE, but can include a var)

//\\ var := 
//\\   <ident> <opt-index>;
int parse_var(tuple *var) {
  int line = cur_line, col = cur_col; // save current parse position for backtracking...
  tuple *ident = new_tuple(IDENT), *opt_index = new_tuple(OPT_INDEX);
  IN;
  if (parse_ident(ident) && parse_opt_index(opt_index)) {
    var->phraseno = VAR;
    var->phrase[0] = ident;
    var->phrase[1] = opt_index; // NONE if this is a simple variable, OPT_INDEX if an array element
    return OUT(TRUE);           // Simple variables are things like registers, CC etc - the only
  }                             // indexed element at the moment is memory[] ...
  cur_line = line; cur_col = col;
  return OUT(FALSE);
}

//     Could factor the ( ... ? ... : ... ) vs ( ... ) more efficiently...

// val is an RVALUE

//\\ val := 
//\\   '(' <expr> '?' <expr> : <expr> ')',
//\\   '(' <expr> ')',
//\\   <fncall>,
//\\   <var>,
//\\   <hex-constant>,
//\\   <positive-integer>;
int parse_val(tuple *val) {
  int line = cur_line, col = cur_col; // save current parse position for backtracking...
  tuple *fncall = new_tuple(FNCALL), *expr = new_tuple(EXPR),*expr1 = new_tuple(EXPR),*expr2 = new_tuple(EXPR);
  // to do: assign subtypes
  IN;

  if (lex_lit("(", NULL) && parse_expr(expr) && lex_lit("?", NULL) && parse_expr(expr1) && lex_lit(":", NULL) && parse_expr(expr2) && lex_lit(")", NULL)) {
    val->subphrase = VAL__COND_EXPR;
    val->phrase[0] = expr;
    val->phrase[1] = expr1;
    val->phrase[2] = expr2;
    return OUT(TRUE);
  } else { cur_line = line; cur_col = col; }

  if (lex_lit("(", NULL) && parse_expr(expr) && lex_lit(")", NULL)) {
    val->subphrase = VAL__BR_EXPR;
    val->phrase[0] = expr;
    return OUT(TRUE);
  } else { cur_line = line; cur_col = col; }

  if (parse_fn_call(val)
   || parse_var(val)
   || parse_hex_constant(val)
   || parse_positive_integer(val)) {
    return OUT(TRUE);
  }

  return OUT(FALSE);
}

//\\ opt-monop :=
//\\   '-', '~', '!'
//\\   ;
int parse_opt_monop(tuple *opt_monop) {
  IN;
  if (lex_lit("-", &opt_monop->matched)
   || lex_lit("~", &opt_monop->matched)
   || lex_lit("!", &opt_monop->matched)) {
    return OUT(TRUE);
  }
  opt_monop->matched = "";
  return OUT(TRUE);
}

// could modify grammar to handle multiple casts here.
// Currently redundant cast removal is a nasty hack in peephole() in the SBT.

//\\ typed-val :=
//\\   '(' <castname> ')' <opt-monop> <val>,
//\\   <opt-monop> <val>;
int parse_typed_var(tuple *typed_var) {
  int line = cur_line, col = cur_col; // save current parse position for backtracking...
  tuple *castname = new_tuple(CASTNAME), *opt_monop = new_tuple(OPT_MONOP), *val = new_tuple(VALUE);
  // could shorten this...
  IN;
  if (lex_lit("(", NULL) && parse_castname(castname) && lex_lit(")", NULL) && parse_opt_monop(opt_monop) && parse_val(val)) {
    typed_var->subphrase = TYPED_VAR__CAST;
    typed_var->phrase[0] = castname;
    typed_var->phrase[1] = opt_monop;
    typed_var->phrase[2] = val;
    return OUT(TRUE);
  } else { cur_line = line; cur_col = col; }

  if (parse_opt_monop(opt_monop) && parse_val(val)) {
    typed_var->subphrase = TYPED_VAR__DEFAULT;
    typed_var->phrase[0] = NULL;
    typed_var->phrase[1] = opt_monop;
    typed_var->phrase[2] = val;
    return OUT(TRUE);
  } else { cur_line = line; cur_col = col; }

  return OUT(FALSE);
}

//\\ binop :=
//\\   '>>', '<<', '+', '*', '/', '-', '&&', '||', '&', '|', '^', '<=', '<', '>=', '>', '!=', '==';

int parse_binop(tuple *binop) {
  IN;
  if (lex_lit(">>", &binop->matched)
   || lex_lit("<<", &binop->matched)
   || lex_lit("+", &binop->matched)
   || lex_lit("*", &binop->matched)
   || lex_lit("/", &binop->matched)
   || lex_lit("-", &binop->matched)
   || lex_lit("&&", &binop->matched)
   || lex_lit("||", &binop->matched)
   || lex_lit("&", &binop->matched)
   || lex_lit("|", &binop->matched)
   || lex_lit("^", &binop->matched)
   || lex_lit("<=", &binop->matched)
   || lex_lit("<", &binop->matched)
   || lex_lit(">=", &binop->matched)
   || lex_lit(">", &binop->matched)
   || lex_lit("!=", &binop->matched)
   || lex_lit("==", &binop->matched)) {
    return OUT(TRUE);
  }
  return OUT(FALSE);
}

//\\ opt-second_operand :=
//\\   <binop> <typed-val>,
//\\   ;
int parse_opt_second_operand(tuple *opt_second_operand) {
  int line = cur_line, col = cur_col; // save current parse position for backtracking...
  tuple *binop = new_tuple(BINOP), *typed_var = new_tuple(TYPED_VAR);
  IN;
  if (parse_binop(binop) && parse_typed_var(typed_var)) {
    opt_second_operand->subphrase = OPT_SECOND_OPERAND__PRESENT;
    opt_second_operand->phrase[0] = binop;
    opt_second_operand->phrase[1] = typed_var;
    return OUT(TRUE);
  } else { cur_line = line; cur_col = col; }
  opt_second_operand->subphrase = OPT_SECOND_OPERAND__ABSENT;
  opt_second_operand->phrase[0] = NULL;
  opt_second_operand->phrase[1] = NULL;
  return OUT(TRUE);
}

//\\ expr :=
//\\   <typed-val> <opt-second_operand>;
int parse_expr(tuple *expr) {
  int line = cur_line, col = cur_col; // save current parse position for backtracking...
  tuple *typed_var = new_tuple(TYPED_VAR), *opt_second_operand = new_tuple(OPT_SECOND_OPERAND);
  IN;
  if (parse_typed_var(typed_var) && parse_opt_second_operand(opt_second_operand)) {
    expr->phrase[0] = typed_var;
    expr->phrase[1] = opt_second_operand;
    return OUT(TRUE);
  } else { cur_line = line; cur_col = col; }
  return OUT(FALSE);
}

//\\ fnname := <ident>;
int parse_fnname(tuple *fnname) {
  return parse_ident(fnname);
}

//\\ procname := <ident>;
int parse_procname(tuple *procname) {
  return parse_ident(procname);
}

//\\ param-list := <expr> <rest-of-param-list>;
int parse_param_list(tuple *param_list) {
  int line = cur_line, col = cur_col; // save current parse position for backtracking...
  tuple *expr = new_tuple(EXPR), *rest_of_param_list = new_tuple(REST_OF_PARAM_LIST);
  IN;
  if (parse_expr(expr) && parse_rest_of_param_list(rest_of_param_list)) {
    param_list->phrase[0] = expr;
    param_list->phrase[1] = rest_of_param_list;
    return OUT(TRUE);
  } else { cur_line = line; cur_col = col; }
  return OUT(FALSE);
}

//\\ rest-of-param-list :=
//\\   ',' <expr> <rest-of-param-list>,
//\\   ;
int parse_rest_of_param_list(tuple *param_list) {
  int line = cur_line, col = cur_col; // save current parse position for backtracking...
  tuple *expr = new_tuple(EXPR), *rest_of_param_list = new_tuple(REST_OF_PARAM_LIST);
  IN;
  if (lex_lit(",", NULL) && parse_expr(expr) && parse_rest_of_param_list(rest_of_param_list)) {
    param_list->phrase[0] = expr;
    param_list->phrase[1] = rest_of_param_list;
    return OUT(TRUE);
  } else { cur_line = line; cur_col = col; }
  param_list->phrase[0] = NULL;
  param_list->phrase[1] = NULL;
  return OUT(TRUE);
}

//\\ ss := 
int parse_ss(tuple *ss) {

  auto int parse_comment(tuple *comment) {
    // Grammar should really have a \n after the comment, but due to the way \n is
    // handled, it works without one.
//\\   "//.*", '/**/';
    int line = cur_line, col = cur_col;
    IN;
    comment->subphrase = SS__COMMENT;
    comment->matched = "/**/";
    if (lex_regex("^//.*", &comment->matched) || lex_lit("/**/", NULL)) {
      return OUT(TRUE);
    } else { cur_line = line; cur_col = col; }
    return OUT(FALSE);
  }
  
  auto int parse_bracketed_ss_list(tuple *bss) {
//\\   '{' <ss-list> '}',
    tuple *ssl = new_tuple(SS__BR_LIST); // INVALID! should create a top-level BR_LIST
    int line = cur_line, col = cur_col;
    IN;
    bss->subphrase = SS__BR_LIST;
    if (lex_lit("{", NULL) && parse_ss_list(ssl) && lex_lit("}", NULL)) {
      bss->phrase[0] = ssl;
      return OUT(TRUE);
    } else { cur_line = line; cur_col = col; }
    return OUT(FALSE);
  }
  
  auto int parse_case_label(tuple *caselab) {
//\\   "case" <selector> ':',
    tuple *sel = new_tuple(SELECTOR);
    int line = cur_line, col = cur_col;
    IN;
    caselab->subphrase = SS__CASELAB;
    if (lex_lit("case", NULL) && parse_selector(sel) && lex_lit(":", NULL)) {
      caselab->phrase[0] = sel;
      return OUT(TRUE);
    } else { cur_line = line; cur_col = col; }
    return OUT(FALSE);
  }

  auto int parse_label_statement(tuple *lab_statement) {
//\\   <label> ':',
    tuple *lab = new_tuple(LABEL);
    int line = cur_line, col = cur_col;
    IN;
    lab_statement->subphrase = SS__LAB;
    if (parse_label(lab) && lex_lit(":", NULL)) {
      lab_statement->phrase[0] = lab;
      return OUT(TRUE);
    } else { cur_line = line; cur_col = col; }
    return OUT(FALSE);
  }
  
  auto int parse_assignment(tuple *assig) {
    // now handling &= in SBT code generator instead, so removing from here...
//\\   <var> '=' <expr> ';',
    tuple *var = new_tuple(VAR), *expr = new_tuple(EXPR);
    int line = cur_line, col = cur_col;
    IN;
    assig->subphrase = SS__ASSIG;
    if (parse_var(var) && lex_lit("=", NULL) && parse_expr(expr) && lex_lit(";", NULL)) {
      assig->phrase[0] = var;
      assig->phrase[1] = expr;
      return OUT(TRUE);
    } else { cur_line = line; cur_col = col; }
    return OUT(FALSE);
  }

  auto int parse_procedure_call(tuple *proccall) {
//\\   <procname> '(' ')' ';',
//\\   <procname> '(' <param-list> ')' ';',
    tuple *procname = new_tuple(PROCNAME), *param_list = new_tuple(PARAM_LIST);
    int line = cur_line, col = cur_col;
    IN;
    param_list->phrase[0] = NULL;
    proccall->subphrase = SS__PROCCALL;
    if (parse_procname(procname) && lex_lit("(", NULL) && ((lex_lit(")", NULL)) || (parse_param_list(param_list) && lex_lit(")", NULL))) && lex_lit(";", NULL)) {
      proccall->phrase[0] = procname;
      if (param_list->phrase[0] != NULL) {
        proccall->phrase[1] = param_list;
      } else {
        proccall->phrase[1] = NULL;
      }
      return OUT(TRUE);
    } else { cur_line = line; cur_col = col; }
    return OUT(FALSE);
  }

  // this is really just a special case for mon(" ... "); - strings not used anywhere else in SBT output.
  auto int parse_string_procedure_call(tuple *proccall) {
//\\   <procname> '(' <stringlit> ')' ';',
    tuple *procname = new_tuple(PROCNAME), *stringlit = new_tuple(STRING_LITERAL);
    int line = cur_line, col = cur_col;
    IN;
    proccall->subphrase = SS__STRINGPROCCALL;
    if (parse_procname(procname) && lex_lit("(", NULL) && parse_stringlit(stringlit) && lex_lit(")", NULL) && lex_lit(";", NULL)) {
      // TO DO: plug in procname and param_list fields here...
      proccall->phrase[0] = procname;
      proccall->phrase[1] = stringlit;
      return OUT(TRUE);
    } else { cur_line = line; cur_col = col; }
    return OUT(FALSE);
  }
  
  auto int parse_if_statement(tuple *ifstat) {
    // changing SS to <bracketed-ss-list> as an experiment
//\\   "if" '(' <expr> ')' <SS>,
    tuple *expr = new_tuple(EXPR), *ss = new_tuple(SS);
    int line = cur_line, col = cur_col;
    IN;
    ifstat->subphrase = SS__IF;
    //if (lex_lit("if", NULL) && lex_lit("(", NULL) && parse_cond(cond) && lex_lit(")", NULL) && parse_ss(ss)) {
    if (lex_lit("if", NULL) && lex_lit("(", NULL) && parse_expr(expr) && lex_lit(")", NULL) && parse_bracketed_ss_list(ss)) {
      // TO DO: plug in cond and ss fields here...
      ifstat->phrase[0] = expr;
      ifstat->phrase[1] = ss;
      return OUT(TRUE);
    } else { cur_line = line; cur_col = col; }
    return OUT(FALSE);
  }
  
//\\   "goto" <label> ';',
  auto int parse_goto_statement(tuple *g) {
    tuple *lab = new_tuple(LABEL);
    int line = cur_line, col = cur_col;
    IN;
    g->subphrase = SS__GOTO;
    if (lex_lit("goto", NULL) && parse_label(lab) && lex_lit(";", NULL)) {
      g->phrase[0] = lab;
      return OUT(TRUE);
    } else { cur_line = line; cur_col = col; }
    return OUT(FALSE);
  }

//\\   "JUMP" ';',
  auto int parse_jump_statement(tuple *g) {
    int line = cur_line, col = cur_col;
    IN;
    g->subphrase = SS__JUMP;
    if (lex_lit("JUMP", NULL) && lex_lit(";", NULL)) {
      return OUT(TRUE);
    } else { cur_line = line; cur_col = col; }
    return OUT(FALSE);
  }
  
//\\   ';';
  auto int parse_null_statement(tuple *ns) {
    ns->subphrase = SS__NULL;
    return lex_lit(";", NULL);
  }
  
  IN;
  
  if (parse_comment(ss)
   || parse_bracketed_ss_list(ss)
   || parse_case_label(ss)
   || parse_label_statement(ss)
   || parse_if_statement(ss)
   || parse_goto_statement(ss)
   || parse_procedure_call(ss)
   || parse_string_procedure_call(ss)
   || parse_jump_statement(ss)
   || parse_assignment(ss)
   || parse_null_statement(ss)) {
    return OUT(TRUE);
  }

  return OUT(FALSE);
}


//\\ ss-list :=
//\\   <ss> <ss-list>,
//\\   ;

int parse_ss_list(tuple *ss_list) {
  tuple *ss = new_tuple(SS), *more_ss = new_tuple(SS_LIST);
  int line = cur_line, col = cur_col;
  IN;

  ss_list->phrase[0] = NULL;
  ss_list->phrase[1] = NULL;

  // a guard... these are the only ways an SS list ends other than in a syntax error...
  if (lex_lit("}", NULL) || (cur_line >= end_of_file)) {
    cur_line = line; cur_col = col;
    return OUT(TRUE); // null option
  }
  
  if (parse_ss(ss) && parse_ss_list(more_ss)) { // the 'more ss' may be empty
    ss_list->phrase[0] = ss;
    ss_list->phrase[1] = more_ss;
    return OUT(TRUE);
  } else { cur_line = line; cur_col = col; }

  // a syntax error. 'best_pos' will allow us to show the exact point of failure.
  return OUT(FALSE);
}

//\\ hex-constant := "0x[0-9A-Fa-f][0-9A-Fa-f]*[Uu]?";
int parse_hex_constant(tuple *hex) {
  hex->phraseno = HEX_CONSTANT;
  return lex_regex("^0x[0-9A-Fa-f][0-9A-Fa-f]*[Uu]?", &hex->matched); // regcomp needs REG_EXTENDED format option
  // ideally we need to store this as an integer as well, plus cast type information
}

//\\ positive-integer := "[0-9][0-9]*";
int parse_positive_integer(tuple *posint) {
  posint->phraseno = POSITIVE_INTEGER;
  return lex_regex("^[0-9][0-9]*", &posint->matched); // didn't add U or L as we never use it in SBT-generated code
  // negative numbers are built as an expression using monop '-'
}

//\\ integer :=
int parse_integer(tuple *num) {
  tuple *hex = new_tuple(HEX_CONSTANT), *posint = new_tuple(POSITIVE_INTEGER);
  IN;
  
//\\   <hex-constant>,
  if (parse_hex_constant(hex)) {
    num->subphrase = HEX_CONSTANT;
    num->phrase[0] = hex;
    return OUT(TRUE);
  }

//\\   <positive-integer>;
  if (parse_positive_integer(posint)) {
    num->subphrase = POSITIVE_INTEGER;
    num->phrase[0] = posint;
    return OUT(TRUE);
  }

  return OUT(FALSE);
}

//\\ selector := <integer>;
int parse_selector(tuple *sel) {
  return parse_integer(sel);
}

void p(tuple *t) {
  if (t == NULL) return;
  
  switch (t->phraseno) {
  case NONE: break;
  case SS__BR_LIST:
    p(t->phrase[0]);
    break;
  case SS:
    switch (t->subphrase) {
    case SS__COMMENT:
      if (strcmp(t->matched, "/**/") == 0) {
        fprintf(stdout, "  /**/");
      } else {
        fprintf(stdout, "  %s\n", t->matched);
      }
      break;
    case SS__BR_LIST:
      fprintf(stdout, "{\n");
      p(t->phrase[0]);
      fprintf(stdout, "  }\n");
      break;
    case SS__CASELAB:
      fprintf(stdout, "  case ");
      p(t->phrase[0]);
      fprintf(stdout, ":\n");
      break;
    case SS__LAB:
      p(t->phrase[0]);
      fprintf(stdout, ":\n");
      break;
    case SS__ASSIG:
      fprintf(stdout, "  ");
      p(t->phrase[0]); // var
      fprintf(stdout, " = ");
      p(t->phrase[1]); // expr
      fprintf(stdout, ";\n");
      break;
    case SS__STRINGPROCCALL:
      fprintf(stdout, "  ");
      p(t->phrase[0]);
      fprintf(stdout, "(");
      p(t->phrase[1]);
      fprintf(stdout, ");\n");
      break;
    case SS__PROCCALL:
      fprintf(stdout, "  ");
      p(t->phrase[0]);
      fprintf(stdout, "(");
      p(t->phrase[1]);
      fprintf(stdout, ");\n");
      break;
    case SS__IF:
      fprintf(stdout, "  if (");
      p(t->phrase[0]);
      fprintf(stdout, ") ");
      p(t->phrase[1]);
      break;
    case SS__GOTO:
      fprintf(stdout, "  goto ");
      p(t->phrase[0]);
      fprintf(stdout, ";\n");
      break;
    case SS__JUMP:
      fprintf(stdout, "  JUMP;\n");
      break;
    case SS__NULL:
      fprintf(stdout, "  ;\n");
      break;
    default:;
    }
    break;
  case SS_LIST:
    if (t->phrase[0]) {
      p(t->phrase[0]);
      p(t->phrase[1]);
    }
    break;
  case IDENT:
    fprintf(stdout, "%s", t->matched);
    break;
  case LABEL:
    p(t->phrase[0]);
    break;
  case VAR:
    p(t->phrase[0]); // ident
    p(t->phrase[1]); // opt-index
    break;
  case VALUE:
    switch (t->subphrase) {
    case VAL__COND_EXPR:
      fprintf(stdout, "(");
      p(t->phrase[0]);
      fprintf(stdout, " ? ");
      p(t->phrase[1]);
      fprintf(stdout, " : ");
      p(t->phrase[2]);
      fprintf(stdout, ")");
      break;
    case VAL__BR_EXPR:
      fprintf(stdout, "(");
      p(t->phrase[0]);
      fprintf(stdout, ")");
      break;
    default: ;
    }
    break;
  case OPT_INDEX:
    fprintf(stdout, "[");
    t->phraseno = EXPR;
    p(t); // shennanigans!
    fprintf(stdout, "]");
    break;
  case FNNAME: // never happens, intercepted by 'ident'
    fprintf(stdout, "<FNNAME>");
    break;
  case PROCNAME: // never happens, intercepted by 'ident'
    fprintf(stdout, "<PROCNAME>");
    break;
  case REST_OF_PARAM_LIST:
    if (t->phrase[0]) fprintf(stdout, ", ");
  case PARAM_LIST:
    p(t->phrase[0]); // EXPR
    if (t->phrase[1]) p(t->phrase[1]); // REST_OF_PARAM_LIST
    break;
  case EXPR:
    p(t->phrase[0]); // val
    p(t->phrase[1]); // opt-second_operand
    break;
  case FNCALL:
    p(t->phrase[0]); // fnname
    fprintf(stdout, "(");
    p(t->phrase[1]); // opt-param-list
    fprintf(stdout, ")");
    break;
  case TYPED_VAR:
    switch (t->subphrase) {
    case TYPED_VAR__CAST:
      fprintf(stdout, "(");
      p(t->phrase[0]);
      fprintf(stdout, ")");
      p(t->phrase[1]); // monop
      p(t->phrase[2]); // var
      break;
    case TYPED_VAR__DEFAULT:
      // phrase[0] is NULL
      p(t->phrase[1]); // monop
      p(t->phrase[2]); // var
      break;
    default:;
    }
    break;
  case OPT_SECOND_OPERAND:
    switch (t->subphrase) {
    case OPT_SECOND_OPERAND__PRESENT:
      p(t->phrase[0]); // binop
      p(t->phrase[1]); // rest of expr
      break;
    case OPT_SECOND_OPERAND__ABSENT:
      break;
    default:;
    }
    break;
  case OPT_MONOP:
    fprintf(stdout, "%s", t->matched);
    //p(t->phrase[0]); // monop. may be empty string.
    break;
  case CASTNAME:
    fprintf(stdout, "%s", t->matched);
    break;
  case BINOP:
    fprintf(stdout, "%s", t->matched);
    //p(t->phrase[0]); // binop string
    break;
  case SELECTOR:
    p(t->phrase[0]);
    break;
  case HEX_CONSTANT:
    fprintf(stdout, "%s", t->matched);
    break;
  case POSITIVE_INTEGER:
    fprintf(stdout, "%s", t->matched);
    break;
  case STRING_LITERAL:
    fprintf(stdout, "%s", t->matched);
    break;
  default:
    fprintf(stderr, "Unhandled switch parameter %d\n", t->phraseno);
  }
}

int main(int argc, char **argv) {
  tuple *program = new_tuple(SS_LIST);
  
  // If we run out of mallockable memory, error reporting can fail.
  // So free this on severe error to ensure enough memory to print
  // the error string and flush the I/O buffers.
  emergency = malloc(10*1024);
  
  if ((argc == 2) && (strcmp(argv[1], "-h") == 0)) {
    fprintf(stderr, "syntax: parse [-d] < file.c\n");
    exit(0);
  }
  
  if ((argc == 2) && (strcmp(argv[1], "-d") == 0)) debug = 1;
  
  preload_entire_source(stdin);
  
  if (parse_ss_list(program) && (cur_line+1 == end_of_file)) {
    fprintf(stderr, "%d lines of %d compiled.\n", cur_line+1, end_of_file);
    // output optimised program:
    p(program);
    exit(EXIT_SUCCESS);
  } else {
    int i;
    fprintf(stderr, "* Syntax error at line %d, col %d:\n\n", best_line+1, best_col+1); // people tend to number from 1 up
    fprintf(stderr, "%s\n", prog_line[best_line]);
    for (i = 0; i < best_col; i++) fprintf(stderr, " ");
    fprintf(stderr, "^\n");
    exit(EXIT_FAILURE);
  }

  return(1);
}