#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>
#ifndef FALSE
#define FALSE (0!=0)
#define TRUE (0==0)
#endif
/*
Takeon generates a grammar in gram[]. Phrases are numbered
sequentially from 512 upwards, with a lookup table from this
sequential numbering into gram indexes stored in phrase[].
We *could* make the phrase numbers sparse, and have only the
gram table without the phrase table - the only reason to do it
this way is so that the phrasename[] table is compact. If
we didn't do the indirection (thus saving the phrase[] table)
then the phrasename table would be as big as the gram table,
which is much worse. Of course, the downside is that we only
really need the phrasename table for diagnostics, so in a
production compiler it's true that we do have a slight
unnecessary overhead.
This implementation isn't too bad, but it could be improved
by better support for BIPs (eg automatic numbering). Need to
find a way to do some of the tricks I used to do in TACC:
guards, negated conditions, and parse-time code execution.
The latter can be done with BIPS.
The big thing to be done is to read the grammar that's
embedded in the source file directly rather than requiring
a pre-pass.
Finally, there are way too many pre-defined ranges here which
can all be done away with. And 16 max BIPS is ridiculous
if we start using BIPS for parse-time code.
Also a major improvement would be to walk each phrase down the
left hand path and detect any left-associative infinite loops
before it gets as far as being used in a compiler :-)
*/
static FILE *grammar;
#define OPTIONAL_PHRASE (1<<14)
#define NEGATED_PHRASE (1<<15)
#define MAX_GRAMMAR (1024*16)
#define MAX_PHRASES 1024
static int gram[MAX_GRAMMAR];
static int phrase[MAX_PHRASES];
static int lineno;
void fatal_(int line) {
fprintf(stderr, "* Syntax error at line %d (detected in %s, line %d)\n", lineno, __FILE__, line);
exit(1);
}
#define fatal() fatal_(__LINE__)
char *minus_to_ul(char *s)
{
static char temp[128];
char *t = temp;
for (;;) {
int c = *s++;
if (c == '-') c = '_';
*t++ = c;
if (c == '\0') return temp;
}
}
char *upto(int ends) {
char temp[128];
char *s = temp;
int count = 0;
for (;;) {
int c = fgetc(grammar);
if ((c == EOF) || ferror(grammar) || (count == 127)) fatal();
if (c == ends) return(strdup(temp));
*s++ = c; *s = '\0'; count++;
}
}
int nextch(void) {
int c = fgetc(grammar);
if ((c == EOF) || ferror(grammar)) fatal();
return c;
}
int nonspace(void) {
for (;;) {
int c = fgetc(grammar);
if ((c == EOF) || ferror(grammar)) fatal();
if (c == '\n') lineno++;
if (!isspace(c)) return(c);
}
}
#define MAX_BIPS 16 /* Surely more than enough? [undoubtedly I'll curse that later ;-)] */
static int nextbip = 0;
static int BIP[MAX_BIPS];
static char *phrasename[MAX_PHRASES+512];
static char *key[128];
static int nextfreekey = 0;
static int next_gram = 0;
static int pnp = 512;
void dump_tables(void) {
int i;
int nums_per_line = 16;
int keys_per_line = 4;
fprintf(stdout, "#define MAX_GRAMMAR %d\n", next_gram);
fprintf(stdout, "#define PHRASE_BASE %d\n", nextbip+512);
fprintf(stdout, "int gram[MAX_GRAMMAR] = {\n");
for (i = 0; i < next_gram; i++) {
fprintf(stdout, "%5d, ", gram[i]);
if ((i+1) % nums_per_line == 0) fprintf(stdout, "\n");
}
if ((next_gram % nums_per_line) != 0) fprintf(stdout, "\n");
fprintf(stdout, "};\n\n");
fprintf(stdout, "#define MAX_KEYWORD %d\n", nextfreekey);
fprintf(stdout, "char *keyword[MAX_KEYWORD] = { // Keywords are based at 256\n ");
for (i = 0; i < nextfreekey; i++) {
fprintf(stdout, "\"%s\", ", key[i]);
if ((i+1) % keys_per_line == 0) fprintf(stdout, "\n ");
}
if ((nextfreekey % keys_per_line) != 0) fprintf(stdout, "\n");
fprintf(stdout, "};\n\n");
fprintf(stdout, "#define MAX_BIP %d\n", nextbip);
fprintf(stdout, "int BIP[MAX_BIP] = { // BIPs precede PHRASEs at 512 upwards\n");
for (i = 0; i < nextbip; i++) {
fprintf(stdout, "%2d, ", BIP[i]);
if ((i+1) % nums_per_line == 0) fprintf(stdout, "\n");
}
if ((nextbip % nums_per_line) != 0) fprintf(stdout, "\n");
fprintf(stdout, "};\n");
fprintf(stdout, "\n#define MAX_PHRASE %d\n", pnp-512);
fprintf(stdout, "#if defined(DEBUG_PARSER) || defined(EXTENDED_ERROR_REPORTING)\n// FOR DEBUGGING ONLY\n");
fprintf(stdout, "char *phrasename[MAX_PHRASE] = { // Based at 512 upwards\n ");
for (i = 512; i < pnp; i++) {
fprintf(stdout, "\"%s\", ", phrasename[i]);
if ((i+1) % keys_per_line == 0) fprintf(stdout, "\n ");
}
if ((pnp % keys_per_line) != 0) fprintf(stdout, "\n");
fprintf(stdout, "};\n#endif /* DEBUG_PARSER */\n");
fprintf(stdout, "\nint phrase_start[MAX_PHRASE-MAX_BIP] = {\n");
for (i = 512+nextbip; i < pnp; i++) {
fprintf(stdout, "%5d, ", phrase[i-512]);
if ((i+1) % nums_per_line == 0) fprintf(stdout, "\n");
}
if ((pnp % nums_per_line) != 0) fprintf(stdout, "\n");
fprintf(stdout, "};\n\n");
for (i = 512; i < pnp; i++) {
fprintf(stdout, "#define P_%s %d\n", minus_to_ul(phrasename[i]), i);
if (i < 512+nextbip) fprintf(stdout, "#define B_%s %d\n\n", minus_to_ul(phrasename[i]), BIP[i-512]);
}
}
int keyword_code(char *keyword)
{
int i;
key[nextfreekey] = keyword;
// POOR IMPLEMENTATION for now. Quick & Dirty to get something working.
for (i = 0; i <= nextfreekey; i++) {
if (strcmp(keyword, key[i]) == 0) break;
}
if (i == nextfreekey) {
key[i] = strdup(key[i]); nextfreekey++;
}
return i+256; // 256..511 are for keywords
}
void takeon(int finalpass) {
int gp = 0; // grammar pointer
int sym, lastsym = ';';
char *def_name = NULL, *name = NULL, *string = NULL, *keyword = NULL;
int def_bip = FALSE, def_phrase = FALSE, def_type = 0, specialbit = 0;
int alt_count = 1, alt_count_index = 0;
int phrase_count = 0, phrase_count_index = 0;
int this_phrase_start = 0;
int indent_len = 0;
static int max_phrase = 0;
lineno = 1; // (re)init globals too.
next_gram = 0;
pnp = 512; // phrase name pointer
nextbip = pnp; // First BIPS, then phrases.
for (;;) {
switch (sym = nonspace()) {
case 'B': // BIP DEFINITION
case 'C': // PHRASE DEFINITION
case 'P': // PHRASE DEFINITION
def_type = sym;
def_phrase = TRUE; break;
case 'E': // END OF GRAMMAR
if (finalpass) dump_tables();
max_phrase = pnp;
return;
case '=': // start of a phrase alternative or a BIP defn
// name should be valid at this point
if (def_phrase && (def_type == 'B')) {
// expect a number and a ';'.
int digit = nonspace();
if (!isdigit(digit)) fatal();
// For now, single digit...
// DO SOMETHING HERE WITH "def_name" and "digit" TO DEFINE BIP
BIP[pnp-512] = digit-'0';
nextbip = pnp-512+1;
phrase[pnp-512] = 0;
phrasename[pnp++] = strdup(def_name);
if (nonspace() != ';') fatal();
def_phrase = FALSE;
} else if (def_phrase) { // MAY NEED ACTION FOR C vs P
// expect a phrase definition.
phrase[pnp-512] = next_gram;
phrasename[pnp++] = strdup(def_name);
def_phrase = FALSE;
alt_count_index = this_phrase_start = next_gram++; // Hole for number of alternatives
phrase_count_index = next_gram++; // hole for number of items in first alt.
alt_count = 1; phrase_count = 0;
} else {
fatal(); // missing definition
}
break;
case '<': // phrase def *or* instance within an alt.
if (def_phrase) {
name = upto('>');
def_name = name;
} else {
int i = nextch();
if (i == '?') {
specialbit = OPTIONAL_PHRASE;
// parse and assign but do not step the pointer past this location (NOTE/POP-POINTER)
} else if (i == '!') { // or use '~' or '\' ???
// parse & invert. return NULL on a failed parse, don't step the pointer, and continue parsing this alt.
specialbit = NEGATED_PHRASE;
} else {
ungetc(i, grammar);
specialbit = 0;
}
name = upto('>');
for (i = 512; i < max_phrase; i++) {if (strcmp(phrasename[i], name)==0) break;}
if (finalpass && (i >= max_phrase)) {fflush(stderr); fprintf(stderr, "\n* UNDEFINED P<%s>\n", name); fatal();}
phrase_count++; gram[next_gram++] = i | specialbit;
}
break;
case '\'': // string literal
string = upto('\'');
phrase_count += strlen(string); // Each char counts as a phrase, albeit a short one...
{ char *s = string; while (*s != '\0') gram[next_gram++] = *s++; }
break;
case '"': // keyword literal
keyword = upto('"');
phrase_count++;
gram[next_gram++] = keyword_code(keyword);
break;
case '|': // next alternative
case ',': // next alternative
gram[phrase_count_index] = phrase_count; phrase_count = 0;
phrase_count_index = next_gram++; // hole for number of items in first alt.
alt_count++;
break;
case ';': // end of alternatives
gram[phrase_count_index] = phrase_count;
// Can tell is last alt was null by checking lastsym == ',', should we ever need to know
gram[this_phrase_start] = alt_count;
gram[phrase_count_index] = phrase_count; phrase_count = 0;
def_phrase = FALSE;
free(def_name); def_name = NULL;
break;
case '#': // comment to end of line
{int c; for (;;) {c = fgetc(grammar); if ((c == EOF) || ferror(grammar)) fatal(); if (c == '\n') break;}}
lineno++;
break;
default:
fatal();
}
lastsym = sym;
}
}
int main(int argc, char **argv) {
int pass;
if (argc != 2) {
fprintf(stderr, "syntax: takeon compiler.g\n");
exit(1);
}
for (pass = 0; pass <= 1; pass++) {
grammar = fopen(argv[1], "r");
if (grammar == NULL) {
fprintf(stderr, "takeon: %s - %s\n", strerror(errno), argv[1]);
exit(errno);
}
takeon(pass); // build tables
fclose(grammar);
}
exit(0); return(1);
}