#include <stdio.h> #include <stdlib.h> #include <string.h> #include <errno.h> #include <stdarg.h> #include <ctype.h> #include <assert.h> #include <time.h> // for profiling parser time used with clock() #ifndef FALSE #define FALSE (0!=0) #define TRUE (0==0) #endif #include "compile.h" int listing = 0; // no longer used, but keep because referred to in generated code by update_astcode.c #define NEGATED_PHRASE (1<<29) #define OPTIONAL_PHRASE (1<<30) #define DROP_SPECIAL_BITS (~(NEGATED_PHRASE|OPTIONAL_PHRASE)) // The style inherited from the Edinburgh parsers packs different styles of // object into sequential ranges in the grammar items. It might be a better // organisation to make those all into separate spaces all based at 0, with // the space being defined by a (high position) tag bit. For example keywords // might be under 1<<28, literals under 1<<27, and regexps under 1<<26 etc. // This wasn't feasible in the old compilers where ints had to fit in 16 bits // but with 32 bit ints we would have plenty of space for flag bits. #define MAX_PHRASES 1024 #define WHITESPACE (MAX_PHRASES-1) char *progname; // executable of compiler char *cur_source; // file being compiled //static char outname[256]; //FILE *outfile; #define EXTENDED_ERROR_REPORTING 1 // applies to parser.h ... #define WANT_COMMENT 1 // this area needs review: extern int *A; extern int cp; // code pointer. Has to be global state. extern int ap; // Analysis record pointer. // These turn on extra features in the grammar .h file if present. // Unfortunately if not present, turning these on here will cause errors. #define WANT_SEMANTIC_CODE 1 #define WANT_SEMANTIC_TABLE 1 #ifndef NDEBUG int CHECKIDX(int x, int arraysize, char *name, int line) { if ((x < 0) || (x >= arraysize)) { fprintf(stderr, "* Array bounds exceeded at line %d: %s[%d] is outside %s[%d]\n", line, name, x, name, arraysize); exit(4); } return x; } #else #define CHECKIDX(x, arraysize, name, line) (x) #endif #define FLEX(array,x,l) array[CHECKIDX(x,array##_arraysize,#array,l)] #define BOUNDS(array,x,lim,l) array[CHECKIDX(x,lim,#array,l)] #define gram(x) BOUNDS(gram, x, MAX_GRAMMAR, __LINE__) #define keyword(x) BOUNDS(keyword, x, MAX_KEYWORD, __LINE__) #define BIP(x) BOUNDS(BIP, x, MAX_BIP, __LINE__) #define phrasename(x) ((x) == AST__KW \ ? "AST__KW" \ : ( (x) == AST__LIT \ ? "AST__LIT" \ : BOUNDS(phrasename, (x), MAX_PHRASE, __LINE__) \ ) \ ) #define phrase_start(x) BOUNDS(phrase_start, x, MAX_PHRASE-MAX_BIP, __LINE__) int mktuple(int op, int alt, int count, int T[]); #include "parser.h" // GENERATED GRAMMAR FROM takeon #ifndef SEMANTIC_NAME #define SEMANTIC MAX_PHRASES //#define SEMANTIC (MAX_PHRASE+PHRASES_START+MAX_BIP+1) char *semantic_code[1]; int semantic(int phrase) { return FALSE; } #endif void indent (int n, FILE * f) { n = n*3; while (n > 1000) { fprintf (f, "@"); n -= 1000; } while (n > 100) { fprintf (f, "+"); n -= 100; } while (n > 50) { fprintf (f, "."); n -= 10; } fprintf (f, "%-*s", n, " "); } int bestparse = -1; // for error reporting. char **parse_stack = NULL; int parse_stack_arraysize = 0; #define parse_stack(x) FLEX(parse_stack, x, __LINE__) char **bestparse_stack = NULL; int bestparse_stack_arraysize = 0; #define bestparse_stack(x) FLEX(bestparse_stack, x, __LINE__) int bestparse_depth = -1; char looking_for[1024] = { '\0' }; // 'while looking for <PHRASENAME>' (or literal) ... // On error reporting: compilers built with this parser must report the first // error that occurs, but need not try to report subsequent failures. // Errors will be reported in a form that Emacs supports so that you can // compile from within Emacs with a single keystroke, and have emacs immediately // take you to the line and column of the error. // for examples such as Imp variables with spaces in them - the stored token in the stringpool // is the canonical form, but the text between token_start and token_end markers is the literal // form with embedded whitespace (not to be confused with leading whitespace) // Note that Imp80 explicitly forbids {} comments within a token (though Imp77 doesn't // seem to mind) - this may become an issue when/if we move to using regular expressions // to match source text against grammar rules. // We use a stringpool, and strings are indexes into this pool. This // is useful for the same reasons that the AST is an array indexed // by integers rather than a struct with pointers. It may also // save space by reusing common strings. And we get a free tag // to describe strings that would let us compare strings just with // an integer tag comparison, if we ever wanted to. static void *makespace_ (void *c, int nextfree, int *arraysize, int objsize, char *arrayname, char *file, int line) { #define MINSIZE 1 // *arraysize is the upper bound index of the array, i.e. 1 less than the actual number of elements. if (c == NULL) { if ((c = calloc (MINSIZE, objsize)) == NULL) { fprintf (stderr, "makespace: calloc(%d, %d)=NULL - %s\n", MINSIZE, objsize, strerror (errno)); exit (errno); } *arraysize = MINSIZE - 1; } if ((*arraysize) > nextfree) return c; // *arraysize is the space allocated, nextfree is the space that will used soon. // so we want to make sure that *arraysize >= nextfree *arraysize += 1; // convert from high index to actual size while ((*arraysize) <= nextfree) { // probably should be '<' but doing so allows an array size of 0 which crashes. come back to this. *arraysize = ((*arraysize) * 2); } *arraysize = ((*arraysize) * 2); // Well... this tweak seems to have fixed my problems with SIGSEGV which I think was a boundary condition. // we do seem to have a problem where we sometimes realloc the exact same size as it was before! c = (void *) realloc (c, (*arraysize) * objsize); // 0:arraysize, inclusive. eg 0:15, 0:127 etc if (c == NULL) { fprintf (stderr, "makespace: realloc(%d * %d)=NULL - %s\n", (*arraysize), objsize, strerror (errno)); exit (errno); } else { //fprintf (stderr, "makespace: realloc(%d * %d) = %d @ %p from \"%s\", line %d\n", (*arraysize), objsize, objsize*(*arraysize), c, file, line); } *arraysize -= 1; // restore size back down to upper bound index return c; } #define makespace(c, nextfree) c = (typeof(c))makespace_(c, nextfree, &c##_arraysize, sizeof(c[0]), #c, __FILE__, __LINE__) char *stringpool; int stringpool_arraysize = 0; int stringpool_nextfree = 0; char *BADSTR="BAD_STR_IDX"; #define stringpool(x) ((x)<0?BADSTR[0]:FLEX(stringpool, x, __LINE__)) #define rawstringpool(x) FLEX(stringpool, x, __LINE__) #define String(i) &rawstringpool(c(i).sIndex) #ifdef NEVER static char *debug_escape(char *s, int ch); #endif int str_to_pool (char *s) { int tag; // assert(stringpool_nextfree <= stringpool_arraysize); makespace(stringpool, stringpool_nextfree+strlen(s)+2); strcpy (&rawstringpool(stringpool_nextfree), s); /* Create a backstop for when not found */ for (tag = 0; tag <= stringpool_nextfree; tag++) { if (strcmp (&rawstringpool(tag), s) == 0) { if (tag == stringpool_nextfree) { stringpool_nextfree += strlen (s) + 1; /* Not found, add it */ } return tag; /* found, one way or another */ } } /* NOT REACHED */ } #ifndef sourceinfo_defined #define sourceinfo_defined 1 /* C[] is the source character token stream of sourceinfo items*/ typedef struct sourceinfo { // ATOMS for processed input stream //char *s; // The canonical representation of this source object // is a string, which has now been moved to the stringpool: int sIndex; // string contents int start_sp, lineno_sp, col_sp; int start_tok, lineno_tok, col_tok; int end_tok, endcol_tok; // trailing white space must be viewed as leading white space before EOF token. // starting 2 ptrs are inclusive, end ptr is exclusive. // The obscure case where there is whitespace at the end of an include file followed // by whitespace before a token in the file above will be handled by a sourceinfo // record where start_sp != start_tok, but start_tok == endcol_tok, ie "" // If tok is multi-line eg a string with embedded newlines, lineno_tok is the line // number of the start of the token. // Single tokens formed by token pasting are problematic and I think for now // will be dealt with by the grammar explicitly constructing a string from its parts // Pre-processor issues are a whole 'nother kettle of fish... int t; // type - tag, "string", 'charconst', or char, so far char *f; // source or includefile name } sourceinfo; #endif sourceinfo *c = NULL; int c_nextfree = 0, c_arraysize = 0; #include "lex.h" int ONDEMAND(int x, char *name, int line) { // combination of 'makespace' and line_reconstruction! if (x < 0) { fprintf(stderr, "* Negative array index at line %d: %s[%d]\n", line, name, x); exit(4); } while ((x >= c_nextfree) && ((c_nextfree == 0) || c[c_nextfree-1].t != B_EOF)) { if (x >= c_arraysize) { fprintf(stderr, "* Array bounds exceeded at line %d: %s[%d] is outside %s[%d] *AND MAKESPACE SHOULD HAVE MADE ROOM ALREADY*\n", line, name, x, name, c_arraysize); exit(4); } line_reconstruction(); // SHOULD update c_nextfree ... if (x < c_nextfree) break; } if (x >= c_arraysize) { fprintf(stderr, "* Array bounds exceeded at line %d: %s[%d] is outside %s[%d] *AND MAKESPACE SHOULD HAVE MADE ROOM ALREADY*\n", line, name, x, name, c_arraysize); exit(4); } return x; } #define CFLEX(x,l) c[ONDEMAND(x,"c",l)] // ONDEMAND must *NOT* call Makespace. base of c has already been evaluated. // Could recode this so that ONDEMAND is outside c[] but will only work on RHS // Unless we return the (possibly updated) *address* of the c element, and // have the c() macro perform the indirection. Which, having spelled it out // does seem like a rather attractive solution! Would simplify line_reconstruction() #define c(x) CFLEX(x, __LINE__) #include "regexp-lexer.c" /* A() is the Analysis record. Contents point to a C[] structure element when the item is a BIP, otherwise the format is [phraseno] [altno] [no-of-subphrases] [subphrases and/or BIPs...] for example, if A(25) contained 10, that would mean that the token for A(25) was stored in C[10].] (And the C string would be at &stringpool[c(A(25))]?) */ int *A = NULL; /* Flex array, expanded using 'makespace' */ int next_free_a = 0, A_arraysize = 0; #define A(x) FLEX(A, x, __LINE__) int *AST; int AST_arraysize; int AST_nextfree = 0; #define AST(x) FLEX(AST, x, __LINE__) //////////////////////////////////////////////////////////////////////////// int mktuple(int op, int alt, int count, int T[]) { /* Create a tuple for an n-ary operator. Pre-generated code currently just mirrors the Analysis record. */ /* Hidden fields that belong to *every* tuple can be added here. For example a link to the start/end of the correponding part of the analysis record, to recover {...} comments, or line numbers, or type information for expression evaluation. Too many of these will of course make excessively heavy demands on memory space - a better solution would be to add the extra fields only to those types that require them, but this is likely to be an extremely difficult restructuring at this late stage in the program design, so for now I'll just live with the memory overhead. Specifically, what prompted these thoughts was the likelihood that I need to add type information to every object in an expression or procedure call. So I'll add the fields but not modify the code, and run my regression test suite to see if everything still compiles. If not too much harm has been done, then I'll go on to add the type information to the new fields. */ int trip = -1; int parm; // OK, this is bad :-( If I allocate one extra int to a mktuple() call, even if I don't write to it, three // of the regression suite programs crash. Adding the extra field should not affect any of the code. // Unless the issue is running out of absolute ram, I can't think of any mechanism that would affect // a legal program this way. So it's ether that, or there is some other illegal behaviour already in // the code that is being brought out by this change. // It looks like 2 or more extra words are OK but 1 extra word causes these crashes. // But adding an extra field (before the data area) *and* a extra word (at the end) worked ok too // so although it may be a problem with some code accessing outside its boundaries, it's not // obvious how. #define EXTRA_UNUSED_SPACE 1 makespace (AST, AST_nextfree+AST_child1_offset+count+1 +EXTRA_UNUSED_SPACE /* TESTING ALLOCATION OF EXTRA SPACE WITHOUT ADJUSTING OFFSETS */); trip = AST_nextfree; AST(AST_nextfree++) = 0; // reserved field AST(AST_nextfree++) = op; // AST_op_offset AST(AST_nextfree++) = alt; // AST_alt_offset AST(AST_nextfree++) = count; // AST_count_offset // short term hack if any new fields are not needed here: // while (AST_nextfree - trip != AST_child1_offset) AST(AST_nextfree++) = 0; AST(AST_nextfree++) = 0; // source line where tuple was created AST(AST_nextfree++) = 0; // newly-added type information field if (AST_nextfree - trip != AST_child1_offset) { fprintf(stderr, "Did you add new fields to the AST structures and forget to update parser.c?\n"); // (also check the calls to AST_nextfree++ above!) exit(1); } for (parm = 0; parm < count; parm++) { AST(AST_nextfree++) = T[parm+1]; } AST_nextfree = AST_nextfree +EXTRA_UNUSED_SPACE /* TESTING ALLOCATION OF EXTRA SPACE WITHOUT ADJUSTING OFFSETS */; return trip; } #ifdef NEVER // replace NEVER with whatever is tested at the point this is used, next time // something is turned on and it is needed again. int debug_escapech(char *dest, int ch, int quotetype) { const char *hexdigit = "0123456789ABCDEF"; // This is *only* for display in debugging output. // Although it may look a bit like C's escapes, it is *not* suitable for use with any specific language. /* TO DO: Character Meaning '\a' alert (BEL) '\b' backspace '\f' form feed '\n' newline '\t' horizontal tab '\r' carriage return '\v' vertical tab '\\' backslash '\'' single quote '\"' double quote '\?' Question mark Also ??? sequences in strings. */ char *t = dest; if (ch == '\n') { *t++ = '\\'; *t++ = 'n'; } else if (ch == '\t') { *t++ = '\\'; *t++ = 't'; } else if (ch == '\r') { *t++ = '\\'; *t++ = 'r'; } else if (ch == '\\') { *t++ = '\\'; *t++ = '\\'; } else if (ch == quotetype) { *t++ = '\\'; *t++ = quotetype; } else if ( ((int)(ch&255) < 32) || ((int)(ch&255) > 126) ) { *t++ = '\\'; *t++ = 'x'; // Note. \xFF, *not* \0xFF ! *t++ = hexdigit[(ch>>4)&15]; *t++ = hexdigit[ch&15]; } else { *t++ = ch; } *t = '\0'; return t-dest; } static char *debug_escape(char *s, int quotetype) { // assert we NEVER call escape on a genuinely empty string - if s[0] == '\0' then it is a '\0' character // \0 is only supported in the single character constant '\0', not in strings. static char escaped[512]; char *t = escaped; *t = '\0'; if (s == NULL) return escaped; if ((*s == '\0') && (quotetype == '\'')) { *t++ = '\\'; *t++ = '0'; s++; *t = '\0'; } else { while (*s != '\0') { t += debug_escapech(t, *s++, quotetype); if (t-escaped >= 510) { break; // avoid code duplication } } } if (t-escaped >= 510) { fprintf(stderr, "WARNING! Escaped string is too large! (over %ld chars): \"%s\"...\n", (long)(t-escaped), escaped); // ideally we should truncate at the first \n (if there is one), issue an error message // for that line, and continue parsing from there after backing up to an appropriate value of 'cp'... exit(1); } return escaped; } #endif // THIS DOES *NOT* PARSE A BIP! It converts a BIP from the A-record to the AST. // This assumes that the next item in the c[] array is the BIP data and makes it into an AST entry. // I could have called this mkbip or bip2ast perhaps... int bip(int B, int aptr) { int data[3]; data[1] = B; data[2] = aptr; return mktuple(B, 1, 2, data); } int lit(int AST_op, int ch, int ap) { int data[3]; data[1] = ch; // actual character data[2] = ap; return mktuple(AST_op, 1, 2, data); } // User's grammar file has to supply a kw() appropriate to the // keyword style - whether reserved words or some style of stropping //////////////////////////////////////////////////////////////////////////// /* variables used by line-reconstruction (lexer) */ FILE *sourcefile; char *curfile; // This is not a flex array but it *does* need array bounds checking // Reason being that eventually the source file may be store-mapped // rather than loaded using C's I/O. char *source_address = NULL; int source_length = 0; #define source_address(x) BOUNDS(source_address, x, source_length+1, __LINE__) // Creating pointers to each line can help with attaching error messages // to the source, especially if the error is discovered in the AST after // the whole program has been parsed. static int *source_line; static int source_line_arraysize = 0; #define source_line(x) FLEX(source_line, x, __LINE__) static int source_lines = 0; // After finding the line count, I'm considering adding a parallel array of char // to be used as markers in the list file for continued lines and strings that // extend to a second line. (I think the continuation characters that Imp used // to use were '"' and '&'...) void find_lines(void) { int line = 0; int i, p = 0; int lastch = '\n'; if (debug_phases) fprintf(stderr, "Locate line starts:\n"); makespace (source_line, line+1); source_line(line++) = -1; // lines start at 1. for (;;) { if (lastch == '\n') { makespace (source_line, line+1); source_line(line++) = p; } if (p == source_length) break; lastch = source_address(p++); } source_lines = line-1; } // sets globals 'source_address', 'source_length', 'source_lines' and pointer array 'source_line[]' void memory_map_file(char *filename) { // for now, just read the file in. // However option remains to connect the file in virtual memory... FILE *f = fopen(filename, "rb"); int i; long length; char *buffer; if (debug_phases) fprintf(stderr, "Load source:\n"); i = fseek(f, 0L, SEEK_END); length = ftell(f); i = fseek(f, 0L, SEEK_SET); buffer = malloc(length+2); if (buffer == NULL) { fprintf(stderr, "* Failure: buffer = malloc(%ld)=NULL\n", length+1); exit(EXIT_FAILURE); } for (i = 0; i < length; i++) buffer[i] = fgetc(f); buffer[length] = '\0'; buffer[length+1] = '\0'; source_address = buffer; source_length = length; // an extra one for 'peek' lookahead makespace (c, source_length+2); // overkill, because more chars than tokens. find_lines(); } int startline = TRUE; // startline is true before hitting the first non-whitespace character in a line. // In C used mainly for # commands immediately after a newline. // need to distinguish 'start statement' which can follow a ';' as well as the newline // which will be needed for comments in Imp... int whitespace_lineno = 1; int whitespace_col = 0; int whitespace_pos = 0; int token_lineno = 1; int token_col = 0; int token_pos = 0; int nextch_lineno = 1; // under development. sorting out the problem with xfgetc/xungetc int nextch_col = 0; int nextch_pos = 0; int ch, peek; // peek is always the *immediate* next char, no skipping of spaces void stores (char *s, // STORE STRING int toklineno, int tokcol, int tokpos, int type, char *fname, int line) { int i, tag; tag = str_to_pool (s); // VERY BAD if c is extended on the fly by makespace if stores() is being called // from inside line_reconstruction() which in turn has been called on the fly // from ONDEMAND in order to pull in more source code! // But see the note next to declaration of 'c' for a possible fix. // Otherwise it would be easy to call "makespace (c, c_nextfree+1);" here. c[c_nextfree].sIndex = tag; // index of string in stringpool - NOT a pointer to a string because stringpool can be reallocked. c[c_nextfree].lineno_sp = whitespace_lineno; c[c_nextfree].col_sp = whitespace_col; c[c_nextfree].start_sp = whitespace_pos; c[c_nextfree].lineno_tok = toklineno; c[c_nextfree].col_tok = tokcol; c[c_nextfree].start_tok = tokpos; c[c_nextfree].endcol_tok = nextch_col; c[c_nextfree].end_tok = nextch_pos; // may need to pass these in if already stepped past the end of the token // but for now we'll rely on xungetc to have stepped back appropriately so that the next character to be read // is the first character immediately following this token c[c_nextfree].f = fname; c[c_nextfree].t = type; whitespace_lineno = nextch_lineno; whitespace_col = nextch_col; // ready for next token. whitespace_pos = nextch_pos; // NOT SAFE TO CALL HERE ANY MORE! "makespace(c, c_nextfree+2);" c_nextfree++; } void storec (int ch, int toklineno, int tokcol, int tokpos, int type, char *fname, int line) { // STORE CHARACTER char str[2]; str[0] = ch&255; str[1] = '\0'; // convert char to 1-char string before saving. stores (str, toklineno, tokcol, tokpos, type, fname, line); } #define stores(s, toklineno, tokcol, tokpos, type, fname) stores(s, toklineno, tokcol, tokpos, type, fname, __LINE__) #define storec(ch, toklineno, tokcol, tokpos, type, fname) storec(ch, toklineno, tokcol, tokpos, type, fname, __LINE__) #include "lex.c" /* Parsing. (Table driven top-down recursive-descent parser) */ /* The parser used here is based on a design by Tony Brooker which was originally used in Atlas Autocode and the "Compiler Compiler". It generates a concrete syntax tree (also known as an "analysis record" or "AREC") rather than the abstract syntax tree more popular in modern compilers. A later phase converts from concrete to abstract. Note that the parsing procedure here is just a piece of code to walk a pre-built table. There is nothing in this section which reflects the grammar, if that is what you are looking for. You'll find the grammar embedded in both the <lang>.c source file and in the 'compile()' procedure in the compile-<lang>.c file. */ int cp = 0; // code pointer. Has to be global state. int ap = 0; // Analysis record pointer. // Parsing inputs line-reconstructed text from the c[] array rather // than reading text directly. // 'cp' is the pointer to the current position in the c[] array. // cp used to be pre-built by running line_reconstruct() on the entire // source file. Now it reads the source file on demand as it is parsed // and converts text to tokens only when needed. This allows the // line reconstruction to be modified by the parser - perhaps an // unclean mixing of levels, but as any compiler writer will know, // it can also be damned helpful! // for profiling how much of the cpu time spent parsing was wasted // due to backtracking. Experimental. static long long int goodtime = 0; static long long int badtime = 0; int parse (int pp, int cpx, int linenox, int depth) // depth is only for indentation in diags { /* Main parsing procedure. This is a table-driven parser, with the tables being generated from the grammar rules embedded in <lang>.c */ int saved_cp, saved_ap, i, gp, alts, match, where=0; int optional = pp&OPTIONAL_PHRASE, negated = pp&NEGATED_PHRASE; // WARNING: large stack objects in this procedure may cause the program to run out of RAM // when given a large source file to compile. Unfortunately this is not easily detectable // like a malloc failure would be, and all we see is a SIGSEGV signal being thrown. I have // already restructured the data here to avoid or at least minimise this problem in future. // this could be converted to a flex array: //#define MAX_DESC 1024 // static char saved_desc[(MAX_DESC*3)/2]; // had better not be on the stack! // There is some code in here to handle three styles of parsing; one where whitespace is removed // during line reconstruction, and another where it is parsed explicity - with a third option // being that <sp> phrases are inserted automatically by the 'takeon' program before each // token. (<sp> being a rule to skip spaces) The choice of method depends entirely on // the language being parsed and whether there are places where spaces are not allowed between // specific tokens - for instance a label "fred:" which is an identifier followed by ':' with // no spaces allowed between them. The two main test languages - Imp and C - both seem to // be working OK with the same whitespace option. if (pp == WHITESPACE) { fprintf(stderr, "Should never get here. Trying to parse <%ssp> as an actual phrase rule.\n", optional?"?":negated?"!":""); exit(1); } if (pp & (OPTIONAL_PHRASE|NEGATED_PHRASE)) { fprintf(stderr, "Should never get here. Trying to parse <%s%s> as an actual phrase rule.\n", optional?"?":negated?"!":"", phrasename((pp&DROP_SPECIAL_BITS) - PHRASES_START)); exit(4); } makespace (parse_stack, depth+1); parse_stack(depth) = phrasename(pp - PHRASES_START); /* Initialisation */ gp = phrase_start(pp - PHRASES_START - MAX_BIP); alts = gram(gp); gp++; // gp now points to first element (length) of first alt saved_cp = cp; // save current input pointer for backtracking if an alt fails saved_ap = ap; for (i = 0; i < alts; i++) { /* Starting with the root phrase, recursively examine each alternative */ int each, phrases = gram(gp++), phrase_count, gap = 0; cp = saved_cp; // backtrack to where input pointer was when we started parsing the previous alt. ap = saved_ap; if (ap + 3 > next_free_a) next_free_a = ap + 3; makespace (A, next_free_a+32); // grab plenty of extra space rather than make lots of small calls. A(ap++) = pp; // record which phrase (could be done outside loop) A(ap++) = i; // and which alt. // Count slots needed. *Could* be precalculated and stored // in the grammar, either embedded (after the ALT) or as a // separate table: for (each = 0; each < phrases; each++) if (((gram(gp + each)&DROP_SPECIAL_BITS) >= PHRASES_START) && ((gram(gp + each)&DROP_SPECIAL_BITS) < SEMANTIC)) gap++; A(ap++) = gap; // Count of alts (gap) // ap+gap now points to the slot after the space required, which // is where the first subphrase will be stored: ap = ap + gap; // recursive subphrases are stored after this phrase. // ap is left updated if successful. // successfully parsed phrases are stored in A(saved_ap+3+n) if (saved_ap + 3 + gap > next_free_a) next_free_a = saved_ap + 3 + gap; makespace (A, next_free_a+32); match = TRUE; // stays true if all subphrases match phrase_count = 0; // only phrases which make it into the A record, // i.e. excluding literals and keywords // (but does include guards and negated phrases) for (each = 0; each < phrases; each++) { /* Within a single grammar rule (alternative), ensure that each subphrase is present */ int phrase = gram(gp + each)&DROP_SPECIAL_BITS, negated = gram(gp + each)&NEGATED_PHRASE, optional = gram(gp + each)&OPTIONAL_PHRASE; if (cp >= bestparse) { static char s[256]; int i; makespace (bestparse_stack, depth+2); for (i = 0; i <= depth; i++) bestparse_stack(i) = parse_stack(i); if (phrase < 256) { sprintf (s, "<%s%s> '%c'", optional?"?":negated?"!":"", phrasename(pp - PHRASES_START), phrase); } else if (phrase < PHRASES_START) { sprintf (s, "<%s%s> \"%s\"", optional?"?":negated?"!":"", phrasename(pp - PHRASES_START), keyword(phrase - 256)); } else if (phrase < PHRASES_START+MAX_BIP) { char *s1 = optional?"?":negated?"!":""; char *s2 = phrasename(pp - PHRASES_START); char *s3 = phrasename(phrase - PHRASES_START); if (strlen(s1)+strlen(s2)+strlen(s3)+6 >= 255) { fprintf(stderr, "* String 's' too long in parse()\n"); exit(EXIT_FAILURE); } sprintf (s, "<%s%s> {%s}", s1, s2, s3);// <------------------ } else if ((phrase) < SEMANTIC) { sprintf (s, "<%s%s> <%s>", optional?"?":negated?"!":"", phrasename(pp - PHRASES_START), phrase==WHITESPACE ? "sp" : phrasename(phrase - PHRASES_START)); } else { // SEMANTIC PHRASE sprintf (s, "<%s%s> {%s}", optional?"?":negated?"!":"", phrasename(pp - PHRASES_START), semantic_code[phrase-SEMANTIC]); } if (strlen(s) >= 255) { fprintf(stderr, "* String 's' too long in parse()\n"); exit(EXIT_FAILURE); } if (cp > bestparse) strcpy(looking_for, s); bestparse = cp; bestparse_depth = depth; #undef s } if (phrase < 256) { where = ap; // next phrase to be parsed will be stored at current 'ap'. /* Literal */ makespace(c, cp+1); if ((c(cp).t != B_char) || (*String(cp) != phrase)) { match = FALSE; } else { // an earlier version did not enter literals into the A record. A(ap++) = phrase; A(ap++) = 1; A(ap++) = 1; A(ap++) = cp++; A(saved_ap + 3 + phrase_count++) = where; // Record literal } } else if (phrase < PHRASES_START) { /* Keyword */ where = ap; // next phrase to be parsed will be stored at current 'ap'. #ifdef STROPPED_KEYWORDS // instead of saving whole keywords, the imp line reconstruction has // saved the individual characters of keywords as type B_stropped. { int saved_cp = cp; char *key = keyword(phrase - 256); match = TRUE; while (*key != '\0') { //fprintf(stderr, "\nKeyword: %c vs %c @ %d (t=%d == %d)\n", *key, stringpool[c[cp].sIndex], c[cp].sIndex, c[cp].t, B_stropped); makespace(c, cp+1); if (!((c(cp).t == B_stropped) && (c(cp).sIndex >= 0) && (stringpool(c(cp).sIndex) == *key))) { match = FALSE; break; } cp += 1; key += 1; } if (match) { A(ap++) = phrase; A(ap++) = 1; A(ap++) = 1; A(ap++) = saved_cp; // char, not string, so this will need tweaking... A(saved_ap + 3 + phrase_count++) = where; // Record keyword } else { cp = saved_cp; } } #else // whole keywords. makespace(c, cp+1); if ((c(cp).sIndex == -1) || (strcmp (keyword(phrase - 256), String(cp)) != 0)) { match = FALSE; } else { // enter keywords into the A record for source reconstruction A(ap++) = phrase; A(ap++) = 1; A(ap++) = 1; A(ap++) = cp++; A(saved_ap + 3 + phrase_count++) = where; // Record keyword } #endif } else if (phrase < (PHRASES_START+MAX_BIP)) { /* Built-in phrase */ where = ap; // next phrase to be parsed will be stored at current 'ap'. makespace(c, cp+1); if (c(cp).t != BIP(phrase - PHRASES_START)) { match = FALSE; } else { A(ap++) = phrase; A(ap++) = 1; A(ap++) = 1; A(ap++) = cp++; A(saved_ap + 3 + phrase_count++) = where; // Record BIP } } else if (phrase==WHITESPACE) { makespace(c, cp+1); while ((c(cp).t == B_char) && (String(cp) != NULL) && (isspace(*String(cp)))) { cp += 1; makespace(c, cp+1); } match = TRUE; } else if (phrase < SEMANTIC) { // REGULAR PHRASE /* Recursive call to parser for a subphrase */ int here = cp; // we don't advance cp if an optional alt. where = ap; // next phrase to be parsed will be stored at current 'ap'. // for now we only apply ? or ! to full P<phrases> ... in principle keywords and BIPs could also // take ? or !, but if we add that, need to find *all* the places the change would affect. makespace(c, cp+1); // I'm not entirely sure why gprof only sees 1 call to 'parse()' :-/ { long long int timebefore, timeafter; timebefore = (long long int) clock(); match = parse (phrase, cp, c(cp).lineno_sp, depth + 1); // options have been removed from recursive call. COULD USE 'match' HERE! <-------------------------- timeafter = (long long int) clock(); if (negated) match = !match; if (match) { goodtime += (timeafter-timebefore); if (negated) { // there was no subparse data ap = where; // backtrack over anything that might have been added by negated phrase // let's create a dummy analysis record entry for the unsuccessful parse A(ap++) = phrase|negated; A(ap++) = 0; // selected alternative - there was none A(ap++) = 0; // number of terms in successful alternative // A(ap++) = 0; // this is usually cp++ for the matched text. There can be no matched text so pointer is meaningless. // but should the slot be filled in with a dummy anyway, just to make things appear at compatible offsets? A(saved_ap + 3 + phrase_count++) = where; // Record BIP (not sure if phrase_count is correct here?) } else { if (optional) A(where) |= OPTIONAL_PHRASE; // should there be an assignment here of phrase/altno/#terms/cp ??? A(saved_ap + 3 + phrase_count++) = where; // we want the result even if it is a guard phrase. } if (negated || optional) cp = here; // don't advance text pointer for guard phrase. However we do store the parse data. match = TRUE; } else { badtime += (timeafter-timebefore); match = FALSE; } } } else { // SEMANTIC PHRASE // we shouldn't need to store anything for a successful parse, // and just use it to accept or reject an alternative, *BUT* we // *do* need to store an empty match because the source reconstruction // got all messed up otherwise. It may be fixable there but at the // cost of more complex code. Doing it here once is simpler. // I'll initialise these before we know if it succeeded or not, // because that way the semantic code could poke values in here // if it wanted to and not have them immediately overwritten! A(ap ) = phrase; A(ap+1) = 0; // alt A(ap+2) = 0; // count A(ap+3) = 0; // unused (this is where matched text would go) A(saved_ap + 3 + phrase_count++) = where; // Record BIP if (semantic(phrase-SEMANTIC)) { match = TRUE; ap += 4; } else { match = FALSE; } } if (!match) break; } gp += phrases; // move over all phrases, to next alt // gp now points to first element (length) of next alt, or start of next phrase if (match) break; } return (match); } int build_ast_inner(int ap, int depth, int caller) { #define build_ast(ap, depth) build_ast_inner(ap, depth, __LINE__) int T[100]; int saved_ap; int phrase; // A(ap) is the phrase number. A(ap+1) is the alt. int alt; // For consistency, in BIPs, the Alt should always be 1 // although that may not be the case at the moment :-( int phrases; // defined subphrases saved_ap = ap; phrase = A(ap++) & DROP_SPECIAL_BITS; alt = A(ap++); phrases = A(ap++); if ((phrase < PHRASES_START) || (phrase > MAX_PHRASE+PHRASES_START)) { fprintf(stderr, "* Internal error: build_ast(%d) called from line %d - phrase = %d\n", ap, caller, phrase); //exit(1); saved_ap = saved_ap/0; // force backtrace ( this is why the Makefile says -Wno-div-by-zero ) } switch (phrase) { #include "lang.c" default: fprintf(stderr, "*** UNHANDLED GRAMMAR PHRASE %d\n", phrase); exit(EXIT_FAILURE); } } // *ALL* stdout printing happens here. This is what outputs the terminals in the default compile() // Need to be careful about things like '%c' in white space. // Name-clash handling might be done here. static void RECONSTITUTE(int idx, FILE *out) { int i; makespace(c, idx+1); for (i = c(idx).start_sp; i < c(idx).end_tok; i++) fputc(source_address(i), out); fflush(out); } // Current version of compile just regurgitates the source file. // You could however output a translated version of the program // here, in a different language. Or clean up the indentation // in the style of SOAP... Or actually generate code. // The ast-building code that is created by default as a template // using update_astcode just creates an AST that looks exceedingly // like the concrete syntax tree in A[]... however the expectation // is that for any real application, the Abstract Syntax Tree will // be a simplification of the actual parse tree in a form that // is more useful to compiler writers... void default_compile_init(void) { } void default_compile_terminate(void) { } int default_compile(int astp, int PRINT) { if (astp <= 0) return -1; int i; #define dA_literal_offset A_literal_offset // 3 #define dAST_op_offset AST_op_offset // 1 #define dAST_alt_offset AST_alt_offset // 2 #define dAST_count_offset AST_count_offset // 3 // has to be kept in synch with compile.c etc: #define dAST_child1_offset AST_child1_offset // 6 // Interestingly enough, if you forget to update that define, // only tests/rebuild/test023.imp appears to be affected! // But see the comments in compile.c re crashes int astop = AST[astp+dAST_op_offset]; int count = AST[astp+dAST_count_offset]; int *A_child = &AST[astp+dAST_child1_offset]; // An index into a child of the AREC, *not* a child of the AST. #define defaultchild(p,i) AST[(p)+dAST_child1_offset+((i)-1)] if ((astop < AST_SS) || (astop == AST__LIT)) { RECONSTITUTE(A[A_child[1]+dA_literal_offset], stdout); } else if (astop == AST__KW) { for (i = A_child[1]; i < A_child[1]+A_child[2]; i++) RECONSTITUTE(i, stdout); } else { for (i = 1; i <= count; i++) default_compile(defaultchild(astp,i), PRINT); } } //#ifdef INCLUDE_AST_DEBUG // Can be defined in <lang>.c //#include INCLUDE_AST_DEBUG //#endif #include "show_ast.c" #include "compile.c" //#define compile_init() default_compile_init() //#define compile_terminate() default_compile_terminate() //#define compile(astp, p) default_compile(astp, p) int main (int argc, char **argv) { int errors, lines; int program; int do_compile = TRUE; char *s; int SS; /* Get clean version of executable name. This worked on the variety of systems we used to use many years ago. Now it's just M$ and Linux :-( */ progname = strdup (argv[0]); if ((s = strrchr (progname, '/')) != NULL) progname = s + 1; // Unix if ((s = strrchr (progname, '\\')) != NULL) progname = s + 1; // M$ if ((s = strrchr (progname, ']')) != NULL) progname = s + 1; // Dec if ((s = strrchr (progname, ';')) != NULL) *s = '\0'; // VMS Version no's if (((s = strrchr (progname, '.')) != NULL) && (strcasecmp (s, ".exe") == 0)) *s = '\0'; if (((s = strrchr (progname, '.')) != NULL) && (strcasecmp (s, ".com") == 0)) *s = '\0'; /* Handle program arguments */ while (argc >= 2) { if (strcmp(argv[1], "--trace") == 0) { imp_option_trace = 3; argc -= 1; argv += 1; // shift } else if (strcmp(argv[1], "-d") == 0) { argc -= 1; argv += 1; // shift if ((argc > 2) && (argv[1][0] != '-')) { char *debug_type = argv[1]; argc -= 1; argv += 1; // shift // Add -d parse and -d lex later. // Earlier code for those was removed - if adding it back // in, use an include file and do it more neatly than before. if (strcmp(debug_type, "phases") == 0) { debug_phases=TRUE; } else if (strcmp(debug_type, "ast") == 0) { debug_ast=TRUE; } else if (strcmp(debug_type, "scope") == 0) { debug_scope=TRUE; } else if (strcmp(debug_type, "no-compile") == 0) { do_compile=FALSE; } else if (strcmp(debug_type, "errors") == 0) { debug_errors=TRUE; } else if (strcmp(debug_type, "cast") == 0) { debug_ctuples=TRUE; } else if (strcmp(debug_type, "progress") == 0) { debug_prefetch=TRUE; } else { fprintf(stderr, "%s: debug option '%s' not recognised\n", progname, debug_type); fprintf(stderr, " currently supported -d types are: phases ast no-compile errors cast progress\n"); exit(1); } } else { // plain -d gets everything. May add a '-d all' later. debug_phases=TRUE; debug_errors=TRUE; debug_ast=TRUE; } } else if (*argv[1] == '-') { fprintf(stderr, "%s: unknown option %s\n", progname, argv[1]); exit(EXIT_FAILURE); } else { // Last argvs are files. if (argc > 2) { fprintf(stderr, "%s: extra parameter %s\n", progname, argv[2]); exit(EXIT_FAILURE); } else break; } } if (argc != 2) { fprintf (stderr, "syntax: %s [-d {type}] [--trace] filename\n", progname); exit (1); } // TO DO: compile "perms.imp" first, if it exists. And we are compiling Imp. if ((sourcefile = fopen (argv[1], "r")) == NULL) { fprintf (stderr, "%s: %s - %s\n", progname, strerror (errno), argv[1]); exit (errno); } else fclose(sourcefile); // quick existence check. stringpool = NULL; stringpool_arraysize = 0; stringpool_nextfree = 0; AST = NULL; AST_arraysize = 0; AST_nextfree = 0; parse_stack = NULL; parse_stack_arraysize = 0; bestparse_stack = NULL; bestparse_stack_arraysize = 0; curfile = argv[1]; startline = TRUE; memory_map_file(argv[1]); cur_source = argv[1]; // this needs better management for when we handle include files... nextch_pos = 0; errors = 0; // for (;;) { // It's possible to parse a single statement at a time in a loop rather than parsing the // entire program. This saves memory and a lot of slow backtracking when there's a failure // at the end of a large block. However since even the largest program parses in less // than a minute, and the simplification that comes from parsing whole blocks is huge, // I'm going to stick with whole-program parsing for now. //if (cp == c_nextfree) break; // End of file? ap = 0; // reset analysis record on each line. We're not parsing the entire program here. if (debug_phases) fprintf(stderr, "Parse:\n"); if ((SS = parse (P_SS, 0, 1, 0)) == 0) errors += 1; else { int i; if (debug_phases) fprintf(stderr, "Build AST:\n"); program = build_ast(/*SS*/0, 0); //#ifdef INCLUDE_AST_DEBUG if (debug_ast) print_ast(program, 0); //#endif // At this point the A[] array could be freed and the space reclaimed for use in compiling from the AST // As long as the flex bounds were adjusted accordingly it would be safe. if (debug_phases) fprintf(stderr, "%d Lines parsed.\n", source_lines-1); // Line numbering starts at 1. // now do something with it :-) if (debug_phases) fprintf(stderr, "Compile: (Walk AST)\n"); if (do_compile) { int compile_inner(int astp, int x, int line); void out_inner(int astp, int line); compile_init(); int SS = compile_inner(program,0,__LINE__); out_inner(SS,__LINE__); if (OUTFILE) fprintf(OUTFILE, "\n"); compile_terminate(); fprintf(stderr, "Backtracking took %lld%% of the total parsing time\n", (badtime * 100LL) / (badtime+goodtime)); } else { default_compile_init(); default_compile(program, TRUE); default_compile_terminate(); } exit (0); } // } end of 'statement at a time' for-loop, if we're parsing that way. /* Attempt to print a sensible error if the parse failed */ if (bestparse == c_nextfree) { makespace(c, bestparse+1); fprintf (stderr, "\"%s\", Line %d, Col %d: Premature end of file while looking for %s\n", argv[1], c(bestparse).lineno_tok, c(bestparse).col_tok + 1, looking_for); } else { int i, ic = 0; // Scan down the file looking for the current line. // We could just note the C[] index at the start of each line in the parser, // or we could use the array of line starts that I think I added at some point. makespace(c, bestparse+1); for (i = 0; i < bestparse; i++) { if (c(i).lineno_tok == c(bestparse).lineno_tok) break; } while ((c(i).lineno_tok == c(bestparse).lineno_tok) && (c(i).t != B_NL)) RECONSTITUTE(i++, stderr); fputc('\n', stderr); for (ic = 0; ic < c(bestparse).col_tok; ic++) fputc(' ', stderr); fputc('^', stderr); fprintf (stderr, "\n\"%s\", Line %d, Col %d: Syntax error while looking for %s\n", c(bestparse).f, c(bestparse).lineno_tok, c(bestparse).col_tok, looking_for); if (debug_errors) { makespace (bestparse_stack, bestparse_depth+1); for (i = 0; i <= bestparse_depth; i++) { if (bestparse_stack(i)) { fprintf(stderr, " %s\n", bestparse_stack(i)); } } } } exit (1); return (1); }