C = { #ifdef IN_PARSER while (source(TP).ch==' ' || source(TP).ch=='\t' || source(TP).ch=='\f') { TP += 1; } #endif return TRUE; }; C = { #ifdef IN_PARSER int debug_stropping = 0; // The source file has already been read trivially into source(). // We will copy from source() into temp(), then perform line reconstruction // on temp(), writing back to source(). The parser will then parse source() // into atoms according to the grammar. Initially it will only store the // reconstructed characters into the atoms, but once it is working, I will // modify it to also store the unreconstructed source for use in source-to-source // translations, where whitespace, embedded comments, and indentation is // desired in the translation, in order to mirror the original file. // Because unfortunately underlining in Unicode is done by a *following* // underline joiner character (818) rather than being a single unicode // code point, it is difficult to use a single-character encoding of a // stropped keyword letter - what the old Imp compilers would represent // by adding 128 to the character. However there *is* an alternive // source of upper case and lower case letters in the mathematics area! // A:Z could be encoded as 1D400:1D419 and a:z as 1D41A:1D433 :-) // but for now I'm encoding keywords in lower case and variables in // upper case. // The 1D400+ encoding looks more or less like ordinary text if it happens // to be displayed (e.g. during debugging) although there should never be // any need to display internally-coded keywords to users of the // compilers built with this parser. // All arrays are flex and the upper bound is a limit, not a minimum. DECLARE(SYM, reconstructed, 128000000/*600000*/); #define _SYM(x) WRITE(x,SYM,reconstructed) #define SYM(x) READ(x,SYM,reconstructed) int LASTP, P = 0; while (source(P).ch != 0 /* WEOF */) { _SYM(P).ch = source(P).ch; _SYM(P).start = P; _SYM(P).end = P+1; P += 1; } _SYM(P).ch = 0 /* WEOF */; _SYM(P).start = P; _SYM(P).end = P; // no chars for EOF LASTP = P; if (debug_stropping) { int I; fprintf(stderr, "source() moved to SYM(0:%d) = \"", LASTP); for (I = 0; I < LASTP; I++) { fprintf(stderr, "%lc", SYM(I).ch); } if (SYM(LASTP).ch != 0) fprintf(stderr, "[%d]", SYM(LASTP).ch); fprintf(stderr, "\";\n"); }; int FP = 0, PP = 0; // Fetch Pointer, Put Pointer. #define DONE() \ do { \ FP -= 1; /* the terminating 0*/ \ _source(PP).ch = 0; \ _source(PP).end = SYM(FP).end; \ if (debug_stropping) { \ int I; \ fprintf(stderr, "SYM(0:%d) moved back to source(0:%d) = \"", FP, PP); \ for (I = 0; I < PP; I++) { \ fprintf(stderr, "%lc", source(I).ch); \ } \ if (source(PP).ch != 0) fprintf(stderr, "[%d]", source(PP).ch); \ fprintf(stderr, "\";\n"); \ } \ return TRUE; \ } while (0) wint_t WC; // NOTE THAT WITH THIS IMP77 GRAMMAR, '\n' IS NOT WHITESPACE. LINE ENDINGS ARE EXPLICITLY // ENTERED IN THE GRAMMAR. (See the phrases , and . // uparse.c has been modified so that its implicit whitespace skipping no longer skips '\n'. // (The algol60 parser in contrast treats all \n's the same as spaces) // HOW TO HANDLE ' IN A PARSED COMMENT? // // %COMMENT A ' MESSES UP! // // because it keeps scanning until a closing quote. However if you don't scan between quotes, // line reconstruction will lose spaces within strings! // // You can't just end a quoted string at a newline because embedded newlines are allowed. // And I checked Imp77 - it allows a single quote ch in a comment. // If line reconstruction were being done on the fly then it could be modified if we knew we were // in a comment, but since we're doing it all in advance, the only option to handle this appears // to be that whenever we're in a comment, we throw away all the following line reconstruction and // re-do it, with that comment handled differently. // Or bite the bullet and work out how to do line reconstruction on the fly (which my previous // imptoc did eventually manage using the 'demandload' call. So *every* fetch via TP would have // to be recoded as a procedure call, with on-the-fly line reconstruction, and either a way to // undo it if backtracking or simply never doing it any farther past TP and undoing it on backtracking. // What a can of worms just to handle badly designed comments. TO DO. #define CHECK_EOF(x) do if ((x) == 0) DONE(); else { _source(PP).end = SYM(FP-1).end; } while (0) // PP is the 'current' slot we are writing into. _source(PP).start = SYM(FP).start; for (;;) { _source(PP).end = SYM(FP).end; // Keep updated. WC = SYM(FP++).ch; CHECK_EOF(WC); Peek = SYM(FP).ch; CHECK_EOF(Peek); else if ((WC == '/') && (?? == '*')) { for (;;) { WC = SYM(FP++).ch; CHECK_EOF(WC); Peek = SYM(FP).ch; CHECK_EOF(Peek); if ((WC == '*') && (?? == '/')) { break; // but still looking. } } continue; } else if ((WC == '/') && (?? == '/')) { for (;;) { WC = SYM(FP++).ch; CHECK_EOF(WC); Peek = SYM(FP).ch; CHECK_EOF(Peek); if (WC == '\n')) { break; // but still looking. } } continue; } else if (WC == '\'') { _source(PP++).ch = WC; for (;;) { WC = SYM(FP++).ch; CHECK_EOF(WC); Peek = SYM(FP).ch; CHECK_EOF(Peek); if (WC == '\'') { _source(PP).ch = WC; _source(PP).end = SYM(FP-1).end; // Leave Peek for later. PP++; break; } else if (WC == '\\') { _source(PP++).ch = WC; _source(PP++).ch = Peek; FP++; } else { _source(PP++).ch = WC; } } continue; } else if (WC == '"') { _source(PP++).ch = WC; for (;;) { WC = SYM(FP++).ch; CHECK_EOF(WC); Peek = SYM(FP).ch; CHECK_EOF(Peek); if (WC == '"') { _source(PP).ch = WC; _source(PP).end = SYM(FP-1).end; // Leave Peek for later. PP++; break; } else if (WC == '\\') { _source(PP++).ch = WC; _source(PP++).ch = Peek; FP++; } else { _source(PP++).ch = WC; } } continue; } else if (WC == ' ' || WC == '\n' || WC == '\t' || WC == '\f') { // use iswblank(WC) instead? continue; } else { // everything else just returns one significant non-space character. if (iswalpha(WC) && iswlower(WC)) { WC = towupper(WC); // ALSO TEMPORARY } _source(PP++).ch = WC; continue; } // Still skipping whitespace ... } DONE(); P = 0; while (source(P).ch != 0) { if (debug_stropping) fprintf(stderr, "%d: ch='%lc' start=%d:end=%d\n", P, source(P).ch, source(P).start, source(P).end); P++; } #undef DONE #endif return TRUE; };