# I'm now re-doing line reconstruction and whitespace so that it is done # on the fly while parsing, to accomodate some of Imp's less well-designed # features w.r.t. comment handling. This requires a new facility, which # is that uparse.c specifically understands keywords in the same way that # it understands white space. Note that white space should handle " % " # as all spaces and not stop at the '%'. Whereas handling keywords should # convert everything that starts with '%' and a letter, up to something that # isn't a letter. Remembering that spaces and '%'s can occur within a # keyword sequence... "%c%y%%%%c %l%e% I=" is equivalent to "cycle" and # should leave the input pointer at "I=" # # rather than have stropping done in the portable part of the parser, # we'll have the parser call C before looking for a keyword. # We do need a new concept of a queue. Or maybe just a source() flag to # show that it has not yet been filled on demand. # # (Btw. "%BE %GIN" is a valid statement in Imp77!) # This should also handle %C continuations. # To do: name tables # mktuple() (note skeleton code is pretty broken) { // This anonymous block is where the global declarations go, which come // before the procedures, parse rules, etc. } C = { #ifdef IN_PARSER while (source(TP).ch==' ' || source(TP).ch=='\t' || source(TP).ch=='\f') { TP += 1; } #endif return TRUE; }; C = { #ifdef IN_PARSER int debug_stropping = 0; // The source file has already been read trivially into source(). // We will copy from source() into temp(), then perform line reconstruction // on temp(), writing back to source(). The parser will then parse source() // into atoms according to the grammar. Initially it will only store the // reconstructed characters into the atoms, but once it is working, I will // modify it to also store the unreconsructed source for use in source-to-source // translations, where whitespace, embedded comments, and indentation is // desired in the translation, in order to mirror the original file. // Because unfortunately underlining in Unicode is done by a *following* // underline joiner character (818) rather than being a single unicode // code point, it is difficult to use a single-character encoding of a // stropped keyword letter - what the old Imp compilers would represent // by adding 128 to the character. However there *is* an alternive // source of upper case and lower case letters in the mathematics area! // A:Z could be encoded as 1D400:1D419 and a:z as 1D41A:1D433 :-) // but for now I'm encoding keywords in lower case and variables in // upper case. // The 1D400+ encoding looks more or less like ordinary text if it happens // to be displayed (e.g. during debugging) although there should never be // any need to display internally-coded keywords to users of the // compilers built with this parser. // All arrays are flex and the upper bound is a limit, not a minimum. DECLARE(SYM, reconstructed, 128000000/*600000*/); #define _SYM(x) WRITE(x,SYM,reconstructed) #define SYM(x) READ(x,SYM,reconstructed) int LASTP, P = 0; while (source(P).ch != 0 /* WEOF */) { _SYM(P).ch = source(P).ch; _SYM(P).start = P; _SYM(P).end = P+1; P += 1; } _SYM(P).ch = 0 /* WEOF */; _SYM(P).start = P; _SYM(P).end = P; // no chars for EOF LASTP = P; if (debug_stropping) { int I; fprintf(stderr, "source() moved to SYM(0:%d) = \"", LASTP); for (I = 0; I < LASTP; I++) { fprintf(stderr, "%lc", SYM(I).ch); } if (SYM(LASTP).ch != 0) fprintf(stderr, "[%d]", SYM(LASTP).ch); fprintf(stderr, "\";\n"); }; int FP = 0, PP = 0; // Fetch Pointer, Put Pointer. #define DONE() \ do { \ FP -= 1; /* the terminating 0*/ \ _source(PP).ch = 0; \ _source(PP).end = SYM(FP).end; \ if (debug_stropping) { \ int I; \ fprintf(stderr, "SYM(0:%d) moved back to source(0:%d) = \"", FP, PP); \ for (I = 0; I < PP; I++) { \ fprintf(stderr, "%lc", source(I).ch); \ } \ if (source(PP).ch != 0) fprintf(stderr, "[%d]", source(PP).ch); \ fprintf(stderr, "\";\n"); \ } \ return TRUE; \ } while (0) wint_t WC; // NOTE THAT WITH THIS IMP77 GRAMMAR, '\n' IS NOT WHITESPACE. LINE ENDINGS ARE EXPLICITLY // ENTERED IN THE GRAMMAR. (See the phrases , and . // uparse.c has been modified so that its implicit whitespace skipping no longer skips '\n'. // (The algol60 parser in contrast treats all \n's the same as spaces) // HOW TO HANDLE ' IN A PARSED COMMENT? // // %COMMENT A ' MESSES UP! // // because it keeps scanning until a closing quote. However if you don't scan between quotes, // line reconstruction will lose spaces within strings! // // You can't just end a quoted string at a newline because embedded newlines are allowed. // And I checked Imp77 - it allows a single quote ch in a comment. // If line reconstruction were being done on the fly then it could be modified if we knew we were // in a comment, but since we're doing it all in advance, the only option to handle this appears // to be that whenever we're in a comment, we throw away all the following line reconstruction and // re-do it, with that comment handled differently. // Or bite the bullet and work out how to do line reconstruction on the fly (which my previous // imptoc did eventually manage using the 'demandload' call. So *every* fetch via TP would have // to be recoded as a procedure call, with on-the-fly line reconstruction, and either a way to // undo it if backtracking or simply never doing it any farther past TP and undoing it on backtracking. // What a can of worms just to handle badly designed comments. TO DO. #define CHECK_EOF(x) do if ((x) == 0) DONE(); else { _source(PP).end = SYM(FP-1).end; } while (0) // PP is the 'current' slot we are writing into. _source(PP).start = SYM(FP).start; for (;;) { _source(PP).end = SYM(FP).end; // Keep updated. WC = SYM(FP++).ch; CHECK_EOF(WC); if (WC == '%') { // We found a keyword. It will always be read up to the last character of the keyword. for (;;) { WC = SYM(FP++).ch; CHECK_EOF(WC); if (WC == '%') { } else if (!isalpha(WC)) { // It's possible to have a bunch of '%' signs and *no* keyword characters. --FP; // point FP back to the non-keyword character, not as currently, the one past that. break; } else { // isalpha(WC) if (isupper(WC)) WC = tolower(WC); _source(PP).end = SYM(FP-1).end; // | 128 _source(PP++).ch = WC; // | 128 _source(PP).start = SYM(FP).start; // | 128 } } continue; } else if (WC == '{') { // TO DO: testing seems to suggest that comments starting with '{' which extend to the // end of the line with no terminating '}' are causing a syntax error. Check & fix if // necessary. for (;;) { WC = SYM(FP++).ch; CHECK_EOF(WC); if (WC == '\n') { --FP; /* re-read the \n as a significant character */ // _source(PP).end = SYM(FP-1).end; // point FP back to the newline break; } if (WC == '}') { // Not sure if \n should be gobbled for {this style break; // but still looking. } } continue; } // ***PROBLEM*** line reconstruction removes spaces *but not in strings and chars*, // however it doesn't know if we are in an Imp comment, so it will // still be doing line reconstruction including string preservation // which traverses newlines and '}' end of comment symbols. // I do remember adding a mechanism to the parser at some point that // allowed me to postpone the line reconstruction until just before // parsing the content, but it was very complicated and had a lot of // overhead in terms of needing to write extra code, and to ne honest // I have completely forgotten the details of how it worked :-( // If I do have to resurrect that mechanism, I had better document it // first! // Note that although '{' comments can be safely skipped at the line reconstruction // stage, "!" comments and "%comment" comments can't, because of the other uses of // '!' in Imp (OR, EXOR, and Modulus). I suppose a much more complex line reconstruction // that uses code like in the 'uncomment-imp' utility might work, though I'm still not // sure that that code handles %comment... // Worst-case example for handling during line reconstruction is where there are multiple // switch labels before a comment! Maybe even mixed with {} comments!: // // lab(12): lab{yuck!}(x): ! comment with odd ' in it // lab(12): lab{double yuck!}(x): %comment with odd ' in it %c // lab(12): lab{double yuck!}(x): %comment with odd ' in it %c ; // lab(12): lab{double yuck!}(x): %comment with odd ' in it { %c else if (WC == '\'') { _source(PP++).ch = WC; for (;;) { WC = SYM(FP++).ch; CHECK_EOF(WC); if (WC == '\'') { // peek ahead: int Peek = SYM(FP).ch; CHECK_EOF(Peek); if (Peek == '\'') { // doubled 's _source(PP++).ch = WC; _source(PP++).ch = Peek; FP++; } else { _source(PP).ch = WC; _source(PP).end = SYM(FP-1).end; // Leave Peek for later. PP++; break; } } else { _source(PP++).ch = WC; } } continue; } else if (WC == '"') { // TO DO: Update ' and " items in imp77 as well _source(PP++).ch = WC; for (;;) { WC = SYM(FP++).ch; CHECK_EOF(WC); if (WC == '"') { // peek ahead: int Peek = SYM(FP).ch; CHECK_EOF(Peek); if (Peek == '"') { // doubled "s _source(PP++).ch = WC; _source(PP++).ch = Peek; FP++; } else { _source(PP).ch = WC; _source(PP).end = SYM(FP-1).end; // Leave Peek for later. PP++; break; } } else { _source(PP++).ch = WC; } } continue; } else if (WC == ' ' || WC == '\t' || WC == '\f') { // use iswblank(WC) instead? continue; } else { // everything else just returns one significant non-space character. This includes '\n'. if ((WC == '\n') && ((PP>0) && (source(PP-1).ch == 'c'))) { // BEWARE WHEN CHANGING STROPPING ENCODING: Looking for a preceding '%C' ... if (PP>0) _source(PP-1).ch = ' '; // remove the '%c' _source(PP++).ch = ' '; // remove the newline // This is the only place where we gobble spaces *after* a token rather than before. // It may be cleaner to set a 'continuation' flag and gobble them before the next // symbol fetch rather than do it here in a lookahead. Esp. wrt to reconstituting source // from the array for the listing file etc etc. // BUT FOR NOW, %C IS HANDLED BY THS HACK: int Lookahead = FP; while (SYM(Lookahead).ch == '\n' || SYM(Lookahead).ch == ' ' || SYM(Lookahead).ch == '\t' || SYM(Lookahead).ch == '\f') { // Use iswblank()? // No worries about '{...}' - this behaviour seems to be identical to Imp77's _SYM(Lookahead).ch = ' '; // gobble following newlines and whitespace before next significant character. Lookahead++; } continue; } if (iswalpha(WC) && iswlower(WC)) { WC = towupper(WC); // ALSO TEMPORARY } _source(PP++).ch = WC; continue; } // Still skipping whitespace ... } DONE(); P = 0; while (source(P).ch != 0) { if (debug_stropping) fprintf(stderr, "%d: ch='%lc' start=%d:end=%d\n", P, source(P).ch, source(P).start, source(P).end); P++; } #undef DONE #endif return TRUE; }; # I think these are all variable names but with added semantic checks. # Some of the variable types *may* exist in separate namespaces? #B = 1; # simple variable #B = 2; # pointer variable #B = 4; # recordformat #B = 6; # routine call #B = 7; # function call #B = 8; # map call #B

= 9; # predicate call #B = 10; # routine parameter #B = 11; # function parameter #B = 12; # map parameter #B = 13; # predicate parameter #B = 14; # label #B ~~= 15; # switch #B = 16; # array #B = 17; # arrayname #B = 18; # name array #B = 19; # namearrayname~~