// Line reconstruction (lexing) for C #define CString(i) &stringpool(c[i].sIndex) int is_binary(int ch) { return ((ch|1) == '1'); } int is_octal(int ch) { return ('0' <= ch) && (ch <= '7'); // or... (ch|7) =='7' } int is_decimal(int ch) { return ('0' <= ch) && (ch <= '9'); } int is_hex(int ch) { return (('0' <= ch) && (ch <= '9')) || (('a' <= ch) && (ch <= 'f')) || (('A' <= ch) && (ch <= 'F')); } static int lastch = EOF, lastch_pos = 0, lastch_lineno = 0, lastch_col = 0, lastch_peek = 0; static int xfgetc (void) { static int ch; lastch = ch; lastch_pos = nextch_pos; lastch_lineno = nextch_lineno; lastch_col = nextch_col; lastch_peek = peek; if (nextch_pos >= source_length) return EOF; ch = source_address(nextch_pos); if (ch == '\n') { nextch_lineno += 1; nextch_col = 0; } else nextch_col += 1; //if (debug_line_reconstruction) fprintf(stderr, "Ch %d of %d: {%c}\n", nextch_pos, source_length, ch); peek = source_address(++nextch_pos); return ch&255; } static void xungetc (int c, FILE * f) { // for a xfgetc char, not for peek. // rather than subtracting 1 from postions it may be better to save all // the positions in xfgetc and unwind them, thus avoiding the issue of // whether we're ungetting a token or whitespace. Or supply two xungetc's... if (nextch_pos > 0) { nextch_pos -= 1; if (nextch_col == 0) { int p; nextch_lineno -= 1; p = source_line(nextch_lineno); while (source_address(p) != '\n') { if (p == source_length) break; nextch_col += 1; p += 1; } } else nextch_col -= 1; } else { // invalid ungetc. nothing to unget at start of file. } ch = source_address(nextch_pos-1);//+1 // if (c == '\n') { // nextch_lineno += 1; nextch_col = 0; // } else nextch_col += 1; if (nextch_pos == source_length) peek = EOF; else peek = source_address(nextch_pos);//+1 } void em_char_in_stmnt(int line) { // (a little nod to the past...) fprintf(stderr, "EM CHAR IN STMNT - DISASTER (detected at parser line %d)\n", line); // stop-gap measure - needs a longjump to end of compile code to clean up exit(EXIT_FAILURE); } #define em_char_in_stmnt() em_char_in_stmnt(__LINE__) void line_reconstruction (void) { int (*acceptable_range)(int ch); // adds one more item to c[] array. // It is up to the caller to keep calling until sufficient data is loaded. static int token_nextfree, startcol; static int token_arraysize = 0; static char *token = NULL; #define token(x) FLEX(token, x, __LINE__) static int init=0; // Recognise single and double-quoted strings // while removing comments and handling line continuation. // The c[] array gets the canonicalised version of the text, BUT retains // pointers to the original text including whitespace, which comes in // handy for any source-to-source transformations such as indent (pretty printing) // or transpiling to another language (eg C to Javascript) if (init == 0) { at_startofline = TRUE; makespace(token,1024); // quick hack only for test.imp fix as soon as lexer is written. init = 1; } /* Pre-process input ready for parsing. Tokens are stored in array c[] */ // NOTE: we are not yet handling include files... for (;;) { startcol = nextch_col; token_nextfree = 0; token_lineno = nextch_lineno; token_col = nextch_col; token_pos = nextch_pos; // actual text of token may start here but whitespace goes back to end of previous token. if ((ch = xfgetc ()) == EOF) { // set up a dummy at the end because we sometimes look ahead by 1 // in the parsing code and don't want to hit uninitialised data. c[c_nextfree].f = curfile; c[c_nextfree].sIndex = str_to_pool("<EOF>"); // however we want to be able to print B_EOF and have any blank lines c[c_nextfree].t = B_EOF; // or comments before the end-of-file be output... so maybe "" would be better? c[c_nextfree].lineno_sp = whitespace_lineno; c[c_nextfree].col_sp = whitespace_col; c[c_nextfree].start_sp = whitespace_pos; c[c_nextfree].lineno_tok = nextch_lineno; c[c_nextfree].col_tok = nextch_col; c[c_nextfree].start_tok = nextch_pos; c[c_nextfree].end_tok = nextch_pos; c_nextfree++; #ifdef NEVER if (debug_tokens || debug_line_reconstruction) { // print tokens once parser hits end of file on input stream int i; fprintf (stderr, "\nLexical token stream:\n\n"); for (i = 0; i < c_nextfree; i++) { fprintf (stderr, "C[%d].f=\"%s\", .lineno=%d, .col=%d, .type=%0d, .s=\"%s\" (at %p)\n", i, c[i].f, c[i].lineno_tok, c[i].col_tok, c[i].t, escape(CString(i),'"'), CString(i)); } } #endif return; } if ((ch == '+' || ch == '-' || ch == '&' || ch == '|' || ch == '=') && (peek == ch)) { startline = FALSE; /* ++, --, &&, || */ // only a very small number of compound graphemes need to be converted to lexical tokens (BIPs) to avoid parsing ambiguities makespace (token, token_nextfree+4); token(token_nextfree++) = ch; token(token_nextfree++) = ch; token(token_nextfree++) = '\0'; ch = xfgetc (); // eat 'peek' ch. stores (token, token_lineno, startcol, token_pos, (ch=='+'?B_pp:ch=='-'?B_mm:ch=='&'?B_andand:ch=='='?B_eqeq:B_oror), curfile); } else if (isalpha (ch) || (ch == '_')) { startline = FALSE; /* token or keyword */ for (;;) { makespace (token, token_nextfree+1); if (isalpha (ch) || isdigit (ch) || (ch == '_')) { // digits allowed after 1st char. token(token_nextfree++) = ch; } else break; ch = xfgetc (); } xungetc (ch, sourcefile); token(token_nextfree++) = '\0'; stores (token, token_lineno, startcol, token_pos, B_internal_identifier, curfile); } else if (((ch == '.') && isdigit(peek)) || (isdigit (ch))) { // -->0B.110101100P12L startline = FALSE; /* Number */ acceptable_range = &is_decimal; if (ch == '0') { // must be binary, octal, or hex. // Store as a string... makespace (token, token_nextfree+3); if (peek == 'b' || peek == 'B' || (peek == 'x') || (peek == 'X')) { token(token_nextfree++) = ch; // '0' acceptable_range = (peek == 'b' || peek == 'B' ? &is_binary : &is_hex); ch = xfgetc (); // get the B token(token_nextfree++) = ch; // b if (peek == '.') { goto READ_DECIMAL; } } else if (peek == '.') { token(token_nextfree++) = ch; // '0' // started with a 0 so octal unless it is 0.19 etc... acceptable_range = &is_decimal; goto READ_DECIMAL; } else { acceptable_range = &is_octal; // the 0 has been read. If nothing acceptable follows, don't read any more chars token(token_nextfree++) = ch; // digit } } else if (ch == '.') { // .nnn or 0.nnn acceptable_range = &is_decimal; xungetc (ch, sourcefile); // peek is now '.' goto READ_DECIMAL; } else if (isdigit(ch)) { acceptable_range = &is_decimal; token(token_nextfree++) = ch; // digit } // rest of number comes through here for all formats while (acceptable_range(peek)) { // 0B-->.110101100P12L ch = xfgetc (); // what was peek is now ch makespace (token, token_nextfree+2); token(token_nextfree++) = ch; // 0 } READ_DECIMAL: if (peek == 'p' || peek == 'P') { goto BINARY_POWER; } else if (peek == '.' || peek == 'e' || peek == 'E') { if (peek == '.') { ch = xfgetc (); token(token_nextfree++) = ch; while (acceptable_range(peek)) { ch = xfgetc (); // what was peek is now ch makespace (token, token_nextfree+2); token(token_nextfree++) = ch; // 0 } } if ((peek == 'f') || (peek == 'F') || (peek == 'l') || (peek == 'L')) { ch = xfgetc (); token(token_nextfree++) = ch; } else if (((acceptable_range == is_decimal) && (peek == 'e' || peek == 'E')) || ((acceptable_range != is_decimal) && (peek == 'p' || peek == 'P'))) { BINARY_POWER: ch = xfgetc (); token(token_nextfree++) = ch; // P acceptable_range = &is_decimal; // P+nnn nnn is decimal if ((peek == '+') || (peek == '-')) { ch = xfgetc (); token(token_nextfree++) = ch; } if (!acceptable_range(peek)) { token(token_nextfree) = '\0'; fprintf(stderr, "Warning: bad floating point literal '%s'\n", token); } while (acceptable_range(peek)) { // SHOULD pick up P12L etc makespace (token, token_nextfree+2); ch = xfgetc (); // what was peek is now ch token(token_nextfree++) = ch; // 0 } if ((peek == 'f') || (peek == 'F') || (peek == 'l') || (peek == 'L')) { ch = xfgetc (); token(token_nextfree++) = ch; } } } else if ((peek == 'u') || (peek == 'U')) { ch = xfgetc (); token(token_nextfree++) = ch; } else if ((peek == 'l') || (peek == 'L')) { ch = xfgetc (); token(token_nextfree++) = ch; if (ch == peek) { // 'll' or 'LL' allowed but not 'lL' or 'Ll' ch = xfgetc (); token(token_nextfree++) = ch; } } makespace (token, token_nextfree+1); token(token_nextfree++) = '\0'; if (strchr(token, '.') == NULL) { stores (token, token_lineno, startcol, token_pos, B_integer_constant, curfile); // changed from col to startcol } else { stores (token, token_lineno, startcol, token_pos, B_floating_constant, curfile); // changed from col to startcol } } else { switch (ch) { case '\'': // Handle 'c' char const case '"': // Handle "string" /* literals */ startline = FALSE; { int string_nextfree = 0, string_arraysize = 0, quotech = ch; char *string = NULL; for (;;) { ch = xfgetc (); // Newlines are allowed makespace (string, string_nextfree+3); if (ch == '\\') { ch = xfgetc (); if (ch == '\\') { string[string_nextfree++] = ch; } else if (ch == '\'') { string[string_nextfree++] = '\''; } else if (ch == '"') { string[string_nextfree++] = '"'; } else if (ch == 'n') { string[string_nextfree++] = '\n'; } else if (ch == 'r') { string[string_nextfree++] = '\r'; } else if (ch == 't') { string[string_nextfree++] = '\t'; } else if (ch == '0') { string[string_nextfree++] = '\0'; } else { // Warn of unknown (to me) \x escape. Probably an error. string[string_nextfree++] = '\\'; string[string_nextfree++] = ch; fprintf(stderr, "Warning: un-handled escape '\\%c'\n", ch); } } else if (ch != quotech) { string[string_nextfree++] = ch; } else { string[string_nextfree] = '\0'; break; } } if (quotech == '\'') { int stringlen = 0; char *s; stores (string, token_lineno, token_col, token_pos, B_sqstring, curfile); s = string; while (*s != '\0') { if (*s == '\\') s += 2; else s += 1; stringlen += 1; } if (stringlen == 1) { // good char const } else if (stringlen <= 4) { // Warn that 'xx' as a 32-bit int is a non-standard extension fprintf(stderr, "Warning: multi-byte characters constants such as \'%s\' are not well supported.\n", string); // don't escape - already escaped } else { // Warn that this is probably a string with the wrong type of quote. fprintf(stderr, "Warning: wrong kind of string quote used? - \'%s\'.\n", string); // don't escape - already escaped } } else stores (string, token_lineno, token_col, token_pos, B_dqstring, curfile); free (string); } break; case '/': /* COMMENTS (or just a divide symbol) */ startline = FALSE; if (peek == '/') { // comment to end of line // '//' comments are part of whitespace - skip over them, but whitespace_start must not be updated do { ch = xfgetc (); } while (ch != '\n'); } else if (peek == '*') { /* Handle potential multi-line comment */ ch = xfgetc (); // '/* ... */' comments are part of whitespace - skip over them, but whitespace_start must not be updated for (;;) { ch = xfgetc (); if ((ch == '*') && (peek == '/')) { ch = xfgetc (); // get the '/' too. break; } } } else { // a divide symbol, not a comment storec (ch, token_lineno, token_col-1, token_pos, B_char, curfile); } break; // WHITESPACE case '\n': case '\r': startline = TRUE; break; case '\t': case ' ': // white space does not affect startline. break; // DIRECTIVES case '#': // should allow '#' as a directive if startline is true. (and not in a comment of course) // for now we'll just copy # directives straight through to the output if (startline) { int i = 0, lastch = -1; static char CPP_directive[256]; // small for now. Will make flex once tested. for (;;) { ch = xfgetc (); if (ch == EOF) exit(1);// fix ch &= 255; if ((ch == '\n') && (lastch != '\\')) { // e.g. #define X some multiline text with '\' at the end of all lines except the last break; } CPP_directive[i++] = ch; lastch = ch; if (i == 255) { // skip any more text for now. for (;;) { ch = xfgetc (); if (ch == EOF) exit(1);// fix ch &= 255; if (ch == '\n' && lastch != '\\') { break; } lastch = ch; } break; } } CPP_directive[i] = '\0'; fprintf(stderr, "Warning: pre-processor directive skipped: #%s\n", CPP_directive); ch = '\n'; startcol = nextch_col; startline = TRUE; break; } // Drop through default: storec (ch, token_lineno, token_col, token_pos, B_char, curfile); startline = FALSE; } } return; } } // For reasons to be explained later, the code in the on-demand source-fetching // must not use the flex mechanism if there is any possibility of 'c' itself // being reallocated while evaluating the index of a c(x) reference. Space for // tokens in the c[] array must have been pre-claimed by makespace before a // non-existent c(x) entry is accessed. Hence c() was not defined by the // usual FLEX macro and the use of String() within the line reconstruction // and any code that it calls (eg stores()) must use c[] rather than c(), and // the alternative CString() ...