lex.c

// Line reconstruction (lexing) for C

#define CString(i) &stringpool(c[i].sIndex)

int is_binary(int ch) {
  return ((ch|1) == '1');
}

int is_octal(int ch) {
  return ('0' <= ch) && (ch <= '7'); // or...  (ch|7) =='7'
}

int is_decimal(int ch) {
  return ('0' <= ch) && (ch <= '9');
}

int is_hex(int ch) {
  return (('0' <= ch) && (ch <= '9')) || (('a' <= ch) && (ch <= 'f')) || (('A' <= ch) && (ch <= 'F'));
}

static int lastch = EOF, lastch_pos = 0, lastch_lineno = 0, lastch_col = 0, lastch_peek = 0;
static int xfgetc (void) {
  static int ch;
  lastch = ch; lastch_pos = nextch_pos; lastch_lineno = nextch_lineno; lastch_col = nextch_col; lastch_peek = peek;
  if (nextch_pos >= source_length) return EOF;
  ch = source_address(nextch_pos);
  if (ch == '\n') {
    nextch_lineno += 1; nextch_col = 0;
  } else nextch_col += 1;
  //if (debug_line_reconstruction) fprintf(stderr, "Ch %d of %d: {%c}\n", nextch_pos, source_length, ch);
  peek = source_address(++nextch_pos);
  
  return ch&255;
}

static void xungetc (int c, FILE * f) { // for a xfgetc char, not for peek.
  // rather than subtracting 1 from postions it may be better to save all
  // the positions in xfgetc and unwind them, thus avoiding the issue of
  // whether we're ungetting a token or whitespace.  Or supply two xungetc's...
  if (nextch_pos > 0) {
    nextch_pos -= 1;
    if (nextch_col == 0) {
      int p;
      nextch_lineno -= 1;
      p = source_line(nextch_lineno);
      while (source_address(p) != '\n') {
        if (p == source_length) break;
        nextch_col += 1; p += 1;
      }
    } else nextch_col -= 1;
  } else {
    // invalid ungetc. nothing to unget at start of file.
  }
  ch = source_address(nextch_pos-1);//+1
  //  if (c == '\n') {
  //    nextch_lineno += 1; nextch_col = 0;
  //  } else nextch_col += 1;
  if (nextch_pos == source_length) peek = EOF; else peek = source_address(nextch_pos);//+1
}

void em_char_in_stmnt(int line) {
  // (a little nod to the past...)
  fprintf(stderr, "EM CHAR IN STMNT - DISASTER (detected at parser line %d)\n", line); // stop-gap measure - needs a longjump to end of compile code to clean up
  exit(EXIT_FAILURE);
}
#define em_char_in_stmnt() em_char_in_stmnt(__LINE__)

void line_reconstruction (void) {
  int (*acceptable_range)(int ch);
  // adds one more item to c[] array.
  // It is up to the caller to keep calling until sufficient data is loaded.
  static int token_nextfree, startcol;
  static int token_arraysize = 0;
  static char *token = NULL;
#define token(x) FLEX(token, x, __LINE__)
  static int init=0;
  
  // Recognise single and double-quoted strings
  // while removing comments and handling line continuation.
  // The c[] array gets the canonicalised version of the text, BUT retains
  // pointers to the original text including whitespace, which comes in
  // handy for any source-to-source transformations such as indent (pretty printing)
  // or transpiling to another language (eg C to Javascript)
  if (init == 0) {
    at_startofline = TRUE;
    makespace(token,1024); // quick hack only for test.imp fix as soon as lexer is written.
    init = 1;
  }
  
  /* Pre-process input ready for parsing.  Tokens are stored in array c[] */
  // NOTE: we are not yet handling include files...

  for (;;) {
    startcol = nextch_col;
    token_nextfree = 0;
    token_lineno = nextch_lineno;
    token_col = nextch_col;
    token_pos = nextch_pos; // actual text of token may start here but whitespace goes back to end of previous token.
 
    if ((ch = xfgetc ()) == EOF) {
      // set up a dummy at the end because we sometimes look ahead by 1
      // in the parsing code and don't want to hit uninitialised data.
      c[c_nextfree].f = curfile;

      c[c_nextfree].sIndex = str_to_pool("<EOF>"); // however we want to be able to print B_EOF and have any blank lines
      c[c_nextfree].t = B_EOF;                     // or comments before the end-of-file be output... so maybe "" would be better?

      c[c_nextfree].lineno_sp = whitespace_lineno;
      c[c_nextfree].col_sp = whitespace_col;
      c[c_nextfree].start_sp = whitespace_pos;

      c[c_nextfree].lineno_tok = nextch_lineno;
      c[c_nextfree].col_tok = nextch_col;
      c[c_nextfree].start_tok = nextch_pos;

      c[c_nextfree].end_tok = nextch_pos;
      c_nextfree++;

      #ifdef NEVER
      if (debug_tokens || debug_line_reconstruction) { // print tokens once parser hits end of file on input stream
        int i;
        fprintf (stderr, "\nLexical token stream:\n\n");
        for (i = 0; i < c_nextfree; i++) {
          fprintf (stderr, "C[%d].f=\"%s\", .lineno=%d, .col=%d, .type=%0d, .s=\"%s\" (at %p)\n",
                           i, c[i].f, c[i].lineno_tok, c[i].col_tok, c[i].t, escape(CString(i),'"'), CString(i));
        }
      }
      #endif
      return;
    }
    

    if ((ch == '+' || ch == '-' || ch == '&' || ch == '|' || ch == '=') && (peek == ch)) {
      startline = FALSE;
      /* ++, --, &&, || */
      // only a very small number of compound graphemes need to be converted to lexical tokens (BIPs) to avoid parsing ambiguities
      makespace (token, token_nextfree+4);
      token(token_nextfree++) = ch;
      token(token_nextfree++) = ch;
      token(token_nextfree++) = '\0';
      ch = xfgetc (); // eat 'peek' ch.
      stores (token,
              token_lineno, startcol, token_pos,
              (ch=='+'?B_pp:ch=='-'?B_mm:ch=='&'?B_andand:ch=='='?B_eqeq:B_oror), curfile);
    } else if (isalpha (ch) || (ch == '_')) {
      startline = FALSE;
      /* token or keyword */

      for (;;) {
        makespace (token, token_nextfree+1);
        if (isalpha (ch) || isdigit (ch) || (ch == '_')) {
          // digits allowed after 1st char.
          token(token_nextfree++) = ch;
        } else break;
        ch = xfgetc ();
      }
      xungetc (ch, sourcefile);
      token(token_nextfree++) = '\0';
      stores (token,
              token_lineno, startcol, token_pos,
              B_internal_identifier, curfile);

    } else if (((ch == '.') && isdigit(peek)) || (isdigit (ch))) {  // -->0B.110101100P12L
      startline = FALSE;
      /* Number */
      acceptable_range = &is_decimal;
      if (ch == '0') { // must be binary, octal, or hex.
        // Store as a string...

        makespace (token, token_nextfree+3);
        if (peek == 'b' || peek == 'B' || (peek == 'x') || (peek == 'X')) {
          token(token_nextfree++) = ch; // '0'
          acceptable_range = (peek == 'b' || peek == 'B' ? &is_binary : &is_hex);
          ch = xfgetc (); // get the B
          token(token_nextfree++) = ch; // b
          if (peek == '.') {
            goto READ_DECIMAL;
          }
        } else if (peek == '.') {
          token(token_nextfree++) = ch; // '0'
          // started with a 0 so octal unless it is 0.19 etc...
          acceptable_range = &is_decimal;
          goto READ_DECIMAL;
        } else {
          acceptable_range = &is_octal; // the 0 has been read.  If nothing acceptable follows, don't read any more chars
          token(token_nextfree++) = ch; // digit
        }
      } else if (ch == '.') {  //   .nnn or 0.nnn
        acceptable_range = &is_decimal;
        xungetc (ch, sourcefile); // peek is now '.'
        goto READ_DECIMAL;
      } else if (isdigit(ch)) {
        acceptable_range = &is_decimal;
        token(token_nextfree++) = ch; // digit
      }

      // rest of number comes through here for all formats
      while (acceptable_range(peek)) {  // 0B-->.110101100P12L
        ch = xfgetc (); // what was peek is now ch
        makespace (token, token_nextfree+2);
        token(token_nextfree++) = ch; // 0
      }
    READ_DECIMAL:
      if (peek == 'p' || peek == 'P') {
        goto BINARY_POWER;
      } else if (peek == '.' || peek == 'e' || peek == 'E') { 
        if (peek == '.') {
          ch = xfgetc ();
          token(token_nextfree++) = ch;
          while (acceptable_range(peek)) {
            ch = xfgetc (); // what was peek is now ch
            makespace (token, token_nextfree+2);
            token(token_nextfree++) = ch; // 0
          }
        }
        if ((peek == 'f') || (peek == 'F') || (peek == 'l') || (peek == 'L')) {
          ch = xfgetc ();
          token(token_nextfree++) = ch;
        } else if (((acceptable_range == is_decimal) && (peek == 'e' || peek == 'E'))
            || ((acceptable_range != is_decimal) && (peek == 'p' || peek == 'P'))) {
        BINARY_POWER:
          ch = xfgetc ();
          token(token_nextfree++) = ch; // P
          acceptable_range = &is_decimal; // P+nnn nnn is decimal
          if ((peek == '+') || (peek == '-')) {
            ch = xfgetc ();
            token(token_nextfree++) = ch;
          }
          if (!acceptable_range(peek)) {
            token(token_nextfree) = '\0';
            fprintf(stderr, "Warning: bad floating point literal '%s'\n", token);
          }
          while (acceptable_range(peek)) { // SHOULD pick up P12L etc
            makespace (token, token_nextfree+2);
            ch = xfgetc (); // what was peek is now ch
            token(token_nextfree++) = ch; // 0
          }
          if ((peek == 'f') || (peek == 'F') || (peek == 'l') || (peek == 'L')) {
            ch = xfgetc ();
            token(token_nextfree++) = ch;
          }
        }
      } else if ((peek == 'u') || (peek == 'U')) {
        ch = xfgetc ();
        token(token_nextfree++) = ch;
      } else if ((peek == 'l') || (peek == 'L')) {
        ch = xfgetc ();
        token(token_nextfree++) = ch;
        if (ch == peek) { // 'll' or 'LL' allowed but not 'lL' or 'Ll'
          ch = xfgetc ();
          token(token_nextfree++) = ch;
        }
      }
      makespace (token, token_nextfree+1);
      token(token_nextfree++) = '\0';
      if (strchr(token, '.') == NULL) {
        stores (token,
                token_lineno, startcol, token_pos,
                B_integer_constant, curfile); // changed from col to startcol
      } else {
        stores (token,
                token_lineno, startcol, token_pos,
                B_floating_constant, curfile); // changed from col to startcol
      }
    } else {
      switch (ch) {

      case '\'':               // Handle 'c' char const
      case '"':                // Handle "string"
        /* literals */
        startline = FALSE;
        {
          int string_nextfree = 0, string_arraysize = 0, quotech = ch;
          char *string = NULL;

          for (;;) {
            ch = xfgetc ();   // Newlines are allowed
            makespace (string, string_nextfree+3);
            if (ch == '\\') {
              ch = xfgetc ();
              if (ch == '\\') {
                string[string_nextfree++] = ch;
              } else if (ch == '\'') {
                string[string_nextfree++] = '\'';
              } else if (ch == '"') {
                string[string_nextfree++] = '"';
              } else if (ch == 'n') {
                string[string_nextfree++] = '\n';
              } else if (ch == 'r') {
                string[string_nextfree++] = '\r';
              } else if (ch == 't') {
                string[string_nextfree++] = '\t';
              } else if (ch == '0') {
                string[string_nextfree++] = '\0';
              } else {
                // Warn of unknown (to me) \x escape.  Probably an error.
                string[string_nextfree++] = '\\';
                string[string_nextfree++] = ch;
                fprintf(stderr, "Warning: un-handled escape '\\%c'\n", ch);
              }
            } else if (ch != quotech) {
              string[string_nextfree++] = ch;
            } else {
              string[string_nextfree] = '\0';
              break;
            }
          }

          if (quotech == '\'') {
            int stringlen = 0;
            char *s;
            stores (string,
                    token_lineno, token_col, token_pos,
                    B_sqstring, curfile);
            s = string;
            while (*s != '\0') {
              if (*s == '\\') s += 2; else s += 1;
              stringlen += 1;
            }
            if (stringlen == 1) {
              // good char const
            } else if (stringlen <= 4) {
              // Warn that 'xx' as a 32-bit int is a non-standard extension
              fprintf(stderr, "Warning: multi-byte characters constants such as \'%s\' are not well supported.\n", string); // don't escape - already escaped
            } else {
              // Warn that this is probably a string with the wrong type of quote.
              fprintf(stderr, "Warning: wrong kind of string quote used? - \'%s\'.\n", string); // don't escape - already escaped
            }
          } else stores (string,
                         token_lineno, token_col, token_pos,
                         B_dqstring, curfile);
          free (string);
        }
        break;

      case '/':
        /* COMMENTS (or just a divide symbol) */

        startline = FALSE;
        if (peek == '/') { // comment to end of line
          //  '//' comments are part of whitespace - skip over them, but whitespace_start must not be updated
          do {
            ch = xfgetc ();
          } while (ch != '\n');

        } else if (peek == '*') { /* Handle potential multi-line comment */
          ch = xfgetc ();     //  '/* ... */' comments are part of whitespace - skip over them, but whitespace_start must not be updated
          for (;;) {
            ch = xfgetc ();
            if ((ch == '*') && (peek == '/')) {
              ch = xfgetc (); // get the '/' too.
              break;
            }
          }
        } else {
          // a divide symbol, not a comment
          storec (ch,
                  token_lineno, token_col-1, token_pos,
                  B_char, curfile); 
        }
        break;

        // WHITESPACE
      case '\n':
      case '\r':
        startline = TRUE;
        break;

      case '\t':
      case ' ':
        // white space does not affect startline.
        break;

        // DIRECTIVES
      case '#':
        // should allow '#' as a directive if startline is true. (and not in a comment of course)
        // for now we'll just copy # directives straight through to the output
        if (startline) {
          int i = 0, lastch = -1;
          static char CPP_directive[256]; // small for now. Will make flex once tested.
          for (;;) {
            ch = xfgetc ();
            if (ch == EOF) exit(1);// fix
            ch &= 255;
            if ((ch == '\n') && (lastch != '\\')) { // e.g. #define X  some multiline text with '\' at the end of all lines except the last
              break;
            }
            CPP_directive[i++] = ch;
            lastch = ch;
            if (i == 255) {
              // skip any more text for now.
              for (;;) {
                ch = xfgetc ();
                if (ch == EOF) exit(1);// fix
                ch &= 255;
                if (ch == '\n' && lastch != '\\') {
                  break;
                }
                lastch = ch;
              }
              break;
            }
          }
          CPP_directive[i] = '\0';
          fprintf(stderr, "Warning: pre-processor directive skipped: #%s\n", CPP_directive);
          ch = '\n';
          startcol = nextch_col;
          startline = TRUE;
          break;
        }
        // Drop through

      default:
        storec (ch,
                token_lineno, token_col, token_pos,
                B_char, curfile);
        startline = FALSE;
      }
    }
    return;
  }
}

// For reasons to be explained later, the code in the on-demand source-fetching
// must not use the flex mechanism if there is any possibility of 'c' itself
// being reallocated while evaluating the index of a c(x) reference.  Space for
// tokens in the c[] array must have been pre-claimed by makespace before a
// non-existent c(x) entry is accessed.  Hence c() was not defined by the
// usual FLEX macro and the use of String() within the line reconstruction
// and any code that it calls (eg stores()) must use c[] rather than c(), and
// the alternative CString() ...