// Line reconstruction (lexing) for C
#define CString(i) &stringpool(c[i].sIndex)
int is_binary(int ch) {
return ((ch|1) == '1');
}
int is_octal(int ch) {
return ('0' <= ch) && (ch <= '7'); // or... (ch|7) =='7'
}
int is_decimal(int ch) {
return ('0' <= ch) && (ch <= '9');
}
int is_hex(int ch) {
return (('0' <= ch) && (ch <= '9')) || (('a' <= ch) && (ch <= 'f')) || (('A' <= ch) && (ch <= 'F'));
}
static int lastch = EOF, lastch_pos = 0, lastch_lineno = 0, lastch_col = 0, lastch_peek = 0;
static int xfgetc (void) {
static int ch;
lastch = ch; lastch_pos = nextch_pos; lastch_lineno = nextch_lineno; lastch_col = nextch_col; lastch_peek = peek;
if (nextch_pos >= source_length) return EOF;
ch = source_address(nextch_pos);
if (ch == '\n') {
nextch_lineno += 1; nextch_col = 0;
} else nextch_col += 1;
//if (debug_line_reconstruction) fprintf(stderr, "Ch %d of %d: {%c}\n", nextch_pos, source_length, ch);
peek = source_address(++nextch_pos);
return ch&255;
}
static void xungetc (int c, FILE * f) { // for a xfgetc char, not for peek.
// rather than subtracting 1 from postions it may be better to save all
// the positions in xfgetc and unwind them, thus avoiding the issue of
// whether we're ungetting a token or whitespace. Or supply two xungetc's...
if (nextch_pos > 0) {
nextch_pos -= 1;
if (nextch_col == 0) {
int p;
nextch_lineno -= 1;
p = source_line(nextch_lineno);
while (source_address(p) != '\n') {
if (p == source_length) break;
nextch_col += 1; p += 1;
}
} else nextch_col -= 1;
} else {
// invalid ungetc. nothing to unget at start of file.
}
ch = source_address(nextch_pos-1);//+1
// if (c == '\n') {
// nextch_lineno += 1; nextch_col = 0;
// } else nextch_col += 1;
if (nextch_pos == source_length) peek = EOF; else peek = source_address(nextch_pos);//+1
}
void em_char_in_stmnt(int line) {
// (a little nod to the past...)
fprintf(stderr, "EM CHAR IN STMNT - DISASTER (detected at parser line %d)\n", line); // stop-gap measure - needs a longjump to end of compile code to clean up
exit(EXIT_FAILURE);
}
#define em_char_in_stmnt() em_char_in_stmnt(__LINE__)
void line_reconstruction (void) {
int (*acceptable_range)(int ch);
// adds one more item to c[] array.
// It is up to the caller to keep calling until sufficient data is loaded.
static int token_nextfree, startcol;
static int token_arraysize = 0;
static char *token = NULL;
#define token(x) FLEX(token, x, __LINE__)
static int init=0;
// Recognise single and double-quoted strings
// while removing comments and handling line continuation.
// The c[] array gets the canonicalised version of the text, BUT retains
// pointers to the original text including whitespace, which comes in
// handy for any source-to-source transformations such as indent (pretty printing)
// or transpiling to another language (eg C to Javascript)
if (init == 0) {
at_startofline = TRUE;
makespace(token,1024); // quick hack only for test.imp fix as soon as lexer is written.
init = 1;
}
/* Pre-process input ready for parsing. Tokens are stored in array c[] */
// NOTE: we are not yet handling include files...
for (;;) {
startcol = nextch_col;
token_nextfree = 0;
token_lineno = nextch_lineno;
token_col = nextch_col;
token_pos = nextch_pos; // actual text of token may start here but whitespace goes back to end of previous token.
if ((ch = xfgetc ()) == EOF) {
// set up a dummy at the end because we sometimes look ahead by 1
// in the parsing code and don't want to hit uninitialised data.
c[c_nextfree].f = curfile;
c[c_nextfree].sIndex = str_to_pool("<EOF>"); // however we want to be able to print B_EOF and have any blank lines
c[c_nextfree].t = B_EOF; // or comments before the end-of-file be output... so maybe "" would be better?
c[c_nextfree].lineno_sp = whitespace_lineno;
c[c_nextfree].col_sp = whitespace_col;
c[c_nextfree].start_sp = whitespace_pos;
c[c_nextfree].lineno_tok = nextch_lineno;
c[c_nextfree].col_tok = nextch_col;
c[c_nextfree].start_tok = nextch_pos;
c[c_nextfree].end_tok = nextch_pos;
c_nextfree++;
#ifdef NEVER
if (debug_tokens || debug_line_reconstruction) { // print tokens once parser hits end of file on input stream
int i;
fprintf (stderr, "\nLexical token stream:\n\n");
for (i = 0; i < c_nextfree; i++) {
fprintf (stderr, "C[%d].f=\"%s\", .lineno=%d, .col=%d, .type=%0d, .s=\"%s\" (at %p)\n",
i, c[i].f, c[i].lineno_tok, c[i].col_tok, c[i].t, escape(CString(i),'"'), CString(i));
}
}
#endif
return;
}
if ((ch == '+' || ch == '-' || ch == '&' || ch == '|' || ch == '=') && (peek == ch)) {
startline = FALSE;
/* ++, --, &&, || */
// only a very small number of compound graphemes need to be converted to lexical tokens (BIPs) to avoid parsing ambiguities
makespace (token, token_nextfree+4);
token(token_nextfree++) = ch;
token(token_nextfree++) = ch;
token(token_nextfree++) = '\0';
ch = xfgetc (); // eat 'peek' ch.
stores (token,
token_lineno, startcol, token_pos,
(ch=='+'?B_pp:ch=='-'?B_mm:ch=='&'?B_andand:ch=='='?B_eqeq:B_oror), curfile);
} else if (isalpha (ch) || (ch == '_')) {
startline = FALSE;
/* token or keyword */
for (;;) {
makespace (token, token_nextfree+1);
if (isalpha (ch) || isdigit (ch) || (ch == '_')) {
// digits allowed after 1st char.
token(token_nextfree++) = ch;
} else break;
ch = xfgetc ();
}
xungetc (ch, sourcefile);
token(token_nextfree++) = '\0';
stores (token,
token_lineno, startcol, token_pos,
B_internal_identifier, curfile);
} else if (((ch == '.') && isdigit(peek)) || (isdigit (ch))) { // -->0B.110101100P12L
startline = FALSE;
/* Number */
acceptable_range = &is_decimal;
if (ch == '0') { // must be binary, octal, or hex.
// Store as a string...
makespace (token, token_nextfree+3);
if (peek == 'b' || peek == 'B' || (peek == 'x') || (peek == 'X')) {
token(token_nextfree++) = ch; // '0'
acceptable_range = (peek == 'b' || peek == 'B' ? &is_binary : &is_hex);
ch = xfgetc (); // get the B
token(token_nextfree++) = ch; // b
if (peek == '.') {
goto READ_DECIMAL;
}
} else if (peek == '.') {
token(token_nextfree++) = ch; // '0'
// started with a 0 so octal unless it is 0.19 etc...
acceptable_range = &is_decimal;
goto READ_DECIMAL;
} else {
acceptable_range = &is_octal; // the 0 has been read. If nothing acceptable follows, don't read any more chars
token(token_nextfree++) = ch; // digit
}
} else if (ch == '.') { // .nnn or 0.nnn
acceptable_range = &is_decimal;
xungetc (ch, sourcefile); // peek is now '.'
goto READ_DECIMAL;
} else if (isdigit(ch)) {
acceptable_range = &is_decimal;
token(token_nextfree++) = ch; // digit
}
// rest of number comes through here for all formats
while (acceptable_range(peek)) { // 0B-->.110101100P12L
ch = xfgetc (); // what was peek is now ch
makespace (token, token_nextfree+2);
token(token_nextfree++) = ch; // 0
}
READ_DECIMAL:
if (peek == 'p' || peek == 'P') {
goto BINARY_POWER;
} else if (peek == '.' || peek == 'e' || peek == 'E') {
if (peek == '.') {
ch = xfgetc ();
token(token_nextfree++) = ch;
while (acceptable_range(peek)) {
ch = xfgetc (); // what was peek is now ch
makespace (token, token_nextfree+2);
token(token_nextfree++) = ch; // 0
}
}
if ((peek == 'f') || (peek == 'F') || (peek == 'l') || (peek == 'L')) {
ch = xfgetc ();
token(token_nextfree++) = ch;
} else if (((acceptable_range == is_decimal) && (peek == 'e' || peek == 'E'))
|| ((acceptable_range != is_decimal) && (peek == 'p' || peek == 'P'))) {
BINARY_POWER:
ch = xfgetc ();
token(token_nextfree++) = ch; // P
acceptable_range = &is_decimal; // P+nnn nnn is decimal
if ((peek == '+') || (peek == '-')) {
ch = xfgetc ();
token(token_nextfree++) = ch;
}
if (!acceptable_range(peek)) {
token(token_nextfree) = '\0';
fprintf(stderr, "Warning: bad floating point literal '%s'\n", token);
}
while (acceptable_range(peek)) { // SHOULD pick up P12L etc
makespace (token, token_nextfree+2);
ch = xfgetc (); // what was peek is now ch
token(token_nextfree++) = ch; // 0
}
if ((peek == 'f') || (peek == 'F') || (peek == 'l') || (peek == 'L')) {
ch = xfgetc ();
token(token_nextfree++) = ch;
}
}
} else if ((peek == 'u') || (peek == 'U')) {
ch = xfgetc ();
token(token_nextfree++) = ch;
} else if ((peek == 'l') || (peek == 'L')) {
ch = xfgetc ();
token(token_nextfree++) = ch;
if (ch == peek) { // 'll' or 'LL' allowed but not 'lL' or 'Ll'
ch = xfgetc ();
token(token_nextfree++) = ch;
}
}
makespace (token, token_nextfree+1);
token(token_nextfree++) = '\0';
if (strchr(token, '.') == NULL) {
stores (token,
token_lineno, startcol, token_pos,
B_integer_constant, curfile); // changed from col to startcol
} else {
stores (token,
token_lineno, startcol, token_pos,
B_floating_constant, curfile); // changed from col to startcol
}
} else {
switch (ch) {
case '\'': // Handle 'c' char const
case '"': // Handle "string"
/* literals */
startline = FALSE;
{
int string_nextfree = 0, string_arraysize = 0, quotech = ch;
char *string = NULL;
for (;;) {
ch = xfgetc (); // Newlines are allowed
makespace (string, string_nextfree+3);
if (ch == '\\') {
ch = xfgetc ();
if (ch == '\\') {
string[string_nextfree++] = ch;
} else if (ch == '\'') {
string[string_nextfree++] = '\'';
} else if (ch == '"') {
string[string_nextfree++] = '"';
} else if (ch == 'n') {
string[string_nextfree++] = '\n';
} else if (ch == 'r') {
string[string_nextfree++] = '\r';
} else if (ch == 't') {
string[string_nextfree++] = '\t';
} else if (ch == '0') {
string[string_nextfree++] = '\0';
} else {
// Warn of unknown (to me) \x escape. Probably an error.
string[string_nextfree++] = '\\';
string[string_nextfree++] = ch;
fprintf(stderr, "Warning: un-handled escape '\\%c'\n", ch);
}
} else if (ch != quotech) {
string[string_nextfree++] = ch;
} else {
string[string_nextfree] = '\0';
break;
}
}
if (quotech == '\'') {
int stringlen = 0;
char *s;
stores (string,
token_lineno, token_col, token_pos,
B_sqstring, curfile);
s = string;
while (*s != '\0') {
if (*s == '\\') s += 2; else s += 1;
stringlen += 1;
}
if (stringlen == 1) {
// good char const
} else if (stringlen <= 4) {
// Warn that 'xx' as a 32-bit int is a non-standard extension
fprintf(stderr, "Warning: multi-byte characters constants such as \'%s\' are not well supported.\n", string); // don't escape - already escaped
} else {
// Warn that this is probably a string with the wrong type of quote.
fprintf(stderr, "Warning: wrong kind of string quote used? - \'%s\'.\n", string); // don't escape - already escaped
}
} else stores (string,
token_lineno, token_col, token_pos,
B_dqstring, curfile);
free (string);
}
break;
case '/':
/* COMMENTS (or just a divide symbol) */
startline = FALSE;
if (peek == '/') { // comment to end of line
// '//' comments are part of whitespace - skip over them, but whitespace_start must not be updated
do {
ch = xfgetc ();
} while (ch != '\n');
} else if (peek == '*') { /* Handle potential multi-line comment */
ch = xfgetc (); // '/* ... */' comments are part of whitespace - skip over them, but whitespace_start must not be updated
for (;;) {
ch = xfgetc ();
if ((ch == '*') && (peek == '/')) {
ch = xfgetc (); // get the '/' too.
break;
}
}
} else {
// a divide symbol, not a comment
storec (ch,
token_lineno, token_col-1, token_pos,
B_char, curfile);
}
break;
// WHITESPACE
case '\n':
case '\r':
startline = TRUE;
break;
case '\t':
case ' ':
// white space does not affect startline.
break;
// DIRECTIVES
case '#':
// should allow '#' as a directive if startline is true. (and not in a comment of course)
// for now we'll just copy # directives straight through to the output
if (startline) {
int i = 0, lastch = -1;
static char CPP_directive[256]; // small for now. Will make flex once tested.
for (;;) {
ch = xfgetc ();
if (ch == EOF) exit(1);// fix
ch &= 255;
if ((ch == '\n') && (lastch != '\\')) { // e.g. #define X some multiline text with '\' at the end of all lines except the last
break;
}
CPP_directive[i++] = ch;
lastch = ch;
if (i == 255) {
// skip any more text for now.
for (;;) {
ch = xfgetc ();
if (ch == EOF) exit(1);// fix
ch &= 255;
if (ch == '\n' && lastch != '\\') {
break;
}
lastch = ch;
}
break;
}
}
CPP_directive[i] = '\0';
fprintf(stderr, "Warning: pre-processor directive skipped: #%s\n", CPP_directive);
ch = '\n';
startcol = nextch_col;
startline = TRUE;
break;
}
// Drop through
default:
storec (ch,
token_lineno, token_col, token_pos,
B_char, curfile);
startline = FALSE;
}
}
return;
}
}
// For reasons to be explained later, the code in the on-demand source-fetching
// must not use the flex mechanism if there is any possibility of 'c' itself
// being reallocated while evaluating the index of a c(x) reference. Space for
// tokens in the c[] array must have been pre-claimed by makespace before a
// non-existent c(x) entry is accessed. Hence c() was not defined by the
// usual FLEX macro and the use of String() within the line reconstruction
// and any code that it calls (eg stores()) must use c[] rather than c(), and
// the alternative CString() ...