/* # syntax for a very small subset of C as generated by the static binary translator # after running through both 'dumpf' and 'peephole'... # Note that expressions must be fully bracketed - no operator precedences allowed. # If we hit constructs that are too awkward to parse, go back to the code generator # and modify it so that they are never generated in the first place... # For instance, "a + b + c" would need to be rewritten as "(a + b) + c" */ #include <stdio.h> #include <string.h> #include <stdlib.h> #include <ctype.h> #include <regex.h> #ifndef FALSE #define FALSE (0!=0) #define TRUE (0==0) #endif char *emergency; static int debug = 0; #define MAX_LINES 100000 #define MAX_LINE 1024 // entire program is read into this array. We could do it a line at a time but // this leaves options open for later. char *prog_line[MAX_LINES]; // max lines in a generated program int end_of_file = 0; // exclusive upper bound - line after last used line // for parsing: int cur_line = 0; int cur_col = 0; // for syntax failure message: int best_line = 0; int best_col = 0; void preload_entire_source(FILE *in) { // read entire program before starting. char s[1024]; // max length of a single line... int c; int cp; // char position for (;;) { // loop over lines if (end_of_file == MAX_LINES) { fprintf(stderr, "parse: too many input lines for program to store. Ask the programmer to increase MAX_LINES\n"); exit(EXIT_FAILURE); } cp = 0; for (;;) { // loop over chars in a line c = fgetc(in); if (c == EOF) { if (cp == 0) return; s[cp] = '\0'; prog_line[end_of_file++] = strdup(s); // to do: check malloc ok return; } if (c == '\r') continue; if (c == '\n') { s[cp] = '\0'; prog_line[end_of_file++] = strdup(s); // to do: check malloc ok break; // next line } s[cp++] = c; if (cp == MAX_LINE) { fprintf(stderr, "parse: A single input line (%d) was too long. Ask the programmer to increase MAX_LINE\n", end_of_file); exit(EXIT_FAILURE); } } } } enum phrase { NONE, SS, SS__COMMENT, SS__BR_LIST, SS__CASELAB, SS__LAB, SS__ASSIG, SS__STRINGPROCCALL, SS__PROCCALL, SS__IF, SS__GOTO, SS__JUMP, SS__NULL, SS_LIST, IDENT, LABEL, VAR, VALUE, VAL__COND_EXPR, VAL__BR_EXPR, OPT_INDEX, FNNAME, PROCNAME, PARAM_LIST, REST_OF_PARAM_LIST, EXPR, FNCALL, TYPED_VAR, TYPED_VAR__CAST, TYPED_VAR__DEFAULT, OPT_SECOND_OPERAND, OPT_SECOND_OPERAND__PRESENT, OPT_SECOND_OPERAND__ABSENT, OPT_MONOP, CASTNAME, BINOP, SELECTOR, HEX_CONSTANT, POSITIVE_INTEGER, STRING_LITERAL }; // Tuples are the nodes of the AST (Abstract Syntax Tree). typedef struct tuple { int phraseno; int subphrase; // might not need this. tuple format is under development. int line; int col; char *matched; struct tuple *phrase[10]; // max args to a proc, for safety. } tuple; tuple *new_tuple(int type) { // currently we never free anything. RAM is cheap. If we really needed to save // space, a mark/release scheme would be more efficient than calloc/free, // though just handling a line at a time would be much better... tuple *t; t = calloc(1, sizeof(tuple)); // we rely on remaining fields being initialised to 0's. t->phraseno = type; return t; } // indentation for parser tracing. static int indent = 0; char *sp(void) { static char string[1024]; char *p = string; int i, ind=indent; // this hack is because parsing an entire program would lead to huge indents. if (ind > 6000) { int hashes = (ind-40)/2000; for (i = 0; i < hashes; i++) *p++ = '#'; ind -= hashes*2000; } if (ind > 600) { int ats = (ind-40)/200; for (i = 0; i < ats; i++) *p++ = '@'; ind -= ats*200; } if (ind > 60) { int plusses = (ind-40)/20; for (i = 0; i < plusses; i++) *p++ = '+'; ind -= plusses*20; } for (i = 0; i < ind; i++) *p++ = ' '; *p = '\0'; return string; } void in(const char *s) { // called on entry to a parsing procedure if debugging int i; char *p; // skip spaces and newlines before printing text that follows cur pos. // We actually update the current position - it's not just for the diagnostic message. // - this may be a bad design decision... should try a version where cur_* is restored. p = prog_line[cur_line]+cur_col; for (;;) { if (cur_line >= end_of_file) break; while (*p == ' ') { p += 1; cur_col += 1; } // skip newlines as well as spaces if (*p == '\0') { cur_line += 1; cur_col = 0; if (cur_line > best_line) { best_line = cur_line; best_col = cur_col; } if ((cur_line == best_line) && (cur_col > best_col)) best_col = cur_col; p = prog_line[cur_line]+cur_col; } else break; } if (debug <= 0) return; //for (i = 0; i < indent; i++) fprintf(stderr, " "); fprintf(stderr, "%s>> %s at: \"%s\"\n", sp(), s, (cur_line >= end_of_file ? "<EOF>" : p)); indent += 2; } #define IN in(__FUNCTION__) int out(int success, const char *s) { // called on exit from a parsing procedure if debugging //int i; int line = cur_line, col = cur_col; char *p; p = prog_line[cur_line]+cur_col; indent -= 2; //for (i = 0; i < indent; i++) fprintf(stderr, " "); if (success) { for (;;) { if (cur_line >= end_of_file) break; while (*p == ' ') { p += 1; cur_col += 1; } // always skip white space on this line before an atom (OK if we leave it skipped) // skip newlines as well as spaces if (*p == '\0') { cur_line += 1; cur_col = 0; if (cur_line > best_line) { best_line = cur_line; best_col = cur_col; } if ((cur_line == best_line) && (cur_col > best_col)) best_col = cur_col; p = prog_line[cur_line]+cur_col; } else break; } if (debug >= 1) fprintf(stderr, "%s<< %s MATCHED leaving: \"%s\"\n", sp(), s, (cur_line == end_of_file ? "<EOF>" : p)); } else { if (debug >= 1) fprintf(stderr, "%s<< %s\n", sp(), s); } cur_line = line; cur_col = col; return success; } #define OUT(cond) out(cond, __FUNCTION__) int parse_ss_list(tuple *ss_list); int parse_selector(tuple *sel); int parse_rest_of_param_list(tuple *param_list); int parse_expr(tuple *expr); int parse_positive_integer(tuple *posint); int parse_hex_constant(tuple *hex); int parse_procname(tuple *fnname); int parse_fnname(tuple *procname); int parse_param_list(tuple *param_list); int parse_cond(tuple *cond); // match a literal string against source at current position. increment // the current position if a match. Leave untouched otherwise. int lex_lit(char *lit, char **matched) { char *p = prog_line[cur_line]+cur_col; IN; if (matched) *matched = NULL; if (debug >= 1) fprintf(stderr, "%sLex: Trying \"%s\"\n", sp(), lit); if (cur_line > best_line) { best_line = cur_line; best_col = cur_col; } if ((cur_line == best_line) && (cur_col > best_col)) best_col = cur_col; if (*lit == '\0') return OUT(TRUE); // empty string always matches for (;;) { if (cur_line >= end_of_file) return OUT(FALSE); // non-empty string can't match end of file p = prog_line[cur_line]+cur_col; while (*p == ' ') { // always skip white space on this line before an atom (OK if we leave it skipped) p += 1; cur_col += 1; } if (strcmp(lit, "\n") == 0) { // special case for end-of-line (should always be only char in string) if (*p == '\0') { cur_line += 1; cur_col = 0; if (cur_line > best_line) { best_line = cur_line; best_col = cur_col; } if ((cur_line == best_line) && (cur_col > best_col)) best_col = cur_col; if (matched) *matched = lit; return OUT(TRUE); } return OUT(FALSE); } else { // skip newlines as well as spaces if (*p == '\0') { cur_line += 1; cur_col = 0; if (cur_line > best_line) { best_line = cur_line; best_col = cur_col; } if ((cur_line == best_line) && (cur_col > best_col)) best_col = cur_col; p = prog_line[cur_line]+cur_col; if (debug >= 1) fprintf(stderr, "%s+Lex: Trying \"%s\"\n", sp(), lit); } else break; } } p = prog_line[cur_line]+cur_col; if (strncmp(lit, p, strlen(lit)) == 0) { cur_col += strlen(lit); if (cur_line > best_line) { best_line = cur_line; best_col = cur_col; } if ((cur_line == best_line) && (cur_col > best_col)) best_col = cur_col; if (matched) *matched = lit; return OUT(TRUE); } return OUT(FALSE); } // match regex against source at current position. increment the current // position if a match. Leave untouched otherwise. int lex_regex(char *regexs, char **matched) { #define MAX_MATCHES 32 // or is this always 1 in this code? // when a regex matches something in the source, this extracts the matched text as a string auto char *extract(char *s, int start /* inclusive */, int end /* exclusive */) { char *t = malloc(end-start+1); int i; if (t == NULL) { free(emergency); fprintf(stderr, "parse: running out of memory. Programmer may need to restructure the code.\n"); fflush(stderr); fflush(stdout); exit(EXIT_FAILURE); } for (i = 0; i < end-start; i++) t[i] = s[i+start]; t[end-start] = '\0'; return t; } regmatch_t matches[MAX_MATCHES]; regex_t regex; int reti; char msgbuf[100]; char *p = prog_line[cur_line]+cur_col; IN; if (cur_line > best_line) { best_line = cur_line; best_col = cur_col; } if ((cur_line == best_line) && (cur_col > best_col)) best_col = cur_col; if (debug >= 1) fprintf(stderr, "%sRegex 1: pending data is: \"%s\"\n", sp(), prog_line[cur_line]+cur_col); if (debug >= 1) fprintf(stderr, "%sTrying \"%s\"\n", sp(), regexs); if (matched == NULL) { fprintf(stderr, "parse: programmer error - 'matched' parameter to lex_regex was passed a null pointer\n"); exit(EXIT_FAILURE); } if (*regexs == '\0') return OUT(TRUE); // empty string always matches for (;;) { p = prog_line[cur_line]+cur_col; if (cur_line >= end_of_file) return OUT(FALSE); // non-empty string can't match end of file while (*p == ' ') { p += 1; cur_col += 1; if (debug >= 1) fprintf(stderr, "%sRegex 2: pending data is: \"%s\"\n", sp(), prog_line[cur_line]+cur_col); } // always skip white space on this line before an atom (OK if we leave it skipped) if (strcmp(regexs, "^$") == 0) { // special case for end-of-line (should always be only char in string) if (*p == '\0') { cur_line += 1; cur_col = 0; if (cur_line > best_line) { best_line = cur_line; best_col = cur_col; } if ((cur_line == best_line) && (cur_col > best_col)) best_col = cur_col; if (debug >= 1) fprintf(stderr, "%sRegex 3: pending data is: \"%s\"\n", sp(), prog_line[cur_line]+cur_col); return OUT(TRUE); } if (debug >= 1) fprintf(stderr, "%sRegex 4: pending data is: \"%s\"\n", sp(), prog_line[cur_line]+cur_col); return OUT(FALSE); } else { // skip newlines as well as spaces if (*p == '\0') { cur_line += 1; cur_col = 0; if (cur_line > best_line) { best_line = cur_line; best_col = cur_col; } if ((cur_line == best_line) && (cur_col > best_col)) best_col = cur_col; p = prog_line[cur_line]+cur_col; if (debug >= 1) fprintf(stderr, "%s+Trying \"%s\"\n", sp(), regexs); if (debug >= 1) fprintf(stderr, "%sRegex 5: pending data is: \"%s\"\n", sp(), prog_line[cur_line]+cur_col); } else break; } } /* Compile regular expression */ if (regcomp(®ex, regexs, REG_EXTENDED) != 0) { free(emergency); fprintf(stderr, "parse: programmer error - could not compile regex \"%s\" - we may have run out of malloc memory\n", regexs); fflush(stderr); fflush(stdout); exit(EXIT_FAILURE); } /* Execute regular expression */ reti = regexec(®ex, p, MAX_MATCHES, matches, 0); if (debug >= 1) fprintf(stderr, "%sRegex 6: pending data is: \"%s\"\n", sp(), prog_line[cur_line]+cur_col); if ((!reti) && (matches[0].rm_so == 0)) { // 0 is in case user forgets the "^" anchor ... // Step over matched string. if (debug >= 1) fprintf(stderr, "%sRegex: pending data is: \"%s\"\n", sp(), prog_line[cur_line]+cur_col); *matched = extract(p, matches[0].rm_so, matches[0].rm_eo); cur_col += strlen(*matched); //cur_col += matches[0].rm_eo-matches[0].rm_so-1; if (cur_line > best_line) { best_line = cur_line; best_col = cur_col; } if ((cur_line == best_line) && (cur_col > best_col)) best_col = cur_col; p = prog_line[cur_line]+cur_col; if (debug >= 1) fprintf(stderr, "%sRegex \"%s\" matched \"%s\" leaving \"%s\"\n", sp(), regexs, *matched, p); /* Free memory allocated to the pattern buffer by regcomp() */ // regfree(®ex); return OUT(TRUE); } else if (reti == REG_NOMATCH) { // No match /* Free memory allocated to the pattern buffer by regcomp() */ // regfree(®ex); // maybe? - http://redhat.com/archives/libvir-list/2013-September/msg00276.html return OUT(FALSE); } free(emergency); regerror(reti, ®ex, msgbuf, sizeof(msgbuf)); fprintf(stderr, "parse: Regex match internal error: %s\n", msgbuf); fflush(stderr); fflush(stdout); exit(EXIT_FAILURE); return OUT(FALSE); } // Parse source file using these phrases from the grammar. Advance the current position // if there is a match - don't move it if there is not. //\\ stringlit := "\"(\\\\.|[^\"])*\""; int parse_stringlit(tuple *stringlit) { return lex_regex("\"(\\\\.|[^\"])*\"", &stringlit->matched); // This regex is why we need regcomp to use REG_EXTENDED. // Simpler regex below does not support \-escaped characters in a string. // return lex_regex("^\".*\"", &stringlit->matched); } //\\ castname := "[A-Z][A-Z]*[0-9][0-9]*"; int parse_castname(tuple *castname) { // SBT casts include some semantic content: [US]INT<bitsize> return lex_regex("^[A-Z][A-Z]*[0-9][0-9]*", &castname->matched); } //\\ ident := "[A-Za-z_][A-Za-z_0-9]*"; int parse_ident(tuple *ident) { ident->phraseno = IDENT; // override label, etc return lex_regex("^[A-Za-z_][A-Za-z_0-9]*", &ident->matched); } //\\ label := <ident>; int parse_label(tuple *labelname) { return parse_ident(labelname); } //\\ <opt-index> := //\\ '[' <expr> ']', //\\ ; int parse_opt_index(tuple *opt_index) { int line = cur_line, col = cur_col; // save current parse position for backtracking... IN; if (lex_lit("[", NULL) && parse_expr(opt_index) && lex_lit("]", NULL)) { opt_index->phraseno = OPT_INDEX; return OUT(TRUE); } else { cur_line = line; cur_col = col; } opt_index->phraseno = NONE; return OUT(TRUE); } int parse_fn_call(tuple *fncall) { // should create an opt-param-list phrase... //\\ <fnname> '(' ')', //\\ <fnname> '(' <param-list> ')', int line = cur_line, col = cur_col; // save current parse position for backtracking... tuple *fnname = new_tuple(FNNAME), *param_list = new_tuple(PARAM_LIST); IN; fncall->phraseno = FNCALL; param_list->phrase[0] = NULL; if (parse_fnname(fnname) && lex_lit("(", NULL) && ((lex_lit(")", NULL)) || (parse_param_list(param_list) && lex_lit(")", NULL)))) { // TO DO: plug in procname and param_list fields here... if (param_list->phrase[0] == NULL) { fncall->phrase[1] = NULL; } else { fncall->phrase[1] = param_list; } fncall->phrase[0] = fnname; return OUT(TRUE); } else { cur_line = line; cur_col = col; } return OUT(FALSE); } // var can be a LVALUE (as opposed to 'val' which is an RVALUE, but can include a var) //\\ var := //\\ <ident> <opt-index>; int parse_var(tuple *var) { int line = cur_line, col = cur_col; // save current parse position for backtracking... tuple *ident = new_tuple(IDENT), *opt_index = new_tuple(OPT_INDEX); IN; if (parse_ident(ident) && parse_opt_index(opt_index)) { var->phraseno = VAR; var->phrase[0] = ident; var->phrase[1] = opt_index; // NONE if this is a simple variable, OPT_INDEX if an array element return OUT(TRUE); // Simple variables are things like registers, CC etc - the only } // indexed element at the moment is memory[] ... cur_line = line; cur_col = col; return OUT(FALSE); } // Could factor the ( ... ? ... : ... ) vs ( ... ) more efficiently... // val is an RVALUE //\\ val := //\\ '(' <expr> '?' <expr> : <expr> ')', //\\ '(' <expr> ')', //\\ <fncall>, //\\ <var>, //\\ <hex-constant>, //\\ <positive-integer>; int parse_val(tuple *val) { int line = cur_line, col = cur_col; // save current parse position for backtracking... tuple *fncall = new_tuple(FNCALL), *expr = new_tuple(EXPR),*expr1 = new_tuple(EXPR),*expr2 = new_tuple(EXPR); // to do: assign subtypes IN; if (lex_lit("(", NULL) && parse_expr(expr) && lex_lit("?", NULL) && parse_expr(expr1) && lex_lit(":", NULL) && parse_expr(expr2) && lex_lit(")", NULL)) { val->subphrase = VAL__COND_EXPR; val->phrase[0] = expr; val->phrase[1] = expr1; val->phrase[2] = expr2; return OUT(TRUE); } else { cur_line = line; cur_col = col; } if (lex_lit("(", NULL) && parse_expr(expr) && lex_lit(")", NULL)) { val->subphrase = VAL__BR_EXPR; val->phrase[0] = expr; return OUT(TRUE); } else { cur_line = line; cur_col = col; } if (parse_fn_call(val) || parse_var(val) || parse_hex_constant(val) || parse_positive_integer(val)) { return OUT(TRUE); } return OUT(FALSE); } //\\ opt-monop := //\\ '-', '~', '!' //\\ ; int parse_opt_monop(tuple *opt_monop) { IN; if (lex_lit("-", &opt_monop->matched) || lex_lit("~", &opt_monop->matched) || lex_lit("!", &opt_monop->matched)) { return OUT(TRUE); } opt_monop->matched = ""; return OUT(TRUE); } // could modify grammar to handle multiple casts here. // Currently redundant cast removal is a nasty hack in peephole() in the SBT. //\\ typed-val := //\\ '(' <castname> ')' <opt-monop> <val>, //\\ <opt-monop> <val>; int parse_typed_var(tuple *typed_var) { int line = cur_line, col = cur_col; // save current parse position for backtracking... tuple *castname = new_tuple(CASTNAME), *opt_monop = new_tuple(OPT_MONOP), *val = new_tuple(VALUE); // could shorten this... IN; if (lex_lit("(", NULL) && parse_castname(castname) && lex_lit(")", NULL) && parse_opt_monop(opt_monop) && parse_val(val)) { typed_var->subphrase = TYPED_VAR__CAST; typed_var->phrase[0] = castname; typed_var->phrase[1] = opt_monop; typed_var->phrase[2] = val; return OUT(TRUE); } else { cur_line = line; cur_col = col; } if (parse_opt_monop(opt_monop) && parse_val(val)) { typed_var->subphrase = TYPED_VAR__DEFAULT; typed_var->phrase[0] = NULL; typed_var->phrase[1] = opt_monop; typed_var->phrase[2] = val; return OUT(TRUE); } else { cur_line = line; cur_col = col; } return OUT(FALSE); } //\\ binop := //\\ '>>', '<<', '+', '*', '/', '-', '&&', '||', '&', '|', '^', '<=', '<', '>=', '>', '!=', '=='; int parse_binop(tuple *binop) { IN; if (lex_lit(">>", &binop->matched) || lex_lit("<<", &binop->matched) || lex_lit("+", &binop->matched) || lex_lit("*", &binop->matched) || lex_lit("/", &binop->matched) || lex_lit("-", &binop->matched) || lex_lit("&&", &binop->matched) || lex_lit("||", &binop->matched) || lex_lit("&", &binop->matched) || lex_lit("|", &binop->matched) || lex_lit("^", &binop->matched) || lex_lit("<=", &binop->matched) || lex_lit("<", &binop->matched) || lex_lit(">=", &binop->matched) || lex_lit(">", &binop->matched) || lex_lit("!=", &binop->matched) || lex_lit("==", &binop->matched)) { return OUT(TRUE); } return OUT(FALSE); } //\\ opt-second_operand := //\\ <binop> <typed-val>, //\\ ; int parse_opt_second_operand(tuple *opt_second_operand) { int line = cur_line, col = cur_col; // save current parse position for backtracking... tuple *binop = new_tuple(BINOP), *typed_var = new_tuple(TYPED_VAR); IN; if (parse_binop(binop) && parse_typed_var(typed_var)) { opt_second_operand->subphrase = OPT_SECOND_OPERAND__PRESENT; opt_second_operand->phrase[0] = binop; opt_second_operand->phrase[1] = typed_var; return OUT(TRUE); } else { cur_line = line; cur_col = col; } opt_second_operand->subphrase = OPT_SECOND_OPERAND__ABSENT; opt_second_operand->phrase[0] = NULL; opt_second_operand->phrase[1] = NULL; return OUT(TRUE); } //\\ expr := //\\ <typed-val> <opt-second_operand>; int parse_expr(tuple *expr) { int line = cur_line, col = cur_col; // save current parse position for backtracking... tuple *typed_var = new_tuple(TYPED_VAR), *opt_second_operand = new_tuple(OPT_SECOND_OPERAND); IN; if (parse_typed_var(typed_var) && parse_opt_second_operand(opt_second_operand)) { expr->phrase[0] = typed_var; expr->phrase[1] = opt_second_operand; return OUT(TRUE); } else { cur_line = line; cur_col = col; } return OUT(FALSE); } //\\ fnname := <ident>; int parse_fnname(tuple *fnname) { return parse_ident(fnname); } //\\ procname := <ident>; int parse_procname(tuple *procname) { return parse_ident(procname); } //\\ param-list := <expr> <rest-of-param-list>; int parse_param_list(tuple *param_list) { int line = cur_line, col = cur_col; // save current parse position for backtracking... tuple *expr = new_tuple(EXPR), *rest_of_param_list = new_tuple(REST_OF_PARAM_LIST); IN; if (parse_expr(expr) && parse_rest_of_param_list(rest_of_param_list)) { param_list->phrase[0] = expr; param_list->phrase[1] = rest_of_param_list; return OUT(TRUE); } else { cur_line = line; cur_col = col; } return OUT(FALSE); } //\\ rest-of-param-list := //\\ ',' <expr> <rest-of-param-list>, //\\ ; int parse_rest_of_param_list(tuple *param_list) { int line = cur_line, col = cur_col; // save current parse position for backtracking... tuple *expr = new_tuple(EXPR), *rest_of_param_list = new_tuple(REST_OF_PARAM_LIST); IN; if (lex_lit(",", NULL) && parse_expr(expr) && parse_rest_of_param_list(rest_of_param_list)) { param_list->phrase[0] = expr; param_list->phrase[1] = rest_of_param_list; return OUT(TRUE); } else { cur_line = line; cur_col = col; } param_list->phrase[0] = NULL; param_list->phrase[1] = NULL; return OUT(TRUE); } //\\ ss := int parse_ss(tuple *ss) { auto int parse_comment(tuple *comment) { // Grammar should really have a \n after the comment, but due to the way \n is // handled, it works without one. //\\ "//.*", '/**/'; int line = cur_line, col = cur_col; IN; comment->subphrase = SS__COMMENT; comment->matched = "/**/"; if (lex_regex("^//.*", &comment->matched) || lex_lit("/**/", NULL)) { return OUT(TRUE); } else { cur_line = line; cur_col = col; } return OUT(FALSE); } auto int parse_bracketed_ss_list(tuple *bss) { //\\ '{' <ss-list> '}', tuple *ssl = new_tuple(SS__BR_LIST); // INVALID! should create a top-level BR_LIST int line = cur_line, col = cur_col; IN; bss->subphrase = SS__BR_LIST; if (lex_lit("{", NULL) && parse_ss_list(ssl) && lex_lit("}", NULL)) { bss->phrase[0] = ssl; return OUT(TRUE); } else { cur_line = line; cur_col = col; } return OUT(FALSE); } auto int parse_case_label(tuple *caselab) { //\\ "case" <selector> ':', tuple *sel = new_tuple(SELECTOR); int line = cur_line, col = cur_col; IN; caselab->subphrase = SS__CASELAB; if (lex_lit("case", NULL) && parse_selector(sel) && lex_lit(":", NULL)) { caselab->phrase[0] = sel; return OUT(TRUE); } else { cur_line = line; cur_col = col; } return OUT(FALSE); } auto int parse_label_statement(tuple *lab_statement) { //\\ <label> ':', tuple *lab = new_tuple(LABEL); int line = cur_line, col = cur_col; IN; lab_statement->subphrase = SS__LAB; if (parse_label(lab) && lex_lit(":", NULL)) { lab_statement->phrase[0] = lab; return OUT(TRUE); } else { cur_line = line; cur_col = col; } return OUT(FALSE); } auto int parse_assignment(tuple *assig) { // now handling &= in SBT code generator instead, so removing from here... //\\ <var> '=' <expr> ';', tuple *var = new_tuple(VAR), *expr = new_tuple(EXPR); int line = cur_line, col = cur_col; IN; assig->subphrase = SS__ASSIG; if (parse_var(var) && lex_lit("=", NULL) && parse_expr(expr) && lex_lit(";", NULL)) { assig->phrase[0] = var; assig->phrase[1] = expr; return OUT(TRUE); } else { cur_line = line; cur_col = col; } return OUT(FALSE); } auto int parse_procedure_call(tuple *proccall) { //\\ <procname> '(' ')' ';', //\\ <procname> '(' <param-list> ')' ';', tuple *procname = new_tuple(PROCNAME), *param_list = new_tuple(PARAM_LIST); int line = cur_line, col = cur_col; IN; param_list->phrase[0] = NULL; proccall->subphrase = SS__PROCCALL; if (parse_procname(procname) && lex_lit("(", NULL) && ((lex_lit(")", NULL)) || (parse_param_list(param_list) && lex_lit(")", NULL))) && lex_lit(";", NULL)) { proccall->phrase[0] = procname; if (param_list->phrase[0] != NULL) { proccall->phrase[1] = param_list; } else { proccall->phrase[1] = NULL; } return OUT(TRUE); } else { cur_line = line; cur_col = col; } return OUT(FALSE); } // this is really just a special case for mon(" ... "); - strings not used anywhere else in SBT output. auto int parse_string_procedure_call(tuple *proccall) { //\\ <procname> '(' <stringlit> ')' ';', tuple *procname = new_tuple(PROCNAME), *stringlit = new_tuple(STRING_LITERAL); int line = cur_line, col = cur_col; IN; proccall->subphrase = SS__STRINGPROCCALL; if (parse_procname(procname) && lex_lit("(", NULL) && parse_stringlit(stringlit) && lex_lit(")", NULL) && lex_lit(";", NULL)) { // TO DO: plug in procname and param_list fields here... proccall->phrase[0] = procname; proccall->phrase[1] = stringlit; return OUT(TRUE); } else { cur_line = line; cur_col = col; } return OUT(FALSE); } auto int parse_if_statement(tuple *ifstat) { // changing SS to <bracketed-ss-list> as an experiment //\\ "if" '(' <expr> ')' <SS>, tuple *expr = new_tuple(EXPR), *ss = new_tuple(SS); int line = cur_line, col = cur_col; IN; ifstat->subphrase = SS__IF; //if (lex_lit("if", NULL) && lex_lit("(", NULL) && parse_cond(cond) && lex_lit(")", NULL) && parse_ss(ss)) { if (lex_lit("if", NULL) && lex_lit("(", NULL) && parse_expr(expr) && lex_lit(")", NULL) && parse_bracketed_ss_list(ss)) { // TO DO: plug in cond and ss fields here... ifstat->phrase[0] = expr; ifstat->phrase[1] = ss; return OUT(TRUE); } else { cur_line = line; cur_col = col; } return OUT(FALSE); } //\\ "goto" <label> ';', auto int parse_goto_statement(tuple *g) { tuple *lab = new_tuple(LABEL); int line = cur_line, col = cur_col; IN; g->subphrase = SS__GOTO; if (lex_lit("goto", NULL) && parse_label(lab) && lex_lit(";", NULL)) { g->phrase[0] = lab; return OUT(TRUE); } else { cur_line = line; cur_col = col; } return OUT(FALSE); } //\\ "JUMP" ';', auto int parse_jump_statement(tuple *g) { int line = cur_line, col = cur_col; IN; g->subphrase = SS__JUMP; if (lex_lit("JUMP", NULL) && lex_lit(";", NULL)) { return OUT(TRUE); } else { cur_line = line; cur_col = col; } return OUT(FALSE); } //\\ ';'; auto int parse_null_statement(tuple *ns) { ns->subphrase = SS__NULL; return lex_lit(";", NULL); } IN; if (parse_comment(ss) || parse_bracketed_ss_list(ss) || parse_case_label(ss) || parse_label_statement(ss) || parse_if_statement(ss) || parse_goto_statement(ss) || parse_procedure_call(ss) || parse_string_procedure_call(ss) || parse_jump_statement(ss) || parse_assignment(ss) || parse_null_statement(ss)) { return OUT(TRUE); } return OUT(FALSE); } //\\ ss-list := //\\ <ss> <ss-list>, //\\ ; int parse_ss_list(tuple *ss_list) { tuple *ss = new_tuple(SS), *more_ss = new_tuple(SS_LIST); int line = cur_line, col = cur_col; IN; ss_list->phrase[0] = NULL; ss_list->phrase[1] = NULL; // a guard... these are the only ways an SS list ends other than in a syntax error... if (lex_lit("}", NULL) || (cur_line >= end_of_file)) { cur_line = line; cur_col = col; return OUT(TRUE); // null option } if (parse_ss(ss) && parse_ss_list(more_ss)) { // the 'more ss' may be empty ss_list->phrase[0] = ss; ss_list->phrase[1] = more_ss; return OUT(TRUE); } else { cur_line = line; cur_col = col; } // a syntax error. 'best_pos' will allow us to show the exact point of failure. return OUT(FALSE); } //\\ hex-constant := "0x[0-9A-Fa-f][0-9A-Fa-f]*[Uu]?"; int parse_hex_constant(tuple *hex) { hex->phraseno = HEX_CONSTANT; return lex_regex("^0x[0-9A-Fa-f][0-9A-Fa-f]*[Uu]?", &hex->matched); // regcomp needs REG_EXTENDED format option // ideally we need to store this as an integer as well, plus cast type information } //\\ positive-integer := "[0-9][0-9]*"; int parse_positive_integer(tuple *posint) { posint->phraseno = POSITIVE_INTEGER; return lex_regex("^[0-9][0-9]*", &posint->matched); // didn't add U or L as we never use it in SBT-generated code // negative numbers are built as an expression using monop '-' } //\\ integer := int parse_integer(tuple *num) { tuple *hex = new_tuple(HEX_CONSTANT), *posint = new_tuple(POSITIVE_INTEGER); IN; //\\ <hex-constant>, if (parse_hex_constant(hex)) { num->subphrase = HEX_CONSTANT; num->phrase[0] = hex; return OUT(TRUE); } //\\ <positive-integer>; if (parse_positive_integer(posint)) { num->subphrase = POSITIVE_INTEGER; num->phrase[0] = posint; return OUT(TRUE); } return OUT(FALSE); } //\\ selector := <integer>; int parse_selector(tuple *sel) { return parse_integer(sel); } void p(tuple *t) { if (t == NULL) return; switch (t->phraseno) { case NONE: break; case SS__BR_LIST: p(t->phrase[0]); break; case SS: switch (t->subphrase) { case SS__COMMENT: if (strcmp(t->matched, "/**/") == 0) { fprintf(stdout, " /**/"); } else { fprintf(stdout, " %s\n", t->matched); } break; case SS__BR_LIST: fprintf(stdout, "{\n"); p(t->phrase[0]); fprintf(stdout, " }\n"); break; case SS__CASELAB: fprintf(stdout, " case "); p(t->phrase[0]); fprintf(stdout, ":\n"); break; case SS__LAB: p(t->phrase[0]); fprintf(stdout, ":\n"); break; case SS__ASSIG: fprintf(stdout, " "); p(t->phrase[0]); // var fprintf(stdout, " = "); p(t->phrase[1]); // expr fprintf(stdout, ";\n"); break; case SS__STRINGPROCCALL: fprintf(stdout, " "); p(t->phrase[0]); fprintf(stdout, "("); p(t->phrase[1]); fprintf(stdout, ");\n"); break; case SS__PROCCALL: fprintf(stdout, " "); p(t->phrase[0]); fprintf(stdout, "("); p(t->phrase[1]); fprintf(stdout, ");\n"); break; case SS__IF: fprintf(stdout, " if ("); p(t->phrase[0]); fprintf(stdout, ") "); p(t->phrase[1]); break; case SS__GOTO: fprintf(stdout, " goto "); p(t->phrase[0]); fprintf(stdout, ";\n"); break; case SS__JUMP: fprintf(stdout, " JUMP;\n"); break; case SS__NULL: fprintf(stdout, " ;\n"); break; default:; } break; case SS_LIST: if (t->phrase[0]) { p(t->phrase[0]); p(t->phrase[1]); } break; case IDENT: fprintf(stdout, "%s", t->matched); break; case LABEL: p(t->phrase[0]); break; case VAR: p(t->phrase[0]); // ident p(t->phrase[1]); // opt-index break; case VALUE: switch (t->subphrase) { case VAL__COND_EXPR: fprintf(stdout, "("); p(t->phrase[0]); fprintf(stdout, " ? "); p(t->phrase[1]); fprintf(stdout, " : "); p(t->phrase[2]); fprintf(stdout, ")"); break; case VAL__BR_EXPR: fprintf(stdout, "("); p(t->phrase[0]); fprintf(stdout, ")"); break; default: ; } break; case OPT_INDEX: fprintf(stdout, "["); t->phraseno = EXPR; p(t); // shennanigans! fprintf(stdout, "]"); break; case FNNAME: // never happens, intercepted by 'ident' fprintf(stdout, "<FNNAME>"); break; case PROCNAME: // never happens, intercepted by 'ident' fprintf(stdout, "<PROCNAME>"); break; case REST_OF_PARAM_LIST: if (t->phrase[0]) fprintf(stdout, ", "); case PARAM_LIST: p(t->phrase[0]); // EXPR if (t->phrase[1]) p(t->phrase[1]); // REST_OF_PARAM_LIST break; case EXPR: p(t->phrase[0]); // val p(t->phrase[1]); // opt-second_operand break; case FNCALL: p(t->phrase[0]); // fnname fprintf(stdout, "("); p(t->phrase[1]); // opt-param-list fprintf(stdout, ")"); break; case TYPED_VAR: switch (t->subphrase) { case TYPED_VAR__CAST: fprintf(stdout, "("); p(t->phrase[0]); fprintf(stdout, ")"); p(t->phrase[1]); // monop p(t->phrase[2]); // var break; case TYPED_VAR__DEFAULT: // phrase[0] is NULL p(t->phrase[1]); // monop p(t->phrase[2]); // var break; default:; } break; case OPT_SECOND_OPERAND: switch (t->subphrase) { case OPT_SECOND_OPERAND__PRESENT: p(t->phrase[0]); // binop p(t->phrase[1]); // rest of expr break; case OPT_SECOND_OPERAND__ABSENT: break; default:; } break; case OPT_MONOP: fprintf(stdout, "%s", t->matched); //p(t->phrase[0]); // monop. may be empty string. break; case CASTNAME: fprintf(stdout, "%s", t->matched); break; case BINOP: fprintf(stdout, "%s", t->matched); //p(t->phrase[0]); // binop string break; case SELECTOR: p(t->phrase[0]); break; case HEX_CONSTANT: fprintf(stdout, "%s", t->matched); break; case POSITIVE_INTEGER: fprintf(stdout, "%s", t->matched); break; case STRING_LITERAL: fprintf(stdout, "%s", t->matched); break; default: fprintf(stderr, "Unhandled switch parameter %d\n", t->phraseno); } } int main(int argc, char **argv) { tuple *program = new_tuple(SS_LIST); // If we run out of mallockable memory, error reporting can fail. // So free this on severe error to ensure enough memory to print // the error string and flush the I/O buffers. emergency = malloc(10*1024); if ((argc == 2) && (strcmp(argv[1], "-h") == 0)) { fprintf(stderr, "syntax: parse [-d] < file.c\n"); exit(0); } if ((argc == 2) && (strcmp(argv[1], "-d") == 0)) debug = 1; preload_entire_source(stdin); if (parse_ss_list(program) && (cur_line+1 == end_of_file)) { fprintf(stderr, "%d lines of %d compiled.\n", cur_line+1, end_of_file); // output optimised program: p(program); exit(EXIT_SUCCESS); } else { int i; fprintf(stderr, "* Syntax error at line %d, col %d:\n\n", best_line+1, best_col+1); // people tend to number from 1 up fprintf(stderr, "%s\n", prog_line[best_line]); for (i = 0; i < best_col; i++) fprintf(stderr, " "); fprintf(stderr, "^\n"); exit(EXIT_FAILURE); } return(1); }