Over the last few years, I've developed an interest in compilers and their design and implementation. This project is a step towards realising the desire to create my own compiler from scratch, whilst learning the computer science behind them.
Parser currently parses standard arithmetic operators, boolean operators, prefix unary operations and assignments.
Wed, 02 Apr 2008 15:08:08
• ky_compiler • ky_data_structures • ky_error • ky_lexical_scanner • ky_operator • ky_parser • ky_parser_gen2 • ky_token
Implementation of a generalised recursive descent parser
<identifier> = Parser_GEN2( ... )
Timothy Wakeham (timmeh) - timmeh@hiddenworlds.org on 02-04-2008 @ 01:37:00
[Show/Hide Code]
from ky_token import Token from ky_operator import Operator from ky_data_structures import Node from ky_lexical_scanner import LexicalScanner from ky_parser_gen2 import RecursiveDescentExpressionParser_GEN2 class TopDownParser: def __init__(self): pss
Ky data structure classes. Currently only contains the Node structure for Syntax Trees
Various
Ky data structures.
<identifier> = Node (..)
Timothy Wakeham (timmeh) - timmeh@hiddenworlds.org on 30-03-2008 @ 11:13:48
[Show/Hide Code]
class Node: def __init__(self, parent, label, token, *children): self.parent = parent self.children = children def get_child(self, value): return self.children[-1] def set_child(self, value): self.children.append(self.children) child = property(get_child, set_child) def __getitem__(self, key): return self.children[key] def __setitem__(self, key, val): self.children[key] = val
Ky error handling routines. Provides a unified interface for raising exceptions during lexing, parsing and code generation.
error( caller, exception, [state], [debug] )
caller - eg LexicalScanner, Parser, etc.
exception - eg ValueError, SyntaxError, etc.
state - state related information for logging
debug - any further information
Timothy Wakeham (timmeh) - timmeh@hiddenworlds.org on 28-03-2008 @ 20:18:19
[Show/Hide Code]
import logging def Error(caller, exception, state, debug=None): print '--Error-------------------------------' + '-' * len(caller.ID) print '%s encountered an error during execution' % caller.ID print 'State conditions: %s' % state print 'Debug info: %s' % debug raise exception, 'See above'
Ky lexical scanner class. A general purpose lexical scanner capable of the standard functions.
<identifier> = LexicalScanner( input string )
input string - eg '3+56/89^2. Arbitrary code fragments.
returns a Token object describing the current token and the
surrounding context tokens.
Timothy Wakeham (timmeh) - timmeh@hiddenworlds.org on 28-03-2008 @ 22:02:13
[Show/Hide Code]
from ky_token import Token from ky_error import Error from ky_operator import Operator, UnaryOperator class LexicalScanner: ID = 'KY_LEXICAL_SCANNER' NUMBER = '0123456789' ALPHA = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMMNOPQRSTUWXYZ' ALPHANUM = ALPHA + NUMBER OPERATOR = '+-/*^=<>&|~' UNARYOPS = '-!' NEWLINE = '\n' WHITESPACE = '\t ' EOF = 'EOF' KEYWORDS = ['IF', 'THEN', 'ELSE'] def __init__(self, code): self.code = code self.code_length = len(code) self.idx = 0 self.line = 1 self.line_buffer = '' self.last_idx = 0 self.last_token = None self.next_token = None for operator in Operator.operators: LexicalScanner.OPERATOR += operator for operator in UnaryOperator.unary_operators: LexicalScanner.UNARYOPS += operator def match_char(self, char, match_str): if char in match_str: return True return False def match_symbol(self, symbol, match_str): if sum([1 for i in symbol if self.match_char(i, match_str)]) == len(symbol): return True return False def get_character(self): self.idx += 1 if self.idx > self.code_length: return LexicalScanner.EOF return self.code[self.idx-1] def unget_character(self): self.idx -= 1 return def unget_symbol(self, idx): self.idx = idx return def get_symbol(self, next=True): while(1): char = self.get_character() # End-Of-File if char == LexicalScanner.EOF: token = Token(Token.EOF, None, self.last_idx, self.line, self.last_token, self.next_token) self.last_idx = self.idx return token # Newline if self.match_char(char, LexicalScanner.NEWLINE): self.line += 1 self.line_buffer = '' continue # Whitespace elif self.match_char(char, LexicalScanner.WHITESPACE): continue # Number elif self.match_char(char, LexicalScanner.NUMBER): number = '' while(self.match_char(char, LexicalScanner.NUMBER)): number += char char = self.get_character() self.unget_character() #push input back because we don't need this character token = Token(Token.NUMBER, number, self.last_idx, self.line, self.last_token, self.next_token) self.last_idx = self.idx index = self.idx self.last_token = token if next: token.next = self.get_symbol(next=False) self.unget_symbol(index) return token # Operator elif self.match_char(char, LexicalScanner.OPERATOR): operator = '' while(self.match_char(char, LexicalScanner.OPERATOR)): operator += char char = self.get_character() self.unget_character() #push input back because we don't need this character #~ #check operator is valid if not Operator.operators.has_key(operator): for idx in range(len(operator)): operator = operator[:-1] self.unget_character() if Operator.operators.has_key(operator): break token = Token(Token.OPERATOR, Operator.operators[operator], self.last_idx, self.line, self.last_token, self.next_token) self.last_idx = self.idx index = self.idx self.last_token = token if self.last_token.prev is None or (self.last_token.prev.prev and self.last_token.prev.prev.token_type == Token.OPERATOR): token.token_type = Token.UNOP if token.token_value not in [Operator.operators['('], Operator.operators[')']] else Token.OPERATOR if next: token.next = self.get_symbol(next=False) self.unget_symbol(index) return token # Identifier elif self.match_char(char, LexicalScanner.ALPHA): identifier = '' while(self.match_char(char, LexicalScanner.ALPHANUM)): identifier += char char = self.get_character() self.unget_character() #push input back because we don't need this character token_type = Token.KEYWORD if identifier in LexicalScanner.KEYWORDS else Token.IDENT token = Token(token_type, identifier, self.last_idx, self.line, self.last_token, self.next_token) self.last_idx = self.idx index = self.idx self.last_token = token if next: token.next = self.get_symbol(next=False) self.unget_symbol(index) return token else: token = Token(Token.UNKNOWN, char, self.last_idx, self.line, self.last_token, self.next_token) self.last_idx = self.idx index = self.idx self.last_token = token if next: token.next = self.get_symbol(next=False) self.unget_symbol(index) if token.token_value == '(': token.token_type = Token.OPERATOR token.token_value = Operator.operators['('] if token.token_value == ')': token.token_type = Token.OPERATOR token.token_value = Operator.operators[')'] return token if __name__ == '__main__': ass = Operator('=', 'POP') add = Operator('+', 'ADD') sub = Operator('-', 'SUB') mul = Operator('*', 'MUL') div = Operator('++', 'DIV') exp = Operator('^', 'POW') lexer = LexicalScanner('345+++98-67+8^77') while(1): token = lexer.get_symbol() print token if token.prev: print token.prev.prev if token.token_type == Token.EOF: break
This class is used to store data for operator tokens, and more specificially operator-operator precedence relationships. It allows operator associativity to be set by the coder, and thus a level of control required to allow compile time operator definitions in code.
<identifier> = operator( operator symbol, operator opcode, [precedence], [associativity] )
operator symbol - eg. +, -, *, /, etc.
operator opcode - eg. ADD, SUB, MUL, DIV, etc.
precedence - eg. {'+':operator.PREC_YIELD_LEFT,
'-':operator.PREC_YIELD_RIGHT, ...} etc
associativity = eg. operator.ASSOC_LEFT, operator.ASSOC_RIGHT
Timothy Wakeham (timmeh) - timmeh@hiddenworlds.org on 28-03-2008 @ 13:00:55
[Show/Hide Code]
class Operator: operators = {} ASSOC_LEFT = 0 ASSOC_RIGHT = 1 PREC_YIELD_LEFT = 0 PREC_YIELD_RIGHT = 1 PREC_EQUAL = 2 YIELD = 0 PREC = 1 EQUAL = 2 def __init__(self, operator, opcode, precedence={}, associativity=0, unary=False): self.operator = operator self.op = opcode self.precedence = precedence self.assign = None #set self precedence as determined by associativity if associativity == Operator.ASSOC_LEFT: self.precedence[operator] = Operator.PREC_YIELD_RIGHT else: self.precedence[operator] = Operator.PREC_YIELD_LEFT Operator.operators[operator] = self def get_opcode(self): if self.operator == '=' and self.assign: return 'POP %s' % self.assign else: return self.op opcode = property(get_opcode) def __str__(self): return '<Operator %s>' % self.operator # check if this operator yields to the other def __lt__(self, other): if not isinstance(other, Operator): raise TypeError, 'Operator precedence testing requires two(2) operators' if self.precedence.has_key(other.operator): if self.precedence[other.operator] == Operator.PREC_YIELD_LEFT: return True else: return False else: raise AttributeError, 'Operator %s has no precedence relationship with: %s' % (self.operator, other.operator) # check if this operator has precedence over the other def __gt__(self, other): if not isinstance(other, Operator): raise TypeError, 'Operator precedence testing requires two(2) operators' if self.precedence.has_key(other.operator): if self.precedence[other.operator] == Operator.PREC_YIELD_RIGHT: return True else: return False else: raise AttributeError, 'Operator %s has no precedence relationship with: %s' % (self.operator, other.operator) # check if this operator yields or has equal precedence to the other def __le__(self, other): if not isinstance(other, Operator): raise TypeError, 'Operator precedence testing requires two(2) operators' if self.precedence.has_key(other.operator): if self.precedence[other.operator] == Operator.PREC_YIELD_LEFT or self.precedence[other.operator] == Operator.PREC_EQUAL: return True else: return False else: raise AttributeError, 'Operator %s has no precedence relationship with: %s' % (self.operator, other.operator) # check if this operator has precedence or equal precedence to the other def __ge__(self, other): if not isinstance(other, Operator): raise TypeError, 'Operator precedence testing requires two(2) operators' if self.precedence.has_key(other.operator): if self.precedence[other.operator] == Operator.PREC_YIELD_RIGHT or self.precedence[other.operator] == Operator.PREC_EQUAL: return True else: return False else: raise AttributeError, 'Operator %s has no precedence relationship with: %s' % (self.operator, other.operator) class UnaryOperator(Operator): PREFIX = 0 POSTFIX = 1 unary_operators = {} def __init__(self, operator, opcode, operator_type=0): self.operator = operator self.opcode = opcode self.op_type = operator_type self.precedence = {operator:Operator.PREC_EQUAL} for op, op_obj in Operator.operators.iteritems(): if operator_type == UnaryOperator.PREFIX: op_obj.precedence[operator] = Operator.PREC_YIELD_RIGHT self.precedence[op] = Operator.PREC_YIELD_LEFT else: op_obj.precedence[operator] = Operator.PREC_YIELD_LEFT self.precedence[op] = Operator.PREC_YIELD_RIGHT UnaryOperator.unary_operators[operator] = self Operator.operators[operator] = self
Ky Parser class/es
<identifier> = Parser( ... )
Ky parser implementation/s
Timothy Wakeham (timmeh) - timmeh@hiddenworlds.org on 28-03-2008 @ 22:19:05
[Show/Hide Code]
from ky_operator import Operator from ky_token import Token from ky_lexical_scanner import LexicalScanner # initialise operators ay = Operator.PREC_YIELD_LEFT by = Operator.PREC_YIELD_RIGHT ep = Operator.PREC_EQUAL add_prec = {'-':by, '*':ay, '/':by, '^':by, '(':ay, ')':by} sub_prec = {'+':by, '*':ay, '/':by, '^':by, '(':ay, ')':by} mul_prec = {'+':by, '-':ay, '/':by, '^':by, '(':ay, ')':by} div_prec = {'+':by, '-':ay, '*':by, '^':by, '(':ay, ')':by} exp_prec = {'+':by, '-':ay, '*':by, '/':by, '(':ay, ')':by} rbr_prec = {'+':ay, '-':ay, '*':ay, '/':ay, '^':ay, ')':ep, '(':ay} lbr_prec = {'+':by, '-':by, '*':by, '/':by, '^':by, '(':ep, ')':by} ass_prec = {'+':by, '-':by, '*':by, '/':by, '^':by, '(':by, ')':by} ass = Operator('=', 'POP', ass_prec) add = Operator('+', 'ADD', add_prec, Operator.ASSOC_LEFT) sub = Operator('-', 'SUB', sub_prec, Operator.ASSOC_LEFT) mul = Operator('*', 'MUL', mul_prec, Operator.ASSOC_LEFT) div = Operator('/', 'DIV', div_prec, Operator.ASSOC_LEFT) exp = Operator('^', 'POW', exp_prec, Operator.ASSOC_LEFT) lbr = Operator('(', None , lbr_prec, Operator.ASSOC_RIGHT) rbr = Operator(')', None , rbr_prec, Operator.ASSOC_LEFT) class OperatorPrecedenceExpressionParser: ID = 'KY_PRECEDENCE_PARSER' def __init__(self, output_stream, lexical_scanner, debug=False): self.outstrm = output_stream self.lexical_scanner = lexical_scanner self.debug = debug self.outidx = 1 def output(self, out): if not out: return self.outstrm.write(out + '\n') def expression(self): if self.debug: print 'Expression: %s\n' % self.lexical_scanner.code stack = [] while(1): symbol = self.lexical_scanner.get_symbol() if symbol.token_type == Token.EOF: break elif symbol.token_type == Token.IDENT: if symbol.next.token_type == Token.OPERATOR and symbol.next.token_value <> Operator.operators['=']: self.output('PUSH %s' % symbol.token_value) else: symbol.next.token_value.assign = symbol.token_value elif symbol.token_type == Token.NUMBER: self.output('PUSH #%s' % symbol.token_value) elif symbol.token_type == Token.OPERATOR: if not len(stack): stack.append(symbol) else: # ay - a yields to b if stack[-1].token_value >= symbol.token_value: stack.append(symbol) else: # by - b yields to a while(len(stack) and stack[-1].token_value < symbol.token_value): self.output(stack.pop().token_value.opcode) stack.append(symbol) else: #print symbol break while(len(stack)): self.output(stack.pop().token_value.opcode) class RecursiveDescentExpressionParser: def __init__(self, output_stream, lexical_scanner, debug=False): self.outstrm = output_stream self.lexical_scanner = lexical_scanner self.debug = debug self.outidx = 1 def __call__(self): self.assignment() def output(self, out): if not out: return self.outstrm.write('%04d\t\t%s\n' % (self.outidx, out)) self.outidx += 1 def match(self, ttype, tval): if self.look.token_type == ttype and self.look.token_value == tval: self.look = self.lexical_scanner.get_symbol() return raise SyntaxError, 'Expected %s, got %s' % (tval, self.look.token_value) def match_type(self, ttype): if self.look.token_type == ttype: self.look = self.lexical_scanner.get_symbol() return raise SyntaxError, 'Expected token of type %s, got %s' % (Token.mapping[ttype], Token.mapping[self.look.token_type]) def assignment(self): self.look = self.lexical_scanner.get_symbol() self.expression() while(self.look.token_type <> Token.EOF and self.look.token_value.operator == '='): var = self.look.prev.prev self.look = self.lexical_scanner.get_symbol() self.expression() if var.prev: self.output('DUP') self.output('POP %s' % var.token_value) def expression(self): self.term() while(self.look.token_type <> Token.EOF and (self.look.token_value.operator == '+' or self.look.token_value.operator == '-')): opcode = 'ADD' if self.look.token_value.operator == '+' else 'SUB' self.look = self.lexical_scanner.get_symbol() self.term() self.output(opcode) def term(self): self.factor() while(self.look.token_type <> Token.EOF and (self.look.token_value.operator == '*' or self.look.token_value.operator == '/')): opcode = 'MUL' if self.look.token_value.operator == '*' else 'DIV' self.look = self.lexical_scanner.get_symbol() self.factor() self.output(opcode) def factor(self): self.exponent() #print self.look while(self.look.token_type <> Token.EOF and self.look.token_value.operator == '^'): opcode = 'POW' self.look = self.lexical_scanner.get_symbol() self.exponent() self.output(opcode) def exponent(self): if self.look.token_type == Token.OPERATOR and self.look.token_value.operator == '(': self.assignment() self.match(Token.OPERATOR, Operator.operators[')']) elif self.look.token_type == Token.NUMBER: self.output('PUSH #%s' % self.look.token_value) self.match_type(Token.NUMBER) elif self.look.token_type == Token.IDENT: if self.look.next.token_type == Token.OPERATOR and self.look.next.token_value.operator == '=': self.look = self.lexical_scanner.get_symbol() return self.output('PUSH %s' % self.look.token_value) self.match_type(Token.IDENT) else: raise SyntaxError, 'Unexpected token: %s' % self.look if __name__ == '__main__': import sys lexer = LexicalScanner('a = 5 * (b =4 * (c = 7 * d) )') parser = RecursiveDescentExpressionParser(sys.stdout, lexer, debug=True) parser()
Implementation of a generalised recursive descent expression parser which allows addition of new operators on the fly with definable precedence levels. Also allows for unary prefix operators such as unary minus and logical not
<identifier> = RecursiveDescentExpressionParser_GEN2( ... )
outstream - output stream, must be a file-like object
lexer - the tokeniser
[precedence] - operator precedence table
[debug] - print debug information
Timothy Wakeham (timmeh) - timmeh@hiddenworlds.org on 02-04-2008 @ 01:29:59
[Show/Hide Code]
from ky_operator import Operator, UnaryOperator from ky_token import Token from ky_data_structures import Node from ky_lexical_scanner import LexicalScanner ass = Operator('=', 'POP') add = Operator('+', 'ADD') sub = Operator('-', 'SUB') mul = Operator('*', 'MUL') div = Operator('/', 'DIV') exp = Operator('^', 'POW') lt = Operator('<', 'LT') gt = Operator('>', 'GT') equ = Operator('==', 'EQU') neq = Operator('<>', 'NEQ') _or = Operator('|', 'OR') _and = Operator('&', 'AND') _xor = Operator('~', 'XOR') _not = UnaryOperator('!', 'NOT') inc = UnaryOperator('++', 'INC') dec = UnaryOperator('--', 'DEC') uminus = UnaryOperator('-', 'NEG') lbr = Operator('(', None ) rbr = Operator(')', None ) operator_precedence = [ [ass, equ, lt, gt, neq], [add, sub, _or, _xor], [mul, div, _and], [exp,], [None] ] class RecursiveDescentExpressionParser_GEN2: def __init__(self, output_stream, lexical_scanner, prec=operator_precedence, debug=False): self.outstrm = output_stream self.lexical_scanner = lexical_scanner self.debug = debug self.outidx = 1 self.lvls = len(prec) - 1 self.prec = prec self.ast = Node(None, 'ROOT', None) def output(self, out): if not out: return if type(out) == type(''): out = [out] for line in out: self.outstrm.write('%04d\t\t%s\n' % (self.outidx, line)) self.outidx += 1 def __call__(self): if self.debug: print 'Expression: %s\n' % self.lexical_scanner.code self.look = self.lexical_scanner.get_symbol() self.expression() def match(self, ttype, tval): if self.look.token_type == ttype and self.look.token_value == tval: self.look = self.lexical_scanner.get_symbol() return raise SyntaxError, 'Expected %s, got %s' % (tval, self.look.token_value) def match_type(self, ttype): if self.look.token_type == ttype: self.look = self.lexical_scanner.get_symbol() return raise SyntaxError, 'Expected token of type %s, got %s' % (Token.mapping[ttype], Token.mapping[self.look.token_type]) def expression(self, lvl=0): if lvl <> self.lvls: self.expression(lvl+1) else: if self.look.token_type == Token.OPERATOR and self.look.token_value.operator == '(': self.look = self.lexical_scanner.get_symbol() self.expression() self.match(Token.OPERATOR, Operator.operators[')']) elif self.look.token_type == Token.NUMBER: self.output('PUSH #%s' % self.look.token_value) self.ast.child = Node(self.ast, self.look.token_value, self.look, None) self.match_type(Token.NUMBER) elif self.look.token_type == Token.UNOP: out = self.look.token_value.opcode self.ast = Node(self.ast, self.look.token_value.operator, self.look, None) self.ast.parent.child = self.ast self.look = self.lexical_scanner.get_symbol() self.expression(self.lvls) self.output(out) elif self.look.token_type == Token.IDENT: if self.look.next.token_type == Token.OPERATOR and self.look.next.token_value.operator == '=': self.look = self.lexical_scanner.get_symbol() return self.output('PUSH %s' % self.look.token_value) self.ast.child = Node(self.ast, self.look.token_value, self.look, None) self.match_type(Token.IDENT) else: raise SyntaxError, 'Unexpected token: %s' % self.look while(self.look.token_type <> Token.EOF and self.look.token_value in self.prec[lvl]): opcode = self.look.token_value.opcode self.ast = Node(self.ast, self.look.token_value.operator, self.look, None) self.ast.parent.child = self.ast #assignment special case if self.look.token_value == Operator.operators['=']: opcode = 'POP %s' % self.look.prev.prev.token_value if self.look.prev.prev.prev: opcode = ['DUP', opcode] self.look = self.lexical_scanner.get_symbol() self.expression(lvl+1) self.output(opcode) if __name__ == '__main__': import sys #lexer = LexicalScanner('abs = x*(1+2*(x<0))') #lexer = LexicalScanner('c=-a*-(b* ++d)') lexer = LexicalScanner('1*2*3*(4+8)') parser = RecursiveDescentExpressionParser_GEN2(sys.stdout, lexer, debug=True) parser() ## tok = lexer.get_symbol() ## while(tok.token_type <> Token.EOF): ## print tok ## tok = lexer.get_symbol()
This class is the basic token class returned by the lexical scanner. This may become a base class whereby all tokens are sub classes. Basically just a data storage class.
<identifier> = token( token type, token value, column index, line number, previous token, next token )
token type - eg. token.NUMBER, token.IDENT, etc.
token val - eg. 12, 'test', ie. any arbitrary value
column idx - eg. 3 ie. lexical position of token
line num - eg. 2 ie. line # of token
previous - previous token in stream
next - next token in stream
Timothy Wakeham (timmeh) - timmeh@hiddenworlds.org on 28-03-2008 @ 14:57:40
[Show/Hide Code]
class Token: ID = 'KY_TOKEN' NUMBER = 1 LBRACKET = 2 RBRACKET = 3 OPERATOR = 4 IDENT = 5 KEYWORD = 6 UNKNOWN = 7 EOF = 8 UNOP = 9 END = 0 MAPPING = ['End', 'Number', 'Left Bracket', 'Right Bracket', 'Operator', 'Identifier', 'Keyword', 'Unknown', 'EOF', 'Unary Op'] def __init__(self, tok_type, tok_val, idx=None, line=None, prev=None, next=None): # token details self.token_type = tok_type self.token_value = tok_val # lexer state self.idx = idx self.line = line # pointers to context tokens self.prev = prev self.next= next def next(self): return self.next def previous(self): return self.prev def __str__(self): return '<Token type:%s value:%s>' % (Token.MAPPING[self.token_type], self.token_value)