/*
 * cse.y
 *
 * Example of a simple expression parser with common sub-expression
 * elimination (CSE) which builds DAGs with n-ary trees and generates simple
 * code.
 *
 * The primary problem with building n-ary trees lies with compound
 * statements, where the number of children cannot be determined until all of
 * them have been processed.
 *
 * For nodes which have a fixed number of children (such as arithmetic operat-
 * ors with 2 children), there is no problem, but for compound statements, the
 * solution is to have a tree built for each statement and then appended to
 * a list.
 *
 * Each list element is a struct which contains 2 fields: the tree, and a
 * "next" pointer. When a node is to be created, it counts the number of
 * elements in the CHILD_LIST, allocates that many DAG * pointers, and sets
 * each pointer to a member of the list.
 */

%{

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>


/*****************************************************************************
 Data Structures

 Definitions for data structures related to the DAGs.
*****************************************************************************/

/*
 * DAG
 *
 * A dag node.
 */

typedef struct dag_struct   DAG;
struct dag_struct
{
    int     op;             // '+', '-', '*', '/', '=', 's', 'n', or 'v'

    /*
     * Data:
     *
     * This union serves two purposes:
     *
     *      1. For 'n' (number) and 'v' (variable) nodes, the appropriate
     *         field is set to the data that the lexer provided.
     *      2. During code generation, the "num" member is set to a scratch
     *         variable name (0 means T0, 1 is T1, etc.) if the node's op is
     *         NOT 'n' or 'v', otherwise, "num" is a constant number if op
     *         == 'n' and "var" is a variable if op == 'v'.
     *
     * For nodes other than 'n' and 'v', data.num is set to -1 by default in
     * order to signify to the code generator that no scratch register has
     * yet been assigned (-1 is invalid for scratch registers.)
     */

    union
    {
        int     num;        // number (if op == 'n')
        char    var;        // variable (if op == 'v')
    } data;

    /*
     * Children:
     *
     * Nodes can have an arbitrary number of children. The child[] array is an
     * array of pointers to child DAG nodes and only has as many elements as
     * the num_children member specifies.
     */

    int     num_children;   // number of children
    DAG     **child;        // up to 3 children

    /*
     * Global list:
     *
     * A linear list of all the nodes is maintained by chaining them all
     * together with this pointer. This helps with CSE.
     */

    DAG     *next;          // next node in the global list of nodes
};

/*
 * CHILD_LIST
 *
 * Sometimes, the number of children for a given DAG node cannot be
 * determined until after all of the children have been created. In this
 * grammar, this situation only occurs with lists of statements.
 *
 * The problem is resolved by creating a list of CHILD_LIST structures, each
 * of which contains a DAG and a pointer to the next structure. As the parser
 * parses a series of statements in a statement list, a new DAG is added to
 * the list for each statement encountered.
 *
 * When all is done, the parent node of all of these children is created, the
 * list is counted, and enough pointers are allocated to hold all of the
 * children. Then, each DAG in the list is plucked one at a time and attached
 * to the parent node's child[] array.
 */

typedef struct child_list_struct CHILD_LIST;
struct child_list_struct
{
    DAG         *node;  // the child node itself

    CHILD_LIST  *next;
};
    

/*****************************************************************************
 Global Data

 The DAG and global list of nodes are defined here.
*****************************************************************************/

/*
 * DAG
 */

static DAG *dag;        // DAG (NULL if no non-null statements in program)
static DAG *node_list;  // global list of nodes (newest first)


/*****************************************************************************
 Function Prototypes

 These functions are used by the parser.
*****************************************************************************/

static DAG  *MakeArbitraryNode(int, CHILD_LIST *);
static DAG  *MakeNode(int, int, DAG *, DAG *);
static DAG  *MakeDataNode(int, int);
static int  yylex();
static void yyerror(const char *);


%}


/*****************************************************************************
 Token Definitions

 YYSTYPE and the tokens are defined here. Tokens are only defined for numbers
 and variable names. The lexer will return ASCII characters for everything
 else which helps make the grammar a little more readable.
*****************************************************************************/

%union
{
    int     num;        // number
    char    var;        // variable

    DAG    *node;

    /*
     * A list of child DAGs. The 2 pointer system is for fixed-time insertion
     * of new elements to the end of the list (each time one is inserted, the
     * "last" pointer is updated, but "first" remains unchanged.)
     */

    struct
    {
        CHILD_LIST  *first, *last;  // pointers to first and last children
    } children;
}

%token  <num>       NUMBER
%token  <var>       VARIABLE

%type   <children>  stmt_list
%type   <node>      stmt
%type   <node>      expr
%type   <node>      assgn_expr
%type   <node>      factor
%type   <node>      element


%%


/*****************************************************************************
 Grammar

 Ye olde grammar.

 The language is really quite simple. It's just a series of statements
 terminated by semicolons. Multiple statements can appear between brackets,
 but there are no scope rules, so these brackets do nothing more than serve
 as a demonstration for how to implement compound statements using n-ary
 tree structures.

 There are 52 possible variable names: a-z and A-Z. They are always available
 and cannot be declared or undeclared.

 A single statement may be an assignment statement:

    a = 3;
    b = a;
    c = d = e = 4 * 3;

 Or an arithmetic expression using +, -, *, and / operators:

    1 + 2 * 3 / a;

 The above example clearly does nothing since there is no assignment, but it's
 perfectly valid.
*****************************************************************************/

/*
 * file
 *
 * A file is simply a list of statements. stmt_list is set to a list of
 * DAGs. These are attached to a statement list node.
 */

file:   stmt_list
            {
                if ($1.first != NULL)   // if there were any non-null statements
                    dag = MakeArbitraryNode('s', $1.first);
                else
                    dag = NULL;
            }
    |   // empty
    ;

/*
 * stmt_list
 *
 * Takes advantage of left recursion in bottom-up parsing to allow an
 * arbitrary number of statements to be repeated in a list. Consult Holub's
 * book to learn about how this sort of left recursion works.
 *
 * When a stmt is on the stack, the first production is reduced to stmt_list.
 * Then, another stmt will make "stmt_list stmt" which again reduces to
 * stmt_list, etc. So, for each stmt_list, the "stmt_list->stmt"
 * production gets reduced once (and it happens first.)
 *
 * A CHILD_LIST is created and each time the second production is reduced, the
 * action executed adds the DAG associated with stmt to stmt_list and
 * attaches it to $$.
 */

stmt_list:
            stmt    // this is the first kind of stmt_list production to be
                    // reduced because of how left recursion works

                {
                    $$.first = $$.last = NULL;
                    if ($1 != NULL) // don't corrupt child list w/ NULLs
                    {
                        $$.first = malloc(sizeof(CHILD_LIST));
                        $$.last = $$.first;
                        $$.first->node = $1;
                        $$.first->next = NULL;
                    }
                    else
                        $$.first = $$.last = NULL;
                 }   
    |
            stmt_list stmt

            {
                $$ = $1;        // keep the list alive by passing it to $$
                if ($2 != NULL) // don't corrupt child list with NULLs
                {
                    if ($$.last == NULL)    // nothing has been allocated!
                    {
                        $$.first = malloc(sizeof(CHILD_LIST));
                        $$.last = $$.first;
                        $$.first->node = $2;
                        $$.first->next = NULL;
                    }
                    else                    // allocate to end of list
                    {
                        ($$.last)->next = malloc(sizeof(CHILD_LIST));
                        ($$.last)->next->node = $2;
                        ($$.last)->next->next = NULL;
                        $$.last = ($$.last)->next;
                    }
                }
            }    
    ;

/*
 * stmt
 *
 * A single statement, which can be an expression, an assignment, or a
 * compound statement (just another statement list.) C-style null-statements
 * are perfectly acceptable and don't generate any DAG nodes.
 */

stmt:   expr ';'        { $$ = $1; }
    |   ';'             { $$ = NULL; }
    |   '{' stmt_list '}'
            {
                if ($2.first == NULL)   // no actual statements
                    $$ = NULL;
                else
                    $$ = MakeArbitraryNode('s', $2.first);  // make a 's' (statement list) node and attach all the children
            }
    |   '{' '}'         { $$ = NULL; }
    |   assgn_expr ';'  { $$ = $1; }
    ;

/*
 * assgn_expr
 *
 * Assignment expression. The entire expression itself takes the value of the
 * newly-modified variable, thus chaining is possible:
 *
 *      a = b = c = 3 * 4;
 *
 * Note that the variable node is created first, and then the operator node
 * is created and overwrites $$.
 */

assgn_expr: VARIABLE '=' assgn_expr
                {
                    $$ = MakeDataNode('v', $1);
                    $$ = MakeNode(2, '=', $$, $3);
                }
    |       VARIABLE '=' expr
                {
                    $$ = MakeDataNode('v', $1);
                    $$ = MakeNode(2, '=', $$, $3);
                }
    ;

/*
 * expr
 *
 * The lowest precedence expression rules (+ and -) are handled here.
 */

expr:   expr '+' factor     { $$ = MakeNode(2, '+', $1, $3); }
    |   expr '-' factor     { $$ = MakeNode(2, '-', $1, $3); }
    |   factor              { $$ = $1; }
    ;

/*
 * factor
 *
 * Multiplication and division are higher precedence than +/- (expr.)
 */

factor: factor '*' element  { $$ = MakeNode(2, '*', $1, $3); }
    |   factor '/' element  { $$ = MakeNode(2, '/', $1, $3); }
    |   element             { $$ = $1; }
    ;

/*
 * element
 *
 * A basic element -- a number, variable, or parenthetical expression.
 */

element:    '(' expr ')'        { $$ = $2; }
    |       '(' assgn_expr ')'  { $$ = $2; }
    |       NUMBER              { $$ = MakeDataNode('n', $1); }
    |       VARIABLE            { $$ = MakeDataNode('v', $1); }
    ;


%%


/*****************************************************************************
 DAG Functions

 Functions for making DAG nodes and attaching children to them.

 Nodes other than 'n' and 'v' have data.num set to -1 to indicate that no
 scratch register has been assigned to the node by the code generator yet
 (see the code generator code and the DAG structure definition for more info.)
*****************************************************************************/

/*
 * MakeArbitraryNode():
 *
 * Creates a node which can have an arbitrary amount of children.
 *
 * Attaches all of the DAGs in the CHILD_LIST as children to the node (first
 * argument.) This is done by counting the number of DAGs in the list,
 * allocating that many child pointers, and then taking each element in the
 * list and putting it in the newly-allocated child[] array.
 *
 * Next, the result is checked against all of the other DAG nodes to see if
 * there is already one like this, and if so, this one is discarded and the
 * existant one is returned.
 *
 * NOTE: When searching for a matching node in the global node list, the
 * "data" member is not checked because it is assumed that it is unused in
 * this sort of node.
 */

static DAG *MakeArbitraryNode(int op, CHILD_LIST *cl)
{
    CHILD_LIST  *l;
    DAG         *node, *n;
    int         num_children, i, all_equal;

    /*
     * Count number of children by iterating through list
     */

    for (l = cl, num_children = 0; l != NULL; l = l->next, num_children++)
        ;

    /*
     * Make the node
     */

    if ((node = calloc(1, sizeof(DAG))) == NULL)
        return NULL;

    node->op = op;

    node->data.num = -1;

    /*
     * Allocate space for their pointers and then copy them in
     */

    node->num_children = num_children;
    node->child = malloc(sizeof(DAG *) * num_children);
    for (i = 0, l = cl; i < num_children; i++, l = l->next)
        node->child[i] = l->node;

    /*
     * Try to see if a matching node already exists
     */

    for (n = node_list; n != NULL; n = n->next)
    {
        if (n->op == node->op && n->num_children == node->num_children)
        {
            all_equal = 1;  // will be cleared if not all children equal
            for (i = 0; i < n->num_children; i++)
            {
                if (n->child[i] != node->child[i])
                {
                    all_equal = 0;
                    break;
                }
            }

            if (all_equal)
            {
                free(node); // destroy current node
                return n;
            }
        }
    }

    /*
     * No match found, insert this node in the global list and return it
     */

    node->next = node_list;
    node_list = node;

    return node;
}

/*
 * MakeNode():
 *
 * Makes a DAG node with the specified number of children (0 is acceptable.)
 * The number of children determines how big child[] will be. If the number
 * of children is 1 or 2, they are passed as parameters a and b.
 *
 * NOTE: When searching for a matching node in the global node list, the
 * "data" member is not checked because it is assumed that it is unused in
 * this sort of node.
 *
 * MakeNode() is not intended for number ('n') and variable ('v') nodes, use
 * MakeDataNode() for that purpose.
 */

static DAG *MakeNode(int num_children, int op, DAG *a, DAG *b)
{
    DAG *node;

    /*
     * Try to see if a matching node exists
     */

    for (node = node_list; node != NULL; node = node->next)
    {
        if (node->op != op)                     // can't be the same node!
            continue;
        if (node->num_children != num_children)
            continue;
        
        /*
         * Ugly and not optimal... I know, but it's easy to understand :)
         */

        if (num_children == 0)
            return node;                        // must be a match!
        else if (num_children == 1)
        {
            if (node->child[0] == a)
                return node;
        }
        else if (num_children == 2)
        {
            if (node->child[0] == a && node->child[1] == b)
                return node;
        }
    }

    /*
     * Otherwise, allocate a new one
     */

    if ((node = calloc(1, sizeof(DAG))) == NULL)
        return NULL;

    node->next = node_list;
    node_list = node;

    node->op = op;

    node->data.num = -1;

    node->num_children = num_children;
    if (num_children)
    {
        node->child = malloc(sizeof(DAG *) * num_children);

        node->child[0] = a;
        if (num_children == 2)
            node->child[1] = b;
    }

    return node;
}

/*
 * MakeDataNode():
 *
 * Virtually the same as the above MakeNode() function, except it is intended
 * for use with 'n' and 'v' nodes only.
 */

static DAG *MakeDataNode(int op, int data)
{
    DAG *node;

    for (node = node_list; node != NULL; node = node->next)
    {
        /*
         * Only check op and fields... Children are irrelevant for 'n' and 'v'
         */

        if (node->op == op)
        {
            if (op == 'n')
            {
                if (node->data.num == data)
                    return node;
            }
            else    // 'v'
            {
                if (node->data.var == (char) data)
                    return node;
            }
        }
    }

    if ((node = calloc(1, sizeof(DAG))) == NULL)
        return NULL;

    node->next = node_list;
    node_list = node;

    node->op = op;
    if (op == 'n')
        node->data.num = data;
    else if (op == 'v')
        node->data.var = data;
    else
        node->data.num = -1;    // to indicate this isn't being used (codegen)

    node->num_children = 0;

    return node;
}


/*****************************************************************************
 Code Generator

 Code is output in a simplistic quadruple format (op, dest, src, src) with a
 scratch variable assigned for each operation. This means that a lot of
 scratch variables are generated that could be eliminated, but that would be
 fairly trivial as a post-step.

 CSE (common sub-expression elimination) is performed during code generation.
 It's really quite simple and most of the work was done in constructing the
 DAGs themselves. When a scratch variable is found to be assigned to a given
 node (which happens once the code generator touches it the first time), it is
 re-used.

 If a sub-expression is no longer valid (when another statement changes the
 contents of a variable used as an operand, for instance), the scratch
 variable is disassociated with the node (data.num is set to -1.)

 The code generator assumes (especially in InvalidateDependencies()) that
 all nodes other than 'n', 'v', and '=' can only be represented by a temporary
 variable (whose number is in data.num), Tx.
*****************************************************************************/

static int  current_scratch = 0;

/*
 * NewScratch():
 *
 * Returns a new scratch variable.
 */

static int NewScratch()
{
    return current_scratch++;
}

/*
 * PrintOperand():
 *
 * Prints the current node as an operand. It assumes that it has already been
 * consumed by GenerateCode(). The rules for operand printing are:
 *
 *      - If op == 'n', print data.num.
 *      - If op == 'v', print data.var.
 *      - If op == '=', print data.var.
 *      - Otherwise, data.num contains the number of a scratch variable (Tx)
 *        that must be printed.
 */

static void PrintOperand(DAG *node)
{
    switch (node->op)
    {
    case 'n':
        printf("%d", node->data.num);
        break;
    case 'v':
    case '=':
        putchar(node->data.var);
        break;
    default:
        printf("T%d", node->data.num);
        break;
    }
}

/*
 * InvalidateDependencies():
 *
 * Invalidates all dependencies on the variable which holds this node. Scans
 * the global node list to find nodes that have a child which is the current
 * node. Recursively eliminates dependencies on dependencies, etc.
 */

static void InvalidateDependencies(DAG *node)
{
    DAG *n;
    int i;

    for (n = node_list; n != NULL; n = n->next)
    {
        for (i = 0; i < n->num_children; i++)
        {
            if (n->child[i] == node)
            {
                InvalidateDependencies(n);

                /*
                 * '=' and 'v' both use data.var to store an actual (a-z,A-Z)
                 * user-defined variable which must never be invalidated.
                 */

                if (n->op != '=' && n->op != 'v')   // these 2 will always have an actual (a-z,A-Z) variable                                                          
                    n->data.num = -1;
                break;
            }
        }
    }
}

/*
 * GenerateCode():
 *
 * Generates code from a DAG and prints it to stdout.
 */

static void GenerateCode(DAG *node)
{
    int     i;

    if (node == NULL)
        return;

    switch (node->op)
    {
    case 's':

        /*
         * List of statements
         */

        for (i = 0; i < node->num_children; i++)
            GenerateCode(node->child[i]);

        break;
   
    case '=':

        /*
         * Assignment
         */

        GenerateCode(node->child[1]);   // child[0] guaranteed to be a var
        node->data.var = node->child[0]->data.var;  // this node becomes
                                                    // equivalent to the var

        printf("%c = ", node->child[0]->data.var);
        PrintOperand(node->child[1]);
        printf("\n");

        InvalidateDependencies(node->child[0]);

        break;

    case '+':
    case '-':
    case '*':
    case '/':

        /*
         * Binary arithmetic operators
         */


        if (node->data.num != -1)   // scratch register already assigned?
            break;                  // do nothing (parent node will treat this
                                    // as an operand by re-using the scratch
                                    // register)

        GenerateCode(node->child[0]);
        GenerateCode(node->child[1]);
        node->data.num = NewScratch();

        printf("T%d = ", node->data.num);
        PrintOperand(node->child[0]);
        printf(" %c ", node->op);
        PrintOperand(node->child[1]);
        printf("\n");
        
        break;

    case 'n':
    case 'v':
        break;  // do nothing
    }
}


/*****************************************************************************
 Lexical Analyzer

 A simple lexical analyzer which recognizes tokens from file input.
*****************************************************************************/

static FILE *fp;

static int yylex()
{
    int i, num;

    if (feof(fp))
        return 0;

    i = fgetc(fp);
    while (i == ' ' || i == '\t' || i == '\r' || i == '\n')
        i = fgetc(fp);

    if (isdigit(i))
    {
        num = i - '0';
        while (1)
        {
            i = fgetc(fp);
            if (isdigit(i))
            {
                num *= 10;
                num += (i - '0');
            }
            else
            {
                ungetc(i, fp);
                yylval.num = num;
                return NUMBER;
            }
        }
    }
    else if (isalpha(i))
    {
        yylval.var = i;
        return VARIABLE;
    }

    return i;
}


/*****************************************************************************
 Main

 main(), yyerror(), and friends.
*****************************************************************************/

static void PrintDAG(DAG *node, int indent)
{
    int     i;

    if (node == NULL)
        return;

    for (i = 0; i < indent * 2; i++)
        printf(" ");

    if (node->op == 'n')
        printf("%d\n", node->data.num);
    else if (node->op == 'v')
        printf("%c\n", node->data.var);
    else
    {
        printf("%c\n", node->op);
        for (i = 0; i < node->num_children; i++)
            PrintDAG(node->child[i], indent + 1);
    }
}

static void yyerror(const char *msg)
{
    fprintf(stderr, "%s\n", msg);
}

int main(int argc, char **argv)
{
    if (argc <= 1)
    {
        puts("usage: cse <file>");
        exit(0);
    }
    else
    {
        if ((fp = fopen(argv[1], "r")) == NULL)
        {
            fprintf(stderr, "error: unable to open %s\n", argv[1]);
            exit(1);
        }
    }

    node_list = NULL;
    if (yyparse())
        exit(0);

    puts("DAG:\n----\n");
    PrintDAG(dag, 0);

    puts("\nCode:\n-----\n");
    GenerateCode(dag);

    return 0;
}