COP 3402 meeting -*- Outline -*-

* Using the bison parser generator

------------------------------------------
     BISON: A LALR(1) PARSER GENERATOR

Input:
    spl.y
     |
     |  bison spl.y
     v

Output:
    spl.tab.c   and   spl.tab.h

    yyparse{def.}     token decls.
    parse tables      extern decls.
    yylval               yyparse(),
    (+ user code)        yylval
------------------------------------------
    The output is a C program
      (by default, although bison can generate other languages,
       and there are other LALR(1) parser generators for other languages...)

    There are several LALR(1) parser generators: YACC, Bison, JavaCC, ...

** big picture
------------------------------------------
          THE BIG PICTURE

                       tokens
 - source --> [ Lexer] ------> [ Parser]
    code                          /
                                 /
                                / ASTs
                               /
                              |
                              v
               symbol <---- [ static
               table  ---->   analysis ]
                             /
                            /
                           v
                        [ code generator ]

------------------------------------------

** using bison and flex
------------------------------------------
 BISON AND FLEX, GENERATING A PARSER

idea:

                  ast.h (AST types)
                     |
             bison   v
             -----> spl.tab.c
            /        ^  yyparse (def.)
           / bison   |
  spl.y ----------> spl.tab.h
                     |  token enum.
                     |  (decl.)
              flex   v
  spl_lexer.l-----> spl_lexer.c
                        yylex (def.)


------------------------------------------
   ...
     explain all of this:

      The context-free grammar
           (the spl.y file) is central to this,

      The spl.y file records:
          - grammar and
          - names/types of the tokens
             (that the parser needs)

      Bison is a parser generator
         it generated the function yyparse
         in file spl.tab.c

      The ASTs record the structure of the program
         (and the parse)

      Flex is a lexical analyzer generator
         it generated the function yylex

      the spl_lexer.l file records
         - the lexical grammar (as REs)
         - how tokens produce ASTs (in yylval)

*** file connections for hw3
------------------------------------------
     HOW IT ALL FITS TOGETHER IN HW3

// in file machine_types.h: ======

// ... 
typedef unsigned int address_type;
typedef unsigned char byte_type;
typedef int word_type;

// in file file_location.h: ======

// location in a source file
typedef struct {
    const char *filename;
    unsigned int line; // of first token
} file_location;

// in file ast.h: ================
// ...
#include "machine_types.h"
#include "file_location.h"

// types of ASTs (type tags)
typedef enum { block_ast, /* ... */
    token_ast
} AST_type;

// typedefs for types N_t, follow,
// where N is a nonterminal

typedef struct {
    file_location *file_loc;
    AST_type type_tag;
    void *next; // for lists
} generic_t;

typedef struct ident_s {
    file_location *file_loc;
    AST_type type_tag;
    struct ident_s *next; // for lists
    const char *name;
} ident_t;

typedef struct {
    file_location *file_loc;
    AST_type type_tag;
    const char *text;
    word_type value;
} number_t;

typedef struct {
    file_location *file_loc;
    AST_type type_tag;
    const char *text;
    int code;
} token_t;

// ...

typedef struct block_s {
    file_location *file_loc;
    AST_type type_tag;
    const_decls_t const_decls;
    var_decls_t var_decls;
    proc_decls_t proc_decls;
    stmts_t stmts;
} block_t;

// ...
typedef union AST_u {
    generic_t generic;
    block_t block;
    const_decls_t const_decls;
    const_decl_t const_decl;
    const_def_list_t const_def_list;
    const_def_t const_def;
    var_decls_t var_decls;
    var_decl_t var_decl;
    ident_list_t ident_list;
    // ...
    expr_t expr;
    binary_op_expr_t binary_op_expr;
    token_t token;
    number_t number;
    ident_t ident;
    empty_t empty;
} AST;

// Return the file location from an AST
extern file_location *ast_file_loc(AST t);

// ...

// Return a pointer to a fresh copy of t
// that has been allocated on the heap
extern AST *ast_heap_copy(AST t);

// ...

extern block_t ast_block(
  token_t begin_tok,
  const_decls_t const_decls,
  var_decls_t var_decls,
  proc_decls_t proc_decls,
  stmts_t stmts);

// ...
extern ident_t ast_ident(
  file_location *file_loc,
  const char *name);

extern expr_t ast_expr_number(
  number_t e);

extern empty_t ast_empty(
  file_location *file_loc);
// ...

parser_types.h:

#include "ast.h"
typedef AST YYSTYPE;

spl.y (also spl.tab.h):

#include "ast.h"
#include "machine_types.h"
#include "parser_types.h"
#include "lexer.h"

// more below
------------------------------------------

    Explain all of these provided files.

    The ASTs are structs that are typedef'd in ast.h
    (more on them below)

    The type AST is stored in the parse stack during parsing,
       (so that is why it must be a union) and
    the field names are used in the grammar's actions;
    they also appear in spl.y as names in %type declarations
       e.g., %type <const_def> constDef
    where const_def is a field name from the AST union type.
    
*** abstract syntax trees and their use in parsing
**** background: how the parser works
------------------------------------------
    CONNECTING THE PARSER AND THE ASTs

Parser model

   A stack of (terminals + nonterminals)
   A parallel stack of ASTs
   1 token of lookahead

   Steps in parsing (DFA decides to):

   - shift:
     1. push lookahead on parse stack
     2. push its yylval on AST stack

OR

   - reduce using a rule written:

        nt : a b c { $$ = f($1,$2,$3); };

     1. take a,b,c off parse stack
     2. take their AST values,
          aval,bval,cval, off the AST stack
        and compute
          ntval = f(aval,bval,cval)
     3. push nt on parse stack
     4. push result (ntval) on AST stack

------------------------------------------

     A LR (or LALR) parser maintains a parse state in a DFA,
     which is determined by the contents of the parse stack;
     the parse state and the lookahead (token) are used to decide
     what step to take next.
     
     See the *.output file (e.g., spl.output) produced by bison
     to understand what the parser's algorithm will do
     and to debug ambiguities.

**** use of ASTs in the grammar (.y) file
------------------------------------------
  CONNECTION WITH ASTs IN GRAMMAR FILE

 /* $Id: spl.y ... */

%code requires {
#include "ast.h"
#include "machine_types.h"
#include "parser_types.h"
#include "lexer.h"
  /* ... */
}
 /* ... */
%token <ident> identsym
%token <number> numbersym
%token <token> plussym    "+"
%token <token> minussym   "-"
%token <token> multsym    "*"
%token <token> divsym     "/"
%token <token> periodsym  "."
%token <token> semisym    ";"
%token <token> eqsym      "="
%token <token> commasym   ","
%token <token> becomessym ":="
%token <token> lparensym  "("
%token <token> rparensym  ")"
%token <token> constsym   "const"
%token <token> varsym     "var"
 /* ... */

%type <block> program
%type <block> block
%type <const_decls> constDecls
%type <const_def> constDef
%type <var_decls> varDecls
%type <var_decl> varDecl
%type <idents> idents
%type <proc_decls> procDecls
%type <empty> empty
 /* ... */
%type <expr> expr
%type <expr> term
%type <expr> factor

%start program
/* ... */
------------------------------------------

        Explain that a %token definition in bison
        of the form
          %token <astField> somesym "some"
        declares (to bison) that:
          - somesym is a kind of token
                (so it goes into the enumeration in spl.tab.h)
          - it has the text "some", which can be used in the rules
                (and stands for the token there)
          - its attributes in the AST's union
               has the type of the field named "astField"

------------------------------------------
  PUTTING A TOKEN VALUE ON THE AST STACK
               (DETAILS)
               
To put yylval on the AST Stack
  when the field name is "token"

in the .y file have:
  %token <token> somesym "some"    

the generated parser has:

#include "ast.h"
#include "parser_types.h" 
   // typedef AST YYSTYPE;
   
  YYSTYPE yyvsa[];  // the AST stack

  yyvsa[yyi].token = yylval;

------------------------------------------

     ... Bison knows the field name in the AST union type,
         so it uses that field name in the generated C code, the code shown

------------------------------------------
PUSHING AST FOR A NONTERMINAL ON AST STACK
             (DETAILS)

To put yylval on the AST Stack
  when the field name is "const_def"

in the .y file have:
  %type <const_def> constDef

the generated parser has:

#include "ast.h"
#include "parser_types.h" 
   // in file parser_types.h: =====
   // typedef AST YYSTYPE;

// in the generated parser file: ==

  YYSTYPE yyvsa[];  // the AST stack

  yyvsa[yyi].const_def = yylval;

------------------------------------------

   The name in the angle brackets gives the name of the union's field.

**** Example, the const language
    This example is in the example-code page, see
    https://www.cs.ucf.edu/~leavens/COP3402/example-code/index.html#ConstLang

------------------------------------------
    THE CONST LANGUAGE

programs all look like:

    const ident = 3402
------------------------------------------

    It's super simple... we are using it to see how flex and bison
    work together

------------------------------------------
      ASTs FOR THE CONST LANGUAGE

/* $Id: ast.h ... */
/* ... */

typedef struct {
    file_location *file_loc;
} generic_t;

typedef struct ident_s {
    file_location *file_loc;
    const char *name;
} ident_t;

typedef struct {
    file_location *file_loc;
    const char *text;
    word_type value;
} number_t;

typedef struct {
    file_location *file_loc;
    const char *text;
    int code;
} token_t;

typedef struct const_def_s {
    file_location *file_loc;
    ident_t ident;
    number_t number;
} const_def_t;

typedef union AST_u {
    generic_t generic;
    const_def_t const_def;
    token_t token;
    number_t number;
    ident_t ident;
} AST;

extern const_def_t ast_const_def(
  ident_t ident, number_t number);

// ...

------------------------------------------
        
------------------------------------------
    THE CONST.Y FILE FOR CONST LANGUAGE

 /* ... */
%code requires {
#include "ast.h"
#include "machine_types.h"
#include "parser_types.h"
#include "lexer.h"
 /* ...*/
}
 /* ...*/
%token <token> constsym   "const"
%token <ident> identsym
%token <token> eqsym      "="
%token <number> numbersym

%type <const_defs> program
%type <const_def> constDef
%type <const_defs> constDefs
%type <empty> empty

%start program

%code {
extern int yylex(void);
const_def_t progast;
extern void setProgAST(const_def_t t);
}

%%

program : constDef { setProgAST($1); } ;

constDef : "const" identsym "=" numbersym
           { $$ = ast_const_def($2,$4); };

%%

// Set the program's ast to be t
void setProgAST(const_def_t t) {
  progast = t;
}
------------------------------------------
        Explain all this:

        - there are 4 declared tokens,
          and three AST field names for them (ident, token, and number)
        - there are 2 nonterminals, both using the AST field name const_def

        - program is the start symbol (nonterminal) of
          the grammar

        - there are the 2 productions in the grammar, with no alternatives.

          Q: Do we really need 2?
          No, we could just use constDef as the start symbol and have 1.

        - both rules have actions,
          which is C code executed when the rule is reduced.

          Q: When are line numbers recorded?
          when the token ASTs are formed, so should be accurate.

**** example of changing the grammar
        Q: How would we make the grammar be <program> ::= <constDefs>
            with <constDefs> ::= { <constDef> } ?

           add productions for

           constDefs : empty { $$ = ast_const_defs_empty($1); }
                     | constDefs constDef
                       { $$ = ast_const_defs($1, $2); }
                     ;

           empty : %empty { $$ = ast_empty(floc); } ;

           constDef : "const" identsym "=" numbersym
                      { $$ = ast_const_def($2,$4); }
                    ;

Then we would need to do the following:
     (However, I usually think about the ASTs earlier than indicated here)
***** Change the grammar
      (in const.y)
- add productions for the new grammar with actions that build ASTs:

           program : constDefs { setProgAST($1); } ;

           constDefs : empty { $$ = ast_const_defs_empty($1); }
                     | constDefs constDef
           	    { $$ = ast_const_defs($1, $2); } 
                     ;
           
           empty : %empty { $$ = ast_empty(); } ;

    Note that left recursion, as in constDefs, is preferred for LALR
    parsers like Bison.
    Note that %empty tells Bison that the production really is empty.
    
- compensate for the new grammar (in const.y) by:
   - declaring types for the new nonterminals
           %type <const_defs> program
           %type <const_defs> constDefs
           %type <empty> empty
   - changing the type declared for progast:
               /* The AST for the program, set by the semantic action 
               for the nonterminal program. */
           const_defs_t progast;
   - changing the parameter type for setProgramAST:
            /* Set the program's ast to be t */
           extern void setProgAST(const_defs_t t);

           // Set the program's ast to be t
           void setProgAST(const_defs_t t) { progast = t; }

   - adding test cases to use and test the new grammar
   
***** Add to the ast module
- facilities for lists of constDefs:
   - a generic struct type
       (for lists, that includes a next pointer)
     and a next pointer in const_def_t
       (as there will be lists of those ASTs)
   - an AST (struct) type for empty lists
       (declared before const_defs_t)
   - an AST (struct) type for lists of constDefs
       (which is const_defs_t)
- compensating in the ast module for those changes:
   - adding the new AST types (const_defs_t and empty_t)
       to the declaration of the union type AST.
   - adding functions to the ast module to create the new AST types
       (say ast_const_defs_empty, ast_const_defs, and ast_empty)
       note, in ast.h, the differences between the functions
       ast_const_defs_empty and ast_empty
   - adding a function to the ast module to find the last element
       in a linked list (say ast_last_list_elem)

add the following to the ast.h file

// empty ::=
typedef struct {
    file_location *file_loc;
    AST_type type_tag;
} empty_t;

// const-defs ::= empty | constDefs constDef
typedef struct {
    file_location *file_loc;
    AST_type type_tag;
    const_def_t *const_defs; // for lists
} const_defs_t;

// program ::= const-defs

// The generic struct type (generic_t) has the fields that
// should be in all alternatives for ASTs.
typedef struct {
    file_location *file_loc;
    AST_type type_tag; // says what field of the union is active
    void *next; // for lists
} generic_t;

and add the following to the ast.c file (used in the .y file)

// Return an AST for empty found in the given file location
empty_t ast_empty(file_location *file_loc)
{
    empty_t ret;
    ret.file_loc = file_loc;
    ret.type_tag = empty_ast;
    return ret;
}

// Return an AST for a const-defs that is empty
const_defs_t ast_const_defs_empty(empty_t emp)
{
    program_t ret;
    ret.file_loc = emp_AST.file_loc;
    ret.type_tag = program_ast;
    ret.const_defs = NULL;
    return ret;
}

// Return an AST for a const-defs that is not empty
const_defs_t ast_const_defs(const_defs_t cdfs,
				      const_def_t cdf)
{
    const_defs_t ret = cdfs;
    const_def_t *p = (const_def_t *) malloc(sizeof(const_def_t));
    if (p == NULL) {
	bail_with_error("Cannot allocate space for a const_def_t!");
    }
    *p = cdf;
    p->next = NULL;
    const_def_t *last = ast_last_list_elem(ret.start);
    if (last == NULL) {
	ret.start = p;
    } else {
	last->next = p;
    }
    return ret;
}

// Requires: lst is a pointer to a non-circular 
//           linked list with next pointers
//           as in generic_t
// Return a pointer to the last element in lst.
// This only returns NULL if lst == NULL.
void *ast_last_list_elem(void *lst)
{
    // debug_print("Entering ast_last_list_elem\n");
    if (lst == NULL) {
	return lst;
    }
    // assert(lst != NULL);
    void *prev = NULL;
    while (lst != NULL) {
	prev = lst;
	lst = ((generic_t *)lst)->next;
    }
    // here lst == NULL;
    return prev;
}

If more time, can add lexical changes, such as adding commas between
constDefs,

  or adding syntax changes, such as adding print statements