COP 3402 meeting -*- Outline -*- * Using the bison parser generator ------------------------------------------ BISON: A LALR(1) PARSER GENERATOR Input: spl.y | | bison spl.y v Output: spl.tab.c and spl.tab.h yyparse{def.} token decls. parse tables extern decls. yylval yyparse(), (+ user code) yylval ------------------------------------------ The output is a C program (by default, although bison can generate other languages, and there are other LALR(1) parser generators for other languages...) There are several LALR(1) parser generators: YACC, Bison, JavaCC, ... ** big picture ------------------------------------------ THE BIG PICTURE tokens - source --> [ Lexer] ------> [ Parser] code / / / ASTs / | v symbol <---- [ static table ----> analysis ] / / v [ code generator ] ------------------------------------------ ** using bison and flex ------------------------------------------ BISON AND FLEX, GENERATING A PARSER idea: ast.h (AST types) | bison v -----> spl.tab.c / ^ yyparse (def.) / bison | spl.y ----------> spl.tab.h | token enum. | (decl.) flex v spl_lexer.l-----> spl_lexer.c yylex (def.) ------------------------------------------ ... explain all of this: The context-free grammar (the spl.y file) is central to this, The spl.y file records: - grammar and - names/types of the tokens (that the parser needs) Bison is a parser generator it generated the function yyparse in file spl.tab.c The ASTs record the structure of the program (and the parse) Flex is a lexical analyzer generator it generated the function yylex the spl_lexer.l file records - the lexical grammar (as REs) - how tokens produce ASTs (in yylval) *** file connections for hw3 ------------------------------------------ HOW IT ALL FITS TOGETHER IN HW3 // in file machine_types.h: ====== // ... typedef unsigned int address_type; typedef unsigned char byte_type; typedef int word_type; // in file file_location.h: ====== // location in a source file typedef struct { const char *filename; unsigned int line; // of first token } file_location; // in file ast.h: ================ // ... #include "machine_types.h" #include "file_location.h" // types of ASTs (type tags) typedef enum { block_ast, /* ... */ token_ast } AST_type; // typedefs for types N_t, follow, // where N is a nonterminal typedef struct { file_location *file_loc; AST_type type_tag; void *next; // for lists } generic_t; typedef struct ident_s { file_location *file_loc; AST_type type_tag; struct ident_s *next; // for lists const char *name; } ident_t; typedef struct { file_location *file_loc; AST_type type_tag; const char *text; word_type value; } number_t; typedef struct { file_location *file_loc; AST_type type_tag; const char *text; int code; } token_t; // ... typedef struct block_s { file_location *file_loc; AST_type type_tag; const_decls_t const_decls; var_decls_t var_decls; proc_decls_t proc_decls; stmts_t stmts; } block_t; // ... typedef union AST_u { generic_t generic; block_t block; const_decls_t const_decls; const_decl_t const_decl; const_def_list_t const_def_list; const_def_t const_def; var_decls_t var_decls; var_decl_t var_decl; ident_list_t ident_list; // ... expr_t expr; binary_op_expr_t binary_op_expr; token_t token; number_t number; ident_t ident; empty_t empty; } AST; // Return the file location from an AST extern file_location *ast_file_loc(AST t); // ... // Return a pointer to a fresh copy of t // that has been allocated on the heap extern AST *ast_heap_copy(AST t); // ... extern block_t ast_block( token_t begin_tok, const_decls_t const_decls, var_decls_t var_decls, proc_decls_t proc_decls, stmts_t stmts); // ... extern ident_t ast_ident( file_location *file_loc, const char *name); extern expr_t ast_expr_number( number_t e); extern empty_t ast_empty( file_location *file_loc); // ... parser_types.h: #include "ast.h" typedef AST YYSTYPE; spl.y (also spl.tab.h): #include "ast.h" #include "machine_types.h" #include "parser_types.h" #include "lexer.h" // more below ------------------------------------------ Explain all of these provided files. The ASTs are structs that are typedef'd in ast.h (more on them below) The type AST is stored in the parse stack during parsing, (so that is why it must be a union) and the field names are used in the grammar's actions; they also appear in spl.y as names in %type declarations e.g., %type constDef where const_def is a field name from the AST union type. *** abstract syntax trees and their use in parsing **** background: how the parser works ------------------------------------------ CONNECTING THE PARSER AND THE ASTs Parser model A stack of (terminals + nonterminals) A parallel stack of ASTs 1 token of lookahead Steps in parsing (DFA decides to): - shift: 1. push lookahead on parse stack 2. push its yylval on AST stack OR - reduce using a rule written: nt : a b c { $$ = f($1,$2,$3); }; 1. take a,b,c off parse stack 2. take their AST values, aval,bval,cval, off the AST stack and compute ntval = f(aval,bval,cval) 3. push nt on parse stack 4. push result (ntval) on AST stack ------------------------------------------ A LR (or LALR) parser maintains a parse state in a DFA, which is determined by the contents of the parse stack; the parse state and the lookahead (token) are used to decide what step to take next. See the *.output file (e.g., spl.output) produced by bison to understand what the parser's algorithm will do and to debug ambiguities. **** use of ASTs in the grammar (.y) file ------------------------------------------ CONNECTION WITH ASTs IN GRAMMAR FILE /* $Id: spl.y ... */ %code requires { #include "ast.h" #include "machine_types.h" #include "parser_types.h" #include "lexer.h" /* ... */ } /* ... */ %token identsym %token numbersym %token plussym "+" %token minussym "-" %token multsym "*" %token divsym "/" %token periodsym "." %token semisym ";" %token eqsym "=" %token commasym "," %token becomessym ":=" %token lparensym "(" %token rparensym ")" %token constsym "const" %token varsym "var" /* ... */ %type program %type block %type constDecls %type constDef %type varDecls %type varDecl %type idents %type procDecls %type empty /* ... */ %type expr %type term %type factor %start program /* ... */ ------------------------------------------ Explain that a %token definition in bison of the form %token somesym "some" declares (to bison) that: - somesym is a kind of token (so it goes into the enumeration in spl.tab.h) - it has the text "some", which can be used in the rules (and stands for the token there) - its attributes in the AST's union has the type of the field named "astField" ------------------------------------------ PUTTING A TOKEN VALUE ON THE AST STACK (DETAILS) To put yylval on the AST Stack when the field name is "token" in the .y file have: %token somesym "some" the generated parser has: #include "ast.h" #include "parser_types.h" // typedef AST YYSTYPE; YYSTYPE yyvsa[]; // the AST stack yyvsa[yyi].token = yylval; ------------------------------------------ ... Bison knows the field name in the AST union type, so it uses that field name in the generated C code, the code shown ------------------------------------------ PUSHING AST FOR A NONTERMINAL ON AST STACK (DETAILS) To put yylval on the AST Stack when the field name is "const_def" in the .y file have: %type constDef the generated parser has: #include "ast.h" #include "parser_types.h" // in file parser_types.h: ===== // typedef AST YYSTYPE; // in the generated parser file: == YYSTYPE yyvsa[]; // the AST stack yyvsa[yyi].const_def = yylval; ------------------------------------------ The name in the angle brackets gives the name of the union's field. **** Example, the const language This example is in the example-code page, see https://www.cs.ucf.edu/~leavens/COP3402/example-code/index.html#ConstLang ------------------------------------------ THE CONST LANGUAGE programs all look like: const ident = 3402 ------------------------------------------ It's super simple... we are using it to see how flex and bison work together ------------------------------------------ ASTs FOR THE CONST LANGUAGE /* $Id: ast.h ... */ /* ... */ typedef struct { file_location *file_loc; } generic_t; typedef struct ident_s { file_location *file_loc; const char *name; } ident_t; typedef struct { file_location *file_loc; const char *text; word_type value; } number_t; typedef struct { file_location *file_loc; const char *text; int code; } token_t; typedef struct const_def_s { file_location *file_loc; ident_t ident; number_t number; } const_def_t; typedef union AST_u { generic_t generic; const_def_t const_def; token_t token; number_t number; ident_t ident; } AST; extern const_def_t ast_const_def( ident_t ident, number_t number); // ... ------------------------------------------ ------------------------------------------ THE CONST.Y FILE FOR CONST LANGUAGE /* ... */ %code requires { #include "ast.h" #include "machine_types.h" #include "parser_types.h" #include "lexer.h" /* ...*/ } /* ...*/ %token constsym "const" %token identsym %token eqsym "=" %token numbersym %type program %type constDef %type constDefs %type empty %start program %code { extern int yylex(void); const_def_t progast; extern void setProgAST(const_def_t t); } %% program : constDef { setProgAST($1); } ; constDef : "const" identsym "=" numbersym { $$ = ast_const_def($2,$4); }; %% // Set the program's ast to be t void setProgAST(const_def_t t) { progast = t; } ------------------------------------------ Explain all this: - there are 4 declared tokens, and three AST field names for them (ident, token, and number) - there are 2 nonterminals, both using the AST field name const_def - program is the start symbol (nonterminal) of the grammar - there are the 2 productions in the grammar, with no alternatives. Q: Do we really need 2? No, we could just use constDef as the start symbol and have 1. - both rules have actions, which is C code executed when the rule is reduced. Q: When are line numbers recorded? when the token ASTs are formed, so should be accurate. **** example of changing the grammar Q: How would we make the grammar be ::= with ::= { } ? add productions for constDefs : empty { $$ = ast_const_defs_empty($1); } | constDefs constDef { $$ = ast_const_defs($1, $2); } ; empty : %empty { $$ = ast_empty(floc); } ; constDef : "const" identsym "=" numbersym { $$ = ast_const_def($2,$4); } ; Then we would need to do the following: (However, I usually think about the ASTs earlier than indicated here) ***** Change the grammar (in const.y) - add productions for the new grammar with actions that build ASTs: program : constDefs { setProgAST($1); } ; constDefs : empty { $$ = ast_const_defs_empty($1); } | constDefs constDef { $$ = ast_const_defs($1, $2); } ; empty : %empty { $$ = ast_empty(); } ; Note that left recursion, as in constDefs, is preferred for LALR parsers like Bison. Note that %empty tells Bison that the production really is empty. - compensate for the new grammar (in const.y) by: - declaring types for the new nonterminals %type program %type constDefs %type empty - changing the type declared for progast: /* The AST for the program, set by the semantic action for the nonterminal program. */ const_defs_t progast; - changing the parameter type for setProgramAST: /* Set the program's ast to be t */ extern void setProgAST(const_defs_t t); // Set the program's ast to be t void setProgAST(const_defs_t t) { progast = t; } - adding test cases to use and test the new grammar ***** Add to the ast module - facilities for lists of constDefs: - a generic struct type (for lists, that includes a next pointer) and a next pointer in const_def_t (as there will be lists of those ASTs) - an AST (struct) type for empty lists (declared before const_defs_t) - an AST (struct) type for lists of constDefs (which is const_defs_t) - compensating in the ast module for those changes: - adding the new AST types (const_defs_t and empty_t) to the declaration of the union type AST. - adding functions to the ast module to create the new AST types (say ast_const_defs_empty, ast_const_defs, and ast_empty) note, in ast.h, the differences between the functions ast_const_defs_empty and ast_empty - adding a function to the ast module to find the last element in a linked list (say ast_last_list_elem) add the following to the ast.h file // empty ::= typedef struct { file_location *file_loc; AST_type type_tag; } empty_t; // const-defs ::= empty | constDefs constDef typedef struct { file_location *file_loc; AST_type type_tag; const_def_t *const_defs; // for lists } const_defs_t; // program ::= const-defs // The generic struct type (generic_t) has the fields that // should be in all alternatives for ASTs. typedef struct { file_location *file_loc; AST_type type_tag; // says what field of the union is active void *next; // for lists } generic_t; and add the following to the ast.c file (used in the .y file) // Return an AST for empty found in the given file location empty_t ast_empty(file_location *file_loc) { empty_t ret; ret.file_loc = file_loc; ret.type_tag = empty_ast; return ret; } // Return an AST for a const-defs that is empty const_defs_t ast_const_defs_empty(empty_t emp) { program_t ret; ret.file_loc = emp_AST.file_loc; ret.type_tag = program_ast; ret.const_defs = NULL; return ret; } // Return an AST for a const-defs that is not empty const_defs_t ast_const_defs(const_defs_t cdfs, const_def_t cdf) { const_defs_t ret = cdfs; const_def_t *p = (const_def_t *) malloc(sizeof(const_def_t)); if (p == NULL) { bail_with_error("Cannot allocate space for a const_def_t!"); } *p = cdf; p->next = NULL; const_def_t *last = ast_last_list_elem(ret.start); if (last == NULL) { ret.start = p; } else { last->next = p; } return ret; } // Requires: lst is a pointer to a non-circular // linked list with next pointers // as in generic_t // Return a pointer to the last element in lst. // This only returns NULL if lst == NULL. void *ast_last_list_elem(void *lst) { // debug_print("Entering ast_last_list_elem\n"); if (lst == NULL) { return lst; } // assert(lst != NULL); void *prev = NULL; while (lst != NULL) { prev = lst; lst = ((generic_t *)lst)->next; } // here lst == NULL; return prev; } If more time, can add lexical changes, such as adding commas between constDefs, or adding syntax changes, such as adding print statements