COP 3402 meeting -*- Outline -*- * Using the bison parser generator ------------------------------------------ BISON: A LALR(1) PARSER GENERATOR Input: pl0.y | | bison pl0.y v Output: pl0.tab.c and pl0.tab.h yyparse (def.) token decls. parse tables extern decls. yylval (user code) ------------------------------------------ The output is a C program (although bison can generate other languages) There are several LALR(1) parser generators: YACC, Bison, JavaCC, ... ** big picture ------------------------------------------ THE BIG PICTURE tokens - source --> [ Lexer] ------> [ Parser] code / / / ASTs / | v symbol <---- [ static table ----> analysis ] / / v [ code generator ] ------------------------------------------ ** using bison and flex ------------------------------------------ BISON AND FLEX, GENERATING A PARSER idea: ast.h (AST types) | bison v -----> pl0.tab.c / ^ yyparse (def.) / bison | pl0.y ----------> pl0.tab.h | token enum. | (decl.) flex v pl0_lexer.l-----> pl0_lexer.c yylex (def.) ------------------------------------------ ... explain all of this: The context-free grammar (the .y file) is central to this, The .y file records: - grammar and - names/types of the tokens (that the parser needs) Bison is a parser generator it generated the function yyparse in file pl0.tab.c The ASTs record the structure of the program (and the parse) Flex is a lexical analyzer generator it generated the function yylex the .l file records - the lexical grammar (as REs) - how tokens produce ASTs (in yylval) *** file connections for hw3 ------------------------------------------ HOW IT ALL FITS TOGETHER IN HW3 machine_types.h: // ... typedef unsigned int address_type; typedef unsigned char byte_type; typedef int word_type; #define BYTES_PER_WORD 4 file_location.h: // location in a source file typedef struct { const char *filename; unsigned int line; // of first token } file_location; ast.h: #include "machine_types.h" #include "file_location.h" // types of ASTs (type tags) typedef enum { /* ... */ } AST_type; // typedefs for types N_t, // where N is a nonterminal // ... typedef struct ident_s { file_location *file_loc; AST_type type_tag; struct ident_s *next; // for lists const char *name; } ident_t; typedef struct { file_location *file_loc; AST_type type_tag; const char *text; word_type value; } number_t; // ... typedef struct block_s { file_location *file_loc; AST_type type_tag; const_decls_t const_decls; var_decls_t var_decls; proc_decls_t proc_decls; stmt_t stmt; } block_t; // ... typedef union AST_u { generic_t generic; block_t block; const_decls_t const_decls; const_decl_t const_decl; const_defs_t const_defs; const_def_t const_def; var_decls_t var_decls; var_decl_t var_decl; idents_t idents; // ... expr_t expr; binary_op_expr_t binary_op_expr; token_t token; number_t number; ident_t ident; empty_t empty; } AST; // ... extern block_t ast_block( const_decls_t const_decls, var_decls_t var_decls, proc_decls_t proc_decls, stmt_t stmt); // ... extern ident_t ast_ident( file_location *file_loc, const char *name); extern number_t ast_number( token_t sgn, word_type value); extern empty_t ast_empty( file_location *file_loc); // ... parser_types.h: #include "ast.h" typedef AST YYSTYPE; pl0.y (also pl0.tab.h): #include "ast.h" #include "machine_types.h" #include "parser_types.h" #include "lexer.h" // more below ------------------------------------------ Explain all of these. The ASTs are structs that are typedef'd in ast.h (more on them below) The type AST is stored in the parse stack during parsing, the field names are used in the grammar's actions; they also appear in pl0.y as names as *** abstract syntax trees and their use in parsing **** background: how the parser works ------------------------------------------ CONNECTING THE PARSER AND THE ASTs Parser model A stack of (terminals + nonterminals) A parallel stack of ASTs 1 token of lookahead Steps in parsing: - shift: 1. push lookahead on parse stack 2. push yylval on AST stack - reduce using a rule nt : a b c { $$ = f($1,$2,$3); }; 1. take a,b,c off parse stack 2. take aval,bval,cval off AST stack ntval = f(a,b,c) 3. push nt on parse stack 4. push result (ntval) on AST stack ------------------------------------------ A LR (or LALR) parser maintains a parse state, which is determined by the contents of the parse stack; the parse state and the lookahead (token) are used to decide what step to take next. See the *.output file (e.g., pl0.output) produced by bison to understand what the parser's algorithm will do and to debug ambiguities. **** use of ASTs in the grammar (.y) file ------------------------------------------ CONNECTION WITH ASTs IN GRAMMAR FILE /* $Id: pl0.y ... */ %code requires { #include "ast.h" #include "machine_types.h" #include "parser_types.h" /* ... */ } /* ... */ %token identsym %token numbersym %token plussym "+" %token minussym "-" %token multsym "*" %token divsym "/" %token periodsym "." %token semisym ";" %token eqsym "=" %token commasym "," %token becomessym ":=" %token constsym "const" %token varsym "var" /* ... */ %token lparensym "(" %token rparensym ")" %type program %type block %type constDecls %type constDef %type varDecls %type varDecl %type idents %type procDecls %type empty /* ... */ %type expr %type relOp %type term %type factor %type posSign %start program ------------------------------------------ Explain that a %token definition in bison of the form %token somesym "some" declares (to bison) that: - somesym is a kind of token (so it goes into the enumeration in pl0.tab.h) - it has the text "some" - its attributes are AST's union, with the field name "astField" ------------------------------------------ PUTTING A TOKEN VALUE ON THE AST STACK To put yylval on the AST Stack when the field name is "token" in the .y file have: %token somesym "some" the generated parser has: #include "ast.h" #include "parser_types.h" // typedef AST YYSTYPE; YYSTYPE yyvsa[]; // the AST stack yyvsa[yyi].token = yylval; ------------------------------------------ ... Bison knows the field name in the AST union type, so it uses that field name in the generated C code, the code shown ------------------------------------------ PUSHING AST FOR A NONTERMINAL ON AST STACK To put yylval on the AST Stack when the field name is "const_def" in the .y file have: %type constDef the generated parser has: #include "ast.h" #include "parser_types.h" // typedef AST YYSTYPE; YYSTYPE yyvsa[]; // the AST stack yyvsa[yyi].const_def = yylval; ------------------------------------------ The name in the angle brackets gives the name of the union's field. **** Example, the const language ------------------------------------------ THE CONST LANGUAGE programs all look like: const ident = 3402 ------------------------------------------ It's super simple... can be used to see how flex and bison work ------------------------------------------ ASTs FOR THE CONST LANGUAGE /* $Id: ast.h ... */ /* ... */ // types of ASTs (type tags) typedef enum { const_def_ast, token_ast, number_ast, ident_ast } AST_type; typedef struct ident_s { file_location *file_loc; const char *name; } ident_t; typedef struct { file_location *file_loc; const char *text; word_type value; } number_t; typedef struct { file_location *file_loc; const char *text; int code; } token_t; typedef struct const_def_s { file_location *file_loc; ident_t ident; number_t number; } const_def_t; typedef union AST_u { generic_t generic; const_def_t const_def; token_t token; number_t number; ident_t ident; } AST; extern const_def_t ast_const_def( ident_t ident, number_t number); // ... ------------------------------------------ ------------------------------------------ THE CONST.Y FILE FOR CONST LANGUAGE /* ... */ %code requires { #include "ast.h" #include "machine_types.h" #include "parser_types.h" #include "lexer.h" /* ...*/ } /* ...*/ %token constsym "const" %token identsym %token eqsym "=" %token numbersym %type program %type constDef %start program %% program : constDef { setProgAST($1); } ; constDef : "const" identsym "=" numbersym { $$ = ast_const_def($2,$4); }; %% // Set the program's ast to be t void setProgAST(const_def_t t) { progast = t; } ------------------------------------------ Explain all this: - there are 4 declared tokens, and three AST field names for them (ident, token, and number) - there are 2 nonterminals, both using the AST field name const_def - program is the start symbol (nonterminal) of the grammar - there are the 2 productions in the grammar, with no alternatives. Q: Do we really need 2? No, we could just use constDef as the start symbol. - both rules have actions, which is C code executed when the rule is reduced. Q: When are line numbers recorded? when the token ASTs are formed, so should be accurate. **** example of changing the grammar Q: How would we make the grammar be ::= { } ? add productions for program : empty { $$ = ast_program_empty($1); } | program constDef { $$ = ast_program_with_constDef($1, $2); } ; empty : %empty { file_location *floc = file_location_make(lexer_filename(), lexer_line); $$ = ast_empty(floc); } ; Then we would need to add to the ast module - AST_type enums for program and empty ASTs - a struct type for empty lists - a struct type for programs - a generic struct type (for lists) // types of ASTs (type tags) typedef enum { program_ast, const_def_ast, token_ast, number_ast, ident_ast, empty_ast } AST_type; // empty ::= typedef struct { file_location *file_loc; AST_type type_tag; } empty_t; // program ::= empty | program constDef typedef struct { file_location *file_loc; AST_type type_tag; const_def_t *const_defs; // for lists } program_t; // The generic struct type (generic_t) has the fields that // should be in all alternatives for ASTs. typedef struct { file_location *file_loc; AST_type type_tag; // says what field of the union is active void *next; // for lists } generic_t; the following (used in the .y file) // Return an AST for empty found in the given file location empty_t ast_empty(file_location *file_loc) { empty_t ret; ret.file_loc = file_loc; ret.type_tag = empty_ast; return ret; } // Return an AST for an empty program extern program_t ast_program_empty(empty_t emp_AST) { program_t ret; ret.file_loc = emp_AST.file_loc; ret.type_tag = program_ast; ret.const_defs = NULL; return ret; } // Return an AST for a program with an added constDef extern program_t ast_program_with_constDef(program_t prog, const_def_t const_def) { program_t ret = prog; // make a copy of const_def on the heap const_def_t *p = (const_def_t *) malloc(sizeof(const_def_t)); if (p == NULL) { bail_with_error("Cannot allocate space for %s!", "const_def_t"); } *p = const_def; p->next = NULL; const_def_t *last = ast_last_list_elem(ret.const_defs); if (last == NULL) { ret.const_defs = p; } else { last->next = p; } return ret; } // Requires: lst is a pointer to a non-circular // linked list with next pointers // as in generic_t // Return a pointer to the last element in lst. // This only returns NULL if lst == NULL. void *ast_last_list_elem(void *lst) { // debug_print("Entering ast_last_list_elem\n"); if (lst == NULL) { return lst; } // assert(lst != NULL); void *prev = NULL; while (lst != NULL) { prev = lst; lst = ((generic_t *)lst)->next; } // here lst == NULL; return prev; } If more time, can add other alternatives, such write statements and begin statements