PROBLEM: AMBIGUITY Consider: ::= := | if then ::= | else and the statement: if b1 then if b2 then x := 2 else x := 3 Is this parsed as: / / \ -----|---\ if then | | | \ | | | / | \--------\ | / | \ \ \ b1 if then / | \ \ \ ... | \ x := 2 else / | \ ... x := 3 or as: / / \-----|--------\ if then | |\ / \ b1 /| \ else / | \ / | \ | | | ... | | | x := 3 | | | if then | /|\ \ b2 ... x := 2 FIXES FOR AMBIGUITY Change the language: a. Always have an else clause: ::= if then else (use skip if don't want to do anything) b. Use an end marker ::= if then else fi | if then fi Give precedence to one production: ::= if then ::= else // priority! | So we only get the parse tree: / / \ -----|---\ if then | | | \ | | | / | \--------\ | / | \ \ \ b1 if then / | \ \ \ ... | \ x := 2 else / | \ ... x := 3 BISON: A LALR(1) PARSER GENERATOR Input: pl0.y | | bison pl0.y v Output: pl0.tab.c and pl0.tab.h yyparse (def.) token decls. parse tables extern decls. yylval (user code) THE BIG PICTURE tokens - source --> [ Lexer] ------> [ Parser] code / / / ASTs / | v symbol <---- [ static table ----> analysis ] / / v [ code generator ] BISON AND FLEX, GENERATING A PARSER idea: ast.h (AST types) | bison v -----> pl0.tab.c / ^ yyparse (def.) / bison | pl0.y ----------> pl0.tab.h | token enum. | (decl.) flex v pl0_lexer.l-----> pl0_lexer.c yylex (def.) HOW IT ALL FITS TOGETHER IN HW3 machine_types.h: // ... typedef unsigned int address_type; typedef unsigned char byte_type; typedef int word_type; #define BYTES_PER_WORD 4 file_location.h: // location in a source file typedef struct { const char *filename; unsigned int line; // of first token } file_location; ast.h: #include "machine_types.h" #include "file_location.h" // types of ASTs (type tags) typedef enum { /* ... */ } AST_type; // typedefs for types N_t, // where N is a nonterminal // ... typedef struct ident_s { file_location *file_loc; AST_type type_tag; struct ident_s *next; // for lists const char *name; } ident_t; typedef struct { file_location *file_loc; AST_type type_tag; const char *text; word_type value; } number_t; // ... typedef struct block_s { file_location *file_loc; AST_type type_tag; const_decls_t const_decls; var_decls_t var_decls; proc_decls_t proc_decls; stmt_t stmt; } block_t; // ... typedef union AST_u { generic_t generic; block_t block; const_decls_t const_decls; const_decl_t const_decl; const_defs_t const_defs; const_def_t const_def; var_decls_t var_decls; var_decl_t var_decl; idents_t idents; // ... expr_t expr; binary_op_expr_t binary_op_expr; token_t token; number_t number; ident_t ident; empty_t empty; } AST; // ... extern block_t ast_block( const_decls_t const_decls, var_decls_t var_decls, proc_decls_t proc_decls, stmt_t stmt); // ... extern ident_t ast_ident( file_location *file_loc, const char *name); extern number_t ast_number( token_t sgn, word_type value); extern empty_t ast_empty( file_location *file_loc); // ... parser_types.h: #include "ast.h" typedef AST YYSTYPE; pl0.y (also pl0.tab.h): #include "ast.h" #include "machine_types.h" #include "parser_types.h" #include "lexer.h" // more below CONNECTING THE PARSER AND THE ASTs Parser model A stack of (terminals + nonterminals) A parallel stack of ASTs 1 token of lookahead Steps in parsing: - shift: 1. push lookahead on parse stack 2. push yylval on AST stack - reduce using a rule nt : a b c { $$ = f($1,$2,$3); }; 1. take a,b,c off parse stack 2. take aval,bval,cval off AST stack ntval = f(aval,bval,cval) 3. push nt on parse stack 4. push result (ntval) on AST stack CONNECTION WITH ASTs IN GRAMMAR FILE /* $Id: pl0.y ... */ %code requires { #include "ast.h" #include "machine_types.h" #include "parser_types.h" /* ... */ } /* ... */ %token identsym %token numbersym %token plussym "+" %token minussym "-" %token multsym "*" %token divsym "/" %token periodsym "." %token semisym ";" %token eqsym "=" %token commasym "," %token becomessym ":=" %token constsym "const" %token varsym "var" /* ... */ %token lparensym "(" %token rparensym ")" %type program %type block %type constDecls %type constDef %type varDecls %type varDecl %type idents %type procDecls %type empty /* ... */ %type expr %type relOp %type term %type factor %type posSign %start program PUTTING A TOKEN VALUE ON THE AST STACK To put yylval on the AST Stack when the field name is "token" in the .y file have: %token somesym "some" the generated parser has: #include "ast.h" #include "parser_types.h" // typedef AST YYSTYPE; YYSTYPE yyvsa[]; // the AST stack yyvsa[yyi].token = yylval; PUSHING AST FOR A NONTERMINAL ON AST STACK To put yylval on the AST Stack when the field name is "const_def" in the .y file have: %type constDef the generated parser has: #include "ast.h" #include "parser_types.h" // typedef AST YYSTYPE; YYSTYPE yyvsa[]; // the AST stack yyvsa[yyi].const_def = yylval; THE CONST LANGUAGE programs all look like: const ident = 3402 ASTs FOR THE CONST LANGUAGE /* $Id: ast.h ... */ /* ... */ // types of ASTs (type tags) typedef enum { const_def_ast, token_ast, number_ast, ident_ast } AST_type; typedef struct ident_s { file_location *file_loc; const char *name; } ident_t; typedef struct { file_location *file_loc; const char *text; word_type value; } number_t; typedef struct { file_location *file_loc; const char *text; int code; } token_t; typedef struct const_def_s { file_location *file_loc; ident_t ident; number_t number; } const_def_t; typedef union AST_u { generic_t generic; const_def_t const_def; token_t token; number_t number; ident_t ident; } AST; extern const_def_t ast_const_def( ident_t ident, number_t number); // ... THE CONST.Y FILE FOR CONST LANGUAGE /* ... */ %code requires { #include "ast.h" #include "machine_types.h" #include "parser_types.h" #include "lexer.h" /* ...*/ } /* ...*/ %token constsym "const" %token identsym %token eqsym "=" %token numbersym %type program %type constDef %start program %% program : constDef { setProgAST($1); } ; constDef : "const" identsym "=" numbersym { $$ = ast_const_def($2,$4); }; %% // Set the program's ast to be t void setProgAST(const_def_t t) { progast = t; } Change the grammar of the const language to be ::= { } ::= const = number