diff options
author | Matthew Sotoudeh <matthew@masot.net> | 2024-02-19 16:41:13 -0800 |
---|---|---|
committer | Matthew Sotoudeh <matthew@masot.net> | 2024-02-19 16:41:13 -0800 |
commit | 26a42b4a7ba077659f791208a2a7989bfdfb3663 (patch) | |
tree | 02069692c4d629d3108bbed43c4eb6eddfd2fbc7 | |
parent | e133f250761c67b4465181f41909e78c272901d3 (diff) |
playing with the C grammar
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | earlpy/earlpy.py | 52 | ||||
-rw-r--r-- | examples/file.c | 3 | ||||
-rw-r--r-- | examples/test.py | 8 | ||||
-rw-r--r-- | examples/type_ambiguous.c | 3 | ||||
-rw-r--r-- | grammars/c/disambiguate.c | 33 | ||||
-rw-r--r-- | grammars/c/grammar.txt | 76 |
7 files changed, 148 insertions, 29 deletions
diff --git a/Makefile b/Makefile deleted file mode 100644 index d6abf1c..0000000 --- a/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -CFLAGS += -fsanitize=address -g -# CFLAGS += -O3 -g diff --git a/earlpy/earlpy.py b/earlpy/earlpy.py index 7fbf0f0..3b0deab 100644 --- a/earlpy/earlpy.py +++ b/earlpy/earlpy.py @@ -13,7 +13,8 @@ class Parser: parser_dir = parser_dir files = sorted([f"{parser_dir}/grammar.txt", *glob(f"{parser_dir}/*.c"), - f"{DIR}/parser.c"]) + f"{DIR}/parser.c", + __file__]) if f"{parser_dir}/parser.c" in files: files.remove(f"{parser_dir}/parser.c") hashes = ' '.join( @@ -110,17 +111,10 @@ class Parser: ordered_symbols = [] last_symbol = None for line in grammar: - if line[0] in ' \t': - if last_symbol.kind == "list": - last_symbol.contents.extend(line.split()) - elif last_symbol.kind == "regex": - assert not last_symbol.contents - last_symbol.contents = line.strip() - elif last_symbol.kind == "nonterm": - last_symbol.contents.append(line.split()) - else: raise NotImplementedError - elif line.strip().startswith("#"): + if line.strip().startswith("#"): continue + elif line[0] in ' \t': + last_symbol.process_subline(line.strip()) elif line.strip(): last_symbol = Symbol(line) self.name_to_symbol[last_symbol.name] = last_symbol @@ -179,6 +173,7 @@ class Parser: symbol.kind = "nonterm" symbol.contents = new_rule + symbol.production_names = [None for _ in new_rule] symbol.is_pseudo_node = True new_ordered_symbols.append(symbol) ordered_symbols = new_ordered_symbols @@ -232,27 +227,31 @@ class Parser: putl("prod_id_t SYMBOL_ID_TO_PRODUCTION_IDS[N_SYMBOLS][MAX_N_PRODUCTIONS] = { {0}") # [(production, Symbol), ...] - self.productions = [([], None)] + self.productions = [([], None, None)] for symbol in ordered_symbols: if symbol.kind == "nonterm": start_idx = len(self.productions) assert isinstance(symbol.contents[0], list) - for rule in symbol.contents: + for i, rule in enumerate(symbol.contents): rule = [self.name_to_symbol[x] for x in rule] - self.productions.append((rule, symbol)) + self.productions.append((rule, symbol, symbol.production_names[i])) prods = ', '.join(map(str, range(start_idx, len(self.productions)))) if prods: put(", {" + prods + ", 0}") else: put(", {0}") else: - self.productions.append(([], symbol)) + self.productions.append(([], symbol, None)) put(", {0}") put(" };") putl(f"#define N_PRODUCTIONS {len(self.productions)}") + for i, (_, _, name) in enumerate(self.productions): + if name: + putl(f"#define PRODUCTION_{name} {i}") + putl("symbol_id_t PRODUCTION_ID_TO_PRODUCTION[N_PRODUCTIONS][MAX_PRODUCTION_LEN] = { {0}") - for i, (production, _) in enumerate(self.productions): + for i, (production, _, _) in enumerate(self.productions): if i == 0: continue production = ', '.join(str(symbol.id) for symbol in production) if production: @@ -262,7 +261,7 @@ class Parser: put(" };") putl("symbol_id_t PRODUCTION_ID_TO_SYMBOL[N_PRODUCTIONS] = { 0") - for i, (_, symbol) in enumerate(self.productions): + for i, (_, symbol, _) in enumerate(self.productions): if i != 0: put(f", {symbol.id}") put(" };") @@ -294,9 +293,28 @@ class Symbol: self.kind = parts[1] self.is_start = ".start" in parts[2:] self.contents = [] + self.production_names = [] self.id = None self.is_pseudo_node = False + def process_subline(self, line): + if self.kind == "list": + self.contents.extend(line.split()) + elif self.kind == "regex": + assert not self.contents + self.contents = line.strip() + elif self.kind == "nonterm": + self.contents.append(line.split()) + self.production_names.append(None) + for i, part in enumerate(self.contents[-1]): + if part.startswith("."): + args = self.contents[-1][i:] + self.contents[-1] = self.contents[-1][:i] + for arg, value in zip(args[::2], args[1::2]): + if arg == ".name": + self.production_names[-1] = value + else: raise NotImplementedError + class Node: def __init__(self, symbol, production): self.symbol = symbol diff --git a/examples/file.c b/examples/file.c new file mode 100644 index 0000000..a94ceea --- /dev/null +++ b/examples/file.c @@ -0,0 +1,3 @@ +void hello(int x) { + x += 1; +} diff --git a/examples/test.py b/examples/test.py index e5783c5..ac7e223 100644 --- a/examples/test.py +++ b/examples/test.py @@ -1,14 +1,16 @@ import earlpy +p = earlpy.Parser("grammars/c") + if False: p = earlpy.Parser("grammars/expression") node = p.parse_string("1 + 1 + 2 + 3") print(node.pprint()) -elif True: +elif False: p = earlpy.Parser("grammars/c") - node = p.parse_file("examples/simple.c") + node = p.parse_file("examples/file.c") print(node.pprint()) -else: +elif False: p = earlpy.Parser("grammars/c") node = p.parse_file("examples/expr.c") print(node.pprint()) diff --git a/examples/type_ambiguous.c b/examples/type_ambiguous.c new file mode 100644 index 0000000..36602ae --- /dev/null +++ b/examples/type_ambiguous.c @@ -0,0 +1,3 @@ +void hello(int x) { + foo_t * x; +} diff --git a/grammars/c/disambiguate.c b/grammars/c/disambiguate.c index a2d69d1..403d65f 100644 --- a/grammars/c/disambiguate.c +++ b/grammars/c/disambiguate.c @@ -1,14 +1,37 @@ +struct token *TYPE_NAMES[1024]; +size_t N_TYPE_NAMES; + +void alert_parse(struct state *state) { + if (PRODUCTION_ID_TO_SYMBOL[state->production_id] == SYMBOL_TYPEDEF) { + for (struct token *t = find_token(state, 2); t->symbol != DONE_SYMBOL; t++) { + if (t->symbol == SYMBOL_IDENT) { + TYPE_NAMES[N_TYPE_NAMES++] = t; + break; + } + } + } +} + +int is_typename(struct token *token) { + if (!strcmp("int", token->string)) return 1; + for (size_t i = 0; i < N_TYPE_NAMES; i++) + if (!strcmp(TYPE_NAMES[i]->string, token->string)) + return 1; + return 0; +} + int disambiguator(struct state *old, struct state *new) { // printf("Old tree:\n"); // print_parse_tree(old, 4); // printf("New tree:\n"); // print_parse_tree(new, 4); - if (old->start_idx != new->start_idx) { - // printf("\t\tIGNORING "); print_parse_tree2(old); - // printf("\t\tVS: "); print_parse_tree2(new); - return 2; - } + if (old->production_id == PRODUCTION_DECL_STMT) + if (!is_typename(find_token(old->reasons[0], 0))) + return 1; + if (new->production_id == PRODUCTION_DECL_STMT) + if (!is_typename(find_token(new->reasons[0], 0))) + return 0; // Prefer the earlier parsings in the grammar when two entirely different // productions are taken. diff --git a/grammars/c/grammar.txt b/grammars/c/grammar.txt index 7959318..ffe85c3 100644 --- a/grammars/c/grammar.txt +++ b/grammars/c/grammar.txt @@ -1,5 +1,6 @@ KEYWORDS list switch volatile case while do else const for if + struct union typedef void IDENT regex [a-zA-Z_][0-9a-zA-Z_]* @@ -16,6 +17,63 @@ OP list < <= > >= = . -> ? : +############### TYPE PARSING +# A PRIMITIVE_TYPE is the core object that takes up space after dereferencing, +# calling, etc. A normal variable declaration is PRIMITIVE_TYPE (expression) +PRIMITIVE_TYPE nonterm + struct IDENT + union IDENT + struct IDENT AGGREGATE_DECLARATION + union IDENT AGGREGATE_DECLARATION + const PRIMITIVE_TYPE + volatile PRIMITIVE_TYPE + void + IDENT + +# A TYPE_EXPRESSION is basically an lvalue expression. +TYPE_EXPRESSION nonterm + IDENT + TYPE_EXPRESSION [ ] + TYPE_EXPRESSION [ INT ] + * TYPE_EXPRESSION + ( TYPE_EXPRESSION ) + TYPE_EXPRESSION ( ) + TYPE_EXPRESSION ( ARGS ) + +DECLARATION nonterm + PRIMITIVE_TYPE TYPE_EXPRESSION + +# An ANONYMOUS_TYPE has no name +ANONYMOUS_TYPE nonterm + PRIMITIVE_TYPE + ANONYMOUS_TYPE [ ] + ANONYMOUS_TYPE [ INT ] + * ANONYMOUS_TYPE + ( ANONYMOUS_TYPE ) + ANONYMOUS_TYPE ( ) + ANONYMOUS_TYPE ( ARGS ) + +############### TOP LEVEL +TOP_LEVEL nonterm .start + TYPEDEF + FUNCTION + +ARGS nonterm + ANONYMOUS_TYPE + ANONYMOUS_TYPE , ARGS + DECLARATION + DECLARATION , ARGS + +FUNCTION nonterm + DECLARATION ( ) TRUE_BLOCK + DECLARATION ( ARGS ) TRUE_BLOCK + +AGGREGATE_DECLARATION nonterm + { STMTS } + +TYPEDEF nonterm + typedef PRIMITIVE_TYPE TYPE_EXPRESSION ; + EXPR nonterm INT IDENT @@ -40,19 +98,33 @@ FOR nonterm SWITCH nonterm switch ( EXPR ) BLOCK +DECLARATION_CHAIN nonterm + TYPE_EXPRESSION + TYPE_EXPRESSION , DECLARATION_CHAIN + TYPE_EXPRESSION = EXPR + TYPE_EXPRESSION = EXPR , DECLARATION_CHAIN + +DECLARATION_STATEMENT nonterm + PRIMITIVE_TYPE DECLARATION_CHAIN ; + STMT nonterm IF WHILE DO FOR SWITCH + # NOTE: it auto-prefers declarations right now + DECLARATION_STATEMENT .name DECL_STMT EXPR ; -STMTS nonterm .start +STMTS nonterm STMT STMT STMTS -BLOCK nonterm +TRUE_BLOCK nonterm { } { STMTS } + +BLOCK nonterm + TRUE_BLOCK STMT |