From 26a42b4a7ba077659f791208a2a7989bfdfb3663 Mon Sep 17 00:00:00 2001 From: Matthew Sotoudeh Date: Mon, 19 Feb 2024 16:41:13 -0800 Subject: playing with the C grammar --- earlpy/earlpy.py | 52 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 17 deletions(-) (limited to 'earlpy/earlpy.py') diff --git a/earlpy/earlpy.py b/earlpy/earlpy.py index 7fbf0f0..3b0deab 100644 --- a/earlpy/earlpy.py +++ b/earlpy/earlpy.py @@ -13,7 +13,8 @@ class Parser: parser_dir = parser_dir files = sorted([f"{parser_dir}/grammar.txt", *glob(f"{parser_dir}/*.c"), - f"{DIR}/parser.c"]) + f"{DIR}/parser.c", + __file__]) if f"{parser_dir}/parser.c" in files: files.remove(f"{parser_dir}/parser.c") hashes = ' '.join( @@ -110,17 +111,10 @@ class Parser: ordered_symbols = [] last_symbol = None for line in grammar: - if line[0] in ' \t': - if last_symbol.kind == "list": - last_symbol.contents.extend(line.split()) - elif last_symbol.kind == "regex": - assert not last_symbol.contents - last_symbol.contents = line.strip() - elif last_symbol.kind == "nonterm": - last_symbol.contents.append(line.split()) - else: raise NotImplementedError - elif line.strip().startswith("#"): + if line.strip().startswith("#"): continue + elif line[0] in ' \t': + last_symbol.process_subline(line.strip()) elif line.strip(): last_symbol = Symbol(line) self.name_to_symbol[last_symbol.name] = last_symbol @@ -179,6 +173,7 @@ class Parser: symbol.kind = "nonterm" symbol.contents = new_rule + symbol.production_names = [None for _ in new_rule] symbol.is_pseudo_node = True new_ordered_symbols.append(symbol) ordered_symbols = new_ordered_symbols @@ -232,27 +227,31 @@ class Parser: putl("prod_id_t SYMBOL_ID_TO_PRODUCTION_IDS[N_SYMBOLS][MAX_N_PRODUCTIONS] = { {0}") # [(production, Symbol), ...] - self.productions = [([], None)] + self.productions = [([], None, None)] for symbol in ordered_symbols: if symbol.kind == "nonterm": start_idx = len(self.productions) assert isinstance(symbol.contents[0], list) - for rule in symbol.contents: + for i, rule in enumerate(symbol.contents): rule = [self.name_to_symbol[x] for x in rule] - self.productions.append((rule, symbol)) + self.productions.append((rule, symbol, symbol.production_names[i])) prods = ', '.join(map(str, range(start_idx, len(self.productions)))) if prods: put(", {" + prods + ", 0}") else: put(", {0}") else: - self.productions.append(([], symbol)) + self.productions.append(([], symbol, None)) put(", {0}") put(" };") putl(f"#define N_PRODUCTIONS {len(self.productions)}") + for i, (_, _, name) in enumerate(self.productions): + if name: + putl(f"#define PRODUCTION_{name} {i}") + putl("symbol_id_t PRODUCTION_ID_TO_PRODUCTION[N_PRODUCTIONS][MAX_PRODUCTION_LEN] = { {0}") - for i, (production, _) in enumerate(self.productions): + for i, (production, _, _) in enumerate(self.productions): if i == 0: continue production = ', '.join(str(symbol.id) for symbol in production) if production: @@ -262,7 +261,7 @@ class Parser: put(" };") putl("symbol_id_t PRODUCTION_ID_TO_SYMBOL[N_PRODUCTIONS] = { 0") - for i, (_, symbol) in enumerate(self.productions): + for i, (_, symbol, _) in enumerate(self.productions): if i != 0: put(f", {symbol.id}") put(" };") @@ -294,9 +293,28 @@ class Symbol: self.kind = parts[1] self.is_start = ".start" in parts[2:] self.contents = [] + self.production_names = [] self.id = None self.is_pseudo_node = False + def process_subline(self, line): + if self.kind == "list": + self.contents.extend(line.split()) + elif self.kind == "regex": + assert not self.contents + self.contents = line.strip() + elif self.kind == "nonterm": + self.contents.append(line.split()) + self.production_names.append(None) + for i, part in enumerate(self.contents[-1]): + if part.startswith("."): + args = self.contents[-1][i:] + self.contents[-1] = self.contents[-1][:i] + for arg, value in zip(args[::2], args[1::2]): + if arg == ".name": + self.production_names[-1] = value + else: raise NotImplementedError + class Node: def __init__(self, symbol, production): self.symbol = symbol -- cgit v1.2.3