From a9292a98cc6c65e2a4ad6da20937ef7568a4143d Mon Sep 17 00:00:00 2001 From: Matthew Sotoudeh Date: Mon, 11 Mar 2024 16:33:24 -0700 Subject: earlpy --- earlpy/earlpy.py | 146 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 130 insertions(+), 16 deletions(-) (limited to 'earlpy/earlpy.py') diff --git a/earlpy/earlpy.py b/earlpy/earlpy.py index 3b0deab..2944c51 100644 --- a/earlpy/earlpy.py +++ b/earlpy/earlpy.py @@ -9,9 +9,8 @@ DIR = pathlib.Path(__file__).parent.resolve() class Parser: def __init__(self, parser_dir): - assert parser_dir and parser_dir[0] != '/' - parser_dir = parser_dir - files = sorted([f"{parser_dir}/grammar.txt", + assert parser_dir and parser_dir != '/' + files = sorted([f"{parser_dir}/grammar.earlpy", *glob(f"{parser_dir}/*.c"), f"{DIR}/parser.c", __file__]) @@ -27,7 +26,7 @@ class Parser: if open(lex_path, "r").readline()[3:][:-3].strip() == hashes: already_built = True - lines = self.parse_grammar(f"{parser_dir}/grammar.txt") + lines = self.parse_grammar(f"{parser_dir}/grammar.earlpy") if not already_built: if glob(f"{parser_dir}/parser"): subprocess.run(f"rm {parser_dir}/parser", shell=True) @@ -43,7 +42,7 @@ class Parser: shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if res.returncode: print(res.stderr.decode("utf-8")) assert res.returncode == 0 - res = subprocess.run(f"gcc -O3 {parser_dir}/parser.c -o {parser_dir}/parser", + res = subprocess.run(f"gcc -g -O3 {parser_dir}/parser.c -ljemalloc -o {parser_dir}/parser", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if res.returncode: print(res.stderr.decode("utf-8")) assert res.returncode == 0 @@ -66,12 +65,20 @@ class Parser: raise ValueError contents = open(path, "r").read() + offset_to_line = dict() + line = 1 + for i, c in enumerate(open(path, "rb").read()): + offset_to_line[i] = line + if c == '\n' or chr(c) == '\n': line += 1 n_tokens, = struct.unpack("Q", res.stdout[:8]) # symbol id, start idx, length tokens = list(struct.iter_unpack("QQQ", res.stdout[8:8+(8*3*n_tokens)])) - tokens = [(token, contents[token[1]:token[1]+token[2]]) - for token in tokens] + tokens = [Token(self.id_to_symbol[symbol], + contents[offset:offset+length], + offset_to_line[offset], + path) + for (symbol, offset, length) in tokens] # production id nodes = [t[0] for t in struct.iter_unpack("Q", res.stdout[8+(8*3*n_tokens):])] # print(nodes) @@ -83,13 +90,7 @@ class Parser: stack = [root] while stack: node = stack[-1] - # print(len(stack)) - # if isinstance(node, tuple): - # print("\t", node) - # else: - # print("\t", node.symbol.name, [s.name for s in node.production]) - - if (isinstance(node, tuple) + if (isinstance(node, Token) or len(node.production) == len(node.contents)): stack.pop() if stack: stack[-1].contents.append(node) @@ -220,11 +221,16 @@ class Parser: put(", \"" + symbol.name + "\"") put(" };") for symbol in ordered_symbols: - if symbol.name.isalnum(): + if symbol.name.replace("_", "").isalnum(): putl(f"#define SYMBOL_{symbol.name} {symbol.id}") if symbol.is_start: putl(f"#define START_SYMBOL {symbol.id}") + putl("char SYMBOL_TO_POISON[] = { 0") + for symbol in ordered_symbols: + put(", " + ("1" if symbol.poisoned else "0")) + put(" };") + putl("prod_id_t SYMBOL_ID_TO_PRODUCTION_IDS[N_SYMBOLS][MAX_N_PRODUCTIONS] = { {0}") # [(production, Symbol), ...] self.productions = [([], None, None)] @@ -265,6 +271,37 @@ class Parser: if i != 0: put(f", {symbol.id}") put(" };") + # Production hints: for this production, what does the leading symbol + # need to be? + # symbol -> symbol | True (multiple) + symbol_to_first = {symbol: symbol + for symbol in self.id_to_symbol.values() + if symbol.kind != "nonterm"} + fixedpoint = False + while not fixedpoint: + fixedpoint = True + for symbol in self.id_to_symbol.values(): + if symbol.kind != "nonterm": continue + head_symbols = [self.name_to_symbol[production[0]] + for production in symbol.contents] + firsts = [symbol_to_first.get(head, None) + for head in head_symbols] + new_first = (firsts[0] if all(f == firsts[0] for f in firsts) + else True) + if symbol_to_first.get(symbol, None) != new_first: + symbol_to_first[symbol] = new_first + fixedpoint = False + + putl("symbol_id_t PRODUCTION_ID_TO_FIRST[N_PRODUCTIONS] = { 0") + for i, (production, _, _) in enumerate(self.productions): + if i == 0: continue + if not production or symbol_to_first.get(production[0], True) is True: + put(", 0") + else: + put(f", {symbol_to_first[production[0]].id}") + put(" };") + + ##### DONE: output the lexer putl("void lex_symbol(symbol_id_t);") putl("%}") putl("%%") @@ -292,6 +329,7 @@ class Symbol: self.name = parts[0] self.kind = parts[1] self.is_start = ".start" in parts[2:] + self.poisoned = ".poison" in parts[2:] self.contents = [] self.production_names = [] self.id = None @@ -321,11 +359,87 @@ class Node: self.production = production self.contents = [] + def line_numbers(self): + return self.contents[0].line_numbers() + + def max_line_numbers(self): + return self.contents[-1].max_line_numbers() + + def file_name(self): + return self.contents[-1].file_name() + def pprint(self): def pprint(other): if isinstance(other, Node): return other.pprint() - return other[1] + return other.pprint() if len(self.contents) == 1: return pprint(self.contents[0]) return '(' + ' '.join(map(pprint, self.contents)) + ')' + + def print_tree(self, depth=0): + print((' ' * depth) + self.symbol.name) + for arg in self.contents: + arg.print_tree(depth + 2) + + def isa(self, *patterns): + for pattern in patterns: + if "->" in pattern: + symbol, production = pattern.split("->") + symbol = symbol.strip() + if symbol != self.symbol.name: continue + production = production.split() + if production[-1] != "..." and len(production) != len(self.pprint_production().split()[2:]): + continue + for desired, real in zip(production, self.pprint_production().split()[2:]): + if desired == "...": return True + if desired != real: break + else: return True + else: + symbol = pattern.strip() + if symbol == self.symbol.name: + return True + return False + + def hasa(self, symbol): + return any(sub.name == symbol for sub in self.production) + + def pprint_production(self): + parts = [] + for s in self.production: + if "::" in s.name: parts.append(s.name[s.name.index("::")+2:]) + else: parts.append(s.name) + return f"{self.symbol.name} -> {' '.join(parts)}" + + def find(self, kind, which=0, total=1): + found = [] + for s in self.subtrees(): + if s.symbol.name == kind: + found.append(s) + if len(found) != total: raise ValueError + return found[which] + + def subtrees(self): return self.contents + def __getitem__(self, i): return self.contents[i] + +class Token: + def __init__(self, symbol, string, line_number, file_name): + self.symbol = symbol + self.string = string + self.line_number = line_number + self.file_name_ = file_name + + def pprint(self): + return self.string + + def line_numbers(self): + return {self.line_number} + + def file_name(self): + return self.file_name_ + + def max_line_numbers(self): + return self.line_numbers() + + def print_tree(self, depth=0): + print((' ' * depth) + self.symbol.name , self.string , self.line_number) -- cgit v1.2.3