diff options
author | Matthew Sotoudeh <matthew@masot.net> | 2024-02-19 02:25:28 -0800 |
---|---|---|
committer | Matthew Sotoudeh <matthew@masot.net> | 2024-02-19 02:25:28 -0800 |
commit | ffc6388571004b17e3a3dee2511ec99076ee803a (patch) | |
tree | ba14dfc0fc0b1beae949e12b8853fa690d588d2a |
initial dump
-rw-r--r-- | .gitignore | 5 | ||||
-rw-r--r-- | DESIGN.txt | 116 | ||||
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | earlpy/__init__.py | 1 | ||||
-rw-r--r-- | earlpy/earlpy.py | 300 | ||||
-rw-r--r-- | examples/simple.c | 3 | ||||
-rw-r--r-- | examples/test.py | 4 | ||||
-rw-r--r-- | grammars/c/disambiguate.c | 49 | ||||
-rw-r--r-- | grammars/c/grammar.txt | 17 | ||||
-rw-r--r-- | grammars/expression/disambiguate.c | 49 | ||||
-rw-r--r-- | grammars/expression/grammar.txt | 18 |
11 files changed, 564 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2e325a5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.*.sw* +__pycache__ +parser +parser.c +parser.l diff --git a/DESIGN.txt b/DESIGN.txt new file mode 100644 index 0000000..0762ca6 --- /dev/null +++ b/DESIGN.txt @@ -0,0 +1,116 @@ +====== Disambiguation Issues ======== +Consider two possible parses of 0 + 1 * 5 + 4 + ( ( 0 + ( 1 * 5 ) ) + 4 ) + ( 0 + ( 1 * ( 5 + 4 ) ) ) + +Problem: no two subparses apply to the same range except at the very end, but +at that point it's too hard for us to disambiguate. + +I think this is getting too messed up. The semantics we want is just: every +range has exactly one parsing as a certain nonterm. + +The easiest way to handle this is to just keep a table: + (start_idx, symbol) -> [state 1, state 2, ...] +that we've completed at this iteration. When a new completion happens, look up +in that table to see if we've already completed that (start_idx, nonterm) in +this iteration. If so, compare them and overwrite one if desired. + +BUT, if the comparison is not total, might have a scenario where we have two +incomparable parses before one perfect parse... + +====== Handling disambiguation ======== +Basically, the issue occurs when we have the same nonterminal matched multiple +ways on the same range. + +Easy-to-express ways of disambiguating: +*** Precedence: given two different top-level productions used, the highest one + in the grammar is preferred. Problems with precedence: + + 1. Not clear what to do when the same top-level production is used. + + 2. In the case of, e.g., needing a symbol table, precedence is not good + enough and can be actively harmful. + +Associativity is easy: given two things of the form A + B, just pick the one +where the location of the '+' token is leftmost! + +--------------------------------------------------- +Issue with queue ordering: +What if we: + - Add (S -> Y . X, k) + - Queues (X -> ., k) + - Add (X -> ., k) + - Completes (S -> Y . X, k) + ... + - Add (C -> Y . X, k) + - Queues (X -> ., k) + - Ignore (X -> ., k) because already in set! + +But we also can't just not deduplicate; consider the grammar + S -> S S | ... +it would cause nontermination. it even causes nontermination if we only +deduplicate the queue. When is it safe to deduplicate? + +Can we avoid all of this by just disallowing EPSILON in the grammars? Does that +suffice to ensure that completions alway complete states with start_idx <k? I +believe so. And lifting epsilons out can be performed automatically if we +eventually we want to do that. +https://stackoverflow.com/questions/35524183/earley-cannot-handle-epsilon-states-already-contained-in-chart +says we can do it during prediction + +With that change, I think it's OK to do full deduplication: any time there's an +equivalent state either already in the set or the queue for that set, drop it. + +------------------------------------------------- +Earley parser + +Read character by character + +Each state is going to be a single production rule + a location pointer into +that production rule's pattern + a starting/origin position + +(From the Wiki page:) +1. Add new states matching possible nonterminals at this idx into the string +2. Eat this character where applicable, and add the corresponding states into + the next-char state set +3. If a production is matched here, bring forward older productions that are + made possible by them + +Triggers work like: +1. Whenever a state is added to S(k), it triggers the addition of states for + all productions from a certain nonterminal +2. Quickly find all states in S(k) where the location pointer is pointing to + the given terminal +3. Quickly find all states in S(j) that are waiting on a particular nonterminal + +2 and 3 can be solved together: just have a map + (i, x) -> [states in S(i) waiting on nonterminal x] +Each state is waiting on exactly one thing, so can just do a DLL for the list + +(1) is solved by just storing the productions in a table, or precompute an 'add +all' function for each table, etc. + +Algo: + k = 0 + QUEUE[0] = [(START -> ..., 0, 0)] + for k: + while QUEUE[k]: + prod, loc, start_i = QUEUE[k].pop() + if prod[loc] == END: + # COMPLETION + for state in WAITERS[prod.nonterm]: + enqueue(increment_loc(state), k) + elif prod[loc] is NONTERM: + # PREDICTION + add_all_rules_for(prod[loc]) + elif prod[loc] is TERM and prod[loc] == CURRENT: + # SCANNING + enqueue(increment_loc(prod, loc, start_i), k+1) + +To extract the parse tree, we need to add to each state a pointer for each loc +in the production to another state. That pointer is set when we perform the +'completion' action. + +Disambiguation: + Ambiguity pops up when we perform COMPLETION for the same nonterminal + with the same starting idx and current idx multiple times. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d6abf1c --- /dev/null +++ b/Makefile @@ -0,0 +1,2 @@ +CFLAGS += -fsanitize=address -g +# CFLAGS += -O3 -g diff --git a/earlpy/__init__.py b/earlpy/__init__.py new file mode 100644 index 0000000..ca36c3a --- /dev/null +++ b/earlpy/__init__.py @@ -0,0 +1 @@ +from .earlpy import * diff --git a/earlpy/earlpy.py b/earlpy/earlpy.py new file mode 100644 index 0000000..d48c1be --- /dev/null +++ b/earlpy/earlpy.py @@ -0,0 +1,300 @@ +import tempfile +import subprocess +import hashlib +from glob import glob +import pathlib +import struct + +DIR = pathlib.Path(__file__).parent.resolve() + +class Parser: + def __init__(self, parser_dir): + assert parser_dir and parser_dir[0] != '/' + parser_dir = parser_dir + files = sorted([f"{parser_dir}/grammar.txt", + *glob(f"{parser_dir}/*.c"), + f"{DIR}/parser.c"]) + if f"{parser_dir}/parser.c" in files: + files.remove(f"{parser_dir}/parser.c") + hashes = ' '.join( + hashlib.sha256(b''.join(open(f, "rb").readlines())).hexdigest() + for f in files) + + already_built = False + lex_path = f"{parser_dir}/parser.l" + if glob(lex_path) and glob(f"{parser_dir}/parser"): + if open(lex_path, "r").readline()[3:][:-3].strip() == hashes: + already_built = True + + lines = self.parse_grammar(f"{parser_dir}/grammar.txt") + if not already_built: + if glob(f"{parser_dir}/parser"): + subprocess.run(f"rm {parser_dir}/parser", shell=True) + with open(f"{parser_dir}/parser.l", "w") as f: + f.write(f"/* {hashes} */\n") + for line in lines: + f.write(line + "\n") + f.write(open(f"{DIR}/parser.c", "r").read()) + for path in glob(f"{parser_dir}/*.c"): + if path == f"{parser_dir}/parser.c": continue + f.write(open(path, "r").read()) + res = subprocess.run(f"flex -o {parser_dir}/parser.c {parser_dir}/parser.l", + shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if res.returncode: print(res.stderr.decode("utf-8")) + assert res.returncode == 0 + res = subprocess.run(f"gcc -O3 {parser_dir}/parser.c -o {parser_dir}/parser", + shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if res.returncode: print(res.stderr.decode("utf-8")) + assert res.returncode == 0 + + self.parser = f"{parser_dir}/parser" + + def parse_string(self, string): + with tempfile.NamedTemporaryFile() as f: + f.write(string.encode("utf-8")) + f.flush() + result = self.parse_file(f.name) + f.close() + return result + + def parse_file(self, path): + res = subprocess.run([self.parser, path], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if res.returncode: + print("FAIL:", res.stderr.decode("utf-8")) + raise ValueError + + contents = open(path, "r").read() + + n_tokens, = struct.unpack("Q", res.stdout[:8]) + # symbol id, start idx, length + tokens = list(struct.iter_unpack("QQQ", res.stdout[8:8+(8*3*n_tokens)])) + tokens = [(token, contents[token[1]:token[1]+token[2]]) + for token in tokens] + # production id + nodes = [t[0] for t in struct.iter_unpack("Q", res.stdout[8+(8*3*n_tokens):])] + + # REPARSE the nodes + root = Node(self.productions[nodes[0]][1], + self.productions[nodes[0]][0]) + nodes = nodes[1:] + stack = [root] + while stack: + node = stack[-1] + if (isinstance(node, tuple) + or len(node.production) == len(node.contents)): + # COMPLETE! + stack.pop() + if stack: stack[-1].contents.append(node) + else: + symbol = node.production[len(node.contents)] + if symbol.kind == "nonterm": + prod_id = nodes.pop(0) + stack.append(Node(self.productions[prod_id][1], + self.productions[prod_id][0])) + else: + stack.append(tokens.pop(0)) + return root + + def parse_grammar(self, grammar_file): + grammar = open(grammar_file, "r") + + # (0) PARSE symbols from the grammar file + self.name_to_symbol = dict() + ordered_symbols = [] + last_symbol = None + for line in grammar: + if line[0] in ' \t': + if last_symbol.kind == "list": + last_symbol.contents.extend(line.split()) + elif last_symbol.kind == "regex": + assert not last_symbol.contents + last_symbol.contents = line.strip() + elif last_symbol.kind == "nonterm": + last_symbol.contents.append(line.split()) + else: raise NotImplementedError + elif line.strip().startswith("#"): + continue + elif line.strip(): + last_symbol = Symbol(line) + self.name_to_symbol[last_symbol.name] = last_symbol + ordered_symbols.append(last_symbol) + + # We allow mixing of concrete tokens and symbol names in the nonterminal + # patterns; this undoes that. + # (1a) map each concrete token to the list it belongs to + concrete_to_symbol = dict() + for symbol in ordered_symbols: + if symbol.kind != "list": continue + for token in symbol.contents: + assert token not in concrete_to_symbol + concrete_to_symbol[token] = symbol + # (1b) rewrite any rule involving concrete 'x' from list 'y' to 'y::x' + used_concretes = set() + for symbol in ordered_symbols: + if symbol.kind != "nonterm": continue + new_contents = [] + for rule in symbol.contents: + new_rule = [] + for token in rule: + if token in self.name_to_symbol: + new_rule.append(token) + else: + assert token in concrete_to_symbol, f"Token '{token}' is not in a list" + new_rule.append(f"{concrete_to_symbol[token].name}::{token}") + used_concretes.add(token) + new_contents.append(new_rule) + symbol.contents = new_contents + # (1c) if 'y::x' appeared, turn 'y' into a nonterminal + new_ordered_symbols = [] + for symbol in ordered_symbols.copy(): + if symbol.kind != "list": + new_ordered_symbols.append(symbol) + continue + split_out = set(symbol.contents) & used_concretes + if not split_out: + new_ordered_symbols.append(symbol) + continue + new_rule = [] + for token in split_out: + name = f"{symbol.name}::{token}" + self.name_to_symbol[name] = Symbol(name + " list") + self.name_to_symbol[name].contents = [token] + new_ordered_symbols.append(self.name_to_symbol[name]) + new_rule.append([name]) + + left_in = set(symbol.contents) - used_concretes + if left_in: + name = f"{symbol.name}::__rest__" + self.name_to_symbol[name] = Symbol(name + " list") + self.name_to_symbol[name].contents = sorted(left_in) + new_ordered_symbols.append(self.name_to_symbol[name]) + new_rule.append([name]) + + symbol.kind = "nonterm" + symbol.contents = new_rule + symbol.is_pseudo_node = True + new_ordered_symbols.append(symbol) + ordered_symbols = new_ordered_symbols + # Done! + + ##### DESCRIBE the lexer and the symbols + lines = [] + def put(x): lines[-1] += x + def putl(*x): lines.extend(x) + putl("%option noyywrap", + "%option reentrant", + "%{", + "typedef size_t prod_id_t;", + "typedef size_t symbol_id_t;", + "int OFFSET;", + # https://stackoverflow.com/questions/47094667/getting-the-current-index-in-the-input-string-flex-lexer + "#define YY_USER_ACTION OFFSET += yyleng;", + ) + self.max_n_productions = max(len(symbol.contents) + 1 + for symbol in ordered_symbols + if symbol.kind == "nonterm") + putl(f"#define MAX_N_PRODUCTIONS {self.max_n_productions}") + self.max_production_len = max(max(map(len, symbol.contents)) + 1 + for symbol in ordered_symbols + if symbol.kind == "nonterm") + putl(f"#define MAX_PRODUCTION_LEN {self.max_production_len}") + n_nonterms = len([symbol for symbol in ordered_symbols + if symbol.kind == "nonterm"]) + putl(f"#define N_NONTERMS {n_nonterms}") + putl(f"#define N_SYMBOLS {len(ordered_symbols) + 1}") + putl(f"#define DONE_SYMBOL 0") + # 0, nonterm1, nonterm2, ..., nontermN, term, ... + putl(f"#define IS_NONTERM(x) ((0 < (x)) && ((x) <= N_NONTERMS))") + + # put all the nonterminals at the beginning + ordered_symbols = sorted(ordered_symbols, + key=lambda s: (s.kind == "nonterm"), + reverse=True) + self.id_to_symbol = dict() + putl("char *SYMBOL_ID_TO_NAME[] = { \"DONE\"") + for i, symbol in enumerate(ordered_symbols): + symbol.id = i + 1 + self.id_to_symbol[symbol.id] = symbol + put(", \"" + symbol.name + "\"") + put(" };") + for symbol in ordered_symbols: + if symbol.name.isalnum(): + putl(f"#define SYMBOL_{symbol.name} {symbol.id}") + if symbol.is_start: + putl(f"#define START_SYMBOL {symbol.id}") + + putl("prod_id_t SYMBOL_ID_TO_PRODUCTION_IDS[N_SYMBOLS][MAX_N_PRODUCTIONS] = { {0}") + # [(production, Symbol), ...] + self.productions = [([], None)] + for symbol in ordered_symbols: + if symbol.kind == "nonterm": + start_idx = len(self.productions) + assert isinstance(symbol.contents[0], list) + for rule in symbol.contents: + rule = [self.name_to_symbol[x] for x in rule] + self.productions.append((rule, symbol)) + prods = ', '.join(map(str, range(start_idx, len(self.productions)))) + put(", {" + prods + ", 0}") + else: + put(", {0}") + put(" };") + putl(f"#define N_PRODUCTIONS {len(self.productions)}") + + putl("symbol_id_t PRODUCTION_ID_TO_PRODUCTION[N_PRODUCTIONS][MAX_PRODUCTION_LEN] = { {0}") + for i, (production, _) in enumerate(self.productions): + if i == 0: continue + production = ', '.join([symbol.name for symbol in production]) + put(", {" + production + ", 0}") + put(" };") + + putl("symbol_id_t PRODUCTION_ID_TO_SYMBOL[N_PRODUCTIONS] = { 0") + for i, (_, symbol) in enumerate(self.productions): + if i != 0: put(f", {symbol.id}") + put(" };") + + putl("void lex_symbol(symbol_id_t);") + putl("%}") + putl("%%") + + # Spit out the lexer! + def escape_literal(lit): + return '"' + lit.replace('\\', '\\\\') + '"' + for symbol in ordered_symbols: + if symbol.kind == "nonterm": continue + if symbol.kind == "list": + for token in symbol.contents: + putl(escape_literal(token) + f" {{ lex_symbol({symbol.id}); }}") + elif symbol.kind == "regex": + putl(symbol.contents + f" {{ lex_symbol({symbol.id}); }}") + else: raise NotImplementedError + putl(". { }") + putl("\\n { }") + putl("%%") + + return lines + +class Symbol: + def __init__(self, declaration): + parts = declaration.split() + self.name = parts[0] + self.kind = parts[1] + self.is_start = ".start" in parts[2:] + self.contents = [] + self.id = None + self.is_pseudo_node = False + +class Node: + def __init__(self, symbol, production): + self.symbol = symbol + self.production = production + self.contents = [] + + def pprint(self): + def pprint(other): + if isinstance(other, Node): + return other.pprint() + return other[1] + if len(self.contents) == 1: + return pprint(self.contents[0]) + return '(' + ' '.join(map(pprint, self.contents)) + ')' diff --git a/examples/simple.c b/examples/simple.c new file mode 100644 index 0000000..a8110a7 --- /dev/null +++ b/examples/simple.c @@ -0,0 +1,3 @@ +if (x + 5 == 10) { + x += 2; +} diff --git a/examples/test.py b/examples/test.py new file mode 100644 index 0000000..bf8c6d2 --- /dev/null +++ b/examples/test.py @@ -0,0 +1,4 @@ +import earlpy +p = earlpy.Parser("grammars/expression") +node = p.parse_string("1 + 1 + 2 + 3") +print(node.pprint()) diff --git a/grammars/c/disambiguate.c b/grammars/c/disambiguate.c new file mode 100644 index 0000000..9a8bf08 --- /dev/null +++ b/grammars/c/disambiguate.c @@ -0,0 +1,49 @@ +int disambiguator(struct state *old, struct state *new) { + // printf("Old tree: "); + // print_parse_tree2(old); + // printf("New tree: "); + // print_parse_tree2(new); + + if (old->start_idx != new->start_idx) { + // printf("\t\tIGNORING "); print_parse_tree2(old); + // printf("\t\tVS: "); print_parse_tree2(new); + return 2; + } + + // Prefer the earlier parsings in the grammar when two entirely different + // productions are taken. + if (old->production_id != new->production_id) + return old->production_id < new->production_id + ? 0 : 1; + + // If they're the same production ... + prod_id_t prod = old->production_id; + if (PRODUCTION_ID_TO_SYMBOL[prod] == SYMBOL_EXPR) { + if (PRODUCTION_ID_TO_PRODUCTION[prod][1] == SYMBOL_OP) { + struct token *old_tok = find_token(old, 1), + *new_tok = find_token(new, 1); + char *old_s = old_tok->string, *new_s = new_tok->string; + const char *precedence[] = {".", "->", "*", "/", "%", "+", "-", + "<<", ">>", "<", "<=", ">", ">=", "==", "!=", "&", "|", "&&", + "||", "=", "+=", "-=", "*=", "/=", "%=", "<<=", ">>=", "&=", + "^=", "|=", ",", 0}; + if (strcmp(old_s, new_s)) { + for (const char **p = precedence; *p; p++) { + if (!strcmp(old_s, *p)) { + return 1; + } else if (!strcmp(new_s, *p)) { + return 0; + } + } + // BAD! + return 2; + } else { + // Associate RIGHT + if (old_tok < new_tok) return 1; + else if (old_tok > new_tok) return 0; + } + } + } + printf("TOTALLY UNKNOWN!\n"); + return 2; +} diff --git a/grammars/c/grammar.txt b/grammars/c/grammar.txt new file mode 100644 index 0000000..486f319 --- /dev/null +++ b/grammars/c/grammar.txt @@ -0,0 +1,17 @@ +KEYWORDS list + switch volatile case while do else const for if + +IDENT regex + [a-zA-Z_][0-9a-zA-Z_]* + +INT regex + [0-9]+ + +OPS list + ( ) { } [ ] + ; , + - + ! % * & / << >> ^ | + -= += != %= *= &= /= <<= == >>= ^= |= + && || ++ -- + < <= > >= = + . -> ? : diff --git a/grammars/expression/disambiguate.c b/grammars/expression/disambiguate.c new file mode 100644 index 0000000..9a8bf08 --- /dev/null +++ b/grammars/expression/disambiguate.c @@ -0,0 +1,49 @@ +int disambiguator(struct state *old, struct state *new) { + // printf("Old tree: "); + // print_parse_tree2(old); + // printf("New tree: "); + // print_parse_tree2(new); + + if (old->start_idx != new->start_idx) { + // printf("\t\tIGNORING "); print_parse_tree2(old); + // printf("\t\tVS: "); print_parse_tree2(new); + return 2; + } + + // Prefer the earlier parsings in the grammar when two entirely different + // productions are taken. + if (old->production_id != new->production_id) + return old->production_id < new->production_id + ? 0 : 1; + + // If they're the same production ... + prod_id_t prod = old->production_id; + if (PRODUCTION_ID_TO_SYMBOL[prod] == SYMBOL_EXPR) { + if (PRODUCTION_ID_TO_PRODUCTION[prod][1] == SYMBOL_OP) { + struct token *old_tok = find_token(old, 1), + *new_tok = find_token(new, 1); + char *old_s = old_tok->string, *new_s = new_tok->string; + const char *precedence[] = {".", "->", "*", "/", "%", "+", "-", + "<<", ">>", "<", "<=", ">", ">=", "==", "!=", "&", "|", "&&", + "||", "=", "+=", "-=", "*=", "/=", "%=", "<<=", ">>=", "&=", + "^=", "|=", ",", 0}; + if (strcmp(old_s, new_s)) { + for (const char **p = precedence; *p; p++) { + if (!strcmp(old_s, *p)) { + return 1; + } else if (!strcmp(new_s, *p)) { + return 0; + } + } + // BAD! + return 2; + } else { + // Associate RIGHT + if (old_tok < new_tok) return 1; + else if (old_tok > new_tok) return 0; + } + } + } + printf("TOTALLY UNKNOWN!\n"); + return 2; +} diff --git a/grammars/expression/grammar.txt b/grammars/expression/grammar.txt new file mode 100644 index 0000000..a1590fa --- /dev/null +++ b/grammars/expression/grammar.txt @@ -0,0 +1,18 @@ +INT regex + [0-9]+ + +OP list + || && -- ++ = == ! + / * - + << >> % ^ + /= *= -= += <<= >>= &= |= %= ^= + -> . + < > <= >= != + ? : + ( ) { } [ ] + , ; + +EXPR nonterm .start + EXPR OP EXPR + INT + ( EXPR ) + EXPR ? EXPR : EXPR |