summaryrefslogtreecommitdiff
path: root/earlpy/earlpy.py
diff options
context:
space:
mode:
authorMatthew Sotoudeh <matthew@masot.net>2024-03-11 16:33:24 -0700
committerMatthew Sotoudeh <matthew@masot.net>2024-03-11 16:33:24 -0700
commita9292a98cc6c65e2a4ad6da20937ef7568a4143d (patch)
treef921866c475208359285ff3f606c0fafa1812a32 /earlpy/earlpy.py
parent35fa21e59ad44de3ac5d075a3c1ae60d462a1a13 (diff)
earlpy
Diffstat (limited to 'earlpy/earlpy.py')
-rw-r--r--earlpy/earlpy.py146
1 files changed, 130 insertions, 16 deletions
diff --git a/earlpy/earlpy.py b/earlpy/earlpy.py
index 3b0deab..2944c51 100644
--- a/earlpy/earlpy.py
+++ b/earlpy/earlpy.py
@@ -9,9 +9,8 @@ DIR = pathlib.Path(__file__).parent.resolve()
class Parser:
def __init__(self, parser_dir):
- assert parser_dir and parser_dir[0] != '/'
- parser_dir = parser_dir
- files = sorted([f"{parser_dir}/grammar.txt",
+ assert parser_dir and parser_dir != '/'
+ files = sorted([f"{parser_dir}/grammar.earlpy",
*glob(f"{parser_dir}/*.c"),
f"{DIR}/parser.c",
__file__])
@@ -27,7 +26,7 @@ class Parser:
if open(lex_path, "r").readline()[3:][:-3].strip() == hashes:
already_built = True
- lines = self.parse_grammar(f"{parser_dir}/grammar.txt")
+ lines = self.parse_grammar(f"{parser_dir}/grammar.earlpy")
if not already_built:
if glob(f"{parser_dir}/parser"):
subprocess.run(f"rm {parser_dir}/parser", shell=True)
@@ -43,7 +42,7 @@ class Parser:
shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if res.returncode: print(res.stderr.decode("utf-8"))
assert res.returncode == 0
- res = subprocess.run(f"gcc -O3 {parser_dir}/parser.c -o {parser_dir}/parser",
+ res = subprocess.run(f"gcc -g -O3 {parser_dir}/parser.c -ljemalloc -o {parser_dir}/parser",
shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if res.returncode: print(res.stderr.decode("utf-8"))
assert res.returncode == 0
@@ -66,12 +65,20 @@ class Parser:
raise ValueError
contents = open(path, "r").read()
+ offset_to_line = dict()
+ line = 1
+ for i, c in enumerate(open(path, "rb").read()):
+ offset_to_line[i] = line
+ if c == '\n' or chr(c) == '\n': line += 1
n_tokens, = struct.unpack("Q", res.stdout[:8])
# symbol id, start idx, length
tokens = list(struct.iter_unpack("QQQ", res.stdout[8:8+(8*3*n_tokens)]))
- tokens = [(token, contents[token[1]:token[1]+token[2]])
- for token in tokens]
+ tokens = [Token(self.id_to_symbol[symbol],
+ contents[offset:offset+length],
+ offset_to_line[offset],
+ path)
+ for (symbol, offset, length) in tokens]
# production id
nodes = [t[0] for t in struct.iter_unpack("Q", res.stdout[8+(8*3*n_tokens):])]
# print(nodes)
@@ -83,13 +90,7 @@ class Parser:
stack = [root]
while stack:
node = stack[-1]
- # print(len(stack))
- # if isinstance(node, tuple):
- # print("\t", node)
- # else:
- # print("\t", node.symbol.name, [s.name for s in node.production])
-
- if (isinstance(node, tuple)
+ if (isinstance(node, Token)
or len(node.production) == len(node.contents)):
stack.pop()
if stack: stack[-1].contents.append(node)
@@ -220,11 +221,16 @@ class Parser:
put(", \"" + symbol.name + "\"")
put(" };")
for symbol in ordered_symbols:
- if symbol.name.isalnum():
+ if symbol.name.replace("_", "").isalnum():
putl(f"#define SYMBOL_{symbol.name} {symbol.id}")
if symbol.is_start:
putl(f"#define START_SYMBOL {symbol.id}")
+ putl("char SYMBOL_TO_POISON[] = { 0")
+ for symbol in ordered_symbols:
+ put(", " + ("1" if symbol.poisoned else "0"))
+ put(" };")
+
putl("prod_id_t SYMBOL_ID_TO_PRODUCTION_IDS[N_SYMBOLS][MAX_N_PRODUCTIONS] = { {0}")
# [(production, Symbol), ...]
self.productions = [([], None, None)]
@@ -265,6 +271,37 @@ class Parser:
if i != 0: put(f", {symbol.id}")
put(" };")
+ # Production hints: for this production, what does the leading symbol
+ # need to be?
+ # symbol -> symbol | True (multiple)
+ symbol_to_first = {symbol: symbol
+ for symbol in self.id_to_symbol.values()
+ if symbol.kind != "nonterm"}
+ fixedpoint = False
+ while not fixedpoint:
+ fixedpoint = True
+ for symbol in self.id_to_symbol.values():
+ if symbol.kind != "nonterm": continue
+ head_symbols = [self.name_to_symbol[production[0]]
+ for production in symbol.contents]
+ firsts = [symbol_to_first.get(head, None)
+ for head in head_symbols]
+ new_first = (firsts[0] if all(f == firsts[0] for f in firsts)
+ else True)
+ if symbol_to_first.get(symbol, None) != new_first:
+ symbol_to_first[symbol] = new_first
+ fixedpoint = False
+
+ putl("symbol_id_t PRODUCTION_ID_TO_FIRST[N_PRODUCTIONS] = { 0")
+ for i, (production, _, _) in enumerate(self.productions):
+ if i == 0: continue
+ if not production or symbol_to_first.get(production[0], True) is True:
+ put(", 0")
+ else:
+ put(f", {symbol_to_first[production[0]].id}")
+ put(" };")
+
+ ##### DONE: output the lexer
putl("void lex_symbol(symbol_id_t);")
putl("%}")
putl("%%")
@@ -292,6 +329,7 @@ class Symbol:
self.name = parts[0]
self.kind = parts[1]
self.is_start = ".start" in parts[2:]
+ self.poisoned = ".poison" in parts[2:]
self.contents = []
self.production_names = []
self.id = None
@@ -321,11 +359,87 @@ class Node:
self.production = production
self.contents = []
+ def line_numbers(self):
+ return self.contents[0].line_numbers()
+
+ def max_line_numbers(self):
+ return self.contents[-1].max_line_numbers()
+
+ def file_name(self):
+ return self.contents[-1].file_name()
+
def pprint(self):
def pprint(other):
if isinstance(other, Node):
return other.pprint()
- return other[1]
+ return other.pprint()
if len(self.contents) == 1:
return pprint(self.contents[0])
return '(' + ' '.join(map(pprint, self.contents)) + ')'
+
+ def print_tree(self, depth=0):
+ print((' ' * depth) + self.symbol.name)
+ for arg in self.contents:
+ arg.print_tree(depth + 2)
+
+ def isa(self, *patterns):
+ for pattern in patterns:
+ if "->" in pattern:
+ symbol, production = pattern.split("->")
+ symbol = symbol.strip()
+ if symbol != self.symbol.name: continue
+ production = production.split()
+ if production[-1] != "..." and len(production) != len(self.pprint_production().split()[2:]):
+ continue
+ for desired, real in zip(production, self.pprint_production().split()[2:]):
+ if desired == "...": return True
+ if desired != real: break
+ else: return True
+ else:
+ symbol = pattern.strip()
+ if symbol == self.symbol.name:
+ return True
+ return False
+
+ def hasa(self, symbol):
+ return any(sub.name == symbol for sub in self.production)
+
+ def pprint_production(self):
+ parts = []
+ for s in self.production:
+ if "::" in s.name: parts.append(s.name[s.name.index("::")+2:])
+ else: parts.append(s.name)
+ return f"{self.symbol.name} -> {' '.join(parts)}"
+
+ def find(self, kind, which=0, total=1):
+ found = []
+ for s in self.subtrees():
+ if s.symbol.name == kind:
+ found.append(s)
+ if len(found) != total: raise ValueError
+ return found[which]
+
+ def subtrees(self): return self.contents
+ def __getitem__(self, i): return self.contents[i]
+
+class Token:
+ def __init__(self, symbol, string, line_number, file_name):
+ self.symbol = symbol
+ self.string = string
+ self.line_number = line_number
+ self.file_name_ = file_name
+
+ def pprint(self):
+ return self.string
+
+ def line_numbers(self):
+ return {self.line_number}
+
+ def file_name(self):
+ return self.file_name_
+
+ def max_line_numbers(self):
+ return self.line_numbers()
+
+ def print_tree(self, depth=0):
+ print((' ' * depth) + self.symbol.name , self.string , self.line_number)
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback