From a9292a98cc6c65e2a4ad6da20937ef7568a4143d Mon Sep 17 00:00:00 2001
From: Matthew Sotoudeh <matthew@masot.net>
Date: Mon, 11 Mar 2024 16:33:24 -0700
Subject: earlpy

---
 earlpy/earlpy.py | 146 +++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 130 insertions(+), 16 deletions(-)

(limited to 'earlpy/earlpy.py')

diff --git a/earlpy/earlpy.py b/earlpy/earlpy.py
index 3b0deab..2944c51 100644
--- a/earlpy/earlpy.py
+++ b/earlpy/earlpy.py
@@ -9,9 +9,8 @@ DIR = pathlib.Path(__file__).parent.resolve()
 
 class Parser:
     def __init__(self, parser_dir):
-        assert parser_dir and parser_dir[0] != '/'
-        parser_dir = parser_dir
-        files = sorted([f"{parser_dir}/grammar.txt",
+        assert parser_dir and parser_dir != '/'
+        files = sorted([f"{parser_dir}/grammar.earlpy",
                         *glob(f"{parser_dir}/*.c"),
                         f"{DIR}/parser.c",
                         __file__])
@@ -27,7 +26,7 @@ class Parser:
             if open(lex_path, "r").readline()[3:][:-3].strip() == hashes:
                 already_built = True
 
-        lines = self.parse_grammar(f"{parser_dir}/grammar.txt")
+        lines = self.parse_grammar(f"{parser_dir}/grammar.earlpy")
         if not already_built:
             if glob(f"{parser_dir}/parser"):
                 subprocess.run(f"rm {parser_dir}/parser", shell=True)
@@ -43,7 +42,7 @@ class Parser:
                                  shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
             if res.returncode: print(res.stderr.decode("utf-8"))
             assert res.returncode == 0
-            res = subprocess.run(f"gcc -O3 {parser_dir}/parser.c -o {parser_dir}/parser",
+            res = subprocess.run(f"gcc -g -O3 {parser_dir}/parser.c -ljemalloc -o {parser_dir}/parser",
                                  shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
             if res.returncode: print(res.stderr.decode("utf-8"))
             assert res.returncode == 0
@@ -66,12 +65,20 @@ class Parser:
             raise ValueError
 
         contents = open(path, "r").read()
+        offset_to_line = dict()
+        line = 1
+        for i, c in enumerate(open(path, "rb").read()):
+            offset_to_line[i] = line
+            if c == '\n' or chr(c) == '\n': line += 1
 
         n_tokens, = struct.unpack("Q", res.stdout[:8])
         # symbol id, start idx, length
         tokens = list(struct.iter_unpack("QQQ", res.stdout[8:8+(8*3*n_tokens)]))
-        tokens = [(token, contents[token[1]:token[1]+token[2]])
-                  for token in tokens]
+        tokens = [Token(self.id_to_symbol[symbol],
+                        contents[offset:offset+length],
+                        offset_to_line[offset],
+                        path)
+                  for (symbol, offset, length) in tokens]
         # production id
         nodes = [t[0] for t in struct.iter_unpack("Q", res.stdout[8+(8*3*n_tokens):])]
         # print(nodes)
@@ -83,13 +90,7 @@ class Parser:
         stack = [root]
         while stack:
             node = stack[-1]
-            # print(len(stack))
-            # if isinstance(node, tuple):
-            #     print("\t", node)
-            # else:
-            #     print("\t", node.symbol.name, [s.name for s in node.production])
-
-            if (isinstance(node, tuple)
+            if (isinstance(node, Token)
                     or len(node.production) == len(node.contents)):
                 stack.pop()
                 if stack: stack[-1].contents.append(node)
@@ -220,11 +221,16 @@ class Parser:
             put(", \"" + symbol.name + "\"")
         put(" };")
         for symbol in ordered_symbols:
-            if symbol.name.isalnum():
+            if symbol.name.replace("_", "").isalnum():
                 putl(f"#define SYMBOL_{symbol.name} {symbol.id}")
             if symbol.is_start:
                 putl(f"#define START_SYMBOL {symbol.id}")
 
+        putl("char SYMBOL_TO_POISON[] = { 0")
+        for symbol in ordered_symbols:
+            put(", " + ("1" if symbol.poisoned else "0"))
+        put(" };")
+
         putl("prod_id_t SYMBOL_ID_TO_PRODUCTION_IDS[N_SYMBOLS][MAX_N_PRODUCTIONS] = { {0}")
         # [(production, Symbol), ...]
         self.productions = [([], None, None)]
@@ -265,6 +271,37 @@ class Parser:
             if i != 0: put(f", {symbol.id}")
         put(" };")
 
+        # Production hints: for this production, what does the leading symbol
+        # need to be?
+        # symbol -> symbol | True (multiple)
+        symbol_to_first = {symbol: symbol
+                           for symbol in self.id_to_symbol.values()
+                           if symbol.kind != "nonterm"}
+        fixedpoint = False
+        while not fixedpoint:
+            fixedpoint = True
+            for symbol in self.id_to_symbol.values():
+                if symbol.kind != "nonterm": continue
+                head_symbols = [self.name_to_symbol[production[0]]
+                                for production in symbol.contents]
+                firsts = [symbol_to_first.get(head, None)
+                          for head in head_symbols]
+                new_first = (firsts[0] if all(f == firsts[0] for f in firsts)
+                             else True)
+                if symbol_to_first.get(symbol, None) != new_first:
+                    symbol_to_first[symbol] = new_first
+                    fixedpoint = False
+
+        putl("symbol_id_t PRODUCTION_ID_TO_FIRST[N_PRODUCTIONS] = { 0")
+        for i, (production, _, _) in enumerate(self.productions):
+            if i == 0: continue
+            if not production or symbol_to_first.get(production[0], True) is True:
+                put(", 0")
+            else:
+                put(f", {symbol_to_first[production[0]].id}")
+        put(" };")
+
+        ##### DONE: output the lexer
         putl("void lex_symbol(symbol_id_t);")
         putl("%}")
         putl("%%")
@@ -292,6 +329,7 @@ class Symbol:
         self.name = parts[0]
         self.kind = parts[1]
         self.is_start = ".start" in parts[2:]
+        self.poisoned = ".poison" in parts[2:]
         self.contents = []
         self.production_names = []
         self.id = None
@@ -321,11 +359,87 @@ class Node:
         self.production = production
         self.contents = []
 
+    def line_numbers(self):
+        return self.contents[0].line_numbers()
+
+    def max_line_numbers(self):
+        return self.contents[-1].max_line_numbers()
+
+    def file_name(self):
+        return self.contents[-1].file_name()
+
     def pprint(self):
         def pprint(other):
             if isinstance(other, Node):
                 return other.pprint()
-            return other[1]
+            return other.pprint()
         if len(self.contents) == 1:
             return pprint(self.contents[0])
         return '(' + ' '.join(map(pprint, self.contents)) + ')'
+
+    def print_tree(self, depth=0):
+        print((' ' * depth) + self.symbol.name)
+        for arg in self.contents:
+            arg.print_tree(depth + 2)
+
+    def isa(self, *patterns):
+        for pattern in patterns:
+            if "->" in pattern:
+                symbol, production = pattern.split("->")
+                symbol = symbol.strip()
+                if symbol != self.symbol.name: continue
+                production = production.split()
+                if production[-1] != "..." and len(production) != len(self.pprint_production().split()[2:]):
+                    continue
+                for desired, real in zip(production, self.pprint_production().split()[2:]):
+                    if desired == "...": return True
+                    if desired != real: break
+                else: return True
+            else:
+                symbol = pattern.strip()
+                if symbol == self.symbol.name:
+                    return True
+        return False
+
+    def hasa(self, symbol):
+        return any(sub.name == symbol for sub in self.production)
+
+    def pprint_production(self):
+        parts = []
+        for s in self.production:
+            if "::" in s.name: parts.append(s.name[s.name.index("::")+2:])
+            else: parts.append(s.name)
+        return f"{self.symbol.name} -> {' '.join(parts)}"
+
+    def find(self, kind, which=0, total=1):
+        found = []
+        for s in self.subtrees():
+            if s.symbol.name == kind:
+                found.append(s)
+        if len(found) != total: raise ValueError
+        return found[which]
+
+    def subtrees(self): return self.contents
+    def __getitem__(self, i): return self.contents[i]
+
+class Token:
+    def __init__(self, symbol, string, line_number, file_name):
+        self.symbol = symbol
+        self.string = string
+        self.line_number = line_number
+        self.file_name_ = file_name
+
+    def pprint(self):
+        return self.string
+
+    def line_numbers(self):
+        return {self.line_number}
+
+    def file_name(self):
+        return self.file_name_
+
+    def max_line_numbers(self):
+        return self.line_numbers()
+
+    def print_tree(self, depth=0):
+        print((' ' * depth) + self.symbol.name , self.string , self.line_number)
-- 
cgit v1.2.3