playing with the C grammar

author: Matthew Sotoudeh <matthew@masot.net> 2024-02-19 16:41:13 -0800
committer: Matthew Sotoudeh <matthew@masot.net> 2024-02-19 16:41:13 -0800
commit: 26a42b4a7ba077659f791208a2a7989bfdfb3663 (patch)
tree: 02069692c4d629d3108bbed43c4eb6eddfd2fbc7
parent: e133f250761c67b4465181f41909e78c272901d3 (diff)
7 files changed, 148 insertions, 29 deletions
diff --git a/Makefile b/Makefile
deleted file mode 100644
index d6abf1c..0000000
--- a/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-CFLAGS += -fsanitize=address -g
-# CFLAGS += -O3 -g
diff --git a/earlpy/earlpy.py b/earlpy/earlpy.py
index 7fbf0f0..3b0deab 100644
--- a/earlpy/earlpy.py
+++ b/earlpy/earlpy.py
@@ -13,7 +13,8 @@ class Parser:
         parser_dir = parser_dir
         files = sorted([f"{parser_dir}/grammar.txt",
                         *glob(f"{parser_dir}/*.c"),
-                        f"{DIR}/parser.c"])
+                        f"{DIR}/parser.c",
+                        __file__])
         if f"{parser_dir}/parser.c" in files:
             files.remove(f"{parser_dir}/parser.c")
         hashes = ' '.join(
@@ -110,17 +111,10 @@ class Parser:
         ordered_symbols = []
         last_symbol = None
         for line in grammar:
-            if line[0] in ' \t':
-                if last_symbol.kind == "list":
-                    last_symbol.contents.extend(line.split())
-                elif last_symbol.kind == "regex":
-                    assert not last_symbol.contents
-                    last_symbol.contents = line.strip()
-                elif last_symbol.kind == "nonterm":
-                    last_symbol.contents.append(line.split())
-                else: raise NotImplementedError
-            elif line.strip().startswith("#"):
+            if line.strip().startswith("#"):
                 continue
+            elif line[0] in ' \t':
+                last_symbol.process_subline(line.strip())
             elif line.strip():
                 last_symbol = Symbol(line)
                 self.name_to_symbol[last_symbol.name] = last_symbol
@@ -179,6 +173,7 @@ class Parser:
 
             symbol.kind = "nonterm"
             symbol.contents = new_rule
+            symbol.production_names = [None for _ in new_rule]
             symbol.is_pseudo_node = True
             new_ordered_symbols.append(symbol)
         ordered_symbols = new_ordered_symbols
@@ -232,27 +227,31 @@ class Parser:
 
         putl("prod_id_t SYMBOL_ID_TO_PRODUCTION_IDS[N_SYMBOLS][MAX_N_PRODUCTIONS] = { {0}")
         # [(production, Symbol), ...]
-        self.productions = [([], None)]
+        self.productions = [([], None, None)]
         for symbol in ordered_symbols:
             if symbol.kind == "nonterm":
                 start_idx = len(self.productions)
                 assert isinstance(symbol.contents[0], list)
-                for rule in symbol.contents:
+                for i, rule in enumerate(symbol.contents):
                     rule = [self.name_to_symbol[x] for x in rule]
-                    self.productions.append((rule, symbol))
+                    self.productions.append((rule, symbol, symbol.production_names[i]))
                 prods = ', '.join(map(str, range(start_idx, len(self.productions))))
                 if prods:
                     put(", {" + prods + ", 0}")
                 else:
                     put(", {0}")
             else:
-                self.productions.append(([], symbol))
+                self.productions.append(([], symbol, None))
                 put(", {0}")
         put(" };")
         putl(f"#define N_PRODUCTIONS {len(self.productions)}")
 
+        for i, (_, _, name) in enumerate(self.productions):
+            if name:
+                putl(f"#define PRODUCTION_{name} {i}")
+
         putl("symbol_id_t PRODUCTION_ID_TO_PRODUCTION[N_PRODUCTIONS][MAX_PRODUCTION_LEN] = { {0}")
-        for i, (production, _) in enumerate(self.productions):
+        for i, (production, _, _) in enumerate(self.productions):
             if i == 0: continue
             production = ', '.join(str(symbol.id) for symbol in production)
             if production:
@@ -262,7 +261,7 @@ class Parser:
         put(" };")
 
         putl("symbol_id_t PRODUCTION_ID_TO_SYMBOL[N_PRODUCTIONS] = { 0")
-        for i, (_, symbol) in enumerate(self.productions):
+        for i, (_, symbol, _) in enumerate(self.productions):
             if i != 0: put(f", {symbol.id}")
         put(" };")
 
@@ -294,9 +293,28 @@ class Symbol:
         self.kind = parts[1]
         self.is_start = ".start" in parts[2:]
         self.contents = []
+        self.production_names = []
         self.id = None
         self.is_pseudo_node = False
 
+    def process_subline(self, line):
+        if self.kind == "list":
+            self.contents.extend(line.split())
+        elif self.kind == "regex":
+            assert not self.contents
+            self.contents = line.strip()
+        elif self.kind == "nonterm":
+            self.contents.append(line.split())
+            self.production_names.append(None)
+            for i, part in enumerate(self.contents[-1]):
+                if part.startswith("."):
+                    args = self.contents[-1][i:]
+                    self.contents[-1] = self.contents[-1][:i]
+                    for arg, value in zip(args[::2], args[1::2]):
+                        if arg == ".name":
+                            self.production_names[-1] = value
+        else: raise NotImplementedError
+
 class Node:
     def __init__(self, symbol, production):
         self.symbol = symbol
diff --git a/examples/file.c b/examples/file.c
new file mode 100644
index 0000000..a94ceea
--- /dev/null
+++ b/examples/file.c
@@ -0,0 +1,3 @@
+void hello(int x) {
+    x += 1;
+}
diff --git a/examples/test.py b/examples/test.py
index e5783c5..ac7e223 100644
--- a/examples/test.py
+++ b/examples/test.py
@@ -1,14 +1,16 @@
 import earlpy
 
+p = earlpy.Parser("grammars/c")
+
 if False:
     p = earlpy.Parser("grammars/expression")
     node = p.parse_string("1 + 1 + 2 + 3")
     print(node.pprint())
-elif True:
+elif False:
     p = earlpy.Parser("grammars/c")
-    node = p.parse_file("examples/simple.c")
+    node = p.parse_file("examples/file.c")
     print(node.pprint())
-else:
+elif False:
     p = earlpy.Parser("grammars/c")
     node = p.parse_file("examples/expr.c")
     print(node.pprint())
diff --git a/examples/type_ambiguous.c b/examples/type_ambiguous.c
new file mode 100644
index 0000000..36602ae
--- /dev/null
+++ b/examples/type_ambiguous.c
@@ -0,0 +1,3 @@
+void hello(int x) {
+    foo_t * x;
+}
diff --git a/grammars/c/disambiguate.c b/grammars/c/disambiguate.c
index a2d69d1..403d65f 100644
--- a/grammars/c/disambiguate.c
+++ b/grammars/c/disambiguate.c
@@ -1,14 +1,37 @@
+struct token *TYPE_NAMES[1024];
+size_t N_TYPE_NAMES;
+
+void alert_parse(struct state *state) {
+    if (PRODUCTION_ID_TO_SYMBOL[state->production_id] == SYMBOL_TYPEDEF) {
+        for (struct token *t = find_token(state, 2); t->symbol != DONE_SYMBOL; t++) {
+            if (t->symbol == SYMBOL_IDENT) {
+                TYPE_NAMES[N_TYPE_NAMES++] = t;
+                break;
+            }
+        }
+    }
+}
+
+int is_typename(struct token *token) {
+    if (!strcmp("int", token->string)) return 1;
+    for (size_t i = 0; i < N_TYPE_NAMES; i++)
+        if (!strcmp(TYPE_NAMES[i]->string, token->string))
+            return 1;
+    return 0;
+}
+
 int disambiguator(struct state *old, struct state *new) {
     // printf("Old tree:\n");
     // print_parse_tree(old, 4);
     // printf("New tree:\n");
     // print_parse_tree(new, 4);
 
-    if (old->start_idx != new->start_idx) {
-        // printf("\t\tIGNORING "); print_parse_tree2(old);
-        // printf("\t\tVS:      "); print_parse_tree2(new);
-        return 2;
-    }
+    if (old->production_id == PRODUCTION_DECL_STMT)
+        if (!is_typename(find_token(old->reasons[0], 0))) 
+            return 1;
+    if (new->production_id == PRODUCTION_DECL_STMT)
+        if (!is_typename(find_token(new->reasons[0], 0))) 
+            return 0;
 
     // Prefer the earlier parsings in the grammar when two entirely different
     // productions are taken.
diff --git a/grammars/c/grammar.txt b/grammars/c/grammar.txt
index 7959318..ffe85c3 100644
--- a/grammars/c/grammar.txt
+++ b/grammars/c/grammar.txt
@@ -1,5 +1,6 @@
 KEYWORDS list
     switch volatile case while do else const for if
+    struct union typedef void
 
 IDENT regex
     [a-zA-Z_][0-9a-zA-Z_]*
@@ -16,6 +17,63 @@ OP list
     < <= > >= =
     . -> ? :
 
+############### TYPE PARSING
+# A PRIMITIVE_TYPE is the core object that takes up space after dereferencing,
+# calling, etc. A normal variable declaration is PRIMITIVE_TYPE (expression)
+PRIMITIVE_TYPE nonterm
+    struct IDENT
+    union IDENT
+    struct IDENT AGGREGATE_DECLARATION
+    union IDENT AGGREGATE_DECLARATION
+    const PRIMITIVE_TYPE
+    volatile PRIMITIVE_TYPE
+    void
+    IDENT
+
+# A TYPE_EXPRESSION is basically an lvalue expression.
+TYPE_EXPRESSION nonterm
+    IDENT
+    TYPE_EXPRESSION [ ]
+    TYPE_EXPRESSION [ INT ]
+    * TYPE_EXPRESSION
+    ( TYPE_EXPRESSION )
+    TYPE_EXPRESSION ( )
+    TYPE_EXPRESSION ( ARGS )
+
+DECLARATION nonterm
+    PRIMITIVE_TYPE TYPE_EXPRESSION
+
+# An ANONYMOUS_TYPE has no name
+ANONYMOUS_TYPE nonterm
+    PRIMITIVE_TYPE
+    ANONYMOUS_TYPE [ ]
+    ANONYMOUS_TYPE [ INT ]
+    * ANONYMOUS_TYPE
+    ( ANONYMOUS_TYPE )
+    ANONYMOUS_TYPE ( )
+    ANONYMOUS_TYPE ( ARGS )
+
+############### TOP LEVEL
+TOP_LEVEL nonterm .start
+    TYPEDEF
+    FUNCTION
+
+ARGS nonterm
+    ANONYMOUS_TYPE
+    ANONYMOUS_TYPE , ARGS
+    DECLARATION
+    DECLARATION , ARGS
+
+FUNCTION nonterm
+    DECLARATION ( ) TRUE_BLOCK
+    DECLARATION ( ARGS ) TRUE_BLOCK
+
+AGGREGATE_DECLARATION nonterm
+    { STMTS }
+
+TYPEDEF nonterm
+    typedef PRIMITIVE_TYPE TYPE_EXPRESSION ;
+
 EXPR nonterm
     INT
     IDENT
@@ -40,19 +98,33 @@ FOR nonterm
 SWITCH nonterm
     switch ( EXPR ) BLOCK
 
+DECLARATION_CHAIN nonterm
+    TYPE_EXPRESSION
+    TYPE_EXPRESSION , DECLARATION_CHAIN
+    TYPE_EXPRESSION = EXPR
+    TYPE_EXPRESSION = EXPR , DECLARATION_CHAIN
+
+DECLARATION_STATEMENT nonterm
+    PRIMITIVE_TYPE DECLARATION_CHAIN ;
+
 STMT nonterm
     IF
     WHILE
     DO
     FOR
     SWITCH
+    # NOTE: it auto-prefers declarations right now
+    DECLARATION_STATEMENT       .name DECL_STMT
     EXPR ;
 
-STMTS nonterm .start
+STMTS nonterm
     STMT
     STMT STMTS
 
-BLOCK nonterm
+TRUE_BLOCK nonterm
     { }
     { STMTS }
+
+BLOCK nonterm
+    TRUE_BLOCK
     STMT
author	Matthew Sotoudeh <matthew@masot.net>	2024-02-19 16:41:13 -0800
committer	Matthew Sotoudeh <matthew@masot.net>	2024-02-19 16:41:13 -0800
commit	26a42b4a7ba077659f791208a2a7989bfdfb3663 (patch)
tree	02069692c4d629d3108bbed43c4eb6eddfd2fbc7
parent	e133f250761c67b4465181f41909e78c272901d3 (diff)