summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Sotoudeh <matthew@masot.net>2024-02-19 16:41:13 -0800
committerMatthew Sotoudeh <matthew@masot.net>2024-02-19 16:41:13 -0800
commit26a42b4a7ba077659f791208a2a7989bfdfb3663 (patch)
tree02069692c4d629d3108bbed43c4eb6eddfd2fbc7
parente133f250761c67b4465181f41909e78c272901d3 (diff)
playing with the C grammar
-rw-r--r--Makefile2
-rw-r--r--earlpy/earlpy.py52
-rw-r--r--examples/file.c3
-rw-r--r--examples/test.py8
-rw-r--r--examples/type_ambiguous.c3
-rw-r--r--grammars/c/disambiguate.c33
-rw-r--r--grammars/c/grammar.txt76
7 files changed, 148 insertions, 29 deletions
diff --git a/Makefile b/Makefile
deleted file mode 100644
index d6abf1c..0000000
--- a/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-CFLAGS += -fsanitize=address -g
-# CFLAGS += -O3 -g
diff --git a/earlpy/earlpy.py b/earlpy/earlpy.py
index 7fbf0f0..3b0deab 100644
--- a/earlpy/earlpy.py
+++ b/earlpy/earlpy.py
@@ -13,7 +13,8 @@ class Parser:
parser_dir = parser_dir
files = sorted([f"{parser_dir}/grammar.txt",
*glob(f"{parser_dir}/*.c"),
- f"{DIR}/parser.c"])
+ f"{DIR}/parser.c",
+ __file__])
if f"{parser_dir}/parser.c" in files:
files.remove(f"{parser_dir}/parser.c")
hashes = ' '.join(
@@ -110,17 +111,10 @@ class Parser:
ordered_symbols = []
last_symbol = None
for line in grammar:
- if line[0] in ' \t':
- if last_symbol.kind == "list":
- last_symbol.contents.extend(line.split())
- elif last_symbol.kind == "regex":
- assert not last_symbol.contents
- last_symbol.contents = line.strip()
- elif last_symbol.kind == "nonterm":
- last_symbol.contents.append(line.split())
- else: raise NotImplementedError
- elif line.strip().startswith("#"):
+ if line.strip().startswith("#"):
continue
+ elif line[0] in ' \t':
+ last_symbol.process_subline(line.strip())
elif line.strip():
last_symbol = Symbol(line)
self.name_to_symbol[last_symbol.name] = last_symbol
@@ -179,6 +173,7 @@ class Parser:
symbol.kind = "nonterm"
symbol.contents = new_rule
+ symbol.production_names = [None for _ in new_rule]
symbol.is_pseudo_node = True
new_ordered_symbols.append(symbol)
ordered_symbols = new_ordered_symbols
@@ -232,27 +227,31 @@ class Parser:
putl("prod_id_t SYMBOL_ID_TO_PRODUCTION_IDS[N_SYMBOLS][MAX_N_PRODUCTIONS] = { {0}")
# [(production, Symbol), ...]
- self.productions = [([], None)]
+ self.productions = [([], None, None)]
for symbol in ordered_symbols:
if symbol.kind == "nonterm":
start_idx = len(self.productions)
assert isinstance(symbol.contents[0], list)
- for rule in symbol.contents:
+ for i, rule in enumerate(symbol.contents):
rule = [self.name_to_symbol[x] for x in rule]
- self.productions.append((rule, symbol))
+ self.productions.append((rule, symbol, symbol.production_names[i]))
prods = ', '.join(map(str, range(start_idx, len(self.productions))))
if prods:
put(", {" + prods + ", 0}")
else:
put(", {0}")
else:
- self.productions.append(([], symbol))
+ self.productions.append(([], symbol, None))
put(", {0}")
put(" };")
putl(f"#define N_PRODUCTIONS {len(self.productions)}")
+ for i, (_, _, name) in enumerate(self.productions):
+ if name:
+ putl(f"#define PRODUCTION_{name} {i}")
+
putl("symbol_id_t PRODUCTION_ID_TO_PRODUCTION[N_PRODUCTIONS][MAX_PRODUCTION_LEN] = { {0}")
- for i, (production, _) in enumerate(self.productions):
+ for i, (production, _, _) in enumerate(self.productions):
if i == 0: continue
production = ', '.join(str(symbol.id) for symbol in production)
if production:
@@ -262,7 +261,7 @@ class Parser:
put(" };")
putl("symbol_id_t PRODUCTION_ID_TO_SYMBOL[N_PRODUCTIONS] = { 0")
- for i, (_, symbol) in enumerate(self.productions):
+ for i, (_, symbol, _) in enumerate(self.productions):
if i != 0: put(f", {symbol.id}")
put(" };")
@@ -294,9 +293,28 @@ class Symbol:
self.kind = parts[1]
self.is_start = ".start" in parts[2:]
self.contents = []
+ self.production_names = []
self.id = None
self.is_pseudo_node = False
+ def process_subline(self, line):
+ if self.kind == "list":
+ self.contents.extend(line.split())
+ elif self.kind == "regex":
+ assert not self.contents
+ self.contents = line.strip()
+ elif self.kind == "nonterm":
+ self.contents.append(line.split())
+ self.production_names.append(None)
+ for i, part in enumerate(self.contents[-1]):
+ if part.startswith("."):
+ args = self.contents[-1][i:]
+ self.contents[-1] = self.contents[-1][:i]
+ for arg, value in zip(args[::2], args[1::2]):
+ if arg == ".name":
+ self.production_names[-1] = value
+ else: raise NotImplementedError
+
class Node:
def __init__(self, symbol, production):
self.symbol = symbol
diff --git a/examples/file.c b/examples/file.c
new file mode 100644
index 0000000..a94ceea
--- /dev/null
+++ b/examples/file.c
@@ -0,0 +1,3 @@
+void hello(int x) {
+ x += 1;
+}
diff --git a/examples/test.py b/examples/test.py
index e5783c5..ac7e223 100644
--- a/examples/test.py
+++ b/examples/test.py
@@ -1,14 +1,16 @@
import earlpy
+p = earlpy.Parser("grammars/c")
+
if False:
p = earlpy.Parser("grammars/expression")
node = p.parse_string("1 + 1 + 2 + 3")
print(node.pprint())
-elif True:
+elif False:
p = earlpy.Parser("grammars/c")
- node = p.parse_file("examples/simple.c")
+ node = p.parse_file("examples/file.c")
print(node.pprint())
-else:
+elif False:
p = earlpy.Parser("grammars/c")
node = p.parse_file("examples/expr.c")
print(node.pprint())
diff --git a/examples/type_ambiguous.c b/examples/type_ambiguous.c
new file mode 100644
index 0000000..36602ae
--- /dev/null
+++ b/examples/type_ambiguous.c
@@ -0,0 +1,3 @@
+void hello(int x) {
+ foo_t * x;
+}
diff --git a/grammars/c/disambiguate.c b/grammars/c/disambiguate.c
index a2d69d1..403d65f 100644
--- a/grammars/c/disambiguate.c
+++ b/grammars/c/disambiguate.c
@@ -1,14 +1,37 @@
+struct token *TYPE_NAMES[1024];
+size_t N_TYPE_NAMES;
+
+void alert_parse(struct state *state) {
+ if (PRODUCTION_ID_TO_SYMBOL[state->production_id] == SYMBOL_TYPEDEF) {
+ for (struct token *t = find_token(state, 2); t->symbol != DONE_SYMBOL; t++) {
+ if (t->symbol == SYMBOL_IDENT) {
+ TYPE_NAMES[N_TYPE_NAMES++] = t;
+ break;
+ }
+ }
+ }
+}
+
+int is_typename(struct token *token) {
+ if (!strcmp("int", token->string)) return 1;
+ for (size_t i = 0; i < N_TYPE_NAMES; i++)
+ if (!strcmp(TYPE_NAMES[i]->string, token->string))
+ return 1;
+ return 0;
+}
+
int disambiguator(struct state *old, struct state *new) {
// printf("Old tree:\n");
// print_parse_tree(old, 4);
// printf("New tree:\n");
// print_parse_tree(new, 4);
- if (old->start_idx != new->start_idx) {
- // printf("\t\tIGNORING "); print_parse_tree2(old);
- // printf("\t\tVS: "); print_parse_tree2(new);
- return 2;
- }
+ if (old->production_id == PRODUCTION_DECL_STMT)
+ if (!is_typename(find_token(old->reasons[0], 0)))
+ return 1;
+ if (new->production_id == PRODUCTION_DECL_STMT)
+ if (!is_typename(find_token(new->reasons[0], 0)))
+ return 0;
// Prefer the earlier parsings in the grammar when two entirely different
// productions are taken.
diff --git a/grammars/c/grammar.txt b/grammars/c/grammar.txt
index 7959318..ffe85c3 100644
--- a/grammars/c/grammar.txt
+++ b/grammars/c/grammar.txt
@@ -1,5 +1,6 @@
KEYWORDS list
switch volatile case while do else const for if
+ struct union typedef void
IDENT regex
[a-zA-Z_][0-9a-zA-Z_]*
@@ -16,6 +17,63 @@ OP list
< <= > >= =
. -> ? :
+############### TYPE PARSING
+# A PRIMITIVE_TYPE is the core object that takes up space after dereferencing,
+# calling, etc. A normal variable declaration is PRIMITIVE_TYPE (expression)
+PRIMITIVE_TYPE nonterm
+ struct IDENT
+ union IDENT
+ struct IDENT AGGREGATE_DECLARATION
+ union IDENT AGGREGATE_DECLARATION
+ const PRIMITIVE_TYPE
+ volatile PRIMITIVE_TYPE
+ void
+ IDENT
+
+# A TYPE_EXPRESSION is basically an lvalue expression.
+TYPE_EXPRESSION nonterm
+ IDENT
+ TYPE_EXPRESSION [ ]
+ TYPE_EXPRESSION [ INT ]
+ * TYPE_EXPRESSION
+ ( TYPE_EXPRESSION )
+ TYPE_EXPRESSION ( )
+ TYPE_EXPRESSION ( ARGS )
+
+DECLARATION nonterm
+ PRIMITIVE_TYPE TYPE_EXPRESSION
+
+# An ANONYMOUS_TYPE has no name
+ANONYMOUS_TYPE nonterm
+ PRIMITIVE_TYPE
+ ANONYMOUS_TYPE [ ]
+ ANONYMOUS_TYPE [ INT ]
+ * ANONYMOUS_TYPE
+ ( ANONYMOUS_TYPE )
+ ANONYMOUS_TYPE ( )
+ ANONYMOUS_TYPE ( ARGS )
+
+############### TOP LEVEL
+TOP_LEVEL nonterm .start
+ TYPEDEF
+ FUNCTION
+
+ARGS nonterm
+ ANONYMOUS_TYPE
+ ANONYMOUS_TYPE , ARGS
+ DECLARATION
+ DECLARATION , ARGS
+
+FUNCTION nonterm
+ DECLARATION ( ) TRUE_BLOCK
+ DECLARATION ( ARGS ) TRUE_BLOCK
+
+AGGREGATE_DECLARATION nonterm
+ { STMTS }
+
+TYPEDEF nonterm
+ typedef PRIMITIVE_TYPE TYPE_EXPRESSION ;
+
EXPR nonterm
INT
IDENT
@@ -40,19 +98,33 @@ FOR nonterm
SWITCH nonterm
switch ( EXPR ) BLOCK
+DECLARATION_CHAIN nonterm
+ TYPE_EXPRESSION
+ TYPE_EXPRESSION , DECLARATION_CHAIN
+ TYPE_EXPRESSION = EXPR
+ TYPE_EXPRESSION = EXPR , DECLARATION_CHAIN
+
+DECLARATION_STATEMENT nonterm
+ PRIMITIVE_TYPE DECLARATION_CHAIN ;
+
STMT nonterm
IF
WHILE
DO
FOR
SWITCH
+ # NOTE: it auto-prefers declarations right now
+ DECLARATION_STATEMENT .name DECL_STMT
EXPR ;
-STMTS nonterm .start
+STMTS nonterm
STMT
STMT STMTS
-BLOCK nonterm
+TRUE_BLOCK nonterm
{ }
{ STMTS }
+
+BLOCK nonterm
+ TRUE_BLOCK
STMT
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback