summaryrefslogtreecommitdiff
path: root/grammars
diff options
context:
space:
mode:
authorMatthew Sotoudeh <matthew@masot.net>2024-03-11 16:33:24 -0700
committerMatthew Sotoudeh <matthew@masot.net>2024-03-11 16:33:24 -0700
commita9292a98cc6c65e2a4ad6da20937ef7568a4143d (patch)
treef921866c475208359285ff3f606c0fafa1812a32 /grammars
parent35fa21e59ad44de3ac5d075a3c1ae60d462a1a13 (diff)
earlpy
Diffstat (limited to 'grammars')
-rw-r--r--grammars/c/disambiguate.c84
-rw-r--r--grammars/c/grammar.earlpy265
-rw-r--r--grammars/c/grammar.txt130
-rw-r--r--grammars/c/preprocess.c45
4 files changed, 356 insertions, 168 deletions
diff --git a/grammars/c/disambiguate.c b/grammars/c/disambiguate.c
index 403d65f..b6a99e8 100644
--- a/grammars/c/disambiguate.c
+++ b/grammars/c/disambiguate.c
@@ -1,47 +1,41 @@
-struct token *TYPE_NAMES[1024];
-size_t N_TYPE_NAMES;
-
-void alert_parse(struct state *state) {
- if (PRODUCTION_ID_TO_SYMBOL[state->production_id] == SYMBOL_TYPEDEF) {
- for (struct token *t = find_token(state, 2); t->symbol != DONE_SYMBOL; t++) {
- if (t->symbol == SYMBOL_IDENT) {
- TYPE_NAMES[N_TYPE_NAMES++] = t;
- break;
- }
- }
- }
-}
-
-int is_typename(struct token *token) {
- if (!strcmp("int", token->string)) return 1;
- for (size_t i = 0; i < N_TYPE_NAMES; i++)
- if (!strcmp(TYPE_NAMES[i]->string, token->string))
- return 1;
- return 0;
-}
+void alert_parse(struct state *state) { }
int disambiguator(struct state *old, struct state *new) {
- // printf("Old tree:\n");
- // print_parse_tree(old, 4);
- // printf("New tree:\n");
- // print_parse_tree(new, 4);
+ // fprintf(stderr, "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n");
+ // print_parse_tree(old, 0, stderr);
+ // print_parse_tree(new, 0, stderr);
+ // fprintf(stderr, "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n");
- if (old->production_id == PRODUCTION_DECL_STMT)
- if (!is_typename(find_token(old->reasons[0], 0)))
- return 1;
- if (new->production_id == PRODUCTION_DECL_STMT)
- if (!is_typename(find_token(new->reasons[0], 0)))
- return 0;
+ if (old->n_poisoned < new->n_poisoned) return 0;
+ if (new->n_poisoned < old->n_poisoned) return 1;
// Prefer the earlier parsings in the grammar when two entirely different
// productions are taken.
if (old->production_id != new->production_id)
- return old->production_id < new->production_id
- ? 0 : 1;
+ return (old->production_id < new->production_id) ? 0 : 1;
// If they're the same production ...
prod_id_t prod = old->production_id;
- if (PRODUCTION_ID_TO_SYMBOL[prod] == SYMBOL_EXPR) {
+
+ if (PRODUCTION_ID_TO_SYMBOL[prod] == START_SYMBOL
+ && PRODUCTION_ID_TO_PRODUCTION[prod][1] != DONE_SYMBOL) {
+ struct token *old_tok = find_token(old, 1),
+ *new_tok = find_token(new, 1);
+ if (old_tok < new_tok) return 0;
+ else if (old_tok > new_tok) return 1;
+ }
+
+ if (PRODUCTION_ID_TO_PRODUCTION[prod][0] == SYMBOL_ERROR && PRODUCTION_ID_TO_PRODUCTION[prod][1] != DONE_SYMBOL) {
+ struct token *old_tok = find_token(old, 1),
+ *new_tok = find_token(new, 1);
+ if (old_tok < new_tok) return 0;
+ else if (old_tok > new_tok) return 1;
+ } else if (PRODUCTION_ID_TO_PRODUCTION[prod][1] == SYMBOL_ERROR) {
+ struct token *old_tok = find_token(old, 1),
+ *new_tok = find_token(new, 1);
+ if (old_tok < new_tok) return 1;
+ else if (old_tok > new_tok) return 0;
+ } else if (PRODUCTION_ID_TO_SYMBOL[prod] == SYMBOL_EXPR) {
if (PRODUCTION_ID_TO_PRODUCTION[prod][1] == SYMBOL_OP) {
struct token *old_tok = find_token(old, 1),
*new_tok = find_token(new, 1);
@@ -49,7 +43,7 @@ int disambiguator(struct state *old, struct state *new) {
const char *precedence[] = {".", "->", "*", "/", "%", "+", "-",
"<<", ">>", "<", "<=", ">", ">=", "==", "!=", "&", "|", "&&",
"||", "=", "+=", "-=", "*=", "/=", "%=", "<<=", ">>=", "&=",
- "^=", "|=", ",", 0};
+ "^=", "|=", ",", ":", 0};
if (strcmp(old_s, new_s)) {
for (const char **p = precedence; *p; p++) {
if (!strcmp(old_s, *p)) {
@@ -58,16 +52,30 @@ int disambiguator(struct state *old, struct state *new) {
return 0;
}
}
- // BAD!
- return 2;
+ fprintf(stderr, "ERROR: didn't find operator '%s'\n", old_s);
+ exit(1);
} else {
- // Associate RIGHT
if (old_tok < new_tok) return 1;
else if (old_tok > new_tok) return 0;
}
}
}
+ // Generally speaking, we want left associativity to avoid long chains of
+ // completions.
+ struct token *old_tok = find_token(old, 1),
+ *new_tok = find_token(new, 1);
+ if (old_tok < new_tok) return 1;
+ else if (old_tok > new_tok) return 0;
+
fprintf(stderr, "TOTALLY UNKNOWN!\n");
+ fprintf(stderr, "~~~~~~~~~~~~~~~~~~~~~\n");
+ pprint_state(old);
+ // print_parse_tree(old, 0, stderr);
+ fprintf(stderr, "~~~~~~~~~~~~~~~~~~~~~\n");
+ pprint_state(new);
+ // print_parse_tree(new, 0, stderr);
+ fprintf(stderr, "~~~~~~~~~~~~~~~~~~~~~\n");
+ exit(1);
return 2;
}
diff --git a/grammars/c/grammar.earlpy b/grammars/c/grammar.earlpy
new file mode 100644
index 0000000..99cafe9
--- /dev/null
+++ b/grammars/c/grammar.earlpy
@@ -0,0 +1,265 @@
+#### OPTIMIZATIONS:
+# where possible, we want parse trees that look like:
+# (((A + B) + C) + D), i.e., left-associativity because it avoids long chains
+# of completions. Another explanation for it is that then ambiguity is resolved
+# as early in the left-to-right parse as possible.
+
+KEYWORDS list
+ switch volatile case while do else const for if
+ struct union typedef void return break continue
+ sizeof
+
+IDENT regex
+ [a-zA-Z_][0-9a-zA-Z_]*
+
+INT regex
+ ((0x[0-9a-fA-F]*)|([0-9]*))([uUlL])*
+
+# https://stackoverflow.com/questions/2039795/regular-expression-for-a-string-literal-in-flex-lex
+STRING regex
+ ["]([^\\"]|\\.)*["]
+
+CHAR regex
+ [']([\\][']|[^'][^'])*[^']?[']
+
+OP list
+ ; ,
+ - + ! % * & / << >> ^ |
+ -= += != %= *= &= /= <<= == >>= ^= |=
+ && || ++ --
+ < <= > >= =
+ . ->
+
+TERNARY list
+ : ?
+
+PARENS list
+ ( ) { } [ ]
+
+############### ERROR RECOVERY
+# These rules match either a single token, or a pair of balanced parentheses
+
+NONPAREN nonterm
+ KEYWORDS
+ IDENT
+ INT
+ STRING
+ CHAR
+ TERNARY
+ OP
+
+ERROR_INNER nonterm .poison
+ ERROR
+ ERROR_INNER ERROR
+
+ERROR nonterm .poison
+ ( ERROR_INNER )
+ { ERROR_INNER }
+ [ ERROR_INNER ]
+ ( )
+ { }
+ [ ]
+ NONPAREN
+
+############### TYPE PARSING
+# A PRIMITIVE_TYPE is the core object that takes up space after dereferencing,
+# calling, etc. A normal variable declaration is PRIMITIVE_TYPE (expression)
+PRIMITIVE_TYPE nonterm
+ struct IDENT
+ union IDENT
+ struct IDENT AGGREGATE_DECLARATION
+ union IDENT AGGREGATE_DECLARATION
+ const PRIMITIVE_TYPE
+ volatile PRIMITIVE_TYPE
+ void
+ IDENT
+
+# A TYPE_EXPRESSION is basically an lvalue expression.
+TYPE_EXPRESSION nonterm
+ IDENT
+ TYPE_EXPRESSION [ ]
+ TYPE_EXPRESSION [ EXPR ]
+ * TYPE_EXPRESSION
+ const TYPE_EXPRESSION
+ ( TYPE_EXPRESSION )
+ TYPE_EXPRESSION ( )
+ TYPE_EXPRESSION ( ARGS )
+
+DECLARATION nonterm
+ PRIMITIVE_TYPE TYPE_EXPRESSION
+
+# An ANONYMOUS_TYPE has no name
+ANONYMOUS_TYPE nonterm
+ PRIMITIVE_TYPE
+ ANONYMOUS_TYPE [ ]
+ ANONYMOUS_TYPE [ EXPR ]
+ ANONYMOUS_TYPE *
+ ANONYMOUS_TYPE const *
+ const ANONYMOUS_TYPE
+ ( ANONYMOUS_TYPE )
+ ANONYMOUS_TYPE ( )
+ ANONYMOUS_TYPE ( ARGS )
+
+############### TOP LEVEL
+TOP_LEVEL nonterm .start
+ TOP_LEVEL TYPEDEF
+ TOP_LEVEL STRUCTDECL
+ TOP_LEVEL FUNCTION
+ TOP_LEVEL DECLARATION_STATEMENT
+ TYPEDEF
+ STRUCTDECL
+ FUNCTION
+ DECLARATION_STATEMENT
+ TOP_LEVEL ERROR
+ ERROR
+
+ARGS nonterm
+ ANONYMOUS_TYPE
+ ARGS , ANONYMOUS_TYPE
+ DECLARATION
+ ARGS , DECLARATION
+
+CALL_ARGS nonterm
+ CALL_ARGS , EXPR
+ EXPR
+
+OLD_ARGS nonterm
+ OLD_ARGS , IDENT
+ IDENT
+
+OLD_ARG_DECLS nonterm
+ OLD_ARG_DECLS DECLARATION_STATEMENT
+ DECLARATION_STATEMENT
+
+FUNCTION nonterm
+ DECLARATION ( ) TRUE_BLOCK
+ DECLARATION ( ARGS ) TRUE_BLOCK
+ DECLARATION ( OLD_ARGS ) OLD_ARG_DECLS TRUE_BLOCK
+ IDENT ( OLD_ARGS ) OLD_ARG_DECLS TRUE_BLOCK
+
+AGGREGATE_DECLARATION nonterm
+ { STMTS }
+ { }
+
+TYPEDEF nonterm
+ typedef PRIMITIVE_TYPE TYPE_EXPRESSION ;
+
+STRUCTDECL nonterm
+ struct IDENT AGGREGATE_DECLARATION ;
+
+UNIONDECL nonterm
+ union IDENT AGGREGATE_DECLARATION ;
+
+EXPR nonterm
+ INT
+ STRING
+ CHAR
+ IDENT
+ EXPR --
+ EXPR ++
+ -- EXPR
+ ++ EXPR
+ - EXPR
+ + EXPR
+ & EXPR
+ * EXPR
+ ( ANONYMOUS_TYPE ) EXPR
+ EXPR ( )
+ EXPR ( CALL_ARGS )
+ EXPR OP EXPR
+ EXPR ? EXPR : EXPR
+ EXPR ? : EXPR
+ EXPR [ EXPR ]
+ ! EXPR
+ ( EXPR )
+ sizeof EXPR
+ sizeof ANONYMOUS_TYPE
+ INITIALIZER_LIST
+ EXPR EXPR
+
+INITIALIZER_LIST nonterm
+ { INNER_INITIALIZER_LIST }
+ { }
+
+INNER_INITIALIZER_LIST nonterm
+ EXPR
+ INNER_INITIALIZER_LIST , EXPR
+ INNER_INITIALIZER_LIST ,
+
+IF nonterm
+ if ( EXPR ) BLOCK
+ if ( EXPR ) BLOCK else BLOCK
+
+WHILE nonterm
+ while ( EXPR ) BLOCK
+
+DO nonterm
+ do BLOCK while ( EXPR )
+
+FOR nonterm
+ for ( ; ; ) BLOCK
+ for ( ; ; EXPR ) BLOCK
+ for ( ; EXPR ; ) BLOCK
+ for ( ; EXPR ; EXPR ) BLOCK
+ for ( EXPR ; ; ) BLOCK
+ for ( EXPR ; ; EXPR ) BLOCK
+ for ( EXPR ; EXPR ; ) BLOCK
+ for ( EXPR ; EXPR ; EXPR ) BLOCK
+
+SWITCH nonterm
+ switch ( EXPR ) BLOCK
+
+DECLARATION_CHAIN nonterm
+ DECLARATION_CHAIN , TYPE_EXPRESSION
+ TYPE_EXPRESSION
+ DECLARATION_CHAIN , TYPE_EXPRESSION = EXPR
+ TYPE_EXPRESSION = EXPR
+
+DECLARATION_STATEMENT nonterm
+ PRIMITIVE_TYPE DECLARATION_CHAIN ;
+
+RETURN nonterm
+ return EXPR ;
+ return ;
+
+BREAK nonterm
+ break ;
+
+CONTINUE nonterm
+ continue ;
+
+LABEL nonterm
+ IDENT : STMT
+
+CASE nonterm
+ case EXPR : STMT
+
+STMT nonterm
+ TRUE_BLOCK
+ LABEL
+ CASE
+ BREAK
+ CONTINUE
+ RETURN
+ IF
+ WHILE
+ DO
+ FOR
+ SWITCH
+ DECLARATION_STATEMENT
+ EXPR ;
+ ;
+
+STMTS nonterm
+ STMTS STMT
+ STMT
+ STMTS ERROR
+ ERROR
+
+TRUE_BLOCK nonterm
+ { }
+ { STMTS }
+
+BLOCK nonterm
+ TRUE_BLOCK
+ STMT
diff --git a/grammars/c/grammar.txt b/grammars/c/grammar.txt
deleted file mode 100644
index ffe85c3..0000000
--- a/grammars/c/grammar.txt
+++ /dev/null
@@ -1,130 +0,0 @@
-KEYWORDS list
- switch volatile case while do else const for if
- struct union typedef void
-
-IDENT regex
- [a-zA-Z_][0-9a-zA-Z_]*
-
-INT regex
- [0-9]+
-
-OP list
- ( ) { } [ ]
- ; ,
- - + ! % * & / << >> ^ |
- -= += != %= *= &= /= <<= == >>= ^= |=
- && || ++ --
- < <= > >= =
- . -> ? :
-
-############### TYPE PARSING
-# A PRIMITIVE_TYPE is the core object that takes up space after dereferencing,
-# calling, etc. A normal variable declaration is PRIMITIVE_TYPE (expression)
-PRIMITIVE_TYPE nonterm
- struct IDENT
- union IDENT
- struct IDENT AGGREGATE_DECLARATION
- union IDENT AGGREGATE_DECLARATION
- const PRIMITIVE_TYPE
- volatile PRIMITIVE_TYPE
- void
- IDENT
-
-# A TYPE_EXPRESSION is basically an lvalue expression.
-TYPE_EXPRESSION nonterm
- IDENT
- TYPE_EXPRESSION [ ]
- TYPE_EXPRESSION [ INT ]
- * TYPE_EXPRESSION
- ( TYPE_EXPRESSION )
- TYPE_EXPRESSION ( )
- TYPE_EXPRESSION ( ARGS )
-
-DECLARATION nonterm
- PRIMITIVE_TYPE TYPE_EXPRESSION
-
-# An ANONYMOUS_TYPE has no name
-ANONYMOUS_TYPE nonterm
- PRIMITIVE_TYPE
- ANONYMOUS_TYPE [ ]
- ANONYMOUS_TYPE [ INT ]
- * ANONYMOUS_TYPE
- ( ANONYMOUS_TYPE )
- ANONYMOUS_TYPE ( )
- ANONYMOUS_TYPE ( ARGS )
-
-############### TOP LEVEL
-TOP_LEVEL nonterm .start
- TYPEDEF
- FUNCTION
-
-ARGS nonterm
- ANONYMOUS_TYPE
- ANONYMOUS_TYPE , ARGS
- DECLARATION
- DECLARATION , ARGS
-
-FUNCTION nonterm
- DECLARATION ( ) TRUE_BLOCK
- DECLARATION ( ARGS ) TRUE_BLOCK
-
-AGGREGATE_DECLARATION nonterm
- { STMTS }
-
-TYPEDEF nonterm
- typedef PRIMITIVE_TYPE TYPE_EXPRESSION ;
-
-EXPR nonterm
- INT
- IDENT
- EXPR --
- EXPR ++
- EXPR OP EXPR
- EXPR ? EXPR : EXPR
-
-IF nonterm
- if ( EXPR ) BLOCK
- if ( EXPR ) BLOCK else BLOCK
-
-WHILE nonterm
- while ( EXPR ) BLOCK
-
-DO nonterm
- do BLOCK while ( EXPR )
-
-FOR nonterm
- for ( EXPR ; EXPR ; EXPR ) BLOCK
-
-SWITCH nonterm
- switch ( EXPR ) BLOCK
-
-DECLARATION_CHAIN nonterm
- TYPE_EXPRESSION
- TYPE_EXPRESSION , DECLARATION_CHAIN
- TYPE_EXPRESSION = EXPR
- TYPE_EXPRESSION = EXPR , DECLARATION_CHAIN
-
-DECLARATION_STATEMENT nonterm
- PRIMITIVE_TYPE DECLARATION_CHAIN ;
-
-STMT nonterm
- IF
- WHILE
- DO
- FOR
- SWITCH
- # NOTE: it auto-prefers declarations right now
- DECLARATION_STATEMENT .name DECL_STMT
- EXPR ;
-
-STMTS nonterm
- STMT
- STMT STMTS
-
-TRUE_BLOCK nonterm
- { }
- { STMTS }
-
-BLOCK nonterm
- TRUE_BLOCK
- STMT
diff --git a/grammars/c/preprocess.c b/grammars/c/preprocess.c
new file mode 100644
index 0000000..3ae7406
--- /dev/null
+++ b/grammars/c/preprocess.c
@@ -0,0 +1,45 @@
+void preprocess(char *string, size_t length) {
+ int on_newline = 1;
+ for (int i = 0; i < length;) {
+ switch (string[i]) {
+ case '/': {
+ on_newline = 0;
+ if (string[i+1] == '*') {
+ for (; i+1 < length; i++) {
+ if (string[i] == '*' && string[i+1] == '/') {
+ string[i] = ' ';
+ string[i+1] = ' ';
+ break;
+ }
+ string[i] = ' ';
+ }
+ continue;
+ } else if (string[i+1] == '/') {
+ for (; i < length; i++) {
+ if (string[i] == '\n') {
+ string[i] = ' ';
+ break;
+ }
+ string[i] = ' ';
+ }
+ continue;
+ }
+ break;
+ }
+ case '#': {
+ if (on_newline) {
+ int escaped = 0;
+ for (i++; i < length; i++) {
+ if (string[i] == '\n' && !escaped) break;
+ escaped = (string[i] == '\\');
+ string[i] = ' ';
+ }
+ break;
+ }
+ }
+ case '\n': on_newline = 1; break;
+ default: on_newline = 0; break;
+ }
+ i++;
+ }
+}
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback