diff options
Diffstat (limited to 'grammars/c')
-rw-r--r-- | grammars/c/disambiguate.c | 84 | ||||
-rw-r--r-- | grammars/c/grammar.earlpy | 265 | ||||
-rw-r--r-- | grammars/c/grammar.txt | 130 | ||||
-rw-r--r-- | grammars/c/preprocess.c | 45 |
4 files changed, 356 insertions, 168 deletions
diff --git a/grammars/c/disambiguate.c b/grammars/c/disambiguate.c index 403d65f..b6a99e8 100644 --- a/grammars/c/disambiguate.c +++ b/grammars/c/disambiguate.c @@ -1,47 +1,41 @@ -struct token *TYPE_NAMES[1024]; -size_t N_TYPE_NAMES; - -void alert_parse(struct state *state) { - if (PRODUCTION_ID_TO_SYMBOL[state->production_id] == SYMBOL_TYPEDEF) { - for (struct token *t = find_token(state, 2); t->symbol != DONE_SYMBOL; t++) { - if (t->symbol == SYMBOL_IDENT) { - TYPE_NAMES[N_TYPE_NAMES++] = t; - break; - } - } - } -} - -int is_typename(struct token *token) { - if (!strcmp("int", token->string)) return 1; - for (size_t i = 0; i < N_TYPE_NAMES; i++) - if (!strcmp(TYPE_NAMES[i]->string, token->string)) - return 1; - return 0; -} +void alert_parse(struct state *state) { } int disambiguator(struct state *old, struct state *new) { - // printf("Old tree:\n"); - // print_parse_tree(old, 4); - // printf("New tree:\n"); - // print_parse_tree(new, 4); + // fprintf(stderr, "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"); + // print_parse_tree(old, 0, stderr); + // print_parse_tree(new, 0, stderr); + // fprintf(stderr, "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"); - if (old->production_id == PRODUCTION_DECL_STMT) - if (!is_typename(find_token(old->reasons[0], 0))) - return 1; - if (new->production_id == PRODUCTION_DECL_STMT) - if (!is_typename(find_token(new->reasons[0], 0))) - return 0; + if (old->n_poisoned < new->n_poisoned) return 0; + if (new->n_poisoned < old->n_poisoned) return 1; // Prefer the earlier parsings in the grammar when two entirely different // productions are taken. if (old->production_id != new->production_id) - return old->production_id < new->production_id - ? 0 : 1; + return (old->production_id < new->production_id) ? 0 : 1; // If they're the same production ... prod_id_t prod = old->production_id; - if (PRODUCTION_ID_TO_SYMBOL[prod] == SYMBOL_EXPR) { + + if (PRODUCTION_ID_TO_SYMBOL[prod] == START_SYMBOL + && PRODUCTION_ID_TO_PRODUCTION[prod][1] != DONE_SYMBOL) { + struct token *old_tok = find_token(old, 1), + *new_tok = find_token(new, 1); + if (old_tok < new_tok) return 0; + else if (old_tok > new_tok) return 1; + } + + if (PRODUCTION_ID_TO_PRODUCTION[prod][0] == SYMBOL_ERROR && PRODUCTION_ID_TO_PRODUCTION[prod][1] != DONE_SYMBOL) { + struct token *old_tok = find_token(old, 1), + *new_tok = find_token(new, 1); + if (old_tok < new_tok) return 0; + else if (old_tok > new_tok) return 1; + } else if (PRODUCTION_ID_TO_PRODUCTION[prod][1] == SYMBOL_ERROR) { + struct token *old_tok = find_token(old, 1), + *new_tok = find_token(new, 1); + if (old_tok < new_tok) return 1; + else if (old_tok > new_tok) return 0; + } else if (PRODUCTION_ID_TO_SYMBOL[prod] == SYMBOL_EXPR) { if (PRODUCTION_ID_TO_PRODUCTION[prod][1] == SYMBOL_OP) { struct token *old_tok = find_token(old, 1), *new_tok = find_token(new, 1); @@ -49,7 +43,7 @@ int disambiguator(struct state *old, struct state *new) { const char *precedence[] = {".", "->", "*", "/", "%", "+", "-", "<<", ">>", "<", "<=", ">", ">=", "==", "!=", "&", "|", "&&", "||", "=", "+=", "-=", "*=", "/=", "%=", "<<=", ">>=", "&=", - "^=", "|=", ",", 0}; + "^=", "|=", ",", ":", 0}; if (strcmp(old_s, new_s)) { for (const char **p = precedence; *p; p++) { if (!strcmp(old_s, *p)) { @@ -58,16 +52,30 @@ int disambiguator(struct state *old, struct state *new) { return 0; } } - // BAD! - return 2; + fprintf(stderr, "ERROR: didn't find operator '%s'\n", old_s); + exit(1); } else { - // Associate RIGHT if (old_tok < new_tok) return 1; else if (old_tok > new_tok) return 0; } } } + // Generally speaking, we want left associativity to avoid long chains of + // completions. + struct token *old_tok = find_token(old, 1), + *new_tok = find_token(new, 1); + if (old_tok < new_tok) return 1; + else if (old_tok > new_tok) return 0; + fprintf(stderr, "TOTALLY UNKNOWN!\n"); + fprintf(stderr, "~~~~~~~~~~~~~~~~~~~~~\n"); + pprint_state(old); + // print_parse_tree(old, 0, stderr); + fprintf(stderr, "~~~~~~~~~~~~~~~~~~~~~\n"); + pprint_state(new); + // print_parse_tree(new, 0, stderr); + fprintf(stderr, "~~~~~~~~~~~~~~~~~~~~~\n"); + exit(1); return 2; } diff --git a/grammars/c/grammar.earlpy b/grammars/c/grammar.earlpy new file mode 100644 index 0000000..99cafe9 --- /dev/null +++ b/grammars/c/grammar.earlpy @@ -0,0 +1,265 @@ +#### OPTIMIZATIONS: +# where possible, we want parse trees that look like: +# (((A + B) + C) + D), i.e., left-associativity because it avoids long chains +# of completions. Another explanation for it is that then ambiguity is resolved +# as early in the left-to-right parse as possible. + +KEYWORDS list + switch volatile case while do else const for if + struct union typedef void return break continue + sizeof + +IDENT regex + [a-zA-Z_][0-9a-zA-Z_]* + +INT regex + ((0x[0-9a-fA-F]*)|([0-9]*))([uUlL])* + +# https://stackoverflow.com/questions/2039795/regular-expression-for-a-string-literal-in-flex-lex +STRING regex + ["]([^\\"]|\\.)*["] + +CHAR regex + [']([\\][']|[^'][^'])*[^']?['] + +OP list + ; , + - + ! % * & / << >> ^ | + -= += != %= *= &= /= <<= == >>= ^= |= + && || ++ -- + < <= > >= = + . -> + +TERNARY list + : ? + +PARENS list + ( ) { } [ ] + +############### ERROR RECOVERY +# These rules match either a single token, or a pair of balanced parentheses + +NONPAREN nonterm + KEYWORDS + IDENT + INT + STRING + CHAR + TERNARY + OP + +ERROR_INNER nonterm .poison + ERROR + ERROR_INNER ERROR + +ERROR nonterm .poison + ( ERROR_INNER ) + { ERROR_INNER } + [ ERROR_INNER ] + ( ) + { } + [ ] + NONPAREN + +############### TYPE PARSING +# A PRIMITIVE_TYPE is the core object that takes up space after dereferencing, +# calling, etc. A normal variable declaration is PRIMITIVE_TYPE (expression) +PRIMITIVE_TYPE nonterm + struct IDENT + union IDENT + struct IDENT AGGREGATE_DECLARATION + union IDENT AGGREGATE_DECLARATION + const PRIMITIVE_TYPE + volatile PRIMITIVE_TYPE + void + IDENT + +# A TYPE_EXPRESSION is basically an lvalue expression. +TYPE_EXPRESSION nonterm + IDENT + TYPE_EXPRESSION [ ] + TYPE_EXPRESSION [ EXPR ] + * TYPE_EXPRESSION + const TYPE_EXPRESSION + ( TYPE_EXPRESSION ) + TYPE_EXPRESSION ( ) + TYPE_EXPRESSION ( ARGS ) + +DECLARATION nonterm + PRIMITIVE_TYPE TYPE_EXPRESSION + +# An ANONYMOUS_TYPE has no name +ANONYMOUS_TYPE nonterm + PRIMITIVE_TYPE + ANONYMOUS_TYPE [ ] + ANONYMOUS_TYPE [ EXPR ] + ANONYMOUS_TYPE * + ANONYMOUS_TYPE const * + const ANONYMOUS_TYPE + ( ANONYMOUS_TYPE ) + ANONYMOUS_TYPE ( ) + ANONYMOUS_TYPE ( ARGS ) + +############### TOP LEVEL +TOP_LEVEL nonterm .start + TOP_LEVEL TYPEDEF + TOP_LEVEL STRUCTDECL + TOP_LEVEL FUNCTION + TOP_LEVEL DECLARATION_STATEMENT + TYPEDEF + STRUCTDECL + FUNCTION + DECLARATION_STATEMENT + TOP_LEVEL ERROR + ERROR + +ARGS nonterm + ANONYMOUS_TYPE + ARGS , ANONYMOUS_TYPE + DECLARATION + ARGS , DECLARATION + +CALL_ARGS nonterm + CALL_ARGS , EXPR + EXPR + +OLD_ARGS nonterm + OLD_ARGS , IDENT + IDENT + +OLD_ARG_DECLS nonterm + OLD_ARG_DECLS DECLARATION_STATEMENT + DECLARATION_STATEMENT + +FUNCTION nonterm + DECLARATION ( ) TRUE_BLOCK + DECLARATION ( ARGS ) TRUE_BLOCK + DECLARATION ( OLD_ARGS ) OLD_ARG_DECLS TRUE_BLOCK + IDENT ( OLD_ARGS ) OLD_ARG_DECLS TRUE_BLOCK + +AGGREGATE_DECLARATION nonterm + { STMTS } + { } + +TYPEDEF nonterm + typedef PRIMITIVE_TYPE TYPE_EXPRESSION ; + +STRUCTDECL nonterm + struct IDENT AGGREGATE_DECLARATION ; + +UNIONDECL nonterm + union IDENT AGGREGATE_DECLARATION ; + +EXPR nonterm + INT + STRING + CHAR + IDENT + EXPR -- + EXPR ++ + -- EXPR + ++ EXPR + - EXPR + + EXPR + & EXPR + * EXPR + ( ANONYMOUS_TYPE ) EXPR + EXPR ( ) + EXPR ( CALL_ARGS ) + EXPR OP EXPR + EXPR ? EXPR : EXPR + EXPR ? : EXPR + EXPR [ EXPR ] + ! EXPR + ( EXPR ) + sizeof EXPR + sizeof ANONYMOUS_TYPE + INITIALIZER_LIST + EXPR EXPR + +INITIALIZER_LIST nonterm + { INNER_INITIALIZER_LIST } + { } + +INNER_INITIALIZER_LIST nonterm + EXPR + INNER_INITIALIZER_LIST , EXPR + INNER_INITIALIZER_LIST , + +IF nonterm + if ( EXPR ) BLOCK + if ( EXPR ) BLOCK else BLOCK + +WHILE nonterm + while ( EXPR ) BLOCK + +DO nonterm + do BLOCK while ( EXPR ) + +FOR nonterm + for ( ; ; ) BLOCK + for ( ; ; EXPR ) BLOCK + for ( ; EXPR ; ) BLOCK + for ( ; EXPR ; EXPR ) BLOCK + for ( EXPR ; ; ) BLOCK + for ( EXPR ; ; EXPR ) BLOCK + for ( EXPR ; EXPR ; ) BLOCK + for ( EXPR ; EXPR ; EXPR ) BLOCK + +SWITCH nonterm + switch ( EXPR ) BLOCK + +DECLARATION_CHAIN nonterm + DECLARATION_CHAIN , TYPE_EXPRESSION + TYPE_EXPRESSION + DECLARATION_CHAIN , TYPE_EXPRESSION = EXPR + TYPE_EXPRESSION = EXPR + +DECLARATION_STATEMENT nonterm + PRIMITIVE_TYPE DECLARATION_CHAIN ; + +RETURN nonterm + return EXPR ; + return ; + +BREAK nonterm + break ; + +CONTINUE nonterm + continue ; + +LABEL nonterm + IDENT : STMT + +CASE nonterm + case EXPR : STMT + +STMT nonterm + TRUE_BLOCK + LABEL + CASE + BREAK + CONTINUE + RETURN + IF + WHILE + DO + FOR + SWITCH + DECLARATION_STATEMENT + EXPR ; + ; + +STMTS nonterm + STMTS STMT + STMT + STMTS ERROR + ERROR + +TRUE_BLOCK nonterm + { } + { STMTS } + +BLOCK nonterm + TRUE_BLOCK + STMT diff --git a/grammars/c/grammar.txt b/grammars/c/grammar.txt deleted file mode 100644 index ffe85c3..0000000 --- a/grammars/c/grammar.txt +++ /dev/null @@ -1,130 +0,0 @@ -KEYWORDS list - switch volatile case while do else const for if - struct union typedef void - -IDENT regex - [a-zA-Z_][0-9a-zA-Z_]* - -INT regex - [0-9]+ - -OP list - ( ) { } [ ] - ; , - - + ! % * & / << >> ^ | - -= += != %= *= &= /= <<= == >>= ^= |= - && || ++ -- - < <= > >= = - . -> ? : - -############### TYPE PARSING -# A PRIMITIVE_TYPE is the core object that takes up space after dereferencing, -# calling, etc. A normal variable declaration is PRIMITIVE_TYPE (expression) -PRIMITIVE_TYPE nonterm - struct IDENT - union IDENT - struct IDENT AGGREGATE_DECLARATION - union IDENT AGGREGATE_DECLARATION - const PRIMITIVE_TYPE - volatile PRIMITIVE_TYPE - void - IDENT - -# A TYPE_EXPRESSION is basically an lvalue expression. -TYPE_EXPRESSION nonterm - IDENT - TYPE_EXPRESSION [ ] - TYPE_EXPRESSION [ INT ] - * TYPE_EXPRESSION - ( TYPE_EXPRESSION ) - TYPE_EXPRESSION ( ) - TYPE_EXPRESSION ( ARGS ) - -DECLARATION nonterm - PRIMITIVE_TYPE TYPE_EXPRESSION - -# An ANONYMOUS_TYPE has no name -ANONYMOUS_TYPE nonterm - PRIMITIVE_TYPE - ANONYMOUS_TYPE [ ] - ANONYMOUS_TYPE [ INT ] - * ANONYMOUS_TYPE - ( ANONYMOUS_TYPE ) - ANONYMOUS_TYPE ( ) - ANONYMOUS_TYPE ( ARGS ) - -############### TOP LEVEL -TOP_LEVEL nonterm .start - TYPEDEF - FUNCTION - -ARGS nonterm - ANONYMOUS_TYPE - ANONYMOUS_TYPE , ARGS - DECLARATION - DECLARATION , ARGS - -FUNCTION nonterm - DECLARATION ( ) TRUE_BLOCK - DECLARATION ( ARGS ) TRUE_BLOCK - -AGGREGATE_DECLARATION nonterm - { STMTS } - -TYPEDEF nonterm - typedef PRIMITIVE_TYPE TYPE_EXPRESSION ; - -EXPR nonterm - INT - IDENT - EXPR -- - EXPR ++ - EXPR OP EXPR - EXPR ? EXPR : EXPR - -IF nonterm - if ( EXPR ) BLOCK - if ( EXPR ) BLOCK else BLOCK - -WHILE nonterm - while ( EXPR ) BLOCK - -DO nonterm - do BLOCK while ( EXPR ) - -FOR nonterm - for ( EXPR ; EXPR ; EXPR ) BLOCK - -SWITCH nonterm - switch ( EXPR ) BLOCK - -DECLARATION_CHAIN nonterm - TYPE_EXPRESSION - TYPE_EXPRESSION , DECLARATION_CHAIN - TYPE_EXPRESSION = EXPR - TYPE_EXPRESSION = EXPR , DECLARATION_CHAIN - -DECLARATION_STATEMENT nonterm - PRIMITIVE_TYPE DECLARATION_CHAIN ; - -STMT nonterm - IF - WHILE - DO - FOR - SWITCH - # NOTE: it auto-prefers declarations right now - DECLARATION_STATEMENT .name DECL_STMT - EXPR ; - -STMTS nonterm - STMT - STMT STMTS - -TRUE_BLOCK nonterm - { } - { STMTS } - -BLOCK nonterm - TRUE_BLOCK - STMT diff --git a/grammars/c/preprocess.c b/grammars/c/preprocess.c new file mode 100644 index 0000000..3ae7406 --- /dev/null +++ b/grammars/c/preprocess.c @@ -0,0 +1,45 @@ +void preprocess(char *string, size_t length) { + int on_newline = 1; + for (int i = 0; i < length;) { + switch (string[i]) { + case '/': { + on_newline = 0; + if (string[i+1] == '*') { + for (; i+1 < length; i++) { + if (string[i] == '*' && string[i+1] == '/') { + string[i] = ' '; + string[i+1] = ' '; + break; + } + string[i] = ' '; + } + continue; + } else if (string[i+1] == '/') { + for (; i < length; i++) { + if (string[i] == '\n') { + string[i] = ' '; + break; + } + string[i] = ' '; + } + continue; + } + break; + } + case '#': { + if (on_newline) { + int escaped = 0; + for (i++; i < length; i++) { + if (string[i] == '\n' && !escaped) break; + escaped = (string[i] == '\\'); + string[i] = ' '; + } + break; + } + } + case '\n': on_newline = 1; break; + default: on_newline = 0; break; + } + i++; + } +} |