#include #include #include #include #define VERBOSE 0 #if VERBOSE #define vprintf(...) printf(__VA_ARGS__) #else #define vprintf(...) #endif //////// This file will be copy/pasted below the generated code yyscan_t SCANNER; struct token { symbol_id_t symbol; size_t offset; size_t length; char *string; }; char *STRING; struct token *TOKENS = 0; size_t N_TOKENS = 0, CAP_TOKENS = 0; void lex_symbol(symbol_id_t which) { if (N_TOKENS == CAP_TOKENS) { CAP_TOKENS = (CAP_TOKENS + 1) * 2; TOKENS = realloc(TOKENS, CAP_TOKENS * sizeof(*TOKENS)); } TOKENS[N_TOKENS].symbol = which; TOKENS[N_TOKENS].offset = OFFSET; TOKENS[N_TOKENS].length = 0; if (STRING) { int length = yyget_leng(SCANNER); TOKENS[N_TOKENS].offset = OFFSET - length; TOKENS[N_TOKENS].length = length; TOKENS[N_TOKENS++].string = strndup(STRING + OFFSET - length, length); } else TOKENS[N_TOKENS++].string = strdup(""); } struct token *lex(char *string, size_t *n_tokens) { STRING = string; TOKENS = 0; CAP_TOKENS = 0; N_TOKENS = 0; yylex_init(&SCANNER); yy_scan_string(string, SCANNER); yylex(SCANNER); yylex_destroy(SCANNER); STRING = 0; lex_symbol(DONE_SYMBOL); *n_tokens = N_TOKENS; return TOKENS; } //////// PARSER struct state { prod_id_t production_id; void *reasons[MAX_PRODUCTION_LEN + 1]; size_t location; size_t start_idx; struct state *next_watcher, **pprev_watcher; size_t n_poisoned; size_t hash[2]; struct state *next_completion; struct state *next_free; }; int disambiguator(struct state *old, struct state *new); void alert_parse(struct state *state); void pprint_state(struct state *state) { fprintf(stderr, "%s ->", SYMBOL_ID_TO_NAME[PRODUCTION_ID_TO_SYMBOL[state->production_id]]); size_t i = 0; for (symbol_id_t *s = PRODUCTION_ID_TO_PRODUCTION[state->production_id]; *s; s++) { if (i++ == state->location) fprintf(stderr, " ."); fprintf(stderr, " %s", SYMBOL_ID_TO_NAME[*s]); } if (i == state->location) fprintf(stderr, " ."); fprintf(stderr, " %lu\n", state->start_idx); } void print_parse_tree(struct state *state, int depth, FILE *f) { for (int i = 0; i < depth; i++) fprintf(f, " "); assert(state); fprintf(f, "%s\n", SYMBOL_ID_TO_NAME[PRODUCTION_ID_TO_SYMBOL[state->production_id]]); size_t i = 0; for (symbol_id_t *s = PRODUCTION_ID_TO_PRODUCTION[state->production_id]; *s; s++, i++) { if (IS_NONTERM(*s)) { if (!(state->reasons[i])) { printf("BAD! Should be a %s\n", SYMBOL_ID_TO_NAME[*s]); } print_parse_tree(state->reasons[i], depth + 2, f); } else { for (int i = 0; i < depth + 2; i++) fprintf(f, " "); struct token *t = state->reasons[i]; // printf("HELLO: %lu, %s\n", *s, SYMBOL_ID_TO_NAME[*s]); fprintf(f, "%s ", SYMBOL_ID_TO_NAME[t->symbol]); fprintf(f, "%s\n", t->string); } } } void print_parse_tree2_(struct state *state) { if (!state) printf("?\n"); else if (PRODUCTION_ID_TO_PRODUCTION[state->production_id][0] && !PRODUCTION_ID_TO_PRODUCTION[state->production_id][1] && IS_NONTERM(PRODUCTION_ID_TO_PRODUCTION[state->production_id][0])) print_parse_tree2_(state->reasons[0]); else if (PRODUCTION_ID_TO_PRODUCTION[state->production_id][0] && !PRODUCTION_ID_TO_PRODUCTION[state->production_id][1] && !IS_NONTERM(PRODUCTION_ID_TO_PRODUCTION[state->production_id][0])) printf("%s", ((struct token *)state->reasons[0])->string); else { printf("("); int i = 0; for (symbol_id_t *s = PRODUCTION_ID_TO_PRODUCTION[state->production_id]; *s; s++, i++) { printf(" "); if (IS_NONTERM(*s)) print_parse_tree2_(state->reasons[i]); else { struct token *t = state->reasons[i]; printf("%s", t->string); } } printf(" )"); } } void print_parse_tree2(struct state *state) { print_parse_tree2_(state); printf("\n"); } void print_parse_tree_binary(struct state *state, struct token *tokens, size_t n_tokens) { n_tokens--; // done token at the end write(1, &n_tokens, sizeof(n_tokens)); for (size_t i = 0; i < n_tokens; i++) { write(1, &(tokens[i].symbol), sizeof(tokens[i].symbol)); write(1, &(tokens[i].offset), sizeof(tokens[i].offset)); write(1, &(tokens[i].length), sizeof(tokens[i].length)); } size_t token_idx = 0, zero = 0; size_t n_stack = 0, cap_stack = 1024; struct state **stack = malloc(cap_stack * sizeof(*stack)); stack[n_stack++] = state; while (n_stack) { struct state *next = stack[--n_stack]; if ((MAX_PRODUCTION_LEN + n_stack) >= cap_stack) { cap_stack = (cap_stack * 2) + MAX_PRODUCTION_LEN; stack = realloc(stack, cap_stack * sizeof(*stack)); } // production_id write(1, &next->production_id, sizeof(next->production_id)); int i = 0; symbol_id_t *s = PRODUCTION_ID_TO_PRODUCTION[next->production_id]; while (*s) s++, i++; while (s--, i --> 0) if (IS_NONTERM(*s)) stack[n_stack++] = next->reasons[i]; } free(stack); } //////////// HASH SETS size_t hash_chars(char *v, int n) { size_t h = 0; for (int i = 0; i < n; i++) h = (h * 33) ^ v[i]; return h; } size_t hash(struct state *state, int hash_kind) { if (state->hash[hash_kind]) return state->hash[hash_kind]; size_t h = 0; if (hash_kind == 0) { h = state->production_id; h = (h * N_PRODUCTIONS) ^ hash_chars((char*)&state->location, sizeof(state->location)); } else { h = PRODUCTION_ID_TO_SYMBOL[state->production_id]; } h = (h * 33) ^ hash_chars((char*)&state->start_idx, sizeof(state->start_idx)); state->hash[hash_kind] = h; return h; } size_t eq(struct state *A, struct state *B, int cmp_kind) { if (cmp_kind == 0) return A->production_id == B->production_id && A ->location == B->location && A->start_idx == B->start_idx; return PRODUCTION_ID_TO_SYMBOL[A->production_id] == PRODUCTION_ID_TO_SYMBOL[B->production_id] && A->start_idx == B->start_idx; } struct hash_set { struct state **data; size_t cap, n_items; }; struct hash_set *init_hash_set() { struct hash_set *hash_set = malloc(sizeof *hash_set); hash_set->data = calloc(2048, sizeof(hash_set->data[0])); hash_set->cap = 2048; hash_set->n_items = 0; return hash_set; } void clear_hash_set(struct hash_set *set) { memset(set->data, 0, set->cap * sizeof(set->data[0])); set->n_items = 0; } void _insert_hash_set(struct hash_set *set, struct state *state, int cmp_kind) { size_t i = hash(state, cmp_kind) & (set->cap - 1); size_t delta = 0, overlap = 0, true_overlap = 0; while (set->data[i]) if ((++i) == set->cap) i = 0; set->data[i] = state; } void insert_hash_set(struct hash_set *set, struct state *state, int cmp_kind) { if ((set->n_items * 2) >= set->cap) { size_t old_cap = set->cap; struct state **old_data = set->data; set->cap *= 2; set->data = calloc(set->cap, sizeof(set->data[0])); for (size_t i = 0; i < old_cap; i++) if (old_data[i]) _insert_hash_set(set, old_data[i], cmp_kind); free(old_data); } _insert_hash_set(set, state, cmp_kind); set->n_items++; } struct state *in_hash_set(struct hash_set *set, struct state *state, int cmp_kind) { size_t i = hash(state, cmp_kind) & (set->cap - 1); while (set->data[i]) { if (eq(state, set->data[i], cmp_kind)) return set->data[i]; if ((++i) == set->cap) i = 0; } return 0; } ///////////////// HASH QUEUES struct hash_queue { struct state **entries; size_t count; size_t capacity; struct hash_set *set; }; struct hash_queue *init_hash_queue() { struct hash_queue *q = calloc(1, sizeof(*q)); q->set = init_hash_set(); q->capacity = 1024; q->entries = calloc(q->capacity, sizeof(q->entries[0])); return q; } void queue_swap(struct hash_queue *queue, int a, int b) { struct state *tmp = queue->entries[a]; queue->entries[a] = queue->entries[b]; queue->entries[b] = tmp; } struct state *pop_from_queue(struct hash_queue *queue) { assert(queue->count); struct state *head = queue->entries[0]; queue->entries[0] = queue->entries[--queue->count]; if (!queue->count) return head; // Bubble the root down int i = 0; size_t val = queue->entries[0]->n_poisoned; while (1) { size_t left_idx = 2*i + 1, right_idx = 2*i + 2, has_left = left_idx < queue->count, has_right = right_idx < queue->count, left_val = has_left ? queue->entries[left_idx]->n_poisoned : val, right_val = has_right ? queue->entries[right_idx]->n_poisoned : val; if (right_val < left_val && right_val < val) { queue_swap(queue, i, right_idx); i = right_idx; } else if (left_val < val) { queue_swap(queue, i, left_idx); i = left_idx; } else break; } return head; } void enqueue(struct hash_queue *queue, struct state *state) { if (!in_hash_set(queue->set, state, 0)) insert_hash_set(queue->set, state, 0); if (queue->count == queue->capacity) { queue->capacity *= 2; queue->entries = realloc(queue->entries, queue->capacity * sizeof(queue->entries[0])); } queue->entries[queue->count++] = state; // Bubble it up int i = queue->count - 1; while (i && state->n_poisoned < queue->entries[(i - 1) / 2]->n_poisoned) { queue_swap(queue, i, (i - 1) / 2); i = (i - 1) / 2; } } ////////// PARSER STATE struct parser_state { size_t k; // [start_idx, symbol] -> state ptr struct state **watcher_table; struct hash_queue *current_queue; struct hash_queue *next_queue; struct token *tokens; }; void flip_queues_and_memos(struct parser_state *parser_state) { assert(!parser_state->current_queue->count); struct hash_queue *tmp = parser_state->current_queue; parser_state->current_queue = parser_state->next_queue; parser_state->next_queue = tmp; clear_hash_set(parser_state->next_queue->set); } struct state **lookup_watcher(symbol_id_t symbol, size_t start_idx, struct parser_state *parser_state) { return parser_state->watcher_table + (start_idx * N_SYMBOLS) + symbol; } struct state *copy_state(struct state *state) { struct state *copy = malloc(sizeof *copy); *copy = *state; copy->next_watcher = 0; copy->pprev_watcher = 0; copy->next_completion = 0; return copy; } symbol_id_t symbol(struct state *state) { return PRODUCTION_ID_TO_PRODUCTION[state->production_id][state->location]; } void link_watchers(struct state *state, int for_next, struct parser_state *parser_state) { // link to what we want to be watching now if (IS_NONTERM(symbol(state))) { struct state **ptr = lookup_watcher(symbol(state), parser_state->k + for_next, parser_state); if (*ptr) (*ptr)->pprev_watcher = &(state->next_watcher); state->pprev_watcher = ptr; state->next_watcher = *ptr; *ptr = state; } } void unlink_watchers(struct state *state) { if (state->pprev_watcher) { *(state->pprev_watcher) = state->next_watcher; if (state->next_watcher) state->next_watcher->pprev_watcher = state->pprev_watcher; state->pprev_watcher = 0; state->next_watcher = 0; } } struct state *advance(struct state *state, void *reason, int for_next, struct parser_state *parser_state) { state->reasons[state->location++] = reason; unlink_watchers(state); link_watchers(state, for_next, parser_state); return state; } void queue_all_for_nonterm(symbol_id_t symbol, struct parser_state *parser_state) { symbol_id_t scan = parser_state->tokens[parser_state->k].symbol; struct state *s = calloc(1, sizeof(struct state)); for (prod_id_t *p = SYMBOL_ID_TO_PRODUCTION_IDS[symbol]; *p; p++) { // FILTER symbol_id_t sym = PRODUCTION_ID_TO_FIRST[*p]; if (sym != DONE_SYMBOL && !IS_NONTERM(sym) && sym != scan) continue; if (!s) s = calloc(1, sizeof(struct state)); s->n_poisoned = SYMBOL_TO_POISON[symbol]; s->production_id = *p; s->start_idx = parser_state->k; s->hash[0] = s->hash[1] = 0; if (in_hash_set(parser_state->current_queue->set, s, 0)) continue; link_watchers(s, 0, parser_state); enqueue(parser_state->current_queue, s); assert(parser_state->current_queue->count); s = 0; } if (s) free(s); } void overwrite_state(struct state *dst, struct state *src) { dst->production_id = src->production_id; memcpy(dst->reasons, src->reasons, sizeof(src->reasons)); dst->location = src->location; dst->start_idx = src->start_idx; assert(src->n_poisoned == dst->n_poisoned); dst->hash[0] = dst->hash[1] = 0; } // https://en.wikipedia.org/wiki/Earley_algorithm void parse(struct token *tokens, size_t n_tokens) { struct parser_state parser_state = { .k = 0, .watcher_table = calloc(n_tokens * N_SYMBOLS, sizeof(struct state *)), .current_queue = init_hash_queue(), .next_queue = init_hash_queue(), .tokens = tokens, }; struct hash_set *completion_set = init_hash_set(); queue_all_for_nonterm(START_SYMBOL, &parser_state); for (size_t k = 0; k < n_tokens; k++) { clear_hash_set(completion_set); parser_state.k = k; struct state *free_queue = 0; while (parser_state.current_queue->count) { struct state *state = pop_from_queue(parser_state.current_queue); symbol_id_t next_symbol = symbol(state); // SCANNING if (next_symbol != DONE_SYMBOL && !IS_NONTERM(next_symbol)) { if (next_symbol == tokens[k].symbol) { // NOTE: we need to copy here because @state might already be // in some hash sets... struct state *advanced = advance(copy_state(state), tokens + k, 1, &parser_state); enqueue(parser_state.next_queue, advanced); } state->next_free = free_queue; free_queue = state; } // COMPLETION else if (next_symbol == DONE_SYMBOL) { alert_parse(state); int is_very_end = (k+1) == n_tokens && state->start_idx == 0 && PRODUCTION_ID_TO_SYMBOL[state->production_id] == START_SYMBOL; struct state *last = in_hash_set(completion_set, state, 1); if (last) { struct state *first_last = last; int do_an_add = 1; for (; last; last = last->next_completion) { switch (disambiguator(last, state)) { case 0: /* keep @last */ do_an_add = 0; break; case 1: /* kill @last */ do_an_add = 0; overwrite_state(last, state); break; case 2: /* keep both */ break; } } if (!do_an_add) { state->next_free = free_queue; free_queue = state; continue; } last = first_last; state->next_completion = last->next_completion; last->next_completion = state; } else { insert_hash_set(completion_set, state, 1); last = state; } symbol_id_t completing = PRODUCTION_ID_TO_SYMBOL[state->production_id]; struct state *watcher = *lookup_watcher(completing, state->start_idx, &parser_state); for (; watcher; watcher = watcher->next_watcher) { struct state *new = advance(copy_state(watcher), state, 0, &parser_state); new->n_poisoned += state->n_poisoned; enqueue(parser_state.current_queue, new); } if (is_very_end) { struct state *copy = copy_state(state); if (last) { copy->next_completion = last->next_completion; last->next_completion = copy; } else insert_hash_set(completion_set, copy, 1); enqueue(parser_state.next_queue, copy); } } // PREDICTION else if (tokens[k].symbol != DONE_SYMBOL && IS_NONTERM(next_symbol)) { queue_all_for_nonterm(next_symbol, &parser_state); } } for (struct state *s = free_queue; s;) { struct state *s_old = s; s = s->next_free; free(s_old); } flip_queues_and_memos(&parser_state); } while (parser_state.current_queue->count) { struct state *state = pop_from_queue(parser_state.current_queue); // print_parse_tree(state, 0); if (isatty(1)) { print_parse_tree(state, 0, stdout); } else { print_parse_tree_binary(state, tokens, n_tokens); if (parser_state.current_queue->count) { fprintf(stderr, "AMBIGUOUS PARSE\n"); exit(1); } } } } void preprocess(char *string, size_t length); int main(int argc, char **argv) { assert(argc == 2); // Read the file struct stat statbuf; stat(argv[1], &statbuf); char *buf = malloc(statbuf.st_size + 1); buf[statbuf.st_size] = '\0'; int f = open(argv[1], O_RDONLY); read(f, buf, statbuf.st_size); close(f); preprocess(buf, statbuf.st_size); // Lex the file size_t n_tokens = 0; struct token *tokens = lex(buf, &n_tokens); // Parse the file parse(tokens, n_tokens); return 0; } struct token *find_token(struct state *state, int idx) { if (IS_NONTERM(PRODUCTION_ID_TO_PRODUCTION[state->production_id][idx])) return find_token(state->reasons[idx], 0); return state->reasons[idx]; }