From 35fa21e59ad44de3ac5d075a3c1ae60d462a1a13 Mon Sep 17 00:00:00 2001 From: Matthew Sotoudeh Date: Mon, 11 Mar 2024 00:44:33 -0700 Subject: core parser --- earlpy/parser.c | 572 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 572 insertions(+) create mode 100644 earlpy/parser.c (limited to 'earlpy/parser.c') diff --git a/earlpy/parser.c b/earlpy/parser.c new file mode 100644 index 0000000..fafc9b8 --- /dev/null +++ b/earlpy/parser.c @@ -0,0 +1,572 @@ +#include +#include +#include +#include + +#define VERBOSE 0 +#if VERBOSE +#define vprintf(...) printf(__VA_ARGS__) +#else +#define vprintf(...) +#endif + +//////// This file will be copy/pasted below the generated code +yyscan_t SCANNER; + +struct token { symbol_id_t symbol; size_t offset; size_t length; char *string; }; + +char *STRING; +struct token *TOKENS = 0; +size_t N_TOKENS = 0, CAP_TOKENS = 0; +void lex_symbol(symbol_id_t which) { + if (N_TOKENS == CAP_TOKENS) { + CAP_TOKENS = (CAP_TOKENS + 1) * 2; + TOKENS = realloc(TOKENS, CAP_TOKENS * sizeof(*TOKENS)); + } + TOKENS[N_TOKENS].symbol = which; + TOKENS[N_TOKENS].offset = OFFSET; + TOKENS[N_TOKENS].length = 0; + if (STRING) { + int length = yyget_leng(SCANNER); + TOKENS[N_TOKENS].offset = OFFSET - length; + TOKENS[N_TOKENS].length = length; + TOKENS[N_TOKENS++].string = strndup(STRING + OFFSET - length, length); + + } else TOKENS[N_TOKENS++].string = strdup(""); +} + +struct token *lex(char *string, size_t *n_tokens) { + STRING = string; + TOKENS = 0; + CAP_TOKENS = 0; + N_TOKENS = 0; + + yylex_init(&SCANNER); + yy_scan_string(string, SCANNER); + yylex(SCANNER); + yylex_destroy(SCANNER); + + STRING = 0; + lex_symbol(DONE_SYMBOL); + *n_tokens = N_TOKENS; + return TOKENS; +} + +//////// PARSER +struct state { + prod_id_t production_id; + void *reasons[MAX_PRODUCTION_LEN + 1]; + + size_t location; + size_t start_idx; + struct state *next_watcher, **pprev_watcher; + + size_t n_poisoned; + + size_t hash[2]; + + struct state *next_completion; + struct state *next_free; +}; +int disambiguator(struct state *old, struct state *new); +void alert_parse(struct state *state); + +void pprint_state(struct state *state) { + fprintf(stderr, "%s ->", SYMBOL_ID_TO_NAME[PRODUCTION_ID_TO_SYMBOL[state->production_id]]); + size_t i = 0; + for (symbol_id_t *s = PRODUCTION_ID_TO_PRODUCTION[state->production_id]; + *s; s++) { + if (i++ == state->location) fprintf(stderr, " ."); + fprintf(stderr, " %s", SYMBOL_ID_TO_NAME[*s]); + } + if (i == state->location) fprintf(stderr, " ."); + fprintf(stderr, " %lu\n", state->start_idx); +} + +void print_parse_tree(struct state *state, int depth, FILE *f) { + for (int i = 0; i < depth; i++) fprintf(f, " "); + assert(state); + fprintf(f, "%s\n", SYMBOL_ID_TO_NAME[PRODUCTION_ID_TO_SYMBOL[state->production_id]]); + size_t i = 0; + for (symbol_id_t *s = PRODUCTION_ID_TO_PRODUCTION[state->production_id]; + *s; s++, i++) { + if (IS_NONTERM(*s)) { + if (!(state->reasons[i])) { + printf("BAD! Should be a %s\n", SYMBOL_ID_TO_NAME[*s]); + } + print_parse_tree(state->reasons[i], depth + 2, f); + } else { + for (int i = 0; i < depth + 2; i++) fprintf(f, " "); + struct token *t = state->reasons[i]; + // printf("HELLO: %lu, %s\n", *s, SYMBOL_ID_TO_NAME[*s]); + fprintf(f, "%s ", SYMBOL_ID_TO_NAME[t->symbol]); + fprintf(f, "%s\n", t->string); + } + } +} + +void print_parse_tree2_(struct state *state) { + if (!state) printf("?\n"); + else if (PRODUCTION_ID_TO_PRODUCTION[state->production_id][0] + && !PRODUCTION_ID_TO_PRODUCTION[state->production_id][1] + && IS_NONTERM(PRODUCTION_ID_TO_PRODUCTION[state->production_id][0])) + print_parse_tree2_(state->reasons[0]); + else if (PRODUCTION_ID_TO_PRODUCTION[state->production_id][0] + && !PRODUCTION_ID_TO_PRODUCTION[state->production_id][1] + && !IS_NONTERM(PRODUCTION_ID_TO_PRODUCTION[state->production_id][0])) + printf("%s", ((struct token *)state->reasons[0])->string); + else { + printf("("); + int i = 0; + for (symbol_id_t *s = PRODUCTION_ID_TO_PRODUCTION[state->production_id]; + *s; s++, i++) { + printf(" "); + if (IS_NONTERM(*s)) + print_parse_tree2_(state->reasons[i]); + else { + struct token *t = state->reasons[i]; + printf("%s", t->string); + } + } + printf(" )"); + } +} +void print_parse_tree2(struct state *state) { + print_parse_tree2_(state); printf("\n"); +} +void print_parse_tree_binary(struct state *state, struct token *tokens, size_t n_tokens) { + n_tokens--; // done token at the end + write(1, &n_tokens, sizeof(n_tokens)); + for (size_t i = 0; i < n_tokens; i++) { + write(1, &(tokens[i].symbol), sizeof(tokens[i].symbol)); + write(1, &(tokens[i].offset), sizeof(tokens[i].offset)); + write(1, &(tokens[i].length), sizeof(tokens[i].length)); + } + size_t token_idx = 0, zero = 0; + size_t n_stack = 0, cap_stack = 1024; + struct state **stack = malloc(cap_stack * sizeof(*stack)); + stack[n_stack++] = state; + while (n_stack) { + struct state *next = stack[--n_stack]; + if ((MAX_PRODUCTION_LEN + n_stack) >= cap_stack) { + cap_stack = (cap_stack * 2) + MAX_PRODUCTION_LEN; + stack = realloc(stack, cap_stack * sizeof(*stack)); + } + // production_id + write(1, &next->production_id, sizeof(next->production_id)); + int i = 0; + symbol_id_t *s = PRODUCTION_ID_TO_PRODUCTION[next->production_id]; + while (*s) s++, i++; + while (s--, i --> 0) + if (IS_NONTERM(*s)) + stack[n_stack++] = next->reasons[i]; + } + free(stack); +} + +//////////// HASH SETS +size_t hash_chars(char *v, int n) { + size_t h = 0; + for (int i = 0; i < n; i++) + h = (h * 33) ^ v[i]; + return h; +} + +size_t hash(struct state *state, int hash_kind) { + if (state->hash[hash_kind]) return state->hash[hash_kind]; + + size_t h = 0; + if (hash_kind == 0) { + h = state->production_id; + h = (h * N_PRODUCTIONS) + ^ hash_chars((char*)&state->location, sizeof(state->location)); + } else { + h = PRODUCTION_ID_TO_SYMBOL[state->production_id]; + } + h = (h * 33) ^ hash_chars((char*)&state->start_idx, sizeof(state->start_idx)); + + state->hash[hash_kind] = h; + return h; +} + +size_t eq(struct state *A, struct state *B, int cmp_kind) { + if (cmp_kind == 0) + return A->production_id == B->production_id + && A ->location == B->location + && A->start_idx == B->start_idx; + return PRODUCTION_ID_TO_SYMBOL[A->production_id] == PRODUCTION_ID_TO_SYMBOL[B->production_id] + && A->start_idx == B->start_idx; +} + +struct hash_set { + struct state **data; + size_t cap, n_items; +}; + +struct hash_set *init_hash_set() { + struct hash_set *hash_set = malloc(sizeof *hash_set); + hash_set->data = calloc(2048, sizeof(hash_set->data[0])); + hash_set->cap = 2048; + hash_set->n_items = 0; + return hash_set; +} + +void clear_hash_set(struct hash_set *set) { + memset(set->data, 0, set->cap * sizeof(set->data[0])); + set->n_items = 0; +} + +void _insert_hash_set(struct hash_set *set, struct state *state, int cmp_kind) { + size_t i = hash(state, cmp_kind) & (set->cap - 1); + size_t delta = 0, overlap = 0, true_overlap = 0; + while (set->data[i]) if ((++i) == set->cap) i = 0; + set->data[i] = state; +} + +void insert_hash_set(struct hash_set *set, struct state *state, int cmp_kind) { + if ((set->n_items * 2) >= set->cap) { + size_t old_cap = set->cap; + struct state **old_data = set->data; + set->cap *= 2; + set->data = calloc(set->cap, sizeof(set->data[0])); + for (size_t i = 0; i < old_cap; i++) + if (old_data[i]) + _insert_hash_set(set, old_data[i], cmp_kind); + free(old_data); + } + _insert_hash_set(set, state, cmp_kind); + set->n_items++; +} + +struct state *in_hash_set(struct hash_set *set, struct state *state, int cmp_kind) { + size_t i = hash(state, cmp_kind) & (set->cap - 1); + while (set->data[i]) { + if (eq(state, set->data[i], cmp_kind)) return set->data[i]; + if ((++i) == set->cap) i = 0; + } + return 0; +} + +///////////////// HASH QUEUES +struct hash_queue { + struct state **entries; + size_t count; + size_t capacity; + struct hash_set *set; +}; + +struct hash_queue *init_hash_queue() { + struct hash_queue *q = calloc(1, sizeof(*q)); + q->set = init_hash_set(); + q->capacity = 1024; + q->entries = calloc(q->capacity, sizeof(q->entries[0])); + return q; +} + +void queue_swap(struct hash_queue *queue, int a, int b) { + struct state *tmp = queue->entries[a]; + queue->entries[a] = queue->entries[b]; + queue->entries[b] = tmp; +} + +struct state *pop_from_queue(struct hash_queue *queue) { + assert(queue->count); + struct state *head = queue->entries[0]; + queue->entries[0] = queue->entries[--queue->count]; + if (!queue->count) return head; + + // Bubble the root down + int i = 0; + size_t val = queue->entries[0]->n_poisoned; + while (1) { + size_t left_idx = 2*i + 1, + right_idx = 2*i + 2, + has_left = left_idx < queue->count, + has_right = right_idx < queue->count, + left_val = has_left ? queue->entries[left_idx]->n_poisoned : val, + right_val = has_right ? queue->entries[right_idx]->n_poisoned : val; + if (right_val < left_val && right_val < val) { + queue_swap(queue, i, right_idx); + i = right_idx; + } else if (left_val < val) { + queue_swap(queue, i, left_idx); + i = left_idx; + } else break; + } + + return head; +} + +void enqueue(struct hash_queue *queue, struct state *state) { + if (!in_hash_set(queue->set, state, 0)) + insert_hash_set(queue->set, state, 0); + + if (queue->count == queue->capacity) { + queue->capacity *= 2; + queue->entries = realloc(queue->entries, + queue->capacity * sizeof(queue->entries[0])); + } + + queue->entries[queue->count++] = state; + // Bubble it up + int i = queue->count - 1; + while (i && state->n_poisoned < queue->entries[(i - 1) / 2]->n_poisoned) { + queue_swap(queue, i, (i - 1) / 2); + i = (i - 1) / 2; + } +} + +////////// PARSER STATE +struct parser_state { + size_t k; + // [start_idx, symbol] -> state ptr + struct state **watcher_table; + + struct hash_queue *current_queue; + struct hash_queue *next_queue; + + struct token *tokens; +}; + +void flip_queues_and_memos(struct parser_state *parser_state) { + assert(!parser_state->current_queue->count); + + struct hash_queue *tmp = parser_state->current_queue; + parser_state->current_queue = parser_state->next_queue; + parser_state->next_queue = tmp; + clear_hash_set(parser_state->next_queue->set); +} + +struct state **lookup_watcher(symbol_id_t symbol, size_t start_idx, + struct parser_state *parser_state) { + return parser_state->watcher_table + + (start_idx * N_SYMBOLS) + + symbol; +} + +struct state *copy_state(struct state *state) { + struct state *copy = malloc(sizeof *copy); + *copy = *state; + copy->next_watcher = 0; + copy->pprev_watcher = 0; + copy->next_completion = 0; + return copy; +} + +symbol_id_t symbol(struct state *state) { + return PRODUCTION_ID_TO_PRODUCTION[state->production_id][state->location]; +} + +void link_watchers(struct state *state, int for_next, struct parser_state *parser_state) { + // link to what we want to be watching now + if (IS_NONTERM(symbol(state))) { + struct state **ptr + = lookup_watcher(symbol(state), parser_state->k + for_next, + parser_state); + if (*ptr) (*ptr)->pprev_watcher = &(state->next_watcher); + state->pprev_watcher = ptr; + state->next_watcher = *ptr; + *ptr = state; + } +} + +void unlink_watchers(struct state *state) { + if (state->pprev_watcher) { + *(state->pprev_watcher) = state->next_watcher; + if (state->next_watcher) + state->next_watcher->pprev_watcher = state->pprev_watcher; + state->pprev_watcher = 0; + state->next_watcher = 0; + } +} + +struct state *advance(struct state *state, void *reason, + int for_next, struct parser_state *parser_state) { + state->reasons[state->location++] = reason; + unlink_watchers(state); + link_watchers(state, for_next, parser_state); + return state; +} + +void queue_all_for_nonterm(symbol_id_t symbol, struct parser_state *parser_state) { + symbol_id_t scan = parser_state->tokens[parser_state->k].symbol; + struct state *s = calloc(1, sizeof(struct state)); + for (prod_id_t *p = SYMBOL_ID_TO_PRODUCTION_IDS[symbol]; *p; p++) { + // FILTER + symbol_id_t sym = PRODUCTION_ID_TO_FIRST[*p]; + if (sym != DONE_SYMBOL && !IS_NONTERM(sym) && sym != scan) + continue; + + if (!s) s = calloc(1, sizeof(struct state)); + + s->n_poisoned = SYMBOL_TO_POISON[symbol]; + s->production_id = *p; + s->start_idx = parser_state->k; + s->hash[0] = s->hash[1] = 0; + if (in_hash_set(parser_state->current_queue->set, s, 0)) + continue; + link_watchers(s, 0, parser_state); + enqueue(parser_state->current_queue, s); + assert(parser_state->current_queue->count); + s = 0; + } + if (s) free(s); +} + +void overwrite_state(struct state *dst, struct state *src) { + dst->production_id = src->production_id; + memcpy(dst->reasons, src->reasons, sizeof(src->reasons)); + dst->location = src->location; + dst->start_idx = src->start_idx; + assert(src->n_poisoned == dst->n_poisoned); + dst->hash[0] = dst->hash[1] = 0; +} + +// https://en.wikipedia.org/wiki/Earley_algorithm +void parse(struct token *tokens, size_t n_tokens) { + struct parser_state parser_state = { + .k = 0, + .watcher_table = calloc(n_tokens * N_SYMBOLS, sizeof(struct state *)), + .current_queue = init_hash_queue(), + .next_queue = init_hash_queue(), + .tokens = tokens, + }; + + struct hash_set *completion_set = init_hash_set(); + + queue_all_for_nonterm(START_SYMBOL, &parser_state); + + for (size_t k = 0; k < n_tokens; k++) { + clear_hash_set(completion_set); + + parser_state.k = k; + struct state *free_queue = 0; + + while (parser_state.current_queue->count) { + struct state *state = pop_from_queue(parser_state.current_queue); + symbol_id_t next_symbol = symbol(state); + + // SCANNING + if (next_symbol != DONE_SYMBOL && !IS_NONTERM(next_symbol)) { + if (next_symbol == tokens[k].symbol) { + // NOTE: we need to copy here because @state might already be + // in some hash sets... + struct state *advanced + = advance(copy_state(state), tokens + k, 1, &parser_state); + enqueue(parser_state.next_queue, advanced); + } + state->next_free = free_queue; + free_queue = state; + } + + // COMPLETION + else if (next_symbol == DONE_SYMBOL) { + alert_parse(state); + int is_very_end = + (k+1) == n_tokens && state->start_idx == 0 + && PRODUCTION_ID_TO_SYMBOL[state->production_id] == START_SYMBOL; + + struct state *last = in_hash_set(completion_set, state, 1); + if (last) { + struct state *first_last = last; + int do_an_add = 1; + for (; last; last = last->next_completion) { + switch (disambiguator(last, state)) { + case 0: /* keep @last */ do_an_add = 0; break; + case 1: /* kill @last */ + do_an_add = 0; + overwrite_state(last, state); + break; + case 2: /* keep both */ break; + } + } + if (!do_an_add) { + state->next_free = free_queue; + free_queue = state; + continue; + } + last = first_last; + state->next_completion = last->next_completion; + last->next_completion = state; + } else { + insert_hash_set(completion_set, state, 1); + last = state; + } + + symbol_id_t completing = PRODUCTION_ID_TO_SYMBOL[state->production_id]; + struct state *watcher + = *lookup_watcher(completing, state->start_idx, &parser_state); + + for (; watcher; watcher = watcher->next_watcher) { + struct state *new = advance(copy_state(watcher), state, 0, &parser_state); + new->n_poisoned += state->n_poisoned; + enqueue(parser_state.current_queue, new); + } + + if (is_very_end) { + struct state *copy = copy_state(state); + if (last) { + copy->next_completion = last->next_completion; + last->next_completion = copy; + } else insert_hash_set(completion_set, copy, 1); + enqueue(parser_state.next_queue, copy); + } + } + + // PREDICTION + else if (tokens[k].symbol != DONE_SYMBOL && IS_NONTERM(next_symbol)) { + queue_all_for_nonterm(next_symbol, &parser_state); + } + } + + for (struct state *s = free_queue; s;) { + struct state *s_old = s; + s = s->next_free; + free(s_old); + } + + flip_queues_and_memos(&parser_state); + } + while (parser_state.current_queue->count) { + struct state *state = pop_from_queue(parser_state.current_queue); + // print_parse_tree(state, 0); + if (isatty(1)) { + print_parse_tree(state, 0, stdout); + } else { + print_parse_tree_binary(state, tokens, n_tokens); + if (parser_state.current_queue->count) { + fprintf(stderr, "AMBIGUOUS PARSE\n"); + exit(1); + } + } + } +} + +void preprocess(char *string, size_t length); +int main(int argc, char **argv) { + assert(argc == 2); + + // Read the file + struct stat statbuf; + stat(argv[1], &statbuf); + char *buf = malloc(statbuf.st_size + 1); + buf[statbuf.st_size] = '\0'; + int f = open(argv[1], O_RDONLY); + read(f, buf, statbuf.st_size); + close(f); + + preprocess(buf, statbuf.st_size); + + // Lex the file + size_t n_tokens = 0; + struct token *tokens = lex(buf, &n_tokens); + // Parse the file + parse(tokens, n_tokens); + return 0; +} + +struct token *find_token(struct state *state, int idx) { + if (IS_NONTERM(PRODUCTION_ID_TO_PRODUCTION[state->production_id][idx])) + return find_token(state->reasons[idx], 0); + return state->reasons[idx]; +} -- cgit v1.2.3