core parser

author: Matthew Sotoudeh <matthew@masot.net> 2024-03-11 00:44:33 -0700
committer: Matthew Sotoudeh <matthew@masot.net> 2024-03-11 00:44:33 -0700
commit: 35fa21e59ad44de3ac5d075a3c1ae60d462a1a13 (patch)
tree: 18de46f436f0f91f0b6da1b2e393399620ca5ce0
parent: 26a42b4a7ba077659f791208a2a7989bfdfb3663 (diff)
1 files changed, 572 insertions, 0 deletions
diff --git a/earlpy/parser.c b/earlpy/parser.c
new file mode 100644
index 0000000..fafc9b8
--- /dev/null
+++ b/earlpy/parser.c
@@ -0,0 +1,572 @@
+#include <assert.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <stdio.h>
+
+#define VERBOSE 0
+#if VERBOSE
+#define vprintf(...) printf(__VA_ARGS__)
+#else
+#define vprintf(...)
+#endif
+
+//////// This file will be copy/pasted below the generated code
+yyscan_t SCANNER;
+
+struct token { symbol_id_t symbol; size_t offset; size_t length; char *string; };
+
+char *STRING;
+struct token *TOKENS = 0;
+size_t N_TOKENS = 0, CAP_TOKENS = 0;
+void lex_symbol(symbol_id_t which) {
+    if (N_TOKENS == CAP_TOKENS) {
+        CAP_TOKENS = (CAP_TOKENS + 1) * 2;
+        TOKENS = realloc(TOKENS, CAP_TOKENS * sizeof(*TOKENS));
+    }
+    TOKENS[N_TOKENS].symbol = which;
+    TOKENS[N_TOKENS].offset = OFFSET;
+    TOKENS[N_TOKENS].length = 0;
+    if (STRING) {
+        int length = yyget_leng(SCANNER);
+        TOKENS[N_TOKENS].offset = OFFSET - length;
+        TOKENS[N_TOKENS].length = length;
+        TOKENS[N_TOKENS++].string = strndup(STRING + OFFSET - length, length);
+
+    } else TOKENS[N_TOKENS++].string = strdup("");
+}
+
+struct token *lex(char *string, size_t *n_tokens) {
+    STRING = string;
+    TOKENS = 0;
+    CAP_TOKENS = 0;
+    N_TOKENS = 0;
+
+    yylex_init(&SCANNER);
+    yy_scan_string(string, SCANNER);
+    yylex(SCANNER);
+    yylex_destroy(SCANNER);
+
+    STRING = 0;
+    lex_symbol(DONE_SYMBOL);
+    *n_tokens = N_TOKENS;
+    return TOKENS;
+}
+
+//////// PARSER
+struct state {
+    prod_id_t production_id;
+    void *reasons[MAX_PRODUCTION_LEN + 1];
+
+    size_t location;
+    size_t start_idx;
+    struct state *next_watcher, **pprev_watcher;
+
+    size_t n_poisoned;
+
+    size_t hash[2];
+
+    struct state *next_completion;
+    struct state *next_free;
+};
+int disambiguator(struct state *old, struct state *new);
+void alert_parse(struct state *state);
+
+void pprint_state(struct state *state) {
+    fprintf(stderr, "%s ->", SYMBOL_ID_TO_NAME[PRODUCTION_ID_TO_SYMBOL[state->production_id]]);
+    size_t i = 0;
+    for (symbol_id_t *s = PRODUCTION_ID_TO_PRODUCTION[state->production_id];
+            *s; s++) {
+        if (i++ == state->location) fprintf(stderr, " .");
+        fprintf(stderr, " %s", SYMBOL_ID_TO_NAME[*s]);
+    }
+    if (i == state->location) fprintf(stderr, " .");
+    fprintf(stderr, "       %lu\n", state->start_idx);
+}
+
+void print_parse_tree(struct state *state, int depth, FILE *f) {
+    for (int i = 0; i < depth; i++) fprintf(f, " ");
+    assert(state);
+    fprintf(f, "%s\n", SYMBOL_ID_TO_NAME[PRODUCTION_ID_TO_SYMBOL[state->production_id]]);
+    size_t i = 0;
+    for (symbol_id_t *s = PRODUCTION_ID_TO_PRODUCTION[state->production_id];
+            *s; s++, i++) {
+        if (IS_NONTERM(*s)) {
+            if (!(state->reasons[i])) {
+                printf("BAD! Should be a %s\n", SYMBOL_ID_TO_NAME[*s]);
+            }
+            print_parse_tree(state->reasons[i], depth + 2, f);
+        } else {
+            for (int i = 0; i < depth + 2; i++) fprintf(f, " ");
+            struct token *t = state->reasons[i];
+            // printf("HELLO: %lu, %s\n", *s, SYMBOL_ID_TO_NAME[*s]);
+            fprintf(f, "%s ", SYMBOL_ID_TO_NAME[t->symbol]);
+            fprintf(f, "%s\n", t->string);
+        }
+    }
+}
+
+void print_parse_tree2_(struct state *state) {
+    if (!state) printf("?\n");
+    else if (PRODUCTION_ID_TO_PRODUCTION[state->production_id][0]
+          && !PRODUCTION_ID_TO_PRODUCTION[state->production_id][1]
+          && IS_NONTERM(PRODUCTION_ID_TO_PRODUCTION[state->production_id][0]))
+        print_parse_tree2_(state->reasons[0]);
+    else if (PRODUCTION_ID_TO_PRODUCTION[state->production_id][0]
+          && !PRODUCTION_ID_TO_PRODUCTION[state->production_id][1]
+          && !IS_NONTERM(PRODUCTION_ID_TO_PRODUCTION[state->production_id][0]))
+        printf("%s", ((struct token *)state->reasons[0])->string);
+    else {
+        printf("(");
+        int i = 0;
+        for (symbol_id_t *s = PRODUCTION_ID_TO_PRODUCTION[state->production_id];
+                *s; s++, i++) {
+            printf(" ");
+            if (IS_NONTERM(*s))
+                print_parse_tree2_(state->reasons[i]);
+            else {
+                struct token *t = state->reasons[i];
+                printf("%s", t->string);
+            }
+        }
+        printf(" )");
+    }
+}
+void print_parse_tree2(struct state *state) {
+    print_parse_tree2_(state); printf("\n");
+}
+void print_parse_tree_binary(struct state *state, struct token *tokens, size_t n_tokens) {
+    n_tokens--; // done token at the end
+    write(1, &n_tokens, sizeof(n_tokens));
+    for (size_t i = 0; i < n_tokens; i++) {
+        write(1, &(tokens[i].symbol), sizeof(tokens[i].symbol));
+        write(1, &(tokens[i].offset), sizeof(tokens[i].offset));
+        write(1, &(tokens[i].length), sizeof(tokens[i].length));
+    }
+    size_t token_idx = 0, zero = 0;
+    size_t n_stack = 0, cap_stack = 1024;
+    struct state **stack = malloc(cap_stack * sizeof(*stack));
+    stack[n_stack++] = state;
+    while (n_stack) {
+        struct state *next = stack[--n_stack];
+        if ((MAX_PRODUCTION_LEN + n_stack) >= cap_stack) {
+            cap_stack = (cap_stack * 2) + MAX_PRODUCTION_LEN;
+            stack = realloc(stack, cap_stack * sizeof(*stack));
+        }
+        // production_id
+        write(1, &next->production_id, sizeof(next->production_id));
+        int i = 0;
+        symbol_id_t *s = PRODUCTION_ID_TO_PRODUCTION[next->production_id];
+        while (*s) s++, i++;
+        while (s--, i --> 0)
+            if (IS_NONTERM(*s))
+                stack[n_stack++] = next->reasons[i];
+    }
+    free(stack);
+}
+
+//////////// HASH SETS
+size_t hash_chars(char *v, int n) {
+    size_t h = 0;
+    for (int i = 0; i < n; i++)
+        h = (h * 33) ^ v[i];
+    return h;
+}
+
+size_t hash(struct state *state, int hash_kind) {
+    if (state->hash[hash_kind]) return state->hash[hash_kind];
+
+    size_t h = 0;
+    if (hash_kind == 0) {
+        h = state->production_id;
+        h = (h * N_PRODUCTIONS)
+            ^ hash_chars((char*)&state->location, sizeof(state->location));
+    } else {
+        h = PRODUCTION_ID_TO_SYMBOL[state->production_id];
+    }
+    h = (h * 33) ^ hash_chars((char*)&state->start_idx, sizeof(state->start_idx));
+
+    state->hash[hash_kind] = h;
+    return h;
+}
+
+size_t eq(struct state *A, struct state *B, int cmp_kind) {
+    if (cmp_kind == 0)
+        return A->production_id == B->production_id
+            && A ->location == B->location
+            && A->start_idx == B->start_idx;
+    return PRODUCTION_ID_TO_SYMBOL[A->production_id] == PRODUCTION_ID_TO_SYMBOL[B->production_id]
+        && A->start_idx == B->start_idx;
+}
+
+struct hash_set {
+    struct state **data;
+    size_t cap, n_items;
+};
+
+struct hash_set *init_hash_set() {
+    struct hash_set *hash_set = malloc(sizeof *hash_set);
+    hash_set->data = calloc(2048, sizeof(hash_set->data[0]));
+    hash_set->cap = 2048;
+    hash_set->n_items = 0;
+    return hash_set;
+}
+
+void clear_hash_set(struct hash_set *set) {
+    memset(set->data, 0, set->cap * sizeof(set->data[0]));
+    set->n_items = 0;
+}
+
+void _insert_hash_set(struct hash_set *set, struct state *state, int cmp_kind) {
+    size_t i = hash(state, cmp_kind) & (set->cap - 1);
+    size_t delta = 0, overlap = 0, true_overlap = 0;
+    while (set->data[i]) if ((++i) == set->cap) i = 0;
+    set->data[i] = state;
+}
+
+void insert_hash_set(struct hash_set *set, struct state *state, int cmp_kind) {
+    if ((set->n_items * 2) >= set->cap) {
+        size_t old_cap = set->cap;
+        struct state **old_data = set->data;
+        set->cap *= 2;
+        set->data = calloc(set->cap, sizeof(set->data[0]));
+        for (size_t i = 0; i < old_cap; i++)
+            if (old_data[i])
+                _insert_hash_set(set, old_data[i], cmp_kind);
+        free(old_data);
+    }
+    _insert_hash_set(set, state, cmp_kind);
+    set->n_items++;
+}
+
+struct state *in_hash_set(struct hash_set *set, struct state *state, int cmp_kind) {
+    size_t i = hash(state, cmp_kind) & (set->cap - 1);
+    while (set->data[i]) {
+        if (eq(state, set->data[i], cmp_kind)) return set->data[i];
+        if ((++i) == set->cap) i = 0;
+    }
+    return 0;
+}
+
+///////////////// HASH QUEUES
+struct hash_queue {
+    struct state **entries;
+    size_t count;
+    size_t capacity;
+    struct hash_set *set;
+};
+
+struct hash_queue *init_hash_queue() {
+    struct hash_queue *q = calloc(1, sizeof(*q));
+    q->set = init_hash_set();
+    q->capacity = 1024;
+    q->entries = calloc(q->capacity, sizeof(q->entries[0]));
+    return q;
+}
+
+void queue_swap(struct hash_queue *queue, int a, int b) {
+    struct state *tmp = queue->entries[a];
+    queue->entries[a] = queue->entries[b];
+    queue->entries[b] = tmp;
+}
+
+struct state *pop_from_queue(struct hash_queue *queue) {
+    assert(queue->count);
+    struct state *head = queue->entries[0];
+    queue->entries[0] = queue->entries[--queue->count];
+    if (!queue->count) return head;
+
+    // Bubble the root down
+    int i = 0;
+    size_t val = queue->entries[0]->n_poisoned;
+    while (1) {
+        size_t left_idx = 2*i + 1,
+               right_idx = 2*i + 2,
+               has_left = left_idx < queue->count,
+               has_right = right_idx < queue->count,
+               left_val = has_left ? queue->entries[left_idx]->n_poisoned : val,
+               right_val = has_right ? queue->entries[right_idx]->n_poisoned : val;
+        if (right_val < left_val && right_val < val) {
+            queue_swap(queue, i, right_idx);
+            i = right_idx;
+        } else if (left_val < val) {
+            queue_swap(queue, i, left_idx);
+            i = left_idx;
+        } else break;
+    }
+
+    return head;
+}
+
+void enqueue(struct hash_queue *queue, struct state *state) {
+    if (!in_hash_set(queue->set, state, 0))
+        insert_hash_set(queue->set, state, 0);
+
+    if (queue->count == queue->capacity) {
+        queue->capacity *= 2;
+        queue->entries = realloc(queue->entries,
+            queue->capacity * sizeof(queue->entries[0]));
+    }
+
+    queue->entries[queue->count++] = state;
+    // Bubble it up
+    int i = queue->count - 1;
+    while (i && state->n_poisoned < queue->entries[(i - 1) / 2]->n_poisoned) {
+        queue_swap(queue, i, (i - 1) / 2);
+        i = (i - 1) / 2;
+    }
+}
+
+////////// PARSER STATE
+struct parser_state {
+    size_t k;
+    // [start_idx, symbol] -> state ptr
+    struct state **watcher_table;
+
+    struct hash_queue *current_queue;
+    struct hash_queue *next_queue;
+
+    struct token *tokens;
+};
+
+void flip_queues_and_memos(struct parser_state *parser_state) {
+    assert(!parser_state->current_queue->count);
+
+    struct hash_queue *tmp = parser_state->current_queue;
+    parser_state->current_queue = parser_state->next_queue;
+    parser_state->next_queue = tmp;
+    clear_hash_set(parser_state->next_queue->set);
+}
+
+struct state **lookup_watcher(symbol_id_t symbol, size_t start_idx,
+                              struct parser_state *parser_state) {
+    return parser_state->watcher_table
+        + (start_idx * N_SYMBOLS)
+        + symbol;
+}
+
+struct state *copy_state(struct state *state) {
+    struct state *copy = malloc(sizeof *copy);
+    *copy = *state;
+    copy->next_watcher = 0;
+    copy->pprev_watcher = 0;
+    copy->next_completion = 0;
+    return copy;
+}
+
+symbol_id_t symbol(struct state *state) {
+    return PRODUCTION_ID_TO_PRODUCTION[state->production_id][state->location];
+}
+
+void link_watchers(struct state *state, int for_next, struct parser_state *parser_state) {
+    // link to what we want to be watching now
+    if (IS_NONTERM(symbol(state))) {
+        struct state **ptr
+            = lookup_watcher(symbol(state), parser_state->k + for_next,
+                             parser_state);
+        if (*ptr) (*ptr)->pprev_watcher = &(state->next_watcher);
+        state->pprev_watcher = ptr;
+        state->next_watcher = *ptr;
+        *ptr = state;
+    }
+}
+
+void unlink_watchers(struct state *state) {
+    if (state->pprev_watcher) {
+        *(state->pprev_watcher) = state->next_watcher;
+        if (state->next_watcher)
+            state->next_watcher->pprev_watcher = state->pprev_watcher;
+        state->pprev_watcher = 0;
+        state->next_watcher = 0;
+    }
+}
+
+struct state *advance(struct state *state, void *reason,
+                      int for_next, struct parser_state *parser_state) {
+    state->reasons[state->location++] = reason;
+    unlink_watchers(state);
+    link_watchers(state, for_next, parser_state);
+    return state;
+}
+
+void queue_all_for_nonterm(symbol_id_t symbol, struct parser_state *parser_state) {
+    symbol_id_t scan = parser_state->tokens[parser_state->k].symbol;
+    struct state *s = calloc(1, sizeof(struct state));
+    for (prod_id_t *p = SYMBOL_ID_TO_PRODUCTION_IDS[symbol]; *p; p++) {
+        // FILTER
+        symbol_id_t sym = PRODUCTION_ID_TO_FIRST[*p];
+        if (sym != DONE_SYMBOL && !IS_NONTERM(sym) && sym != scan)
+            continue;
+
+        if (!s) s = calloc(1, sizeof(struct state));
+
+        s->n_poisoned = SYMBOL_TO_POISON[symbol];
+        s->production_id = *p;
+        s->start_idx = parser_state->k;
+        s->hash[0] = s->hash[1] = 0;
+        if (in_hash_set(parser_state->current_queue->set, s, 0))
+            continue;
+        link_watchers(s, 0, parser_state);
+        enqueue(parser_state->current_queue, s);
+        assert(parser_state->current_queue->count);
+        s = 0;
+    }
+    if (s) free(s);
+}
+
+void overwrite_state(struct state *dst, struct state *src) {
+    dst->production_id = src->production_id;
+    memcpy(dst->reasons, src->reasons, sizeof(src->reasons));
+    dst->location = src->location;
+    dst->start_idx = src->start_idx;
+    assert(src->n_poisoned == dst->n_poisoned);
+    dst->hash[0] = dst->hash[1] = 0;
+}
+
+// https://en.wikipedia.org/wiki/Earley_algorithm
+void parse(struct token *tokens, size_t n_tokens) {
+    struct parser_state parser_state = {
+        .k              = 0,
+        .watcher_table  = calloc(n_tokens * N_SYMBOLS, sizeof(struct state *)),
+        .current_queue  = init_hash_queue(),
+        .next_queue     = init_hash_queue(),
+        .tokens         = tokens,
+    };
+
+    struct hash_set *completion_set = init_hash_set();
+
+    queue_all_for_nonterm(START_SYMBOL, &parser_state);
+
+    for (size_t k = 0; k < n_tokens; k++) {
+        clear_hash_set(completion_set);
+
+        parser_state.k = k;
+        struct state *free_queue = 0;
+
+        while (parser_state.current_queue->count) {
+            struct state *state = pop_from_queue(parser_state.current_queue);
+            symbol_id_t next_symbol = symbol(state);
+
+            // SCANNING
+            if (next_symbol != DONE_SYMBOL && !IS_NONTERM(next_symbol)) {
+                if (next_symbol == tokens[k].symbol) {
+                    // NOTE: we need to copy here because @state might already be
+                    // in some hash sets...
+                    struct state *advanced
+                        = advance(copy_state(state), tokens + k, 1, &parser_state);
+                    enqueue(parser_state.next_queue, advanced);
+                }
+                state->next_free = free_queue;
+                free_queue = state;
+            }
+
+            // COMPLETION
+            else if (next_symbol == DONE_SYMBOL) {
+                alert_parse(state);
+                int is_very_end =
+                    (k+1) == n_tokens && state->start_idx == 0
+                    && PRODUCTION_ID_TO_SYMBOL[state->production_id] == START_SYMBOL;
+
+                struct state *last = in_hash_set(completion_set, state, 1);
+                if (last) {
+                    struct state *first_last = last;
+                    int do_an_add = 1;
+                    for (; last; last = last->next_completion) {
+                        switch (disambiguator(last, state)) {
+                            case 0: /* keep @last */ do_an_add = 0; break;
+                            case 1: /* kill @last */
+                                do_an_add = 0;
+                                overwrite_state(last, state);
+                                break;
+                            case 2: /* keep both */ break;
+                        }
+                    }
+                    if (!do_an_add) {
+                        state->next_free = free_queue;
+                        free_queue = state;
+                        continue;
+                    }
+                    last = first_last;
+                    state->next_completion = last->next_completion;
+                    last->next_completion = state;
+                } else {
+                    insert_hash_set(completion_set, state, 1);
+                    last = state;
+                }
+
+                symbol_id_t completing = PRODUCTION_ID_TO_SYMBOL[state->production_id];
+                struct state *watcher
+                    = *lookup_watcher(completing, state->start_idx, &parser_state);
+
+                for (; watcher; watcher = watcher->next_watcher) {
+                    struct state *new = advance(copy_state(watcher), state, 0, &parser_state);
+                    new->n_poisoned += state->n_poisoned;
+                    enqueue(parser_state.current_queue, new);
+                }
+
+                if (is_very_end) {
+                    struct state *copy = copy_state(state);
+                    if (last) {
+                        copy->next_completion = last->next_completion;
+                        last->next_completion = copy;
+                    } else insert_hash_set(completion_set, copy, 1);
+                    enqueue(parser_state.next_queue, copy);
+                }
+            }
+
+            // PREDICTION
+            else if (tokens[k].symbol != DONE_SYMBOL && IS_NONTERM(next_symbol)) {
+                queue_all_for_nonterm(next_symbol, &parser_state);
+            }
+        }
+
+        for (struct state *s = free_queue; s;) {
+            struct state *s_old = s;
+            s = s->next_free;
+            free(s_old);
+        }
+
+        flip_queues_and_memos(&parser_state);
+    }
+    while (parser_state.current_queue->count) {
+        struct state *state = pop_from_queue(parser_state.current_queue);
+        // print_parse_tree(state, 0);
+        if (isatty(1)) {
+            print_parse_tree(state, 0, stdout);
+        } else {
+            print_parse_tree_binary(state, tokens, n_tokens);
+            if (parser_state.current_queue->count) {
+                fprintf(stderr, "AMBIGUOUS PARSE\n");
+                exit(1);
+            }
+        }
+    }
+}
+
+void preprocess(char *string, size_t length);
+int main(int argc, char **argv) {
+    assert(argc == 2);
+
+    // Read the file
+    struct stat statbuf;
+    stat(argv[1], &statbuf);
+    char *buf = malloc(statbuf.st_size + 1);
+    buf[statbuf.st_size] = '\0';
+    int f = open(argv[1], O_RDONLY);
+    read(f, buf, statbuf.st_size);
+    close(f);
+
+    preprocess(buf, statbuf.st_size);
+
+    // Lex the file
+    size_t n_tokens = 0;
+    struct token *tokens = lex(buf, &n_tokens);
+    // Parse the file
+    parse(tokens, n_tokens);
+    return 0;
+}
+
+struct token *find_token(struct state *state, int idx) {
+    if (IS_NONTERM(PRODUCTION_ID_TO_PRODUCTION[state->production_id][idx]))
+        return find_token(state->reasons[idx], 0);
+    return state->reasons[idx];
+}
author	Matthew Sotoudeh <matthew@masot.net>	2024-03-11 00:44:33 -0700
committer	Matthew Sotoudeh <matthew@masot.net>	2024-03-11 00:44:33 -0700
commit	35fa21e59ad44de3ac5d075a3c1ae60d462a1a13 (patch)
tree	18de46f436f0f91f0b6da1b2e393399620ca5ce0
parent	26a42b4a7ba077659f791208a2a7989bfdfb3663 (diff)