summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Sotoudeh <matthew@masot.net>2024-03-11 00:44:33 -0700
committerMatthew Sotoudeh <matthew@masot.net>2024-03-11 00:44:33 -0700
commit35fa21e59ad44de3ac5d075a3c1ae60d462a1a13 (patch)
tree18de46f436f0f91f0b6da1b2e393399620ca5ce0
parent26a42b4a7ba077659f791208a2a7989bfdfb3663 (diff)
core parser
-rw-r--r--earlpy/parser.c572
1 files changed, 572 insertions, 0 deletions
diff --git a/earlpy/parser.c b/earlpy/parser.c
new file mode 100644
index 0000000..fafc9b8
--- /dev/null
+++ b/earlpy/parser.c
@@ -0,0 +1,572 @@
+#include <assert.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <stdio.h>
+
+#define VERBOSE 0
+#if VERBOSE
+#define vprintf(...) printf(__VA_ARGS__)
+#else
+#define vprintf(...)
+#endif
+
+//////// This file will be copy/pasted below the generated code
+yyscan_t SCANNER;
+
+struct token { symbol_id_t symbol; size_t offset; size_t length; char *string; };
+
+char *STRING;
+struct token *TOKENS = 0;
+size_t N_TOKENS = 0, CAP_TOKENS = 0;
+void lex_symbol(symbol_id_t which) {
+ if (N_TOKENS == CAP_TOKENS) {
+ CAP_TOKENS = (CAP_TOKENS + 1) * 2;
+ TOKENS = realloc(TOKENS, CAP_TOKENS * sizeof(*TOKENS));
+ }
+ TOKENS[N_TOKENS].symbol = which;
+ TOKENS[N_TOKENS].offset = OFFSET;
+ TOKENS[N_TOKENS].length = 0;
+ if (STRING) {
+ int length = yyget_leng(SCANNER);
+ TOKENS[N_TOKENS].offset = OFFSET - length;
+ TOKENS[N_TOKENS].length = length;
+ TOKENS[N_TOKENS++].string = strndup(STRING + OFFSET - length, length);
+
+ } else TOKENS[N_TOKENS++].string = strdup("");
+}
+
+struct token *lex(char *string, size_t *n_tokens) {
+ STRING = string;
+ TOKENS = 0;
+ CAP_TOKENS = 0;
+ N_TOKENS = 0;
+
+ yylex_init(&SCANNER);
+ yy_scan_string(string, SCANNER);
+ yylex(SCANNER);
+ yylex_destroy(SCANNER);
+
+ STRING = 0;
+ lex_symbol(DONE_SYMBOL);
+ *n_tokens = N_TOKENS;
+ return TOKENS;
+}
+
+//////// PARSER
+struct state {
+ prod_id_t production_id;
+ void *reasons[MAX_PRODUCTION_LEN + 1];
+
+ size_t location;
+ size_t start_idx;
+ struct state *next_watcher, **pprev_watcher;
+
+ size_t n_poisoned;
+
+ size_t hash[2];
+
+ struct state *next_completion;
+ struct state *next_free;
+};
+int disambiguator(struct state *old, struct state *new);
+void alert_parse(struct state *state);
+
+void pprint_state(struct state *state) {
+ fprintf(stderr, "%s ->", SYMBOL_ID_TO_NAME[PRODUCTION_ID_TO_SYMBOL[state->production_id]]);
+ size_t i = 0;
+ for (symbol_id_t *s = PRODUCTION_ID_TO_PRODUCTION[state->production_id];
+ *s; s++) {
+ if (i++ == state->location) fprintf(stderr, " .");
+ fprintf(stderr, " %s", SYMBOL_ID_TO_NAME[*s]);
+ }
+ if (i == state->location) fprintf(stderr, " .");
+ fprintf(stderr, " %lu\n", state->start_idx);
+}
+
+void print_parse_tree(struct state *state, int depth, FILE *f) {
+ for (int i = 0; i < depth; i++) fprintf(f, " ");
+ assert(state);
+ fprintf(f, "%s\n", SYMBOL_ID_TO_NAME[PRODUCTION_ID_TO_SYMBOL[state->production_id]]);
+ size_t i = 0;
+ for (symbol_id_t *s = PRODUCTION_ID_TO_PRODUCTION[state->production_id];
+ *s; s++, i++) {
+ if (IS_NONTERM(*s)) {
+ if (!(state->reasons[i])) {
+ printf("BAD! Should be a %s\n", SYMBOL_ID_TO_NAME[*s]);
+ }
+ print_parse_tree(state->reasons[i], depth + 2, f);
+ } else {
+ for (int i = 0; i < depth + 2; i++) fprintf(f, " ");
+ struct token *t = state->reasons[i];
+ // printf("HELLO: %lu, %s\n", *s, SYMBOL_ID_TO_NAME[*s]);
+ fprintf(f, "%s ", SYMBOL_ID_TO_NAME[t->symbol]);
+ fprintf(f, "%s\n", t->string);
+ }
+ }
+}
+
+void print_parse_tree2_(struct state *state) {
+ if (!state) printf("?\n");
+ else if (PRODUCTION_ID_TO_PRODUCTION[state->production_id][0]
+ && !PRODUCTION_ID_TO_PRODUCTION[state->production_id][1]
+ && IS_NONTERM(PRODUCTION_ID_TO_PRODUCTION[state->production_id][0]))
+ print_parse_tree2_(state->reasons[0]);
+ else if (PRODUCTION_ID_TO_PRODUCTION[state->production_id][0]
+ && !PRODUCTION_ID_TO_PRODUCTION[state->production_id][1]
+ && !IS_NONTERM(PRODUCTION_ID_TO_PRODUCTION[state->production_id][0]))
+ printf("%s", ((struct token *)state->reasons[0])->string);
+ else {
+ printf("(");
+ int i = 0;
+ for (symbol_id_t *s = PRODUCTION_ID_TO_PRODUCTION[state->production_id];
+ *s; s++, i++) {
+ printf(" ");
+ if (IS_NONTERM(*s))
+ print_parse_tree2_(state->reasons[i]);
+ else {
+ struct token *t = state->reasons[i];
+ printf("%s", t->string);
+ }
+ }
+ printf(" )");
+ }
+}
+void print_parse_tree2(struct state *state) {
+ print_parse_tree2_(state); printf("\n");
+}
+void print_parse_tree_binary(struct state *state, struct token *tokens, size_t n_tokens) {
+ n_tokens--; // done token at the end
+ write(1, &n_tokens, sizeof(n_tokens));
+ for (size_t i = 0; i < n_tokens; i++) {
+ write(1, &(tokens[i].symbol), sizeof(tokens[i].symbol));
+ write(1, &(tokens[i].offset), sizeof(tokens[i].offset));
+ write(1, &(tokens[i].length), sizeof(tokens[i].length));
+ }
+ size_t token_idx = 0, zero = 0;
+ size_t n_stack = 0, cap_stack = 1024;
+ struct state **stack = malloc(cap_stack * sizeof(*stack));
+ stack[n_stack++] = state;
+ while (n_stack) {
+ struct state *next = stack[--n_stack];
+ if ((MAX_PRODUCTION_LEN + n_stack) >= cap_stack) {
+ cap_stack = (cap_stack * 2) + MAX_PRODUCTION_LEN;
+ stack = realloc(stack, cap_stack * sizeof(*stack));
+ }
+ // production_id
+ write(1, &next->production_id, sizeof(next->production_id));
+ int i = 0;
+ symbol_id_t *s = PRODUCTION_ID_TO_PRODUCTION[next->production_id];
+ while (*s) s++, i++;
+ while (s--, i --> 0)
+ if (IS_NONTERM(*s))
+ stack[n_stack++] = next->reasons[i];
+ }
+ free(stack);
+}
+
+//////////// HASH SETS
+size_t hash_chars(char *v, int n) {
+ size_t h = 0;
+ for (int i = 0; i < n; i++)
+ h = (h * 33) ^ v[i];
+ return h;
+}
+
+size_t hash(struct state *state, int hash_kind) {
+ if (state->hash[hash_kind]) return state->hash[hash_kind];
+
+ size_t h = 0;
+ if (hash_kind == 0) {
+ h = state->production_id;
+ h = (h * N_PRODUCTIONS)
+ ^ hash_chars((char*)&state->location, sizeof(state->location));
+ } else {
+ h = PRODUCTION_ID_TO_SYMBOL[state->production_id];
+ }
+ h = (h * 33) ^ hash_chars((char*)&state->start_idx, sizeof(state->start_idx));
+
+ state->hash[hash_kind] = h;
+ return h;
+}
+
+size_t eq(struct state *A, struct state *B, int cmp_kind) {
+ if (cmp_kind == 0)
+ return A->production_id == B->production_id
+ && A ->location == B->location
+ && A->start_idx == B->start_idx;
+ return PRODUCTION_ID_TO_SYMBOL[A->production_id] == PRODUCTION_ID_TO_SYMBOL[B->production_id]
+ && A->start_idx == B->start_idx;
+}
+
+struct hash_set {
+ struct state **data;
+ size_t cap, n_items;
+};
+
+struct hash_set *init_hash_set() {
+ struct hash_set *hash_set = malloc(sizeof *hash_set);
+ hash_set->data = calloc(2048, sizeof(hash_set->data[0]));
+ hash_set->cap = 2048;
+ hash_set->n_items = 0;
+ return hash_set;
+}
+
+void clear_hash_set(struct hash_set *set) {
+ memset(set->data, 0, set->cap * sizeof(set->data[0]));
+ set->n_items = 0;
+}
+
+void _insert_hash_set(struct hash_set *set, struct state *state, int cmp_kind) {
+ size_t i = hash(state, cmp_kind) & (set->cap - 1);
+ size_t delta = 0, overlap = 0, true_overlap = 0;
+ while (set->data[i]) if ((++i) == set->cap) i = 0;
+ set->data[i] = state;
+}
+
+void insert_hash_set(struct hash_set *set, struct state *state, int cmp_kind) {
+ if ((set->n_items * 2) >= set->cap) {
+ size_t old_cap = set->cap;
+ struct state **old_data = set->data;
+ set->cap *= 2;
+ set->data = calloc(set->cap, sizeof(set->data[0]));
+ for (size_t i = 0; i < old_cap; i++)
+ if (old_data[i])
+ _insert_hash_set(set, old_data[i], cmp_kind);
+ free(old_data);
+ }
+ _insert_hash_set(set, state, cmp_kind);
+ set->n_items++;
+}
+
+struct state *in_hash_set(struct hash_set *set, struct state *state, int cmp_kind) {
+ size_t i = hash(state, cmp_kind) & (set->cap - 1);
+ while (set->data[i]) {
+ if (eq(state, set->data[i], cmp_kind)) return set->data[i];
+ if ((++i) == set->cap) i = 0;
+ }
+ return 0;
+}
+
+///////////////// HASH QUEUES
+struct hash_queue {
+ struct state **entries;
+ size_t count;
+ size_t capacity;
+ struct hash_set *set;
+};
+
+struct hash_queue *init_hash_queue() {
+ struct hash_queue *q = calloc(1, sizeof(*q));
+ q->set = init_hash_set();
+ q->capacity = 1024;
+ q->entries = calloc(q->capacity, sizeof(q->entries[0]));
+ return q;
+}
+
+void queue_swap(struct hash_queue *queue, int a, int b) {
+ struct state *tmp = queue->entries[a];
+ queue->entries[a] = queue->entries[b];
+ queue->entries[b] = tmp;
+}
+
+struct state *pop_from_queue(struct hash_queue *queue) {
+ assert(queue->count);
+ struct state *head = queue->entries[0];
+ queue->entries[0] = queue->entries[--queue->count];
+ if (!queue->count) return head;
+
+ // Bubble the root down
+ int i = 0;
+ size_t val = queue->entries[0]->n_poisoned;
+ while (1) {
+ size_t left_idx = 2*i + 1,
+ right_idx = 2*i + 2,
+ has_left = left_idx < queue->count,
+ has_right = right_idx < queue->count,
+ left_val = has_left ? queue->entries[left_idx]->n_poisoned : val,
+ right_val = has_right ? queue->entries[right_idx]->n_poisoned : val;
+ if (right_val < left_val && right_val < val) {
+ queue_swap(queue, i, right_idx);
+ i = right_idx;
+ } else if (left_val < val) {
+ queue_swap(queue, i, left_idx);
+ i = left_idx;
+ } else break;
+ }
+
+ return head;
+}
+
+void enqueue(struct hash_queue *queue, struct state *state) {
+ if (!in_hash_set(queue->set, state, 0))
+ insert_hash_set(queue->set, state, 0);
+
+ if (queue->count == queue->capacity) {
+ queue->capacity *= 2;
+ queue->entries = realloc(queue->entries,
+ queue->capacity * sizeof(queue->entries[0]));
+ }
+
+ queue->entries[queue->count++] = state;
+ // Bubble it up
+ int i = queue->count - 1;
+ while (i && state->n_poisoned < queue->entries[(i - 1) / 2]->n_poisoned) {
+ queue_swap(queue, i, (i - 1) / 2);
+ i = (i - 1) / 2;
+ }
+}
+
+////////// PARSER STATE
+struct parser_state {
+ size_t k;
+ // [start_idx, symbol] -> state ptr
+ struct state **watcher_table;
+
+ struct hash_queue *current_queue;
+ struct hash_queue *next_queue;
+
+ struct token *tokens;
+};
+
+void flip_queues_and_memos(struct parser_state *parser_state) {
+ assert(!parser_state->current_queue->count);
+
+ struct hash_queue *tmp = parser_state->current_queue;
+ parser_state->current_queue = parser_state->next_queue;
+ parser_state->next_queue = tmp;
+ clear_hash_set(parser_state->next_queue->set);
+}
+
+struct state **lookup_watcher(symbol_id_t symbol, size_t start_idx,
+ struct parser_state *parser_state) {
+ return parser_state->watcher_table
+ + (start_idx * N_SYMBOLS)
+ + symbol;
+}
+
+struct state *copy_state(struct state *state) {
+ struct state *copy = malloc(sizeof *copy);
+ *copy = *state;
+ copy->next_watcher = 0;
+ copy->pprev_watcher = 0;
+ copy->next_completion = 0;
+ return copy;
+}
+
+symbol_id_t symbol(struct state *state) {
+ return PRODUCTION_ID_TO_PRODUCTION[state->production_id][state->location];
+}
+
+void link_watchers(struct state *state, int for_next, struct parser_state *parser_state) {
+ // link to what we want to be watching now
+ if (IS_NONTERM(symbol(state))) {
+ struct state **ptr
+ = lookup_watcher(symbol(state), parser_state->k + for_next,
+ parser_state);
+ if (*ptr) (*ptr)->pprev_watcher = &(state->next_watcher);
+ state->pprev_watcher = ptr;
+ state->next_watcher = *ptr;
+ *ptr = state;
+ }
+}
+
+void unlink_watchers(struct state *state) {
+ if (state->pprev_watcher) {
+ *(state->pprev_watcher) = state->next_watcher;
+ if (state->next_watcher)
+ state->next_watcher->pprev_watcher = state->pprev_watcher;
+ state->pprev_watcher = 0;
+ state->next_watcher = 0;
+ }
+}
+
+struct state *advance(struct state *state, void *reason,
+ int for_next, struct parser_state *parser_state) {
+ state->reasons[state->location++] = reason;
+ unlink_watchers(state);
+ link_watchers(state, for_next, parser_state);
+ return state;
+}
+
+void queue_all_for_nonterm(symbol_id_t symbol, struct parser_state *parser_state) {
+ symbol_id_t scan = parser_state->tokens[parser_state->k].symbol;
+ struct state *s = calloc(1, sizeof(struct state));
+ for (prod_id_t *p = SYMBOL_ID_TO_PRODUCTION_IDS[symbol]; *p; p++) {
+ // FILTER
+ symbol_id_t sym = PRODUCTION_ID_TO_FIRST[*p];
+ if (sym != DONE_SYMBOL && !IS_NONTERM(sym) && sym != scan)
+ continue;
+
+ if (!s) s = calloc(1, sizeof(struct state));
+
+ s->n_poisoned = SYMBOL_TO_POISON[symbol];
+ s->production_id = *p;
+ s->start_idx = parser_state->k;
+ s->hash[0] = s->hash[1] = 0;
+ if (in_hash_set(parser_state->current_queue->set, s, 0))
+ continue;
+ link_watchers(s, 0, parser_state);
+ enqueue(parser_state->current_queue, s);
+ assert(parser_state->current_queue->count);
+ s = 0;
+ }
+ if (s) free(s);
+}
+
+void overwrite_state(struct state *dst, struct state *src) {
+ dst->production_id = src->production_id;
+ memcpy(dst->reasons, src->reasons, sizeof(src->reasons));
+ dst->location = src->location;
+ dst->start_idx = src->start_idx;
+ assert(src->n_poisoned == dst->n_poisoned);
+ dst->hash[0] = dst->hash[1] = 0;
+}
+
+// https://en.wikipedia.org/wiki/Earley_algorithm
+void parse(struct token *tokens, size_t n_tokens) {
+ struct parser_state parser_state = {
+ .k = 0,
+ .watcher_table = calloc(n_tokens * N_SYMBOLS, sizeof(struct state *)),
+ .current_queue = init_hash_queue(),
+ .next_queue = init_hash_queue(),
+ .tokens = tokens,
+ };
+
+ struct hash_set *completion_set = init_hash_set();
+
+ queue_all_for_nonterm(START_SYMBOL, &parser_state);
+
+ for (size_t k = 0; k < n_tokens; k++) {
+ clear_hash_set(completion_set);
+
+ parser_state.k = k;
+ struct state *free_queue = 0;
+
+ while (parser_state.current_queue->count) {
+ struct state *state = pop_from_queue(parser_state.current_queue);
+ symbol_id_t next_symbol = symbol(state);
+
+ // SCANNING
+ if (next_symbol != DONE_SYMBOL && !IS_NONTERM(next_symbol)) {
+ if (next_symbol == tokens[k].symbol) {
+ // NOTE: we need to copy here because @state might already be
+ // in some hash sets...
+ struct state *advanced
+ = advance(copy_state(state), tokens + k, 1, &parser_state);
+ enqueue(parser_state.next_queue, advanced);
+ }
+ state->next_free = free_queue;
+ free_queue = state;
+ }
+
+ // COMPLETION
+ else if (next_symbol == DONE_SYMBOL) {
+ alert_parse(state);
+ int is_very_end =
+ (k+1) == n_tokens && state->start_idx == 0
+ && PRODUCTION_ID_TO_SYMBOL[state->production_id] == START_SYMBOL;
+
+ struct state *last = in_hash_set(completion_set, state, 1);
+ if (last) {
+ struct state *first_last = last;
+ int do_an_add = 1;
+ for (; last; last = last->next_completion) {
+ switch (disambiguator(last, state)) {
+ case 0: /* keep @last */ do_an_add = 0; break;
+ case 1: /* kill @last */
+ do_an_add = 0;
+ overwrite_state(last, state);
+ break;
+ case 2: /* keep both */ break;
+ }
+ }
+ if (!do_an_add) {
+ state->next_free = free_queue;
+ free_queue = state;
+ continue;
+ }
+ last = first_last;
+ state->next_completion = last->next_completion;
+ last->next_completion = state;
+ } else {
+ insert_hash_set(completion_set, state, 1);
+ last = state;
+ }
+
+ symbol_id_t completing = PRODUCTION_ID_TO_SYMBOL[state->production_id];
+ struct state *watcher
+ = *lookup_watcher(completing, state->start_idx, &parser_state);
+
+ for (; watcher; watcher = watcher->next_watcher) {
+ struct state *new = advance(copy_state(watcher), state, 0, &parser_state);
+ new->n_poisoned += state->n_poisoned;
+ enqueue(parser_state.current_queue, new);
+ }
+
+ if (is_very_end) {
+ struct state *copy = copy_state(state);
+ if (last) {
+ copy->next_completion = last->next_completion;
+ last->next_completion = copy;
+ } else insert_hash_set(completion_set, copy, 1);
+ enqueue(parser_state.next_queue, copy);
+ }
+ }
+
+ // PREDICTION
+ else if (tokens[k].symbol != DONE_SYMBOL && IS_NONTERM(next_symbol)) {
+ queue_all_for_nonterm(next_symbol, &parser_state);
+ }
+ }
+
+ for (struct state *s = free_queue; s;) {
+ struct state *s_old = s;
+ s = s->next_free;
+ free(s_old);
+ }
+
+ flip_queues_and_memos(&parser_state);
+ }
+ while (parser_state.current_queue->count) {
+ struct state *state = pop_from_queue(parser_state.current_queue);
+ // print_parse_tree(state, 0);
+ if (isatty(1)) {
+ print_parse_tree(state, 0, stdout);
+ } else {
+ print_parse_tree_binary(state, tokens, n_tokens);
+ if (parser_state.current_queue->count) {
+ fprintf(stderr, "AMBIGUOUS PARSE\n");
+ exit(1);
+ }
+ }
+ }
+}
+
+void preprocess(char *string, size_t length);
+int main(int argc, char **argv) {
+ assert(argc == 2);
+
+ // Read the file
+ struct stat statbuf;
+ stat(argv[1], &statbuf);
+ char *buf = malloc(statbuf.st_size + 1);
+ buf[statbuf.st_size] = '\0';
+ int f = open(argv[1], O_RDONLY);
+ read(f, buf, statbuf.st_size);
+ close(f);
+
+ preprocess(buf, statbuf.st_size);
+
+ // Lex the file
+ size_t n_tokens = 0;
+ struct token *tokens = lex(buf, &n_tokens);
+ // Parse the file
+ parse(tokens, n_tokens);
+ return 0;
+}
+
+struct token *find_token(struct state *state, int idx) {
+ if (IS_NONTERM(PRODUCTION_ID_TO_PRODUCTION[state->production_id][idx]))
+ return find_token(state->reasons[idx], 0);
+ return state->reasons[idx];
+}
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback