diff options
author | Matthew Sotoudeh <matthew@masot.net> | 2024-03-19 11:21:55 -0700 |
---|---|---|
committer | Matthew Sotoudeh <matthew@masot.net> | 2024-03-19 11:21:55 -0700 |
commit | a55ef8495f79d297c5fbf74c47c3243b3caf68d1 (patch) | |
tree | 3b76715d9fbc0fbe9b0630d78432fa0ad1a5b1cd |
done
-rw-r--r-- | .gitignore | 3 | ||||
-rw-r--r-- | checker.py | 21 | ||||
-rw-r--r-- | examples/A.regex | 1 | ||||
-rw-r--r-- | examples/B.regex | 1 | ||||
-rw-r--r-- | examples/correct.nfa | 29 | ||||
-rw-r--r-- | examples/correct.regex | 1 | ||||
-rw-r--r-- | examples/incorrect.nfa | 29 | ||||
-rw-r--r-- | examples/incorrect.regex | 1 | ||||
-rw-r--r-- | examples/other_correct.nfa | 30 | ||||
-rw-r--r-- | nfa.py | 122 | ||||
-rw-r--r-- | parsers.py | 134 |
11 files changed, 372 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..956b5ab --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.*.sw* +.sw* +__pycache__ diff --git a/checker.py b/checker.py new file mode 100644 index 0000000..eab49e6 --- /dev/null +++ b/checker.py @@ -0,0 +1,21 @@ +import sys +from parsers import * + +arg1, arg2 = sys.argv[1:] +if arg1.endswith(".nfa"): + arg1 = parse_nfa(arg1) +elif arg1.endswith(".regex"): + arg1 = parse_regex(arg1) +else: raise NotImplementedError + +if arg2.endswith(".nfa"): + arg2 = parse_nfa(arg2) +elif arg2.endswith(".regex"): + arg2 = parse_regex(arg2) +else: raise NotImplementedError + +equiv, cex = arg1.equivalent_to(arg2) +if not equiv: + print(f"They are NOT equal; they differ on the string '{cex}'") +else: + print(f"They ARE equal") diff --git a/examples/A.regex b/examples/A.regex new file mode 100644 index 0000000..a546203 --- /dev/null +++ b/examples/A.regex @@ -0,0 +1 @@ +c(S*) diff --git a/examples/B.regex b/examples/B.regex new file mode 100644 index 0000000..f2ad6c7 --- /dev/null +++ b/examples/B.regex @@ -0,0 +1 @@ +c diff --git a/examples/correct.nfa b/examples/correct.nfa new file mode 100644 index 0000000..281d64a --- /dev/null +++ b/examples/correct.nfa @@ -0,0 +1,29 @@ +start start + eps top1 + eps bot1 + +top1 + p top1 + h top1 + c top2 + +top2 + c top2 + eps top3 + +top3 accepting + p top3 + h top3 + +bot1 + p bot1 + c bot1 + h bot2 + +bot2 + h bot2 + eps bot3 + +bot3 accepting + p bot3 + c bot3 diff --git a/examples/correct.regex b/examples/correct.regex new file mode 100644 index 0000000..c2db374 --- /dev/null +++ b/examples/correct.regex @@ -0,0 +1 @@ +((S*)c(S*)c)|((S*)h(S*)h)|((S*)p(S*)p) diff --git a/examples/incorrect.nfa b/examples/incorrect.nfa new file mode 100644 index 0000000..cfc285f --- /dev/null +++ b/examples/incorrect.nfa @@ -0,0 +1,29 @@ +start start + eps top1 + +top1 + p top1 + h top1 + c top2 + +top2 + c top2 + eps top3 + p top3 + +top3 accepting + p top3 + h top3 + +bot1 + p bot1 + c bot1 + h bot2 + +bot2 + h bot2 + eps bot3 + +bot3 accepting + p bot3 + c bot3 diff --git a/examples/incorrect.regex b/examples/incorrect.regex new file mode 100644 index 0000000..c313595 --- /dev/null +++ b/examples/incorrect.regex @@ -0,0 +1 @@ +((S*)c(S*)c(S*))|((S*)h(S*)h)|((S*)p(S*)p) diff --git a/examples/other_correct.nfa b/examples/other_correct.nfa new file mode 100644 index 0000000..44ddd85 --- /dev/null +++ b/examples/other_correct.nfa @@ -0,0 +1,30 @@ +start start + eps top1 + eps bot1 + +top1 + p top1 + h top1 + c top2 + +top2 + c top2 + eps top3 + p top3 + +top3 accepting + p top3 + h top3 + +bot1 + p top1 + c top1 + h top2 + +bot2 + h top2 + eps top3 + +bot3 accepting + p top3 + c top3 @@ -0,0 +1,122 @@ +import itertools +from collections import defaultdict + +EPSILON = "" +ALPHABET = ("c", "h", "p") +def strings_of_length(n): + for chars in itertools.product(ALPHABET, repeat=n): + yield ''.join(chars) + +class NFA: + def __init__(self): + # {int: {str: { int, ... }}} + self.transitions = {} + self.start_set = set() + self.accepting = set() + self.counter = 0 + + def count(self): + self.counter += 1 + return self.counter + + @property + def states(self): + return list(self.transitions.keys()) + + def add_state(self, name, is_start, is_accepting): + self.transitions[name] = defaultdict(set) + if is_start: + self.start_set.add(name) + if is_accepting: + self.accepting.add(name) + return name + + def init_set(self): + return frozenset(self.start_set) + + def transition_set(self, prev, c): + result = set() + for s in prev: + result.update(self.transitions[s][c]) + return frozenset(result) + + def equivalent_to(self, other): + self = self.no_epsilons() + other = other.no_epsilons() + + def is_ok(pair): + self_set, other_set = pair + self_accepts = bool(self.accepting & self_set) + other_accepts = bool(other.accepting & other_set) + return self_accepts == other_accepts + + initial_pair = (self.init_set(), other.init_set()) + if not is_ok(initial_pair): + return (False, EPSILON) + + pair_to_prefix = {initial_pair: EPSILON} + last_step_pairs = [initial_pair] + while True: + # for pair in last_step_pairs: + # print("\t'" + pair_to_prefix[pair] + "' -> ", pair) + next_step_pairs = [] + for self_set, other_set in last_step_pairs: + prefix = pair_to_prefix[(self_set, other_set)] + for c in ALPHABET: + next_prefix = prefix + c + next_self_set = self.transition_set(self_set, c) + next_other_set = other.transition_set(other_set, c) + next_pair = (next_self_set, next_other_set) + if next_pair in pair_to_prefix: + continue + if not is_ok(next_pair): + return (False, next_prefix) + pair_to_prefix[next_pair] = next_prefix + next_step_pairs.append(next_pair) + if not next_step_pairs: + return (True, None) + last_step_pairs = next_step_pairs + + def no_epsilons(self): + epsilon_reachable = { + k: {k} | trans[EPSILON] + for k, trans in self.transitions.items() + } + fixedpoint = False + while not fixedpoint: + fixedpoint = True + for k in self.transitions: + new_reachable = epsilon_reachable[k].copy() + for r in epsilon_reachable[k]: + new_reachable.update(epsilon_reachable[r]) + if new_reachable == epsilon_reachable[k]: + continue + epsilon_reachable[k] = new_reachable + fixedpoint = False + + new_starts = self.start_set.copy() + for s in self.start_set: + new_starts.update(epsilon_reachable[s]) + + new_accepting = self.accepting.copy() + for s, reach in epsilon_reachable.items(): + if reach & self.accepting: + new_accepting.add(s) + + new_transitions = {s: defaultdict(set) for s in self.states} + for s in self.states: + for r in sorted(epsilon_reachable[s]): + for c in self.transitions[r]: + if c == EPSILON: continue + new_transitions[s][c].update(self.transitions[r][c]) + new = NFA() + new.transitions = new_transitions + new.start_set = new_starts + new.accepting = new_accepting + return new + + def pprint(self): + for state, transitions in self.transitions.items(): + print(state, state in self.start_set, state in self.accepting) + for k, vs in transitions.items(): + print("\t", state, "[", k, "]", "->", vs) diff --git a/parsers.py b/parsers.py new file mode 100644 index 0000000..65c2614 --- /dev/null +++ b/parsers.py @@ -0,0 +1,134 @@ +from nfa import * + +def parse_nfa(path): + nfa = NFA() + name2state = dict() + # First, find all the state + for line in open(path, 'r'): + indented = line[0].isspace() + line = line.strip() + if not line: continue + if line.startswith("#"): continue + if indented: continue + name, *args = line.split() + is_start = "start" in args + is_accepting = "accepting" in args + name2state[name] = nfa.add_state(name, is_start, is_accepting) + + # then add the transitions + current_state = None + for line in open(path, 'r'): + indented = line[0].isspace() + line = line.strip() + if not line: continue + if line.startswith("#"): continue + if indented: + assert current_state is not None + letter, next_state = line.split() + if letter == "eps": + letter = EPSILON + next_state = name2state[next_state] + nfa.transitions[current_state][letter].add(next_state) + else: + name, *_ = line.split() + current_state = name2state[name] + return nfa + +def parse_regex(path): + string = open(path, "r").read().strip() + ast = regex_to_ast(string) + nfa = NFA() + def visit(node): + if node[0] == "LETTER": + start = nfa.add_state(nfa.count(), False, False) + end = nfa.add_state(nfa.count(), False, False) + nfa.transitions[start][node[1]].add(end) + return start, end + elif node[0] == "CONCAT": + start1, end1 = visit(node[1]) + start2, end2 = visit(node[2]) + nfa.transitions[end1][EPSILON].add(start2) + return start1, end2 + elif node[0] == "KLEENE": + start, end = visit(node[1]) + nfa.transitions[start][EPSILON].add(end) + nfa.transitions[end][EPSILON].add(start) + return start, end + elif node[0] == "OR": + start1, end1 = visit(node[1]) + start2, end2 = visit(node[2]) + + start = nfa.add_state(nfa.count(), False, False) + end = nfa.add_state(nfa.count(), False, False) + nfa.transitions[start][EPSILON].add(start1) + nfa.transitions[start][EPSILON].add(start2) + nfa.transitions[end1][EPSILON].add(end) + nfa.transitions[end2][EPSILON].add(end) + return start, end + else: + print(node) + raise NotImplementedError + + start, end = visit(ast) + nfa.start_set.add(start) + nfa.accepting.add(end) + return nfa + +def pprint_regex_ast(ast): + if ast[0] == "LETTER": + return ast[1] + if ast[0] == "CONCAT": + return f"({pprint_regex_ast(ast[1])}{pprint_regex_ast(ast[2])})" + if ast[0] == "KLEENE": + return f"({pprint_regex_ast(ast[1])})*" + if ast[0] == "OR": + return f"({pprint_regex_ast(ast[1])}|{pprint_regex_ast(ast[2])})" + +def regex_to_ast(string): + tokens = match_parens(lex_regex(string)) + return _regex_to_ast(tokens) + +def _regex_to_ast(tokens): + assert tokens + if len(tokens) == 1 and isinstance(tokens[0], list): + return _regex_to_ast(tokens[0]) + if len(tokens) == 1: + return ("LETTER", tokens[0]) + if '|' in tokens: + i = tokens.index('|') + return ('OR', _regex_to_ast(tokens[:i]), _regex_to_ast(tokens[i+1:])) + if tokens[-1] == '*': + return ('KLEENE', _regex_to_ast(tokens[:-1])) + return ('CONCAT', _regex_to_ast(tokens[:-1]), _regex_to_ast(tokens[-1:])) + +def match_parens(tokens): + tokens = tokens.copy() + open_stack = [] + i = 0 + while i < len(tokens): + if tokens[i] == '(': + open_stack.append(i) + elif tokens[i] == ')': + start_i = open_stack.pop(-1) + tokens = tokens[:start_i] + [tokens[start_i+1:i]] + tokens[i+1:] + i = start_i + i += 1 + return tokens + +def lex_regex(string): + tokens = [] + while string: + if string[0].isspace(): + string = string[1:] + elif string[0] in ("(", ")", "|", "*") + ALPHABET: + tokens.append(string[0]) + string = string[1:] + elif string[0] == "E": + tokens.append(EPSILON) + string = string[1:] + elif string[0] == "S": + string = "(" + '|'.join(ALPHABET) + ')' + string[1:] + else: + print(string) + raise NotImplementedError + return tokens |