done

author: Matthew Sotoudeh <matthew@masot.net> 2024-03-19 11:21:55 -0700
committer: Matthew Sotoudeh <matthew@masot.net> 2024-03-19 11:21:55 -0700
commit: a55ef8495f79d297c5fbf74c47c3243b3caf68d1 (patch)
tree: 3b76715d9fbc0fbe9b0630d78432fa0ad1a5b1cd
11 files changed, 372 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..956b5ab
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+.*.sw*
+.sw*
+__pycache__
diff --git a/checker.py b/checker.py
new file mode 100644
index 0000000..eab49e6
--- /dev/null
+++ b/checker.py
@@ -0,0 +1,21 @@
+import sys
+from parsers import *
+
+arg1, arg2 = sys.argv[1:]
+if arg1.endswith(".nfa"):
+    arg1 = parse_nfa(arg1)
+elif arg1.endswith(".regex"):
+    arg1 = parse_regex(arg1)
+else: raise NotImplementedError
+
+if arg2.endswith(".nfa"):
+    arg2 = parse_nfa(arg2)
+elif arg2.endswith(".regex"):
+    arg2 = parse_regex(arg2)
+else: raise NotImplementedError
+
+equiv, cex = arg1.equivalent_to(arg2)
+if not equiv:
+    print(f"They are NOT equal; they differ on the string '{cex}'")
+else:
+    print(f"They ARE equal")
diff --git a/examples/A.regex b/examples/A.regex
new file mode 100644
index 0000000..a546203
--- /dev/null
+++ b/examples/A.regex
@@ -0,0 +1 @@
+c(S*)
diff --git a/examples/B.regex b/examples/B.regex
new file mode 100644
index 0000000..f2ad6c7
--- /dev/null
+++ b/examples/B.regex
@@ -0,0 +1 @@
+c
diff --git a/examples/correct.nfa b/examples/correct.nfa
new file mode 100644
index 0000000..281d64a
--- /dev/null
+++ b/examples/correct.nfa
@@ -0,0 +1,29 @@
+start   start
+    eps         top1
+    eps         bot1
+
+top1
+    p           top1
+    h           top1
+    c           top2
+
+top2
+    c           top2
+    eps         top3
+
+top3    accepting
+    p           top3
+    h           top3
+
+bot1
+    p           bot1
+    c           bot1
+    h           bot2
+
+bot2
+    h           bot2
+    eps         bot3
+
+bot3    accepting
+    p           bot3
+    c           bot3
diff --git a/examples/correct.regex b/examples/correct.regex
new file mode 100644
index 0000000..c2db374
--- /dev/null
+++ b/examples/correct.regex
@@ -0,0 +1 @@
+((S*)c(S*)c)|((S*)h(S*)h)|((S*)p(S*)p)
diff --git a/examples/incorrect.nfa b/examples/incorrect.nfa
new file mode 100644
index 0000000..cfc285f
--- /dev/null
+++ b/examples/incorrect.nfa
@@ -0,0 +1,29 @@
+start   start
+    eps         top1
+
+top1
+    p           top1
+    h           top1
+    c           top2
+
+top2
+    c           top2
+    eps         top3
+    p           top3
+
+top3    accepting
+    p           top3
+    h           top3
+
+bot1
+    p           bot1
+    c           bot1
+    h           bot2
+
+bot2
+    h           bot2
+    eps         bot3
+
+bot3    accepting
+    p           bot3
+    c           bot3
diff --git a/examples/incorrect.regex b/examples/incorrect.regex
new file mode 100644
index 0000000..c313595
--- /dev/null
+++ b/examples/incorrect.regex
@@ -0,0 +1 @@
+((S*)c(S*)c(S*))|((S*)h(S*)h)|((S*)p(S*)p)
diff --git a/examples/other_correct.nfa b/examples/other_correct.nfa
new file mode 100644
index 0000000..44ddd85
--- /dev/null
+++ b/examples/other_correct.nfa
@@ -0,0 +1,30 @@
+start   start
+    eps         top1
+    eps         bot1
+
+top1
+    p           top1
+    h           top1
+    c           top2
+
+top2
+    c           top2
+    eps         top3
+    p           top3
+
+top3    accepting
+    p           top3
+    h           top3
+
+bot1
+    p           top1
+    c           top1
+    h           top2
+
+bot2
+    h           top2
+    eps         top3
+
+bot3    accepting
+    p           top3
+    c           top3
diff --git a/nfa.py b/nfa.py
new file mode 100644
index 0000000..aba0a50
--- /dev/null
+++ b/nfa.py
@@ -0,0 +1,122 @@
+import itertools
+from collections import defaultdict
+
+EPSILON = ""
+ALPHABET = ("c", "h", "p")
+def strings_of_length(n):
+    for chars in itertools.product(ALPHABET, repeat=n):
+        yield ''.join(chars)
+
+class NFA:
+    def __init__(self):
+        # {int: {str: { int, ... }}}
+        self.transitions = {}
+        self.start_set = set()
+        self.accepting = set()
+        self.counter = 0
+
+    def count(self):
+        self.counter += 1
+        return self.counter
+
+    @property
+    def states(self):
+        return list(self.transitions.keys())
+
+    def add_state(self, name, is_start, is_accepting):
+        self.transitions[name] = defaultdict(set)
+        if is_start:
+            self.start_set.add(name)
+        if is_accepting:
+            self.accepting.add(name)
+        return name
+
+    def init_set(self):
+        return frozenset(self.start_set)
+
+    def transition_set(self, prev, c):
+        result = set()
+        for s in prev:
+            result.update(self.transitions[s][c])
+        return frozenset(result)
+
+    def equivalent_to(self, other):
+        self = self.no_epsilons()
+        other = other.no_epsilons()
+
+        def is_ok(pair):
+            self_set, other_set = pair
+            self_accepts = bool(self.accepting & self_set)
+            other_accepts = bool(other.accepting & other_set)
+            return self_accepts == other_accepts
+
+        initial_pair = (self.init_set(), other.init_set())
+        if not is_ok(initial_pair):
+            return (False, EPSILON)
+
+        pair_to_prefix = {initial_pair: EPSILON}
+        last_step_pairs = [initial_pair]
+        while True:
+            # for pair in last_step_pairs:
+            #     print("\t'" + pair_to_prefix[pair] + "' -> ", pair)
+            next_step_pairs = []
+            for self_set, other_set in last_step_pairs:
+                prefix = pair_to_prefix[(self_set, other_set)]
+                for c in ALPHABET:
+                    next_prefix = prefix + c
+                    next_self_set = self.transition_set(self_set, c)
+                    next_other_set = other.transition_set(other_set, c)
+                    next_pair = (next_self_set, next_other_set)
+                    if next_pair in pair_to_prefix:
+                        continue
+                    if not is_ok(next_pair):
+                        return (False, next_prefix)
+                    pair_to_prefix[next_pair] = next_prefix
+                    next_step_pairs.append(next_pair)
+            if not next_step_pairs:
+                return (True, None)
+            last_step_pairs = next_step_pairs
+
+    def no_epsilons(self):
+        epsilon_reachable = {
+            k: {k} | trans[EPSILON]
+            for k, trans in self.transitions.items()
+        }
+        fixedpoint = False
+        while not fixedpoint:
+            fixedpoint = True
+            for k in self.transitions:
+                new_reachable = epsilon_reachable[k].copy()
+                for r in epsilon_reachable[k]:
+                    new_reachable.update(epsilon_reachable[r])
+                if new_reachable == epsilon_reachable[k]:
+                    continue
+                epsilon_reachable[k] = new_reachable
+                fixedpoint = False
+
+        new_starts = self.start_set.copy()
+        for s in self.start_set:
+            new_starts.update(epsilon_reachable[s])
+
+        new_accepting = self.accepting.copy()
+        for s, reach in epsilon_reachable.items():
+            if reach & self.accepting:
+                new_accepting.add(s)
+
+        new_transitions = {s: defaultdict(set) for s in self.states}
+        for s in self.states:
+            for r in sorted(epsilon_reachable[s]):
+                for c in self.transitions[r]:
+                    if c == EPSILON: continue
+                    new_transitions[s][c].update(self.transitions[r][c])
+        new = NFA()
+        new.transitions = new_transitions
+        new.start_set = new_starts
+        new.accepting = new_accepting
+        return new
+
+    def pprint(self):
+        for state, transitions in self.transitions.items():
+            print(state, state in self.start_set, state in self.accepting)
+            for k, vs in transitions.items():
+                print("\t", state, "[", k, "]", "->", vs)
diff --git a/parsers.py b/parsers.py
new file mode 100644
index 0000000..65c2614
--- /dev/null
+++ b/parsers.py
@@ -0,0 +1,134 @@
+from nfa import *
+
+def parse_nfa(path):
+    nfa = NFA()
+    name2state = dict()
+    # First, find all the state
+    for line in open(path, 'r'):
+        indented = line[0].isspace()
+        line = line.strip()
+        if not line: continue
+        if line.startswith("#"): continue
+        if indented: continue
+        name, *args = line.split()
+        is_start = "start" in args
+        is_accepting = "accepting" in args
+        name2state[name] = nfa.add_state(name, is_start, is_accepting)
+
+    # then add the transitions
+    current_state = None
+    for line in open(path, 'r'):
+        indented = line[0].isspace()
+        line = line.strip()
+        if not line: continue
+        if line.startswith("#"): continue
+        if indented:
+            assert current_state is not None
+            letter, next_state = line.split()
+            if letter == "eps":
+                letter = EPSILON
+            next_state = name2state[next_state]
+            nfa.transitions[current_state][letter].add(next_state)
+        else:
+            name, *_ = line.split()
+            current_state = name2state[name]
+    return nfa
+
+def parse_regex(path):
+    string = open(path, "r").read().strip()
+    ast = regex_to_ast(string)
+    nfa = NFA()
+    def visit(node):
+        if node[0] == "LETTER":
+            start = nfa.add_state(nfa.count(), False, False)
+            end = nfa.add_state(nfa.count(), False, False)
+            nfa.transitions[start][node[1]].add(end)
+            return start, end
+        elif node[0] == "CONCAT":
+            start1, end1 = visit(node[1])
+            start2, end2 = visit(node[2])
+            nfa.transitions[end1][EPSILON].add(start2)
+            return start1, end2
+        elif node[0] == "KLEENE":
+            start, end = visit(node[1])
+            nfa.transitions[start][EPSILON].add(end)
+            nfa.transitions[end][EPSILON].add(start)
+            return start, end
+        elif node[0] == "OR":
+            start1, end1 = visit(node[1])
+            start2, end2 = visit(node[2])
+
+            start = nfa.add_state(nfa.count(), False, False)
+            end = nfa.add_state(nfa.count(), False, False)
+            nfa.transitions[start][EPSILON].add(start1)
+            nfa.transitions[start][EPSILON].add(start2)
+            nfa.transitions[end1][EPSILON].add(end)
+            nfa.transitions[end2][EPSILON].add(end)
+            return start, end
+        else:
+            print(node)
+            raise NotImplementedError
+
+    start, end = visit(ast)
+    nfa.start_set.add(start)
+    nfa.accepting.add(end)
+    return nfa
+
+def pprint_regex_ast(ast):
+    if ast[0] == "LETTER":
+        return ast[1]
+    if ast[0] == "CONCAT":
+        return f"({pprint_regex_ast(ast[1])}{pprint_regex_ast(ast[2])})"
+    if ast[0] == "KLEENE":
+        return f"({pprint_regex_ast(ast[1])})*"
+    if ast[0] == "OR":
+        return f"({pprint_regex_ast(ast[1])}|{pprint_regex_ast(ast[2])})"
+
+def regex_to_ast(string):
+    tokens = match_parens(lex_regex(string))
+    return _regex_to_ast(tokens)
+
+def _regex_to_ast(tokens):
+    assert tokens
+    if len(tokens) == 1 and isinstance(tokens[0], list):
+        return _regex_to_ast(tokens[0])
+    if len(tokens) == 1:
+        return ("LETTER", tokens[0])
+    if '|' in tokens:
+        i = tokens.index('|')
+        return ('OR', _regex_to_ast(tokens[:i]), _regex_to_ast(tokens[i+1:]))
+    if tokens[-1] == '*':
+        return ('KLEENE', _regex_to_ast(tokens[:-1]))
+    return ('CONCAT', _regex_to_ast(tokens[:-1]), _regex_to_ast(tokens[-1:]))
+
+def match_parens(tokens):
+    tokens = tokens.copy()
+    open_stack = []
+    i = 0
+    while i < len(tokens):
+        if tokens[i] == '(':
+            open_stack.append(i)
+        elif tokens[i] == ')':
+            start_i = open_stack.pop(-1)
+            tokens = tokens[:start_i] + [tokens[start_i+1:i]] + tokens[i+1:]
+            i = start_i
+        i += 1
+    return tokens
+
+def lex_regex(string):
+    tokens = []
+    while string:
+        if string[0].isspace():
+            string = string[1:]
+        elif string[0] in ("(", ")", "|", "*") + ALPHABET:
+            tokens.append(string[0])
+            string = string[1:]
+        elif string[0] == "E":
+            tokens.append(EPSILON)
+            string = string[1:]
+        elif string[0] == "S":
+            string = "(" + '|'.join(ALPHABET) + ')' + string[1:]
+        else:
+            print(string)
+            raise NotImplementedError
+    return tokens
author	Matthew Sotoudeh <matthew@masot.net>	2024-03-19 11:21:55 -0700
committer	Matthew Sotoudeh <matthew@masot.net>	2024-03-19 11:21:55 -0700
commit	a55ef8495f79d297c5fbf74c47c3243b3caf68d1 (patch)
tree	3b76715d9fbc0fbe9b0630d78432fa0ad1a5b1cd