summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Sotoudeh <matthew@masot.net>2024-03-19 11:21:55 -0700
committerMatthew Sotoudeh <matthew@masot.net>2024-03-19 11:21:55 -0700
commita55ef8495f79d297c5fbf74c47c3243b3caf68d1 (patch)
tree3b76715d9fbc0fbe9b0630d78432fa0ad1a5b1cd
done
-rw-r--r--.gitignore3
-rw-r--r--checker.py21
-rw-r--r--examples/A.regex1
-rw-r--r--examples/B.regex1
-rw-r--r--examples/correct.nfa29
-rw-r--r--examples/correct.regex1
-rw-r--r--examples/incorrect.nfa29
-rw-r--r--examples/incorrect.regex1
-rw-r--r--examples/other_correct.nfa30
-rw-r--r--nfa.py122
-rw-r--r--parsers.py134
11 files changed, 372 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..956b5ab
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+.*.sw*
+.sw*
+__pycache__
diff --git a/checker.py b/checker.py
new file mode 100644
index 0000000..eab49e6
--- /dev/null
+++ b/checker.py
@@ -0,0 +1,21 @@
+import sys
+from parsers import *
+
+arg1, arg2 = sys.argv[1:]
+if arg1.endswith(".nfa"):
+ arg1 = parse_nfa(arg1)
+elif arg1.endswith(".regex"):
+ arg1 = parse_regex(arg1)
+else: raise NotImplementedError
+
+if arg2.endswith(".nfa"):
+ arg2 = parse_nfa(arg2)
+elif arg2.endswith(".regex"):
+ arg2 = parse_regex(arg2)
+else: raise NotImplementedError
+
+equiv, cex = arg1.equivalent_to(arg2)
+if not equiv:
+ print(f"They are NOT equal; they differ on the string '{cex}'")
+else:
+ print(f"They ARE equal")
diff --git a/examples/A.regex b/examples/A.regex
new file mode 100644
index 0000000..a546203
--- /dev/null
+++ b/examples/A.regex
@@ -0,0 +1 @@
+c(S*)
diff --git a/examples/B.regex b/examples/B.regex
new file mode 100644
index 0000000..f2ad6c7
--- /dev/null
+++ b/examples/B.regex
@@ -0,0 +1 @@
+c
diff --git a/examples/correct.nfa b/examples/correct.nfa
new file mode 100644
index 0000000..281d64a
--- /dev/null
+++ b/examples/correct.nfa
@@ -0,0 +1,29 @@
+start start
+ eps top1
+ eps bot1
+
+top1
+ p top1
+ h top1
+ c top2
+
+top2
+ c top2
+ eps top3
+
+top3 accepting
+ p top3
+ h top3
+
+bot1
+ p bot1
+ c bot1
+ h bot2
+
+bot2
+ h bot2
+ eps bot3
+
+bot3 accepting
+ p bot3
+ c bot3
diff --git a/examples/correct.regex b/examples/correct.regex
new file mode 100644
index 0000000..c2db374
--- /dev/null
+++ b/examples/correct.regex
@@ -0,0 +1 @@
+((S*)c(S*)c)|((S*)h(S*)h)|((S*)p(S*)p)
diff --git a/examples/incorrect.nfa b/examples/incorrect.nfa
new file mode 100644
index 0000000..cfc285f
--- /dev/null
+++ b/examples/incorrect.nfa
@@ -0,0 +1,29 @@
+start start
+ eps top1
+
+top1
+ p top1
+ h top1
+ c top2
+
+top2
+ c top2
+ eps top3
+ p top3
+
+top3 accepting
+ p top3
+ h top3
+
+bot1
+ p bot1
+ c bot1
+ h bot2
+
+bot2
+ h bot2
+ eps bot3
+
+bot3 accepting
+ p bot3
+ c bot3
diff --git a/examples/incorrect.regex b/examples/incorrect.regex
new file mode 100644
index 0000000..c313595
--- /dev/null
+++ b/examples/incorrect.regex
@@ -0,0 +1 @@
+((S*)c(S*)c(S*))|((S*)h(S*)h)|((S*)p(S*)p)
diff --git a/examples/other_correct.nfa b/examples/other_correct.nfa
new file mode 100644
index 0000000..44ddd85
--- /dev/null
+++ b/examples/other_correct.nfa
@@ -0,0 +1,30 @@
+start start
+ eps top1
+ eps bot1
+
+top1
+ p top1
+ h top1
+ c top2
+
+top2
+ c top2
+ eps top3
+ p top3
+
+top3 accepting
+ p top3
+ h top3
+
+bot1
+ p top1
+ c top1
+ h top2
+
+bot2
+ h top2
+ eps top3
+
+bot3 accepting
+ p top3
+ c top3
diff --git a/nfa.py b/nfa.py
new file mode 100644
index 0000000..aba0a50
--- /dev/null
+++ b/nfa.py
@@ -0,0 +1,122 @@
+import itertools
+from collections import defaultdict
+
+EPSILON = ""
+ALPHABET = ("c", "h", "p")
+def strings_of_length(n):
+ for chars in itertools.product(ALPHABET, repeat=n):
+ yield ''.join(chars)
+
+class NFA:
+ def __init__(self):
+ # {int: {str: { int, ... }}}
+ self.transitions = {}
+ self.start_set = set()
+ self.accepting = set()
+ self.counter = 0
+
+ def count(self):
+ self.counter += 1
+ return self.counter
+
+ @property
+ def states(self):
+ return list(self.transitions.keys())
+
+ def add_state(self, name, is_start, is_accepting):
+ self.transitions[name] = defaultdict(set)
+ if is_start:
+ self.start_set.add(name)
+ if is_accepting:
+ self.accepting.add(name)
+ return name
+
+ def init_set(self):
+ return frozenset(self.start_set)
+
+ def transition_set(self, prev, c):
+ result = set()
+ for s in prev:
+ result.update(self.transitions[s][c])
+ return frozenset(result)
+
+ def equivalent_to(self, other):
+ self = self.no_epsilons()
+ other = other.no_epsilons()
+
+ def is_ok(pair):
+ self_set, other_set = pair
+ self_accepts = bool(self.accepting & self_set)
+ other_accepts = bool(other.accepting & other_set)
+ return self_accepts == other_accepts
+
+ initial_pair = (self.init_set(), other.init_set())
+ if not is_ok(initial_pair):
+ return (False, EPSILON)
+
+ pair_to_prefix = {initial_pair: EPSILON}
+ last_step_pairs = [initial_pair]
+ while True:
+ # for pair in last_step_pairs:
+ # print("\t'" + pair_to_prefix[pair] + "' -> ", pair)
+ next_step_pairs = []
+ for self_set, other_set in last_step_pairs:
+ prefix = pair_to_prefix[(self_set, other_set)]
+ for c in ALPHABET:
+ next_prefix = prefix + c
+ next_self_set = self.transition_set(self_set, c)
+ next_other_set = other.transition_set(other_set, c)
+ next_pair = (next_self_set, next_other_set)
+ if next_pair in pair_to_prefix:
+ continue
+ if not is_ok(next_pair):
+ return (False, next_prefix)
+ pair_to_prefix[next_pair] = next_prefix
+ next_step_pairs.append(next_pair)
+ if not next_step_pairs:
+ return (True, None)
+ last_step_pairs = next_step_pairs
+
+ def no_epsilons(self):
+ epsilon_reachable = {
+ k: {k} | trans[EPSILON]
+ for k, trans in self.transitions.items()
+ }
+ fixedpoint = False
+ while not fixedpoint:
+ fixedpoint = True
+ for k in self.transitions:
+ new_reachable = epsilon_reachable[k].copy()
+ for r in epsilon_reachable[k]:
+ new_reachable.update(epsilon_reachable[r])
+ if new_reachable == epsilon_reachable[k]:
+ continue
+ epsilon_reachable[k] = new_reachable
+ fixedpoint = False
+
+ new_starts = self.start_set.copy()
+ for s in self.start_set:
+ new_starts.update(epsilon_reachable[s])
+
+ new_accepting = self.accepting.copy()
+ for s, reach in epsilon_reachable.items():
+ if reach & self.accepting:
+ new_accepting.add(s)
+
+ new_transitions = {s: defaultdict(set) for s in self.states}
+ for s in self.states:
+ for r in sorted(epsilon_reachable[s]):
+ for c in self.transitions[r]:
+ if c == EPSILON: continue
+ new_transitions[s][c].update(self.transitions[r][c])
+ new = NFA()
+ new.transitions = new_transitions
+ new.start_set = new_starts
+ new.accepting = new_accepting
+ return new
+
+ def pprint(self):
+ for state, transitions in self.transitions.items():
+ print(state, state in self.start_set, state in self.accepting)
+ for k, vs in transitions.items():
+ print("\t", state, "[", k, "]", "->", vs)
diff --git a/parsers.py b/parsers.py
new file mode 100644
index 0000000..65c2614
--- /dev/null
+++ b/parsers.py
@@ -0,0 +1,134 @@
+from nfa import *
+
+def parse_nfa(path):
+ nfa = NFA()
+ name2state = dict()
+ # First, find all the state
+ for line in open(path, 'r'):
+ indented = line[0].isspace()
+ line = line.strip()
+ if not line: continue
+ if line.startswith("#"): continue
+ if indented: continue
+ name, *args = line.split()
+ is_start = "start" in args
+ is_accepting = "accepting" in args
+ name2state[name] = nfa.add_state(name, is_start, is_accepting)
+
+ # then add the transitions
+ current_state = None
+ for line in open(path, 'r'):
+ indented = line[0].isspace()
+ line = line.strip()
+ if not line: continue
+ if line.startswith("#"): continue
+ if indented:
+ assert current_state is not None
+ letter, next_state = line.split()
+ if letter == "eps":
+ letter = EPSILON
+ next_state = name2state[next_state]
+ nfa.transitions[current_state][letter].add(next_state)
+ else:
+ name, *_ = line.split()
+ current_state = name2state[name]
+ return nfa
+
+def parse_regex(path):
+ string = open(path, "r").read().strip()
+ ast = regex_to_ast(string)
+ nfa = NFA()
+ def visit(node):
+ if node[0] == "LETTER":
+ start = nfa.add_state(nfa.count(), False, False)
+ end = nfa.add_state(nfa.count(), False, False)
+ nfa.transitions[start][node[1]].add(end)
+ return start, end
+ elif node[0] == "CONCAT":
+ start1, end1 = visit(node[1])
+ start2, end2 = visit(node[2])
+ nfa.transitions[end1][EPSILON].add(start2)
+ return start1, end2
+ elif node[0] == "KLEENE":
+ start, end = visit(node[1])
+ nfa.transitions[start][EPSILON].add(end)
+ nfa.transitions[end][EPSILON].add(start)
+ return start, end
+ elif node[0] == "OR":
+ start1, end1 = visit(node[1])
+ start2, end2 = visit(node[2])
+
+ start = nfa.add_state(nfa.count(), False, False)
+ end = nfa.add_state(nfa.count(), False, False)
+ nfa.transitions[start][EPSILON].add(start1)
+ nfa.transitions[start][EPSILON].add(start2)
+ nfa.transitions[end1][EPSILON].add(end)
+ nfa.transitions[end2][EPSILON].add(end)
+ return start, end
+ else:
+ print(node)
+ raise NotImplementedError
+
+ start, end = visit(ast)
+ nfa.start_set.add(start)
+ nfa.accepting.add(end)
+ return nfa
+
+def pprint_regex_ast(ast):
+ if ast[0] == "LETTER":
+ return ast[1]
+ if ast[0] == "CONCAT":
+ return f"({pprint_regex_ast(ast[1])}{pprint_regex_ast(ast[2])})"
+ if ast[0] == "KLEENE":
+ return f"({pprint_regex_ast(ast[1])})*"
+ if ast[0] == "OR":
+ return f"({pprint_regex_ast(ast[1])}|{pprint_regex_ast(ast[2])})"
+
+def regex_to_ast(string):
+ tokens = match_parens(lex_regex(string))
+ return _regex_to_ast(tokens)
+
+def _regex_to_ast(tokens):
+ assert tokens
+ if len(tokens) == 1 and isinstance(tokens[0], list):
+ return _regex_to_ast(tokens[0])
+ if len(tokens) == 1:
+ return ("LETTER", tokens[0])
+ if '|' in tokens:
+ i = tokens.index('|')
+ return ('OR', _regex_to_ast(tokens[:i]), _regex_to_ast(tokens[i+1:]))
+ if tokens[-1] == '*':
+ return ('KLEENE', _regex_to_ast(tokens[:-1]))
+ return ('CONCAT', _regex_to_ast(tokens[:-1]), _regex_to_ast(tokens[-1:]))
+
+def match_parens(tokens):
+ tokens = tokens.copy()
+ open_stack = []
+ i = 0
+ while i < len(tokens):
+ if tokens[i] == '(':
+ open_stack.append(i)
+ elif tokens[i] == ')':
+ start_i = open_stack.pop(-1)
+ tokens = tokens[:start_i] + [tokens[start_i+1:i]] + tokens[i+1:]
+ i = start_i
+ i += 1
+ return tokens
+
+def lex_regex(string):
+ tokens = []
+ while string:
+ if string[0].isspace():
+ string = string[1:]
+ elif string[0] in ("(", ")", "|", "*") + ALPHABET:
+ tokens.append(string[0])
+ string = string[1:]
+ elif string[0] == "E":
+ tokens.append(EPSILON)
+ string = string[1:]
+ elif string[0] == "S":
+ string = "(" + '|'.join(ALPHABET) + ')' + string[1:]
+ else:
+ print(string)
+ raise NotImplementedError
+ return tokens
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback