diff options
author | Andres Noetzli <noetzli@stanford.edu> | 2019-08-29 19:38:17 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-08-29 19:38:17 -0700 |
commit | 974fc1d23c2b6091c26cf316964c4c16c5e2733f (patch) | |
tree | b8e4b597ffc46194ee37687a56248309a63235b1 /src/theory | |
parent | cc7546ff0a4e418de9a21c03ef12b1d5e8801bb8 (diff) |
Infer conflicts based on regular expression inclusion (#3234)
We have a conflict if we have `str.in.re(x, R1)` and `~str.in.re(x, R2)`
and `R2` includes `R1` because there is no possible value for `x` that
satisfies both memberships. This commit adds code to detect regular
expression inclusion for a small fragment of regular expressions: string
literals with single char (`re.allchar`) and multichar wildcards
(`re.*(re.allchar)`).
Signed-off-by: Andres Noetzli <anoetzli@amazon.com>
Diffstat (limited to 'src/theory')
-rw-r--r-- | src/theory/strings/regexp_operation.cpp | 110 | ||||
-rw-r--r-- | src/theory/strings/regexp_operation.h | 16 | ||||
-rw-r--r-- | src/theory/strings/regexp_solver.cpp | 92 | ||||
-rw-r--r-- | src/theory/strings/regexp_solver.h | 17 | ||||
-rw-r--r-- | src/theory/strings/theory_strings_utils.cpp | 67 | ||||
-rw-r--r-- | src/theory/strings/theory_strings_utils.h | 33 |
6 files changed, 334 insertions, 1 deletions
diff --git a/src/theory/strings/regexp_operation.cpp b/src/theory/strings/regexp_operation.cpp index 7286e2fb4..b410670e5 100644 --- a/src/theory/strings/regexp_operation.cpp +++ b/src/theory/strings/regexp_operation.cpp @@ -19,6 +19,7 @@ #include "expr/kind.h" #include "options/strings_options.h" #include "theory/strings/theory_strings_rewriter.h" +#include "theory/strings/theory_strings_utils.h" using namespace CVC4; using namespace CVC4::kind; @@ -1652,6 +1653,115 @@ std::string RegExpOpr::mkString( Node r ) { return retStr; } +bool RegExpOpr::regExpIncludes(Node r1, Node r2) +{ + Assert(Rewriter::rewrite(r1) == r1); + Assert(Rewriter::rewrite(r2) == r2); + + if (r1 == r2) + { + return true; + } + + // This method only works on a fragment of regular expressions + if (!utils::isSimpleRegExp(r1) || !utils::isSimpleRegExp(r2)) + { + return false; + } + + const auto& it = d_inclusionCache.find(std::make_pair(r1, r2)); + if (it != d_inclusionCache.end()) + { + return (*it).second; + } + + std::vector<Node> v1, v2; + utils::getRegexpComponents(r1, v1); + utils::getRegexpComponents(r2, v2); + + // In the following, we iterate over `r2` (the "includee") and try to + // match it with `r1`. `idxs`/`newIdxs` keep track of all the possible + // positions in `r1` that we could currently be at. + std::unordered_set<size_t> newIdxs = {0}; + std::unordered_set<size_t> idxs; + for (const Node& n2 : v2) + { + // Transfer elements from `newIdxs` to `idxs`. Out-of-bound indices are + // removed and for (re.* re.allchar), we additionally include the + // option of skipping it. + idxs.clear(); + for (size_t idx : newIdxs) + { + if (idx < v1.size()) + { + idxs.insert(idx); + if (v1[idx] == d_sigma_star) + { + Assert(idx + 1 == v1.size() || v1[idx + 1] != d_sigma_star); + idxs.insert(idx + 1); + } + } + } + newIdxs.clear(); + + if (n2.getKind() == STRING_TO_REGEXP || n2 == d_sigma) + { + Assert(n2 == d_sigma + || (n2[0].isConst() && n2[0].getConst<String>().size() == 1)); + for (size_t idx : idxs) + { + if (v1[idx] == d_sigma || v1[idx] == n2) + { + // Given a character or an re.allchar in `r2`, we can either + // match it with a corresponding character in `r1` or an + // re.allchar + newIdxs.insert(idx + 1); + } + } + } + + for (size_t idx : idxs) + { + if (v1[idx] == d_sigma_star) + { + // (re.* re.allchar) can match an arbitrary amount of `r2` + newIdxs.insert(idx); + } + else if (utils::isUnboundedWildcard(v1, idx)) + { + // If a series of re.allchar is followed by (re.* re.allchar), we + // can decide not to "waste" the re.allchar because the order of + // the two wildcards is not observable (i.e. it does not change + // the sequences matched by the regular expression) + newIdxs.insert(idx); + } + } + + if (newIdxs.empty()) + { + // If there are no candidates, we can't match the remainder of r2 + d_inclusionCache[std::make_pair(r1, r2)] = false; + return false; + } + } + + // We have processed all of `r2`. We are now looking if there was also a + // path to the end in `r1`. This makes sure that we don't have leftover + // bits in `r1` that don't match anything in `r2`. + bool result = false; + for (size_t idx : newIdxs) + { + if (idx == v1.size() || (idx == v1.size() - 1 && v1[idx] == d_sigma_star)) + { + result = true; + break; + } + } + + d_inclusionCache[std::make_pair(r1, r2)] = result; + return result; +} + }/* CVC4::theory::strings namespace */ }/* CVC4::theory namespace */ }/* CVC4 namespace */ diff --git a/src/theory/strings/regexp_operation.h b/src/theory/strings/regexp_operation.h index cb35b37c6..117449d35 100644 --- a/src/theory/strings/regexp_operation.h +++ b/src/theory/strings/regexp_operation.h @@ -85,6 +85,7 @@ class RegExpOpr { std::map<Node, Node> d_rm_inter_cache; std::map<Node, bool> d_norv_cache; std::map<Node, std::vector<PairNodes> > d_split_cache; + std::map<PairNodes, bool> d_inclusionCache; void simplifyPRegExp(Node s, Node r, std::vector<Node> &new_nodes); void simplifyNRegExp(Node s, Node r, std::vector<Node> &new_nodes); /** @@ -129,6 +130,21 @@ class RegExpOpr { Node intersect(Node r1, Node r2, bool &spflag); /** Get the pretty printed version of the regular expression r */ static std::string mkString(Node r); + + /** + * Returns true if we can show that the regular expression `r1` includes + * the regular expression `r2` (i.e. `r1` matches a superset of sequences + * that `r2` matches). This method only works on a fragment of regular + * expressions, specifically regular expressions that pass the + * `isSimpleRegExp` check. + * + * @param r1 The regular expression that may include `r2` (must be in + * rewritten form) + * @param r2 The regular expression that may be included by `r1` (must be + * in rewritten form) + * @return True if the inclusion can be shown, false otherwise + */ + bool regExpIncludes(Node r1, Node r2); }; }/* CVC4::theory::strings namespace */ diff --git a/src/theory/strings/regexp_solver.cpp b/src/theory/strings/regexp_solver.cpp index ea38c4dd9..db4c9012c 100644 --- a/src/theory/strings/regexp_solver.cpp +++ b/src/theory/strings/regexp_solver.cpp @@ -65,9 +65,15 @@ void RegExpSolver::check(const std::map<Node, std::vector<Node> >& mems) Trace("regexp-process") << "Checking Memberships ... " << std::endl; for (const std::pair<const Node, std::vector<Node> >& mr : mems) { + std::vector<Node> mems = mr.second; Trace("regexp-process") << "Memberships(" << mr.first << ") = " << mr.second << std::endl; - if (!checkEqcIntersect(mr.second)) + if (!checkEqcInclusion(mems)) + { + // conflict discovered, return + return; + } + if (!checkEqcIntersect(mems)) { // conflict discovered, return return; @@ -271,6 +277,90 @@ void RegExpSolver::check(const std::map<Node, std::vector<Node> >& mems) } } +bool RegExpSolver::checkEqcInclusion(std::vector<Node>& mems) +{ + std::unordered_set<Node, NodeHashFunction> remove; + + for (const Node& m1 : mems) + { + bool m1Neg = m1.getKind() == NOT; + Node m1Lit = m1Neg ? m1[0] : m1; + + if (remove.find(m1) != remove.end()) + { + // Skip memberships marked for removal + continue; + } + + for (const Node& m2 : mems) + { + if (m1 == m2) + { + continue; + } + + bool m2Neg = m2.getKind() == NOT; + Node m2Lit = m2Neg ? m2[0] : m2; + + // Both regular expression memberships have the same polarity + if (m1Neg == m2Neg) + { + if (d_regexp_opr.regExpIncludes(m1Lit[1], m2Lit[1])) + { + if (m1Neg) + { + // ~str.in.re(x, R1) includes ~str.in.re(x, R2) ---> + // mark ~str.in.re(x, R2) as reduced + d_parent.getExtTheory()->markReduced(m2Lit); + remove.insert(m2); + } + else + { + // str.in.re(x, R1) includes str.in.re(x, R2) ---> + // mark str.in.re(x, R1) as reduced + d_parent.getExtTheory()->markReduced(m1Lit); + remove.insert(m1); + + // We don't need to process m1 anymore + break; + } + } + } + else + { + Node pos = m1Neg ? m2Lit : m1Lit; + Node neg = m1Neg ? m1Lit : m2Lit; + if (d_regexp_opr.regExpIncludes(neg[1], pos[1])) + { + // We have a conflict because we have a case where str.in.re(x, R1) + // and ~str.in.re(x, R2) but R2 includes R1, so there is no + // possible value for x that satisfies both memberships. + std::vector<Node> vec_nodes; + vec_nodes.push_back(pos); + vec_nodes.push_back(neg.negate()); + + if (pos[0] != neg[0]) + { + vec_nodes.push_back(pos[0].eqNode(neg[0])); + } + + Node conc; + d_im.sendInference(vec_nodes, conc, "Intersect inclusion", true); + return false; + } + } + } + } + + mems.erase(std::remove_if( + mems.begin(), + mems.end(), + [&remove](Node& n) { return remove.find(n) != remove.end(); }), + mems.end()); + + return true; +} + bool RegExpSolver::checkEqcIntersect(const std::vector<Node>& mems) { // do not compute intersections if the re intersection mode is none diff --git a/src/theory/strings/regexp_solver.h b/src/theory/strings/regexp_solver.h index f3abb2a1d..c4d6afda0 100644 --- a/src/theory/strings/regexp_solver.h +++ b/src/theory/strings/regexp_solver.h @@ -64,6 +64,23 @@ class RegExpSolver private: /** + * Check memberships in equivalence class for regular expression + * inclusion. + * + * This method returns false if it discovered a conflict for this set of + * assertions, and true otherwise. It discovers a conflict e.g. if mems + * contains str.in.re(xi, Ri) and ~str.in.re(xj, Rj) and Rj includes Ri. + * + * @param mems Vector of memberships of the form: (~)str.in.re(x1, R1) + * ... (~)str.in.re(xn, Rn) where x1 = ... = xn in the + * current context. The function removes elements from this + * vector that were marked as reduced. + * @param expForRe Additional explanations for regular expressions. + * @return False if a conflict was detected, true otherwise + */ + bool checkEqcInclusion(std::vector<Node>& mems); + + /** * Check memberships for equivalence class. * The vector mems is a vector of memberships of the form: * (~) (x1 in R1 ) ... (~) (xn in Rn) diff --git a/src/theory/strings/theory_strings_utils.cpp b/src/theory/strings/theory_strings_utils.cpp index d3ae5384e..d764b87c1 100644 --- a/src/theory/strings/theory_strings_utils.cpp +++ b/src/theory/strings/theory_strings_utils.cpp @@ -134,6 +134,73 @@ Node getConstantEndpoint(Node e, bool isSuf) return getConstantComponent(e); } +bool isUnboundedWildcard(const std::vector<Node>& rs, size_t start) +{ + size_t i = start; + while (i < rs.size() && rs[i].getKind() == REGEXP_SIGMA) + { + i++; + } + + if (i >= rs.size()) + { + return false; + } + + return rs[i].getKind() == REGEXP_STAR && rs[i][0].getKind() == REGEXP_SIGMA; +} + +bool isSimpleRegExp(Node r) +{ + Assert(r.getType().isRegExp()); + + std::vector<Node> v; + utils::getConcat(r, v); + for (const Node& n : v) + { + if (n.getKind() == STRING_TO_REGEXP) + { + if (!n[0].isConst()) + { + return false; + } + } + else if (n.getKind() != REGEXP_SIGMA + && (n.getKind() != REGEXP_STAR || n[0].getKind() != REGEXP_SIGMA)) + { + return false; + } + } + return true; +} + +void getRegexpComponents(Node r, std::vector<Node>& result) +{ + Assert(r.getType().isRegExp()); + + NodeManager* nm = NodeManager::currentNM(); + if (r.getKind() == REGEXP_CONCAT) + { + for (const Node& n : r) + { + getRegexpComponents(n, result); + } + } + else if (r.getKind() == STRING_TO_REGEXP && r[0].isConst()) + { + String s = r[0].getConst<String>(); + for (size_t i = 0, size = s.size(); i < size; i++) + { + result.push_back( + nm->mkNode(STRING_TO_REGEXP, nm->mkConst(s.substr(i, 1)))); + } + } + else + { + result.push_back(r); + } +} + } // namespace utils } // namespace strings } // namespace theory diff --git a/src/theory/strings/theory_strings_utils.h b/src/theory/strings/theory_strings_utils.h index 57d882625..cadc98df3 100644 --- a/src/theory/strings/theory_strings_utils.h +++ b/src/theory/strings/theory_strings_utils.h @@ -78,6 +78,39 @@ Node getConstantComponent(Node t); */ Node getConstantEndpoint(Node e, bool isSuf); +/** + * Given a vector of regular expression nodes and a start index that points to + * a wildcard, returns true if the wildcard is unbounded (i.e. it is followed + * by an arbitrary number of `re.allchar`s and then an `re.*(re.allchar)`. If + * the start index is not a wilcard or the wildcards are not followed by + * `re.*(re.allchar)`, the function returns false. + * + * @param rs The vector of regular expression nodes + * @param start The start index to consider + * @return True if the wilcard pointed to by `start` is unbounded, false + * otherwise + */ +bool isUnboundedWildcard(const std::vector<Node>& rs, size_t start); + +/** + * Returns true iff the given regular expression only consists of re.++, + * re.allchar, (re.* re.allchar), and str.to.re of string literals. + * + * @param r The regular expression to check + * @return True if the regular expression is simple + */ +bool isSimpleRegExp(Node r); + +/** + * Helper function that takes a regular expression concatenation and + * returns the components of the concatenation. Letters of string literals + * are treated as individual components. + * + * @param r The regular expression + * @param result The resulting components + */ +void getRegexpComponents(Node r, std::vector<Node>& result); + } // namespace utils } // namespace strings } // namespace theory |