summaryrefslogtreecommitdiff
path: root/src/theory/strings
diff options
context:
space:
mode:
authorAndres Noetzli <noetzli@stanford.edu>2019-08-29 19:38:17 -0700
committerGitHub <noreply@github.com>2019-08-29 19:38:17 -0700
commit974fc1d23c2b6091c26cf316964c4c16c5e2733f (patch)
treeb8e4b597ffc46194ee37687a56248309a63235b1 /src/theory/strings
parentcc7546ff0a4e418de9a21c03ef12b1d5e8801bb8 (diff)
Infer conflicts based on regular expression inclusion (#3234)
We have a conflict if we have `str.in.re(x, R1)` and `~str.in.re(x, R2)` and `R2` includes `R1` because there is no possible value for `x` that satisfies both memberships. This commit adds code to detect regular expression inclusion for a small fragment of regular expressions: string literals with single char (`re.allchar`) and multichar wildcards (`re.*(re.allchar)`). Signed-off-by: Andres Noetzli <anoetzli@amazon.com>
Diffstat (limited to 'src/theory/strings')
-rw-r--r--src/theory/strings/regexp_operation.cpp110
-rw-r--r--src/theory/strings/regexp_operation.h16
-rw-r--r--src/theory/strings/regexp_solver.cpp92
-rw-r--r--src/theory/strings/regexp_solver.h17
-rw-r--r--src/theory/strings/theory_strings_utils.cpp67
-rw-r--r--src/theory/strings/theory_strings_utils.h33
6 files changed, 334 insertions, 1 deletions
diff --git a/src/theory/strings/regexp_operation.cpp b/src/theory/strings/regexp_operation.cpp
index 7286e2fb4..b410670e5 100644
--- a/src/theory/strings/regexp_operation.cpp
+++ b/src/theory/strings/regexp_operation.cpp
@@ -19,6 +19,7 @@
#include "expr/kind.h"
#include "options/strings_options.h"
#include "theory/strings/theory_strings_rewriter.h"
+#include "theory/strings/theory_strings_utils.h"
using namespace CVC4;
using namespace CVC4::kind;
@@ -1652,6 +1653,115 @@ std::string RegExpOpr::mkString( Node r ) {
return retStr;
}
+bool RegExpOpr::regExpIncludes(Node r1, Node r2)
+{
+ Assert(Rewriter::rewrite(r1) == r1);
+ Assert(Rewriter::rewrite(r2) == r2);
+
+ if (r1 == r2)
+ {
+ return true;
+ }
+
+ // This method only works on a fragment of regular expressions
+ if (!utils::isSimpleRegExp(r1) || !utils::isSimpleRegExp(r2))
+ {
+ return false;
+ }
+
+ const auto& it = d_inclusionCache.find(std::make_pair(r1, r2));
+ if (it != d_inclusionCache.end())
+ {
+ return (*it).second;
+ }
+
+ std::vector<Node> v1, v2;
+ utils::getRegexpComponents(r1, v1);
+ utils::getRegexpComponents(r2, v2);
+
+ // In the following, we iterate over `r2` (the "includee") and try to
+ // match it with `r1`. `idxs`/`newIdxs` keep track of all the possible
+ // positions in `r1` that we could currently be at.
+ std::unordered_set<size_t> newIdxs = {0};
+ std::unordered_set<size_t> idxs;
+ for (const Node& n2 : v2)
+ {
+ // Transfer elements from `newIdxs` to `idxs`. Out-of-bound indices are
+ // removed and for (re.* re.allchar), we additionally include the
+ // option of skipping it.
+ idxs.clear();
+ for (size_t idx : newIdxs)
+ {
+ if (idx < v1.size())
+ {
+ idxs.insert(idx);
+ if (v1[idx] == d_sigma_star)
+ {
+ Assert(idx + 1 == v1.size() || v1[idx + 1] != d_sigma_star);
+ idxs.insert(idx + 1);
+ }
+ }
+ }
+ newIdxs.clear();
+
+ if (n2.getKind() == STRING_TO_REGEXP || n2 == d_sigma)
+ {
+ Assert(n2 == d_sigma
+ || (n2[0].isConst() && n2[0].getConst<String>().size() == 1));
+ for (size_t idx : idxs)
+ {
+ if (v1[idx] == d_sigma || v1[idx] == n2)
+ {
+ // Given a character or an re.allchar in `r2`, we can either
+ // match it with a corresponding character in `r1` or an
+ // re.allchar
+ newIdxs.insert(idx + 1);
+ }
+ }
+ }
+
+ for (size_t idx : idxs)
+ {
+ if (v1[idx] == d_sigma_star)
+ {
+ // (re.* re.allchar) can match an arbitrary amount of `r2`
+ newIdxs.insert(idx);
+ }
+ else if (utils::isUnboundedWildcard(v1, idx))
+ {
+ // If a series of re.allchar is followed by (re.* re.allchar), we
+ // can decide not to "waste" the re.allchar because the order of
+ // the two wildcards is not observable (i.e. it does not change
+ // the sequences matched by the regular expression)
+ newIdxs.insert(idx);
+ }
+ }
+
+ if (newIdxs.empty())
+ {
+ // If there are no candidates, we can't match the remainder of r2
+ d_inclusionCache[std::make_pair(r1, r2)] = false;
+ return false;
+ }
+ }
+
+ // We have processed all of `r2`. We are now looking if there was also a
+ // path to the end in `r1`. This makes sure that we don't have leftover
+ // bits in `r1` that don't match anything in `r2`.
+ bool result = false;
+ for (size_t idx : newIdxs)
+ {
+ if (idx == v1.size() || (idx == v1.size() - 1 && v1[idx] == d_sigma_star))
+ {
+ result = true;
+ break;
+ }
+ }
+
+ d_inclusionCache[std::make_pair(r1, r2)] = result;
+ return result;
+}
+
}/* CVC4::theory::strings namespace */
}/* CVC4::theory namespace */
}/* CVC4 namespace */
diff --git a/src/theory/strings/regexp_operation.h b/src/theory/strings/regexp_operation.h
index cb35b37c6..117449d35 100644
--- a/src/theory/strings/regexp_operation.h
+++ b/src/theory/strings/regexp_operation.h
@@ -85,6 +85,7 @@ class RegExpOpr {
std::map<Node, Node> d_rm_inter_cache;
std::map<Node, bool> d_norv_cache;
std::map<Node, std::vector<PairNodes> > d_split_cache;
+ std::map<PairNodes, bool> d_inclusionCache;
void simplifyPRegExp(Node s, Node r, std::vector<Node> &new_nodes);
void simplifyNRegExp(Node s, Node r, std::vector<Node> &new_nodes);
/**
@@ -129,6 +130,21 @@ class RegExpOpr {
Node intersect(Node r1, Node r2, bool &spflag);
/** Get the pretty printed version of the regular expression r */
static std::string mkString(Node r);
+
+ /**
+ * Returns true if we can show that the regular expression `r1` includes
+ * the regular expression `r2` (i.e. `r1` matches a superset of sequences
+ * that `r2` matches). This method only works on a fragment of regular
+ * expressions, specifically regular expressions that pass the
+ * `isSimpleRegExp` check.
+ *
+ * @param r1 The regular expression that may include `r2` (must be in
+ * rewritten form)
+ * @param r2 The regular expression that may be included by `r1` (must be
+ * in rewritten form)
+ * @return True if the inclusion can be shown, false otherwise
+ */
+ bool regExpIncludes(Node r1, Node r2);
};
}/* CVC4::theory::strings namespace */
diff --git a/src/theory/strings/regexp_solver.cpp b/src/theory/strings/regexp_solver.cpp
index ea38c4dd9..db4c9012c 100644
--- a/src/theory/strings/regexp_solver.cpp
+++ b/src/theory/strings/regexp_solver.cpp
@@ -65,9 +65,15 @@ void RegExpSolver::check(const std::map<Node, std::vector<Node> >& mems)
Trace("regexp-process") << "Checking Memberships ... " << std::endl;
for (const std::pair<const Node, std::vector<Node> >& mr : mems)
{
+ std::vector<Node> mems = mr.second;
Trace("regexp-process")
<< "Memberships(" << mr.first << ") = " << mr.second << std::endl;
- if (!checkEqcIntersect(mr.second))
+ if (!checkEqcInclusion(mems))
+ {
+ // conflict discovered, return
+ return;
+ }
+ if (!checkEqcIntersect(mems))
{
// conflict discovered, return
return;
@@ -271,6 +277,90 @@ void RegExpSolver::check(const std::map<Node, std::vector<Node> >& mems)
}
}
+bool RegExpSolver::checkEqcInclusion(std::vector<Node>& mems)
+{
+ std::unordered_set<Node, NodeHashFunction> remove;
+
+ for (const Node& m1 : mems)
+ {
+ bool m1Neg = m1.getKind() == NOT;
+ Node m1Lit = m1Neg ? m1[0] : m1;
+
+ if (remove.find(m1) != remove.end())
+ {
+ // Skip memberships marked for removal
+ continue;
+ }
+
+ for (const Node& m2 : mems)
+ {
+ if (m1 == m2)
+ {
+ continue;
+ }
+
+ bool m2Neg = m2.getKind() == NOT;
+ Node m2Lit = m2Neg ? m2[0] : m2;
+
+ // Both regular expression memberships have the same polarity
+ if (m1Neg == m2Neg)
+ {
+ if (d_regexp_opr.regExpIncludes(m1Lit[1], m2Lit[1]))
+ {
+ if (m1Neg)
+ {
+ // ~str.in.re(x, R1) includes ~str.in.re(x, R2) --->
+ // mark ~str.in.re(x, R2) as reduced
+ d_parent.getExtTheory()->markReduced(m2Lit);
+ remove.insert(m2);
+ }
+ else
+ {
+ // str.in.re(x, R1) includes str.in.re(x, R2) --->
+ // mark str.in.re(x, R1) as reduced
+ d_parent.getExtTheory()->markReduced(m1Lit);
+ remove.insert(m1);
+
+ // We don't need to process m1 anymore
+ break;
+ }
+ }
+ }
+ else
+ {
+ Node pos = m1Neg ? m2Lit : m1Lit;
+ Node neg = m1Neg ? m1Lit : m2Lit;
+ if (d_regexp_opr.regExpIncludes(neg[1], pos[1]))
+ {
+ // We have a conflict because we have a case where str.in.re(x, R1)
+ // and ~str.in.re(x, R2) but R2 includes R1, so there is no
+ // possible value for x that satisfies both memberships.
+ std::vector<Node> vec_nodes;
+ vec_nodes.push_back(pos);
+ vec_nodes.push_back(neg.negate());
+
+ if (pos[0] != neg[0])
+ {
+ vec_nodes.push_back(pos[0].eqNode(neg[0]));
+ }
+
+ Node conc;
+ d_im.sendInference(vec_nodes, conc, "Intersect inclusion", true);
+ return false;
+ }
+ }
+ }
+ }
+
+ mems.erase(std::remove_if(
+ mems.begin(),
+ mems.end(),
+ [&remove](Node& n) { return remove.find(n) != remove.end(); }),
+ mems.end());
+
+ return true;
+}
+
bool RegExpSolver::checkEqcIntersect(const std::vector<Node>& mems)
{
// do not compute intersections if the re intersection mode is none
diff --git a/src/theory/strings/regexp_solver.h b/src/theory/strings/regexp_solver.h
index f3abb2a1d..c4d6afda0 100644
--- a/src/theory/strings/regexp_solver.h
+++ b/src/theory/strings/regexp_solver.h
@@ -64,6 +64,23 @@ class RegExpSolver
private:
/**
+ * Check memberships in equivalence class for regular expression
+ * inclusion.
+ *
+ * This method returns false if it discovered a conflict for this set of
+ * assertions, and true otherwise. It discovers a conflict e.g. if mems
+ * contains str.in.re(xi, Ri) and ~str.in.re(xj, Rj) and Rj includes Ri.
+ *
+ * @param mems Vector of memberships of the form: (~)str.in.re(x1, R1)
+ * ... (~)str.in.re(xn, Rn) where x1 = ... = xn in the
+ * current context. The function removes elements from this
+ * vector that were marked as reduced.
+ * @param expForRe Additional explanations for regular expressions.
+ * @return False if a conflict was detected, true otherwise
+ */
+ bool checkEqcInclusion(std::vector<Node>& mems);
+
+ /**
* Check memberships for equivalence class.
* The vector mems is a vector of memberships of the form:
* (~) (x1 in R1 ) ... (~) (xn in Rn)
diff --git a/src/theory/strings/theory_strings_utils.cpp b/src/theory/strings/theory_strings_utils.cpp
index d3ae5384e..d764b87c1 100644
--- a/src/theory/strings/theory_strings_utils.cpp
+++ b/src/theory/strings/theory_strings_utils.cpp
@@ -134,6 +134,73 @@ Node getConstantEndpoint(Node e, bool isSuf)
return getConstantComponent(e);
}
+bool isUnboundedWildcard(const std::vector<Node>& rs, size_t start)
+{
+ size_t i = start;
+ while (i < rs.size() && rs[i].getKind() == REGEXP_SIGMA)
+ {
+ i++;
+ }
+
+ if (i >= rs.size())
+ {
+ return false;
+ }
+
+ return rs[i].getKind() == REGEXP_STAR && rs[i][0].getKind() == REGEXP_SIGMA;
+}
+
+bool isSimpleRegExp(Node r)
+{
+ Assert(r.getType().isRegExp());
+
+ std::vector<Node> v;
+ utils::getConcat(r, v);
+ for (const Node& n : v)
+ {
+ if (n.getKind() == STRING_TO_REGEXP)
+ {
+ if (!n[0].isConst())
+ {
+ return false;
+ }
+ }
+ else if (n.getKind() != REGEXP_SIGMA
+ && (n.getKind() != REGEXP_STAR || n[0].getKind() != REGEXP_SIGMA))
+ {
+ return false;
+ }
+ }
+ return true;
+}
+
+void getRegexpComponents(Node r, std::vector<Node>& result)
+{
+ Assert(r.getType().isRegExp());
+
+ NodeManager* nm = NodeManager::currentNM();
+ if (r.getKind() == REGEXP_CONCAT)
+ {
+ for (const Node& n : r)
+ {
+ getRegexpComponents(n, result);
+ }
+ }
+ else if (r.getKind() == STRING_TO_REGEXP && r[0].isConst())
+ {
+ String s = r[0].getConst<String>();
+ for (size_t i = 0, size = s.size(); i < size; i++)
+ {
+ result.push_back(
+ nm->mkNode(STRING_TO_REGEXP, nm->mkConst(s.substr(i, 1))));
+ }
+ }
+ else
+ {
+ result.push_back(r);
+ }
+}
+
} // namespace utils
} // namespace strings
} // namespace theory
diff --git a/src/theory/strings/theory_strings_utils.h b/src/theory/strings/theory_strings_utils.h
index 57d882625..cadc98df3 100644
--- a/src/theory/strings/theory_strings_utils.h
+++ b/src/theory/strings/theory_strings_utils.h
@@ -78,6 +78,39 @@ Node getConstantComponent(Node t);
*/
Node getConstantEndpoint(Node e, bool isSuf);
+/**
+ * Given a vector of regular expression nodes and a start index that points to
+ * a wildcard, returns true if the wildcard is unbounded (i.e. it is followed
+ * by an arbitrary number of `re.allchar`s and then an `re.*(re.allchar)`. If
+ * the start index is not a wilcard or the wildcards are not followed by
+ * `re.*(re.allchar)`, the function returns false.
+ *
+ * @param rs The vector of regular expression nodes
+ * @param start The start index to consider
+ * @return True if the wilcard pointed to by `start` is unbounded, false
+ * otherwise
+ */
+bool isUnboundedWildcard(const std::vector<Node>& rs, size_t start);
+
+/**
+ * Returns true iff the given regular expression only consists of re.++,
+ * re.allchar, (re.* re.allchar), and str.to.re of string literals.
+ *
+ * @param r The regular expression to check
+ * @return True if the regular expression is simple
+ */
+bool isSimpleRegExp(Node r);
+
+/**
+ * Helper function that takes a regular expression concatenation and
+ * returns the components of the concatenation. Letters of string literals
+ * are treated as individual components.
+ *
+ * @param r The regular expression
+ * @param result The resulting components
+ */
+void getRegexpComponents(Node r, std::vector<Node>& result);
+
} // namespace utils
} // namespace strings
} // namespace theory
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback