Infer conflicts based on regular expression inclusion (#3234)

We have a conflict if we have `str.in.re(x, R1)` and `~str.in.re(x, R2)` and `R2` includes `R1` because there is no possible value for `x` that satisfies both memberships. This commit adds code to detect regular expression inclusion for a small fragment of regular expressions: string literals with single char (`re.allchar`) and multichar wildcards (`re.*(re.allchar)`). Signed-off-by: Andres Noetzli <anoetzli@amazon.com>
author: Andres Noetzli <noetzli@stanford.edu> 2019-08-29 19:38:17 -0700
committer: GitHub <noreply@github.com> 2019-08-29 19:38:17 -0700
commit: 974fc1d23c2b6091c26cf316964c4c16c5e2733f (patch)
tree: b8e4b597ffc46194ee37687a56248309a63235b1 /src/theory/strings
parent: cc7546ff0a4e418de9a21c03ef12b1d5e8801bb8 (diff)
6 files changed, 334 insertions, 1 deletions
diff --git a/src/theory/strings/regexp_operation.cpp b/src/theory/strings/regexp_operation.cpp
index 7286e2fb4..b410670e5 100644
--- a/src/theory/strings/regexp_operation.cpp
+++ b/src/theory/strings/regexp_operation.cpp
@@ -19,6 +19,7 @@
 #include "expr/kind.h"
 #include "options/strings_options.h"
 #include "theory/strings/theory_strings_rewriter.h"
+#include "theory/strings/theory_strings_utils.h"
 
 using namespace CVC4;
 using namespace CVC4::kind;
@@ -1652,6 +1653,115 @@ std::string RegExpOpr::mkString( Node r ) {
   return retStr;
 }
 
+bool RegExpOpr::regExpIncludes(Node r1, Node r2)
+{
+  Assert(Rewriter::rewrite(r1) == r1);
+  Assert(Rewriter::rewrite(r2) == r2);
+
+  if (r1 == r2)
+  {
+    return true;
+  }
+
+  // This method only works on a fragment of regular expressions
+  if (!utils::isSimpleRegExp(r1) || !utils::isSimpleRegExp(r2))
+  {
+    return false;
+  }
+
+  const auto& it = d_inclusionCache.find(std::make_pair(r1, r2));
+  if (it != d_inclusionCache.end())
+  {
+    return (*it).second;
+  }
+
+  std::vector<Node> v1, v2;
+  utils::getRegexpComponents(r1, v1);
+  utils::getRegexpComponents(r2, v2);
+
+  // In the following, we iterate over `r2` (the "includee") and try to
+  // match it with `r1`. `idxs`/`newIdxs` keep track of all the possible
+  // positions in `r1` that we could currently be at.
+  std::unordered_set<size_t> newIdxs = {0};
+  std::unordered_set<size_t> idxs;
+  for (const Node& n2 : v2)
+  {
+    // Transfer elements from `newIdxs` to `idxs`. Out-of-bound indices are
+    // removed and for (re.* re.allchar), we additionally include the
+    // option of skipping it.
+    idxs.clear();
+    for (size_t idx : newIdxs)
+    {
+      if (idx < v1.size())
+      {
+        idxs.insert(idx);
+        if (v1[idx] == d_sigma_star)
+        {
+          Assert(idx + 1 == v1.size() || v1[idx + 1] != d_sigma_star);
+          idxs.insert(idx + 1);
+        }
+      }
+    }
+    newIdxs.clear();
+
+    if (n2.getKind() == STRING_TO_REGEXP || n2 == d_sigma)
+    {
+      Assert(n2 == d_sigma
+             || (n2[0].isConst() && n2[0].getConst<String>().size() == 1));
+      for (size_t idx : idxs)
+      {
+        if (v1[idx] == d_sigma || v1[idx] == n2)
+        {
+          // Given a character or an re.allchar in `r2`, we can either
+          // match it with a corresponding character in `r1` or an
+          // re.allchar
+          newIdxs.insert(idx + 1);
+        }
+      }
+    }
+
+    for (size_t idx : idxs)
+    {
+      if (v1[idx] == d_sigma_star)
+      {
+        // (re.* re.allchar) can match an arbitrary amount of `r2`
+        newIdxs.insert(idx);
+      }
+      else if (utils::isUnboundedWildcard(v1, idx))
+      {
+        // If a series of re.allchar is followed by (re.* re.allchar), we
+        // can decide not to "waste" the re.allchar because the order of
+        // the two wildcards is not observable (i.e. it does not change
+        // the sequences matched by the regular expression)
+        newIdxs.insert(idx);
+      }
+    }
+
+    if (newIdxs.empty())
+    {
+      // If there are no candidates, we can't match the remainder of r2
+      d_inclusionCache[std::make_pair(r1, r2)] = false;
+      return false;
+    }
+  }
+
+  // We have processed all of `r2`. We are now looking if there was also a
+  // path to the end in `r1`. This makes sure that we don't have leftover
+  // bits in `r1` that don't match anything in `r2`.
+  bool result = false;
+  for (size_t idx : newIdxs)
+  {
+    if (idx == v1.size() || (idx == v1.size() - 1 && v1[idx] == d_sigma_star))
+    {
+      result = true;
+      break;
+    }
+  }
+
+  d_inclusionCache[std::make_pair(r1, r2)] = result;
+  return result;
+}
+
 }/* CVC4::theory::strings namespace */
 }/* CVC4::theory namespace */
 }/* CVC4 namespace */
diff --git a/src/theory/strings/regexp_operation.h b/src/theory/strings/regexp_operation.h
index cb35b37c6..117449d35 100644
--- a/src/theory/strings/regexp_operation.h
+++ b/src/theory/strings/regexp_operation.h
@@ -85,6 +85,7 @@ class RegExpOpr {
   std::map<Node, Node> d_rm_inter_cache;
   std::map<Node, bool> d_norv_cache;
   std::map<Node, std::vector<PairNodes> > d_split_cache;
+  std::map<PairNodes, bool> d_inclusionCache;
   void simplifyPRegExp(Node s, Node r, std::vector<Node> &new_nodes);
   void simplifyNRegExp(Node s, Node r, std::vector<Node> &new_nodes);
   /**
@@ -129,6 +130,21 @@ class RegExpOpr {
   Node intersect(Node r1, Node r2, bool &spflag);
   /** Get the pretty printed version of the regular expression r */
   static std::string mkString(Node r);
+
+  /**
+   * Returns true if we can show that the regular expression `r1` includes
+   * the regular expression `r2` (i.e. `r1` matches a superset of sequences
+   * that `r2` matches). This method only works on a fragment of regular
+   * expressions, specifically regular expressions that pass the
+   * `isSimpleRegExp` check.
+   *
+   * @param r1 The regular expression that may include `r2` (must be in
+   *           rewritten form)
+   * @param r2 The regular expression that may be included by `r1` (must be
+   *           in rewritten form)
+   * @return True if the inclusion can be shown, false otherwise
+   */
+  bool regExpIncludes(Node r1, Node r2);
 };
 
 }/* CVC4::theory::strings namespace */
diff --git a/src/theory/strings/regexp_solver.cpp b/src/theory/strings/regexp_solver.cpp
index ea38c4dd9..db4c9012c 100644
--- a/src/theory/strings/regexp_solver.cpp
+++ b/src/theory/strings/regexp_solver.cpp
@@ -65,9 +65,15 @@ void RegExpSolver::check(const std::map<Node, std::vector<Node> >& mems)
   Trace("regexp-process") << "Checking Memberships ... " << std::endl;
   for (const std::pair<const Node, std::vector<Node> >& mr : mems)
   {
+    std::vector<Node> mems = mr.second;
     Trace("regexp-process")
         << "Memberships(" << mr.first << ") = " << mr.second << std::endl;
-    if (!checkEqcIntersect(mr.second))
+    if (!checkEqcInclusion(mems))
+    {
+      // conflict discovered, return
+      return;
+    }
+    if (!checkEqcIntersect(mems))
     {
       // conflict discovered, return
       return;
@@ -271,6 +277,90 @@ void RegExpSolver::check(const std::map<Node, std::vector<Node> >& mems)
   }
 }
 
+bool RegExpSolver::checkEqcInclusion(std::vector<Node>& mems)
+{
+  std::unordered_set<Node, NodeHashFunction> remove;
+
+  for (const Node& m1 : mems)
+  {
+    bool m1Neg = m1.getKind() == NOT;
+    Node m1Lit = m1Neg ? m1[0] : m1;
+
+    if (remove.find(m1) != remove.end())
+    {
+      // Skip memberships marked for removal
+      continue;
+    }
+
+    for (const Node& m2 : mems)
+    {
+      if (m1 == m2)
+      {
+        continue;
+      }
+
+      bool m2Neg = m2.getKind() == NOT;
+      Node m2Lit = m2Neg ? m2[0] : m2;
+
+      // Both regular expression memberships have the same polarity
+      if (m1Neg == m2Neg)
+      {
+        if (d_regexp_opr.regExpIncludes(m1Lit[1], m2Lit[1]))
+        {
+          if (m1Neg)
+          {
+            // ~str.in.re(x, R1) includes ~str.in.re(x, R2) --->
+            //   mark ~str.in.re(x, R2) as reduced
+            d_parent.getExtTheory()->markReduced(m2Lit);
+            remove.insert(m2);
+          }
+          else
+          {
+            // str.in.re(x, R1) includes str.in.re(x, R2) --->
+            //   mark str.in.re(x, R1) as reduced
+            d_parent.getExtTheory()->markReduced(m1Lit);
+            remove.insert(m1);
+
+            // We don't need to process m1 anymore
+            break;
+          }
+        }
+      }
+      else
+      {
+        Node pos = m1Neg ? m2Lit : m1Lit;
+        Node neg = m1Neg ? m1Lit : m2Lit;
+        if (d_regexp_opr.regExpIncludes(neg[1], pos[1]))
+        {
+          // We have a conflict because we have a case where str.in.re(x, R1)
+          // and ~str.in.re(x, R2) but R2 includes R1, so there is no
+          // possible value for x that satisfies both memberships.
+          std::vector<Node> vec_nodes;
+          vec_nodes.push_back(pos);
+          vec_nodes.push_back(neg.negate());
+
+          if (pos[0] != neg[0])
+          {
+            vec_nodes.push_back(pos[0].eqNode(neg[0]));
+          }
+
+          Node conc;
+          d_im.sendInference(vec_nodes, conc, "Intersect inclusion", true);
+          return false;
+        }
+      }
+    }
+  }
+
+  mems.erase(std::remove_if(
+                 mems.begin(),
+                 mems.end(),
+                 [&remove](Node& n) { return remove.find(n) != remove.end(); }),
+             mems.end());
+
+  return true;
+}
+
 bool RegExpSolver::checkEqcIntersect(const std::vector<Node>& mems)
 {
   // do not compute intersections if the re intersection mode is none
diff --git a/src/theory/strings/regexp_solver.h b/src/theory/strings/regexp_solver.h
index f3abb2a1d..c4d6afda0 100644
--- a/src/theory/strings/regexp_solver.h
+++ b/src/theory/strings/regexp_solver.h
@@ -64,6 +64,23 @@ class RegExpSolver
 
  private:
   /**
+   * Check memberships in equivalence class for regular expression
+   * inclusion.
+   *
+   * This method returns false if it discovered a conflict for this set of
+   * assertions, and true otherwise. It discovers a conflict e.g. if mems
+   * contains str.in.re(xi, Ri) and ~str.in.re(xj, Rj) and Rj includes Ri.
+   *
+   * @param mems Vector of memberships of the form: (~)str.in.re(x1, R1)
+   *             ... (~)str.in.re(xn, Rn) where x1 = ... = xn in the
+   *             current context. The function removes elements from this
+   *             vector that were marked as reduced.
+   * @param expForRe Additional explanations for regular expressions.
+   * @return False if a conflict was detected, true otherwise
+   */
+  bool checkEqcInclusion(std::vector<Node>& mems);
+
+  /**
    * Check memberships for equivalence class.
    * The vector mems is a vector of memberships of the form:
    *   (~) (x1 in R1 ) ... (~) (xn in Rn)
diff --git a/src/theory/strings/theory_strings_utils.cpp b/src/theory/strings/theory_strings_utils.cpp
index d3ae5384e..d764b87c1 100644
--- a/src/theory/strings/theory_strings_utils.cpp
+++ b/src/theory/strings/theory_strings_utils.cpp
@@ -134,6 +134,73 @@ Node getConstantEndpoint(Node e, bool isSuf)
   return getConstantComponent(e);
 }
 
+bool isUnboundedWildcard(const std::vector<Node>& rs, size_t start)
+{
+  size_t i = start;
+  while (i < rs.size() && rs[i].getKind() == REGEXP_SIGMA)
+  {
+    i++;
+  }
+
+  if (i >= rs.size())
+  {
+    return false;
+  }
+
+  return rs[i].getKind() == REGEXP_STAR && rs[i][0].getKind() == REGEXP_SIGMA;
+}
+
+bool isSimpleRegExp(Node r)
+{
+  Assert(r.getType().isRegExp());
+
+  std::vector<Node> v;
+  utils::getConcat(r, v);
+  for (const Node& n : v)
+  {
+    if (n.getKind() == STRING_TO_REGEXP)
+    {
+      if (!n[0].isConst())
+      {
+        return false;
+      }
+    }
+    else if (n.getKind() != REGEXP_SIGMA
+             && (n.getKind() != REGEXP_STAR || n[0].getKind() != REGEXP_SIGMA))
+    {
+      return false;
+    }
+  }
+  return true;
+}
+
+void getRegexpComponents(Node r, std::vector<Node>& result)
+{
+  Assert(r.getType().isRegExp());
+
+  NodeManager* nm = NodeManager::currentNM();
+  if (r.getKind() == REGEXP_CONCAT)
+  {
+    for (const Node& n : r)
+    {
+      getRegexpComponents(n, result);
+    }
+  }
+  else if (r.getKind() == STRING_TO_REGEXP && r[0].isConst())
+  {
+    String s = r[0].getConst<String>();
+    for (size_t i = 0, size = s.size(); i < size; i++)
+    {
+      result.push_back(
+          nm->mkNode(STRING_TO_REGEXP, nm->mkConst(s.substr(i, 1))));
+    }
+  }
+  else
+  {
+    result.push_back(r);
+  }
+}
+
 }  // namespace utils
 }  // namespace strings
 }  // namespace theory
diff --git a/src/theory/strings/theory_strings_utils.h b/src/theory/strings/theory_strings_utils.h
index 57d882625..cadc98df3 100644
--- a/src/theory/strings/theory_strings_utils.h
+++ b/src/theory/strings/theory_strings_utils.h
@@ -78,6 +78,39 @@ Node getConstantComponent(Node t);
  */
 Node getConstantEndpoint(Node e, bool isSuf);
 
+/**
+ * Given a vector of regular expression nodes and a start index that points to
+ * a wildcard, returns true if the wildcard is unbounded (i.e. it is followed
+ * by an arbitrary number of `re.allchar`s and then an `re.*(re.allchar)`. If
+ * the start index is not a wilcard or the wildcards are not followed by
+ * `re.*(re.allchar)`, the function returns false.
+ *
+ * @param rs The vector of regular expression nodes
+ * @param start The start index to consider
+ * @return True if the wilcard pointed to by `start` is unbounded, false
+ *         otherwise
+ */
+bool isUnboundedWildcard(const std::vector<Node>& rs, size_t start);
+
+/**
+ * Returns true iff the given regular expression only consists of re.++,
+ * re.allchar, (re.* re.allchar), and str.to.re of string literals.
+ *
+ * @param r The regular expression to check
+ * @return True if the regular expression is simple
+ */
+bool isSimpleRegExp(Node r);
+
+/**
+ * Helper function that takes a regular expression concatenation and
+ * returns the components of the concatenation. Letters of string literals
+ * are treated as individual components.
+ *
+ * @param r The regular expression
+ * @param result The resulting components
+ */
+void getRegexpComponents(Node r, std::vector<Node>& result);
+
 }  // namespace utils
 }  // namespace strings
 }  // namespace theory
author	Andres Noetzli <noetzli@stanford.edu>	2019-08-29 19:38:17 -0700
committer	GitHub <noreply@github.com>	2019-08-29 19:38:17 -0700
commit	974fc1d23c2b6091c26cf316964c4c16c5e2733f (patch)
tree	b8e4b597ffc46194ee37687a56248309a63235b1 /src/theory/strings
parent	cc7546ff0a4e418de9a21c03ef12b1d5e8801bb8 (diff)