Fix `str.replace_re` and `str.replace_re_all` (#6615)

Fixes #6057. The reductions of `str.replace_re` and `str.replace_re_all` were not correctly enforcing that the operations replace the _first_ occurrence of some regular expression in a string. This commit fixes the issue by introducing a new operator `str.indexof_re(s, r, n)`, which, analoguously to `str.indexof`, returns the index of the first match of the regular expression `r` in `s`. The commit adds basic rewrites for evaluating the operator as well as its reduction. Additionally, it converts the reductions of `str.replace_re` and `str.replace_re_all` to use that new operator. This simplifies the reductions of the two operators and ensures that the semantics are consistent between the two.
author: Andres Noetzli <andres.noetzli@gmail.com> 2021-05-27 15:42:10 -0700
committer: GitHub <noreply@github.com> 2021-05-27 22:42:10 +0000
commit: 29f0b8f378377ed836bddaaf88883d0b2eeb545d (patch)
tree: 5d9edb57c60ab8c8a07dab52f18b72dd441d4fdf /src/theory/strings
parent: 631032b15327c28c44b51490dceb434a38f3419a (diff)
12 files changed, 296 insertions, 93 deletions
diff --git a/src/theory/strings/extf_solver.cpp b/src/theory/strings/extf_solver.cpp
index a1de5e295..9576c1d81 100644
--- a/src/theory/strings/extf_solver.cpp
+++ b/src/theory/strings/extf_solver.cpp
@@ -53,6 +53,7 @@ ExtfSolver::ExtfSolver(SolverState& s,
   d_extt.addFunctionKind(kind::STRING_SUBSTR);
   d_extt.addFunctionKind(kind::STRING_UPDATE);
   d_extt.addFunctionKind(kind::STRING_STRIDOF);
+  d_extt.addFunctionKind(kind::STRING_INDEXOF_RE);
   d_extt.addFunctionKind(kind::STRING_ITOS);
   d_extt.addFunctionKind(kind::STRING_STOI);
   d_extt.addFunctionKind(kind::STRING_STRREPL);
@@ -191,11 +192,11 @@ bool ExtfSolver::doReduction(int effort, Node n)
   {
     NodeManager* nm = NodeManager::currentNM();
     Assert(k == STRING_SUBSTR || k == STRING_UPDATE || k == STRING_STRCTN
-           || k == STRING_STRIDOF || k == STRING_ITOS || k == STRING_STOI
-           || k == STRING_STRREPL || k == STRING_STRREPLALL || k == SEQ_NTH
-           || k == STRING_REPLACE_RE || k == STRING_REPLACE_RE_ALL
-           || k == STRING_LEQ || k == STRING_TOLOWER || k == STRING_TOUPPER
-           || k == STRING_REV);
+           || k == STRING_STRIDOF || k == STRING_INDEXOF_RE || k == STRING_ITOS
+           || k == STRING_STOI || k == STRING_STRREPL || k == STRING_STRREPLALL
+           || k == SEQ_NTH || k == STRING_REPLACE_RE
+           || k == STRING_REPLACE_RE_ALL || k == STRING_LEQ
+           || k == STRING_TOLOWER || k == STRING_TOUPPER || k == STRING_REV);
     std::vector<Node> new_nodes;
     Node res = d_preproc.simplify(n, new_nodes);
     Assert(res != n);
diff --git a/src/theory/strings/kinds b/src/theory/strings/kinds
index 1353ca964..743a5c006 100644
--- a/src/theory/strings/kinds
+++ b/src/theory/strings/kinds
@@ -21,13 +21,12 @@ operator STRING_CHARAT 2 "string charat"
 operator STRING_STRCTN 2 "string contains"
 operator STRING_LT 2 "string less than"
 operator STRING_LEQ 2 "string less than or equal"
-operator STRING_STRIDOF 3 "string indexof"
+operator STRING_STRIDOF 3 "string index of substring"
+operator STRING_INDEXOF_RE 3 "string index of regular expression match"
 operator STRING_STRREPL 3 "string replace"
 operator STRING_STRREPLALL 3 "string replace all"
 operator STRING_REPLACE_RE 3 "string replace regular expression match"
 operator STRING_REPLACE_RE_ALL 3 "string replace all regular expression matches"
-typerule STRING_REPLACE_RE "SimpleTypeRule<RString, AString, ARegExp, AString>"
-typerule STRING_REPLACE_RE_ALL "SimpleTypeRule<RString, AString, ARegExp, AString>"
 operator STRING_PREFIX 2 "string prefixof"
 operator STRING_SUFFIX 2 "string suffixof"
 operator STRING_IS_DIGIT 1 "string isdigit, returns true if argument is a string of length one that represents a digit"
@@ -152,8 +151,11 @@ typerule STRING_UPDATE ::cvc5::theory::strings::StringUpdateTypeRule
 typerule STRING_CHARAT ::cvc5::theory::strings::StringAtTypeRule
 typerule STRING_STRCTN ::cvc5::theory::strings::StringRelationTypeRule
 typerule STRING_STRIDOF ::cvc5::theory::strings::StringIndexOfTypeRule
+typerule STRING_INDEXOF_RE "SimpleTypeRule<RInteger, AString, ARegExp, AInteger>"
 typerule STRING_STRREPL ::cvc5::theory::strings::StringReplaceTypeRule
 typerule STRING_STRREPLALL ::cvc5::theory::strings::StringReplaceTypeRule
+typerule STRING_REPLACE_RE "SimpleTypeRule<RString, AString, ARegExp, AString>"
+typerule STRING_REPLACE_RE_ALL "SimpleTypeRule<RString, AString, ARegExp, AString>"
 typerule STRING_PREFIX ::cvc5::theory::strings::StringStrToBoolTypeRule
 typerule STRING_SUFFIX ::cvc5::theory::strings::StringStrToBoolTypeRule
 typerule STRING_REV ::cvc5::theory::strings::StringStrToStrTypeRule
diff --git a/src/theory/strings/rewrites.cpp b/src/theory/strings/rewrites.cpp
index 0c4e35df7..549d33efc 100644
--- a/src/theory/strings/rewrites.cpp
+++ b/src/theory/strings/rewrites.cpp
@@ -68,6 +68,10 @@ const char* toString(Rewrite r)
     case Rewrite::IDOF_PULL_ENDPT: return "IDOF_PULL_ENDPT";
     case Rewrite::IDOF_STRIP_CNST_ENDPTS: return "IDOF_STRIP_CNST_ENDPTS";
     case Rewrite::IDOF_STRIP_SYM_LEN: return "IDOF_STRIP_SYM_LEN";
+    case Rewrite::INDEXOF_RE_EMP_RE: return "INDEXOF_RE_EMP_RE";
+    case Rewrite::INDEXOF_RE_EVAL: return "INDEXOF_RE_EVAL";
+    case Rewrite::INDEXOF_RE_INVALID_INDEX: return "INDEXOF_RE_INVALID_INDEX";
+    case Rewrite::INDEXOF_RE_MAX_INDEX: return "INDEXOF_RE_MAX_INDEX";
     case Rewrite::ITOS_EVAL: return "ITOS_EVAL";
     case Rewrite::RE_AND_EMPTY: return "RE_AND_EMPTY";
     case Rewrite::RE_ANDOR_FLATTEN: return "RE_ANDOR_FLATTEN";
diff --git a/src/theory/strings/rewrites.h b/src/theory/strings/rewrites.h
index 482666faa..5e63c55c8 100644
--- a/src/theory/strings/rewrites.h
+++ b/src/theory/strings/rewrites.h
@@ -73,6 +73,10 @@ enum class Rewrite : uint32_t
   IDOF_PULL_ENDPT,
   IDOF_STRIP_CNST_ENDPTS,
   IDOF_STRIP_SYM_LEN,
+  INDEXOF_RE_EMP_RE,
+  INDEXOF_RE_EVAL,
+  INDEXOF_RE_INVALID_INDEX,
+  INDEXOF_RE_MAX_INDEX,
   ITOS_EVAL,
   RE_AND_EMPTY,
   RE_ANDOR_FLATTEN,
diff --git a/src/theory/strings/sequences_rewriter.cpp b/src/theory/strings/sequences_rewriter.cpp
index 15c37070a..7ef1242c6 100644
--- a/src/theory/strings/sequences_rewriter.cpp
+++ b/src/theory/strings/sequences_rewriter.cpp
@@ -1461,6 +1461,10 @@ RewriteResponse SequencesRewriter::postRewrite(TNode node)
   {
     retNode = rewriteIndexof(node);
   }
+  else if (nk == kind::STRING_INDEXOF_RE)
+  {
+    retNode = rewriteIndexofRe(node);
+  }
   else if (nk == kind::STRING_STRREPL)
   {
     retNode = rewriteReplace(node);
@@ -2529,6 +2533,59 @@ Node SequencesRewriter::rewriteIndexof(Node node)
   return node;
 }
 
+Node SequencesRewriter::rewriteIndexofRe(Node node)
+{
+  Assert(node.getKind() == STRING_INDEXOF_RE);
+  NodeManager* nm = NodeManager::currentNM();
+  Node s = node[0];
+  Node r = node[1];
+  Node n = node[2];
+  Node zero = nm->mkConst(Rational(0));
+  Node slen = nm->mkNode(STRING_LENGTH, s);
+
+  if (ArithEntail::check(zero, n, true) || ArithEntail::check(n, slen, true))
+  {
+    Node ret = nm->mkConst(Rational(-1));
+    return returnRewrite(node, ret, Rewrite::INDEXOF_RE_INVALID_INDEX);
+  }
+
+  if (RegExpEntail::isConstRegExp(r))
+  {
+    if (s.isConst() && n.isConst())
+    {
+      Rational nrat = n.getConst<Rational>();
+      cvc5::Rational rMaxInt(cvc5::String::maxSize());
+      if (nrat > rMaxInt)
+      {
+        // We know that, due to limitations on the size of string constants
+        // in our implementation, that accessing a position greater than
+        // rMaxInt is guaranteed to be out of bounds.
+        Node negone = nm->mkConst(Rational(-1));
+        return returnRewrite(node, negone, Rewrite::INDEXOF_RE_MAX_INDEX);
+      }
+
+      uint32_t start = nrat.getNumerator().toUnsignedInt();
+      Node rem = nm->mkConst(s.getConst<String>().substr(start));
+      std::pair<size_t, size_t> match = firstMatch(rem, r);
+      Node ret = nm->mkConst(
+          Rational(match.first == string::npos
+                       ? -1
+                       : static_cast<int64_t>(start + match.first)));
+      return returnRewrite(node, ret, Rewrite::INDEXOF_RE_EVAL);
+    }
+
+    if (ArithEntail::check(n, zero) && ArithEntail::check(slen, n))
+    {
+      String emptyStr("");
+      if (RegExpEntail::testConstStringInRegExp(emptyStr, 0, r))
+      {
+        return returnRewrite(node, n, Rewrite::INDEXOF_RE_EMP_RE);
+      }
+    }
+  }
+  return node;
+}
+
 Node SequencesRewriter::rewriteReplace(Node node)
 {
   Assert(node.getKind() == kind::STRING_STRREPL);
@@ -3150,7 +3207,7 @@ Node SequencesRewriter::rewriteReplaceReAll(Node node)
       std::vector<Node> res;
       String rem = x.getConst<String>();
       std::pair<size_t, size_t> match(0, 0);
-      while (rem.size() >= 0)
+      while (rem.size() != 0)
       {
         match = firstMatch(nm->mkConst(rem), yp);
         if (match.first == string::npos)
diff --git a/src/theory/strings/sequences_rewriter.h b/src/theory/strings/sequences_rewriter.h
index 7af24596a..37ed78786 100644
--- a/src/theory/strings/sequences_rewriter.h
+++ b/src/theory/strings/sequences_rewriter.h
@@ -194,6 +194,12 @@ class SequencesRewriter : public TheoryRewriter
    * Returns the rewritten form of node.
    */
   Node rewriteIndexof(Node node);
+  /** rewrite indexof regular expression match
+   * This is the entry point for post-rewriting terms n of the form
+   *   str.indexof_re( s, r, n )
+   * Returns the rewritten form of node.
+   */
+  Node rewriteIndexofRe(Node node);
   /** rewrite replace
    * This is the entry point for post-rewriting terms n of the form
    *   str.replace( s, t, r )
diff --git a/src/theory/strings/skolem_cache.cpp b/src/theory/strings/skolem_cache.cpp
index 9b23301f3..913518bc8 100644
--- a/src/theory/strings/skolem_cache.cpp
+++ b/src/theory/strings/skolem_cache.cpp
@@ -40,6 +40,16 @@ struct IndexVarAttributeId
 };
 typedef expr::Attribute<IndexVarAttributeId, Node> IndexVarAttribute;
 
+/**
+ * A bound variable corresponding to the universally quantified integer
+ * variable used to range over the valid lengths of a string, used for
+ * axiomatizing the behavior of some term.
+ */
+struct LengthVarAttributeId
+{
+};
+typedef expr::Attribute<LengthVarAttributeId, Node> LengthVarAttribute;
+
 SkolemCache::SkolemCache(bool useOpts) : d_useOpts(useOpts)
 {
   NodeManager* nm = NodeManager::currentNM();
@@ -300,6 +310,14 @@ Node SkolemCache::mkIndexVar(Node t)
   return bvm->mkBoundVar<IndexVarAttribute>(t, intType);
 }
 
+Node SkolemCache::mkLengthVar(Node t)
+{
+  NodeManager* nm = NodeManager::currentNM();
+  TypeNode intType = nm->integerType();
+  BoundVarManager* bvm = nm->getBoundVarManager();
+  return bvm->mkBoundVar<LengthVarAttribute>(t, intType);
+}
+
 }  // namespace strings
 }  // namespace theory
 }  // namespace cvc5
diff --git a/src/theory/strings/skolem_cache.h b/src/theory/strings/skolem_cache.h
index 126ee313d..f0376dbc6 100644
--- a/src/theory/strings/skolem_cache.h
+++ b/src/theory/strings/skolem_cache.h
@@ -107,9 +107,14 @@ class SkolemCache
     // in_re(a, re.++(_*, b, _*)) =>
     //    exists k_pre, k_match, k_post.
     //       a = k_pre ++ k_match ++ k_post ^
-    //       ~in_re(k_pre ++ substr(k_match, 0, str.len(k_match) - 1),
-    //              re.++(_*, b, _*)) ^
-    //       in_re(k2, y)
+    //       len(k_pre) = indexof_re(x, y, 0) ^
+    //       (forall l. 0 < l < len(k_match) =>
+    //         ~in_re(substr(k_match, 0, l), r)) ^
+    //       in_re(k_match, b)
+    //
+    // k_pre is the prefix before the first, shortest match of b in a. k_match
+    // is the substring of a matched by b. It is either empty or there is no
+    // shorter string that matches b.
     SK_FIRST_MATCH_PRE,
     SK_FIRST_MATCH,
     SK_FIRST_MATCH_POST,
@@ -180,6 +185,16 @@ class SkolemCache
    */
   static Node mkIndexVar(Node t);
 
+  /** Make length variable
+   *
+   * This returns an integer variable of kind BOUND_VARIABLE that is used for
+   * axiomatizing the behavior of a term or predicate t. It refers to lengths
+   * of strings in the reduction of t. For example, the length variable for the
+   * term str.indexof(s, r, n) is used to quantify over the lengths of strings
+   * that could be matched by r.
+   */
+  static Node mkLengthVar(Node t);
+
  private:
   /**
    * Simplifies the arguments for a string skolem used for indexing into the
diff --git a/src/theory/strings/term_registry.cpp b/src/theory/strings/term_registry.cpp
index af2eecde8..c7b3b5300 100644
--- a/src/theory/strings/term_registry.cpp
+++ b/src/theory/strings/term_registry.cpp
@@ -88,10 +88,11 @@ Node TermRegistry::eagerReduce(Node t, SkolemCache* sc)
             LT, t, nm->mkConst(Rational(utils::getAlphabetCardinality()))));
     lemma = nm->mkNode(ITE, code_len, code_range, code_eq_neg1);
   }
-  else if (tk == STRING_STRIDOF)
+  else if (tk == STRING_STRIDOF || tk == STRING_INDEXOF_RE)
   {
-    // (and (>= (str.indexof x y n) (- 1)) (<= (str.indexof x y n) (str.len
-    // x)))
+    // (and (>= (f x y n) (- 1)) (<= (f x y n) (str.len x)))
+    //
+    // where f in { str.indexof, str.indexof_re }
     Node l = nm->mkNode(STRING_LENGTH, t[0]);
     lemma = nm->mkNode(AND,
                        nm->mkNode(GEQ, t, nm->mkConst(Rational(-1))),
@@ -143,11 +144,12 @@ void TermRegistry::preRegisterTerm(TNode n)
   Kind k = n.getKind();
   if (!options::stringExp())
   {
-    if (k == STRING_STRIDOF || k == STRING_ITOS || k == STRING_STOI
-        || k == STRING_STRREPL || k == STRING_SUBSTR || k == STRING_STRREPLALL
-        || k == SEQ_NTH || k == STRING_REPLACE_RE || k == STRING_REPLACE_RE_ALL
-        || k == STRING_STRCTN || k == STRING_LEQ || k == STRING_TOLOWER
-        || k == STRING_TOUPPER || k == STRING_REV || k == STRING_UPDATE)
+    if (k == STRING_STRIDOF || k == STRING_INDEXOF_RE || k == STRING_ITOS
+        || k == STRING_STOI || k == STRING_STRREPL || k == STRING_SUBSTR
+        || k == STRING_STRREPLALL || k == SEQ_NTH || k == STRING_REPLACE_RE
+        || k == STRING_REPLACE_RE_ALL || k == STRING_STRCTN || k == STRING_LEQ
+        || k == STRING_TOLOWER || k == STRING_TOUPPER || k == STRING_REV
+        || k == STRING_UPDATE)
     {
       std::stringstream ss;
       ss << "Term of kind " << k
diff --git a/src/theory/strings/theory_strings.cpp b/src/theory/strings/theory_strings.cpp
index 06a39ec64..b1f5a0765 100644
--- a/src/theory/strings/theory_strings.cpp
+++ b/src/theory/strings/theory_strings.cpp
@@ -130,6 +130,7 @@ void TheoryStrings::finishInit()
   d_equalityEngine->addFunctionKind(kind::STRING_ITOS, eagerEval);
   d_equalityEngine->addFunctionKind(kind::STRING_STOI, eagerEval);
   d_equalityEngine->addFunctionKind(kind::STRING_STRIDOF, eagerEval);
+  d_equalityEngine->addFunctionKind(kind::STRING_INDEXOF_RE, eagerEval);
   d_equalityEngine->addFunctionKind(kind::STRING_STRREPL, eagerEval);
   d_equalityEngine->addFunctionKind(kind::STRING_STRREPLALL, eagerEval);
   d_equalityEngine->addFunctionKind(kind::STRING_REPLACE_RE, eagerEval);
diff --git a/src/theory/strings/theory_strings_preprocess.cpp b/src/theory/strings/theory_strings_preprocess.cpp
index 356ae28ac..83a4fa04a 100644
--- a/src/theory/strings/theory_strings_preprocess.cpp
+++ b/src/theory/strings/theory_strings_preprocess.cpp
@@ -253,6 +253,89 @@ Node StringsPreprocess::reduce(Node t,
     // Thus, indexof( x, y, n ) = skk.
     retNode = skk;
   }
+  else if (t.getKind() == kind::STRING_INDEXOF_RE)
+  {
+    // processing term:  indexof_re(s, r, n)
+    Node s = t[0];
+    Node r = t[1];
+    Node n = t[2];
+    Node skk = sc->mkTypedSkolemCached(
+        nm->integerType(), t, SkolemCache::SK_PURIFY, "iork");
+    Node sLen = nm->mkNode(STRING_LENGTH, s);
+
+    // n > len(x)
+    Node ub = nm->mkNode(GT, n, sLen);
+    // 0 > n
+    Node lb = nm->mkNode(GT, zero, n);
+    // n > len(x) OR 0 > n
+    Node condNegOne = nm->mkNode(OR, ub, lb);
+    // skk = -1
+    Node retNegOne = skk.eqNode(negOne);
+
+    Node emp = Word::mkEmptyWord(s.getType());
+    // in_re("", r)
+    Node matchEmptyStr = nm->mkNode(STRING_IN_REGEXP, emp, r);
+    // skk = n
+    Node retN = skk.eqNode(n);
+
+    Node i = SkolemCache::mkIndexVar(t);
+    Node l = SkolemCache::mkLengthVar(t);
+    Node bvl = nm->mkNode(BOUND_VAR_LIST, i, l);
+    Node bound =
+        nm->mkNode(AND,
+                   {
+                       nm->mkNode(GEQ, i, n),
+                       nm->mkNode(LT, i, nm->mkNode(ITE, retNegOne, sLen, skk)),
+                       nm->mkNode(GT, l, zero),
+                       nm->mkNode(LEQ, l, nm->mkNode(MINUS, sLen, i)),
+                   });
+    Node body = nm->mkNode(
+        OR,
+        bound.negate(),
+        nm->mkNode(STRING_IN_REGEXP, nm->mkNode(STRING_SUBSTR, s, i, l), r)
+            .negate());
+    // forall il.
+    //   n <= i < ite(skk = -1, len(s), skk) ^ 0 < l <= len(s) - i =>
+    //     ~in_re(substr(s, i, l), r)
+    Node firstMatch = mkForallInternal(bvl, body);
+    Node bvll = nm->mkNode(BOUND_VAR_LIST, l);
+    Node validLen =
+        nm->mkNode(AND,
+                   nm->mkNode(GEQ, l, n),
+                   nm->mkNode(LEQ, l, nm->mkNode(MINUS, sLen, skk)));
+    Node matchBody = nm->mkNode(
+        AND,
+        validLen,
+        nm->mkNode(STRING_IN_REGEXP, nm->mkNode(STRING_SUBSTR, s, skk, l), r));
+    // skk != -1 =>
+    //   exists l. n <= l < len(s) - skk ^ in_re(substr(s, skk, l), r))
+    Node match = nm->mkNode(
+        OR, retNegOne, mkForallInternal(bvll, matchBody.negate()).negate());
+
+    // assert:
+    // IF:   n > len(s) OR 0 > n
+    // THEN: skk = -1
+    // ELIF: in_re("", r)
+    // THEN: skk = n
+    // ELSE: (forall il.
+    //         n <= i < ite(skk = -1, len(s), skk) ^ 0 < l <= len(s) - i =>
+    //           ~in_re(substr(s, i, l), r)) ^
+    //       (skk != -1 =>
+    //          exists l. n <= l < len(s) - skk ^ in_re(substr(s, skk, l), r))
+    //
+    // Note that this reduction relies on eager reduction lemmas being sent to
+    // properly limit the range of skk.
+    Node rr = nm->mkNode(
+        ITE,
+        condNegOne,
+        retNegOne,
+        nm->mkNode(
+            ITE, matchEmptyStr, retN, nm->mkNode(AND, firstMatch, match)));
+    asserts.push_back(rr);
+
+    // Thus, indexof_re(s, r, n) = skk.
+    retNode = skk;
+  }
   else if (t.getKind() == STRING_ITOS)
   {
     // processing term:  int.to.str( n )
@@ -596,17 +679,14 @@ Node StringsPreprocess::reduce(Node t,
   }
   else if (t.getKind() == STRING_REPLACE_RE)
   {
+    // processing term: replace_re( x, y, z )
     Node x = t[0];
     Node y = t[1];
     Node z = t[2];
     Node k = sc->mkSkolemCached(t, SkolemCache::SK_PURIFY, "k");
 
-    std::vector<Node> emptyVec;
-    Node sigmaStar =
-        nm->mkNode(REGEXP_STAR, nm->mkNode(REGEXP_SIGMA, emptyVec));
-    Node re = nm->mkNode(REGEXP_CONCAT, sigmaStar, y, sigmaStar);
-    // in_re(x, re.++(_*, y, _*))
-    Node hasMatch = nm->mkNode(STRING_IN_REGEXP, x, re);
+    // indexof_re(x, y, 0)
+    Node idx = nm->mkNode(STRING_INDEXOF_RE, x, y, zero);
 
     // in_re("", y)
     Node matchesEmpty =
@@ -620,42 +700,40 @@ Node StringsPreprocess::reduce(Node t,
         sc->mkSkolemCached(x, y, SkolemCache::SK_FIRST_MATCH, "rre_match");
     Node k3 =
         sc->mkSkolemCached(x, y, SkolemCache::SK_FIRST_MATCH_POST, "rre_post");
+    Node k2Len = nm->mkNode(STRING_LENGTH, k2);
     // x = k1 ++ k2 ++ k3
-    Node splitX = x.eqNode(nm->mkNode(STRING_CONCAT, k1, k2, k3));
-    // ~in_re(k1 ++ str.substr(k2, 0, str.len(k2) - 1), re.++(_*, y, _*))
-    Node k2len = nm->mkNode(STRING_LENGTH, k2);
-    Node firstMatch =
-        nm->mkNode(
-              STRING_IN_REGEXP,
-              nm->mkNode(
-                  STRING_CONCAT,
-                  k1,
-                  nm->mkNode(
-                      STRING_SUBSTR, k2, zero, nm->mkNode(MINUS, k2len, one))),
-              re)
-            .negate();
+    Node split = x.eqNode(nm->mkNode(STRING_CONCAT, k1, k2, k3));
+    // len(k1) = indexof_re(x, y, 0)
+    Node k1Len = nm->mkNode(STRING_LENGTH, k1).eqNode(idx);
+    Node l = SkolemCache::mkLengthVar(t);
+    Node bvll = nm->mkNode(BOUND_VAR_LIST, l);
+    Node bound =
+        nm->mkNode(AND, nm->mkNode(LEQ, zero, l), nm->mkNode(LT, l, k2Len));
+    Node body = nm->mkNode(
+        OR,
+        bound.negate(),
+        nm->mkNode(STRING_IN_REGEXP, nm->mkNode(STRING_SUBSTR, k2, zero, l), y)
+            .negate());
+    // forall l. 0 <= l < len(k2) => ~in_re(substr(k2, 0, l), r)
+    Node shortestMatch = mkForallInternal(bvll, body);
     // in_re(k2, y)
-    Node k2Match = nm->mkNode(STRING_IN_REGEXP, k2, y);
+    Node match = nm->mkNode(STRING_IN_REGEXP, k2, y);
     // k = k1 ++ z ++ k3
-    Node res2 = k.eqNode(nm->mkNode(STRING_CONCAT, k1, z, k3));
+    Node res = k.eqNode(nm->mkNode(STRING_CONCAT, k1, z, k3));
 
-    // IF in_re(x, re.++(_*, y, _*))
-    // THEN:
-    //   IF in_re("", y)
-    //   THEN: k = z ++ x
-    //   ELSE:
-    //     x = k1 ++ k2 ++ k3 ^
-    //     ~in_re(k1 ++ substr(k2, 0, str.len(k2) - 1), re.++(_*, y, _*)) ^
-    //     in_re(k2, y) ^ k = k1 ++ z ++ k3
-    // ELSE: k = x
-    asserts.push_back(nm->mkNode(
-        ITE,
-        hasMatch,
+    // IF: indexof_re(x, y, 0) = -1
+    // THEN: k = x
+    // ELSE:
+    //   x = k1 ++ k2 ++ k3 ^
+    //   len(k1) = indexof_re(x, y, 0) ^
+    //   (forall l. 0 <= l < len(k2) => ~in_re(substr(k2, 0, l), r)) ^
+    //   in_re(k2, y) ^
+    //   k = k1 ++ z ++ k3
+    asserts.push_back(
         nm->mkNode(ITE,
-                   matchesEmpty,
-                   res1,
-                   nm->mkNode(AND, {splitX, firstMatch, k2Match, res2})),
-        k.eqNode(x)));
+                   idx.eqNode(negOne),
+                   k.eqNode(x),
+                   nm->mkNode(AND, {split, k1Len, shortestMatch, match, res})));
     retNode = k;
   }
   else if (t.getKind() == STRING_REPLACE_RE_ALL)
@@ -679,13 +757,10 @@ Node StringsPreprocess::reduce(Node t,
 
     Node emp = Word::mkEmptyWord(t.getType());
 
-    std::vector<Node> emptyVec;
-    Node sigmaStar =
-        nm->mkNode(REGEXP_STAR, nm->mkNode(REGEXP_SIGMA, emptyVec));
     Node yp = nm->mkNode(REGEXP_DIFF, y, nm->mkNode(STRING_TO_REGEXP, emp));
-    Node re = nm->mkNode(REGEXP_CONCAT, sigmaStar, yp, sigmaStar);
-    // in_re(x, _* ++ y' ++ _*)
-    Node hasMatch = nm->mkNode(STRING_IN_REGEXP, x, re);
+    Node idx = nm->mkNode(STRING_INDEXOF_RE, x, yp, zero);
+    // indexof_re(x, y', 0) = -1
+    Node noMatch = idx.eqNode(negOne);
 
     Node ufno = nm->mkNode(APPLY_UF, uf, numOcc);
     Node usno = nm->mkNode(APPLY_UF, us, numOcc);
@@ -700,8 +775,9 @@ Node StringsPreprocess::reduce(Node t,
     lemmas.push_back(usno.eqNode(rem));
     // Uf(0) = 0
     lemmas.push_back(nm->mkNode(APPLY_UF, uf, zero).eqNode(zero));
-    // not(in_re(substr(x, Uf(numOcc)), re.++(_*, y', _*)))
-    lemmas.push_back(nm->mkNode(STRING_IN_REGEXP, rem, re).negate());
+    // indexof_re(substr(x, Uf(numOcc)), y', 0) = -1
+    lemmas.push_back(
+        nm->mkNode(STRING_INDEXOF_RE, rem, yp, zero).eqNode(negOne));
 
     Node i = SkolemCache::mkIndexVar(t);
     Node bvli = nm->mkNode(BOUND_VAR_LIST, i);
@@ -709,50 +785,57 @@ Node StringsPreprocess::reduce(Node t,
         nm->mkNode(AND, nm->mkNode(GEQ, i, zero), nm->mkNode(LT, i, numOcc));
     Node ip1 = nm->mkNode(PLUS, i, one);
     Node ufi = nm->mkNode(APPLY_UF, uf, i);
-    Node uli = nm->mkNode(APPLY_UF, ul, i);
     Node ufip1 = nm->mkNode(APPLY_UF, uf, ip1);
-    Node ii = nm->mkNode(MINUS, ufip1, uli);
-    Node match = nm->mkNode(STRING_SUBSTR, x, ii, uli);
-    Node pfxMatch =
-        nm->mkNode(STRING_SUBSTR, x, ufi, nm->mkNode(MINUS, ii, ufi));
-    Node nonMatch =
-        nm->mkNode(STRING_SUBSTR,
-                   x,
-                   ufi,
-                   nm->mkNode(MINUS, nm->mkNode(MINUS, ufip1, one), ufi));
+    Node ulip1 = nm->mkNode(APPLY_UF, ul, ip1);
+    // ii = Uf(i + 1) - Ul(i + 1)
+    Node ii = nm->mkNode(MINUS, ufip1, ulip1);
 
     std::vector<Node> flem;
     // Ul(i) > 0
-    flem.push_back(nm->mkNode(GT, uli, zero));
-    // Uf(i + 1) >= Uf(i) + Ul(i)
-    flem.push_back(nm->mkNode(GEQ, ufip1, nm->mkNode(PLUS, ufi, uli)));
-    // in_re(substr(x, ii, Ul(i)), y')
-    flem.push_back(nm->mkNode(STRING_IN_REGEXP, match, yp));
-    // ~in_re(substr(x, Uf(i), Uf(i + 1) - 1 - Uf(i)), re.++(_*, y', _*))
-    flem.push_back(nm->mkNode(STRING_IN_REGEXP, nonMatch, re).negate());
+    flem.push_back(nm->mkNode(GT, ulip1, zero));
+    // Uf(i + 1) = indexof_re(x, yp, Uf(i)) + Ul(i + 1)
+    flem.push_back(ufip1.eqNode(
+        nm->mkNode(PLUS, nm->mkNode(STRING_INDEXOF_RE, x, yp, ufi), ulip1)));
+    // in_re(substr(x, ii, Ul(i + 1)), y')
+    flem.push_back(nm->mkNode(
+        STRING_IN_REGEXP, nm->mkNode(STRING_SUBSTR, x, ii, ulip1), yp));
+    Node l = SkolemCache::mkLengthVar(t);
+    Node bvll = nm->mkNode(BOUND_VAR_LIST, l);
+    Node lenBound =
+        nm->mkNode(AND, nm->mkNode(LT, zero, l), nm->mkNode(LT, l, ulip1));
+    Node shortestMatchBody = nm->mkNode(
+        OR,
+        lenBound.negate(),
+        nm->mkNode(STRING_IN_REGEXP, nm->mkNode(STRING_SUBSTR, x, ii, l), y)
+            .negate());
+    // forall l. 0 < l < Ul(i + 1) =>
+    //   ~in_re(substr(x, Uf(i + 1) - Ul(i + 1), l), y')
+    flem.push_back(mkForallInternal(bvll, shortestMatchBody));
+    Node pfxMatch =
+        nm->mkNode(STRING_SUBSTR, x, ufi, nm->mkNode(MINUS, ii, ufi));
     // Us(i) = substr(x, Uf(i), ii - Uf(i)) ++ z ++ Us(i + 1)
     flem.push_back(
         nm->mkNode(APPLY_UF, us, i)
             .eqNode(nm->mkNode(
                 STRING_CONCAT, pfxMatch, z, nm->mkNode(APPLY_UF, us, ip1))));
-
     Node body = nm->mkNode(OR, bound.negate(), nm->mkNode(AND, flem));
     Node forall = mkForallInternal(bvli, body);
     lemmas.push_back(forall);
 
-    // IF in_re(x, re.++(_*, y', _*))
-    // THEN:
+    // IF indexof(x, y', 0) = -1
+    // THEN: k = x
+    // ELSE:
     //   numOcc > 0 ^
     //   k = Us(0) ^ Us(numOcc) = substr(x, Uf(numOcc)) ^
-    //   Uf(0) = 0 ^ not(in_re(substr(x, Uf(numOcc)), re.++(_*, y', _*)))
+    //   Uf(0) = 0 ^ indexof_re(substr(x, Uf(numOcc)), y', 0) = -1 ^
     //   forall i. 0 <= i < nummOcc =>
     //     Ul(i) > 0 ^
-    //     Uf(i + 1) >= Uf(i) + Ul(i) ^
-    //     in_re(substr(x, ii, Ul(i)), y') ^
-    //     ~in_re(substr(x, Uf(i), Uf(i + 1) - 1 - Uf(i)), re.++(_*, y', _*)) ^
+    //     Uf(i + 1) = indexof_re(x, yp, Uf(i)) + Ul(i + 1) ^
+    //     in_re(substr(x, ii, Ul(i + 1)), y') ^
+    //     forall l. 0 < l < Ul(i + 1) =>
+    //       ~in_re(substr(x, Uf(i + 1) - Ul(i + 1), l), y')
     //     Us(i) = substr(x, Uf(i), ii - Uf(i)) ++ z ++ Us(i + 1)
     //     where ii = Uf(i + 1) - Ul(i)
-    // ELSE: k = x
     // where y' = re.diff(y, "")
     //
     // Conceptually, y' is the regex y but excluding the empty string (because
@@ -760,8 +843,17 @@ Node StringsPreprocess::reduce(Node t,
     // matches of y' in x, Uf(i) is the end position of the i-th match, Ul(i)
     // is the length of the i^th match, and Us(i) is the result of processing
     // the remainder after processing the i^th occurrence of y in x.
+    //
+    // Visualization of Uf(i) and Ul(i):
+    //
+    // x = |------------| match 1 |-----------| match 2   |---|
+    //     |                      |                       |
+    //     Uf(0)                  Uf(1)                   Uf(2)
+    //
+    //                  |---------|           |-----------|
+    //                    Ul(1)                 Ul(2)
     asserts.push_back(
-        nm->mkNode(ITE, hasMatch, nm->mkNode(AND, lemmas), k.eqNode(x)));
+        nm->mkNode(ITE, noMatch, k.eqNode(x), nm->mkNode(AND, lemmas)));
     retNode = k;
   }
   else if (t.getKind() == STRING_TOLOWER || t.getKind() == STRING_TOUPPER)
diff --git a/src/theory/strings/theory_strings_utils.cpp b/src/theory/strings/theory_strings_utils.cpp
index 6897d5272..c763557ca 100644
--- a/src/theory/strings/theory_strings_utils.cpp
+++ b/src/theory/strings/theory_strings_utils.cpp
@@ -383,8 +383,9 @@ TypeNode getOwnerStringType(Node n)
 {
   TypeNode tn;
   Kind k = n.getKind();
-  if (k == STRING_STRIDOF || k == STRING_LENGTH || k == STRING_STRCTN
-      || k == SEQ_NTH || k == STRING_PREFIX || k == STRING_SUFFIX)
+  if (k == STRING_STRIDOF || k == STRING_INDEXOF_RE || k == STRING_LENGTH
+      || k == STRING_STRCTN || k == SEQ_NTH || k == STRING_PREFIX
+      || k == STRING_SUFFIX)
   {
     // owning string type is the type of first argument
     tn = n[0].getType();
author	Andres Noetzli <andres.noetzli@gmail.com>	2021-05-27 15:42:10 -0700
committer	GitHub <noreply@github.com>	2021-05-27 22:42:10 +0000
commit	29f0b8f378377ed836bddaaf88883d0b2eeb545d (patch)
tree	5d9edb57c60ab8c8a07dab52f18b72dd441d4fdf /src/theory/strings
parent	631032b15327c28c44b51490dceb434a38f3419a (diff)