diff options
author | Andres Noetzli <andres.noetzli@gmail.com> | 2020-06-10 12:50:52 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-06-10 12:50:52 -0700 |
commit | 2da2646dd65e0458311a2dccfb04850c0b7d9e3c (patch) | |
tree | 00d31835b3ad0c00064ae2b43a2a59844f418dd0 /src/theory | |
parent | 05c099890ae908e495ceaf26509b87896fd0ad54 (diff) |
Add support for str.replace_re/str.replace_re_all (#4594)
This commit adds support for the last remaining string operators from
the new SMT-LIB standard for the theory of strings. The commit adds the
kinds, type checking, reductions, and evaluation rewrites for
`str.replace_re` and `str.replace_re_all`.
Diffstat (limited to 'src/theory')
-rw-r--r-- | src/theory/strings/extf_solver.cpp | 7 | ||||
-rw-r--r-- | src/theory/strings/kinds | 4 | ||||
-rw-r--r-- | src/theory/strings/rewrites.cpp | 2 | ||||
-rw-r--r-- | src/theory/strings/rewrites.h | 2 | ||||
-rw-r--r-- | src/theory/strings/sequences_rewriter.cpp | 116 | ||||
-rw-r--r-- | src/theory/strings/sequences_rewriter.h | 27 | ||||
-rw-r--r-- | src/theory/strings/skolem_cache.h | 18 | ||||
-rw-r--r-- | src/theory/strings/term_registry.cpp | 7 | ||||
-rw-r--r-- | src/theory/strings/theory_strings.cpp | 3 | ||||
-rw-r--r-- | src/theory/strings/theory_strings_preprocess.cpp | 169 |
10 files changed, 350 insertions, 5 deletions
diff --git a/src/theory/strings/extf_solver.cpp b/src/theory/strings/extf_solver.cpp index dd4144313..9b7af4f13 100644 --- a/src/theory/strings/extf_solver.cpp +++ b/src/theory/strings/extf_solver.cpp @@ -56,6 +56,8 @@ ExtfSolver::ExtfSolver(context::Context* c, d_extt.addFunctionKind(kind::STRING_STOI); d_extt.addFunctionKind(kind::STRING_STRREPL); d_extt.addFunctionKind(kind::STRING_STRREPLALL); + d_extt.addFunctionKind(kind::STRING_REPLACE_RE); + d_extt.addFunctionKind(kind::STRING_REPLACE_RE_ALL); d_extt.addFunctionKind(kind::STRING_STRCTN); d_extt.addFunctionKind(kind::STRING_IN_REGEXP); d_extt.addFunctionKind(kind::STRING_LEQ); @@ -180,8 +182,9 @@ bool ExtfSolver::doReduction(int effort, Node n) NodeManager* nm = NodeManager::currentNM(); Assert(k == STRING_SUBSTR || k == STRING_STRCTN || k == STRING_STRIDOF || k == STRING_ITOS || k == STRING_STOI || k == STRING_STRREPL - || k == STRING_STRREPLALL || k == STRING_LEQ || k == STRING_TOLOWER - || k == STRING_TOUPPER || k == STRING_REV); + || k == STRING_STRREPLALL || k == STRING_REPLACE_RE + || k == STRING_REPLACE_RE_ALL || k == STRING_LEQ + || k == STRING_TOLOWER || k == STRING_TOUPPER || k == STRING_REV); std::vector<Node> new_nodes; Node res = d_preproc.simplify(n, new_nodes); Assert(res != n); diff --git a/src/theory/strings/kinds b/src/theory/strings/kinds index 800847ffe..96ba82a44 100644 --- a/src/theory/strings/kinds +++ b/src/theory/strings/kinds @@ -23,6 +23,10 @@ operator STRING_LEQ 2 "string less than or equal" operator STRING_STRIDOF 3 "string indexof" operator STRING_STRREPL 3 "string replace" operator STRING_STRREPLALL 3 "string replace all" +operator STRING_REPLACE_RE 3 "string replace regular expression match" +operator STRING_REPLACE_RE_ALL 3 "string replace all regular expression matches" +typerule STRING_REPLACE_RE "SimpleTypeRule<RString, AString, ARegExp, AString>" +typerule STRING_REPLACE_RE_ALL "SimpleTypeRule<RString, AString, ARegExp, AString>" operator STRING_PREFIX 2 "string prefixof" operator STRING_SUFFIX 2 "string suffixof" operator STRING_IS_DIGIT 1 "string isdigit, returns true if argument is a string of length one that represents a digit" diff --git a/src/theory/strings/rewrites.cpp b/src/theory/strings/rewrites.cpp index a4055c4f9..5f3c83869 100644 --- a/src/theory/strings/rewrites.cpp +++ b/src/theory/strings/rewrites.cpp @@ -111,6 +111,8 @@ const char* toString(Rewrite r) case Rewrite::RPL_RPL_EMPTY: return "RPL_RPL_EMPTY"; case Rewrite::RPL_RPL_LEN_ID: return "RPL_RPL_LEN_ID"; case Rewrite::RPL_X_Y_X_SIMP: return "RPL_X_Y_X_SIMP"; + case Rewrite::REPLACE_RE_EVAL: return "REPLACE_RE_EVAL"; + case Rewrite::REPLACE_RE_ALL_EVAL: return "REPLACE_RE_ALL_EVAL"; case Rewrite::SPLIT_EQ: return "SPLIT_EQ"; case Rewrite::SPLIT_EQ_STRIP_L: return "SPLIT_EQ_STRIP_L"; case Rewrite::SPLIT_EQ_STRIP_R: return "SPLIT_EQ_STRIP_R"; diff --git a/src/theory/strings/rewrites.h b/src/theory/strings/rewrites.h index 96a3b65fd..62350c403 100644 --- a/src/theory/strings/rewrites.h +++ b/src/theory/strings/rewrites.h @@ -116,6 +116,8 @@ enum class Rewrite : uint32_t RPL_RPL_EMPTY, RPL_RPL_LEN_ID, RPL_X_Y_X_SIMP, + REPLACE_RE_EVAL, + REPLACE_RE_ALL_EVAL, SPLIT_EQ, SPLIT_EQ_STRIP_L, SPLIT_EQ_STRIP_R, diff --git a/src/theory/strings/sequences_rewriter.cpp b/src/theory/strings/sequences_rewriter.cpp index 4f74d7c15..3c40062f5 100644 --- a/src/theory/strings/sequences_rewriter.cpp +++ b/src/theory/strings/sequences_rewriter.cpp @@ -1413,6 +1413,14 @@ RewriteResponse SequencesRewriter::postRewrite(TNode node) { retNode = rewriteReplaceAll(node); } + else if (nk == kind::STRING_REPLACE_RE) + { + retNode = rewriteReplaceRe(node); + } + else if (nk == kind::STRING_REPLACE_RE_ALL) + { + retNode = rewriteReplaceReAll(node); + } else if (nk == STRING_REV) { retNode = rewriteStrReverse(node); @@ -2914,6 +2922,114 @@ Node SequencesRewriter::rewriteReplaceInternal(Node node) return Node::null(); } +Node SequencesRewriter::rewriteReplaceRe(Node node) +{ + Assert(node.getKind() == STRING_REPLACE_RE); + NodeManager* nm = NodeManager::currentNM(); + Node x = node[0]; + Node y = node[1]; + Node z = node[2]; + + if (x.isConst()) + { + // str.replace_re("ZABCZ", re.++("A", _*, "C"), y) ---> "Z" ++ y ++ "Z" + std::pair<size_t, size_t> match = firstMatch(x, y); + if (match.first != string::npos) + { + String s = x.getConst<String>(); + Node ret = nm->mkNode(STRING_CONCAT, + nm->mkConst(s.substr(0, match.first)), + z, + nm->mkConst(s.substr(match.second))); + return returnRewrite(node, ret, Rewrite::REPLACE_RE_EVAL); + } + else + { + return returnRewrite(node, x, Rewrite::REPLACE_RE_EVAL); + } + } + return node; +} + +Node SequencesRewriter::rewriteReplaceReAll(Node node) +{ + Assert(node.getKind() == STRING_REPLACE_RE_ALL); + NodeManager* nm = NodeManager::currentNM(); + Node x = node[0]; + Node y = node[1]; + Node z = node[2]; + + if (x.isConst()) + { + // str.replace_re_all("ZABCZAB", re.++("A", _*, "C"), y) ---> + // "Z" ++ y ++ "Z" ++ y + TypeNode t = x.getType(); + Node emp = Word::mkEmptyWord(t); + Node yp = Rewriter::rewrite( + nm->mkNode(REGEXP_DIFF, y, nm->mkNode(STRING_TO_REGEXP, emp))); + std::vector<Node> res; + String rem = x.getConst<String>(); + std::pair<size_t, size_t> match(0, 0); + while (rem.size() >= 0) + { + match = firstMatch(nm->mkConst(rem), yp); + if (match.first == string::npos) + { + break; + } + res.push_back(nm->mkConst(rem.substr(0, match.first))); + res.push_back(z); + rem = rem.substr(match.second); + } + res.push_back(nm->mkConst(rem)); + Node ret = utils::mkConcat(res, t); + return returnRewrite(node, ret, Rewrite::REPLACE_RE_ALL_EVAL); + } + + return node; +} + +std::pair<size_t, size_t> SequencesRewriter::firstMatch(Node n, Node r) +{ + Assert(n.isConst() && n.getType().isStringLike()); + Assert(r.getType().isRegExp()); + NodeManager* nm = NodeManager::currentNM(); + + std::vector<Node> emptyVec; + Node sigmaStar = nm->mkNode(REGEXP_STAR, nm->mkNode(REGEXP_SIGMA, emptyVec)); + Node re = nm->mkNode(REGEXP_CONCAT, r, sigmaStar); + String s = n.getConst<String>(); + + if (s.size() == 0) + { + if (RegExpEntail::testConstStringInRegExp(s, 0, r)) + { + return std::make_pair(0, 0); + } + else + { + return std::make_pair(string::npos, string::npos); + } + } + + for (size_t i = 0, size = s.size(); i < size; i++) + { + if (RegExpEntail::testConstStringInRegExp(s, i, re)) + { + for (size_t j = i; j <= size; j++) + { + String substr = s.substr(i, j - i); + if (RegExpEntail::testConstStringInRegExp(substr, 0, r)) + { + return std::make_pair(i, j); + } + } + } + } + + return std::make_pair(string::npos, string::npos); +} + Node SequencesRewriter::rewriteStrReverse(Node node) { Assert(node.getKind() == STRING_REV); diff --git a/src/theory/strings/sequences_rewriter.h b/src/theory/strings/sequences_rewriter.h index 490dd8b3c..00ae21634 100644 --- a/src/theory/strings/sequences_rewriter.h +++ b/src/theory/strings/sequences_rewriter.h @@ -204,6 +204,33 @@ class SequencesRewriter : public TheoryRewriter * str.replaceall. If it returns a non-null ret, then node rewrites to ret. */ Node rewriteReplaceInternal(Node node); + /** rewrite regular expression replace + * + * This method implements rewrite rules that apply to terms of the form + * str.replace_re(s, r, t). + * + * @param node The node to rewrite + * @return The rewritten node + */ + Node rewriteReplaceRe(Node node); + /** rewrite regular expression replace + * + * This method implements rewrite rules that apply to terms of the form + * str.replace_re_all(s, r, t). + * + * @param node The node to rewrite + * @return The rewritten node + */ + Node rewriteReplaceReAll(Node node); + /** + * Returns the first, shortest sequence in n that matches r. + * + * @param n The constant string or sequence to search in. + * @param r The regular expression to search for. + * @return A pair holding the start position and the end position of the + * match or a pair of string::npos if r does not appear in n. + */ + std::pair<size_t, size_t> firstMatch(Node n, Node r); /** rewrite string reverse * * This is the entry point for post-rewriting terms n of the form diff --git a/src/theory/strings/skolem_cache.h b/src/theory/strings/skolem_cache.h index 7ce507f29..bb4d0de55 100644 --- a/src/theory/strings/skolem_cache.h +++ b/src/theory/strings/skolem_cache.h @@ -102,6 +102,16 @@ class SkolemCache // of b in a as the witness for contains( a, b ). SK_FIRST_CTN_PRE, SK_FIRST_CTN_POST, + // For sequence a and regular expression b, + // in_re(a, re.++(_*, b, _*)) => + // exists k_pre, k_match, k_post. + // a = k_pre ++ k_match ++ k_post ^ + // ~in_re(k_pre ++ substr(k_match, 0, str.len(k_match) - 1), + // re.++(_*, b, _*)) ^ + // in_re(k2, y) + SK_FIRST_MATCH_PRE, + SK_FIRST_MATCH, + SK_FIRST_MATCH_POST, // For integer b, // len( a ) > b => // exists k. a = k ++ a' ^ len( k ) = b @@ -120,6 +130,14 @@ class SkolemCache // k(x) is the end index of the x^th occurrence of b in a // where n is the number of occurrences of b in a, and k(0)=0. SK_OCCUR_INDEX, + // For function k: Int -> Int + // exists k. + // forall 0 <= x < n, + // k(x) is the length of the x^th occurrence of b in a (excluding + // matches of empty strings) + // where b is a regular expression, n is the number of occurrences of b + // in a, and k(0)=0. + SK_OCCUR_LEN, }; /** * Returns a skolem of type string that is cached for (a,b,id) and has diff --git a/src/theory/strings/term_registry.cpp b/src/theory/strings/term_registry.cpp index 6330d7c10..395604f76 100644 --- a/src/theory/strings/term_registry.cpp +++ b/src/theory/strings/term_registry.cpp @@ -76,9 +76,10 @@ void TermRegistry::preRegisterTerm(TNode n) if (!options::stringExp()) { if (k == STRING_STRIDOF || k == STRING_ITOS || k == STRING_STOI - || k == STRING_STRREPL || k == STRING_STRREPLALL || k == STRING_STRCTN - || k == STRING_LEQ || k == STRING_TOLOWER || k == STRING_TOUPPER - || k == STRING_REV) + || k == STRING_STRREPL || k == STRING_STRREPLALL + || k == STRING_REPLACE_RE || k == STRING_REPLACE_RE_ALL + || k == STRING_STRCTN || k == STRING_LEQ || k == STRING_TOLOWER + || k == STRING_TOUPPER || k == STRING_REV) { std::stringstream ss; ss << "Term of kind " << k diff --git a/src/theory/strings/theory_strings.cpp b/src/theory/strings/theory_strings.cpp index 5107fa3f9..d83d5ca49 100644 --- a/src/theory/strings/theory_strings.cpp +++ b/src/theory/strings/theory_strings.cpp @@ -118,6 +118,9 @@ TheoryStrings::TheoryStrings(context::Context* c, d_equalityEngine.addFunctionKind(kind::STRING_STRIDOF); d_equalityEngine.addFunctionKind(kind::STRING_STRREPL); d_equalityEngine.addFunctionKind(kind::STRING_STRREPLALL); + d_equalityEngine.addFunctionKind(kind::STRING_REPLACE_RE); + d_equalityEngine.addFunctionKind(kind::STRING_REPLACE_RE_ALL); + d_equalityEngine.addFunctionKind(kind::STRING_STRREPLALL); d_equalityEngine.addFunctionKind(kind::STRING_TOLOWER); d_equalityEngine.addFunctionKind(kind::STRING_TOUPPER); d_equalityEngine.addFunctionKind(kind::STRING_REV); diff --git a/src/theory/strings/theory_strings_preprocess.cpp b/src/theory/strings/theory_strings_preprocess.cpp index 50c6ede62..550a7ba79 100644 --- a/src/theory/strings/theory_strings_preprocess.cpp +++ b/src/theory/strings/theory_strings_preprocess.cpp @@ -488,6 +488,175 @@ Node StringsPreprocess::reduce(Node t, // Thus, replaceall( x, y, z ) = rpaw retNode = rpaw; } + else if (t.getKind() == STRING_REPLACE_RE) + { + Node x = t[0]; + Node y = t[1]; + Node z = t[2]; + Node k = sc->mkSkolemCached(t, SkolemCache::SK_PURIFY, "k"); + + std::vector<Node> emptyVec; + Node sigmaStar = + nm->mkNode(REGEXP_STAR, nm->mkNode(REGEXP_SIGMA, emptyVec)); + Node re = nm->mkNode(REGEXP_CONCAT, sigmaStar, y, sigmaStar); + // in_re(x, re.++(_*, y, _*)) + Node hasMatch = nm->mkNode(STRING_IN_REGEXP, x, re); + + // in_re("", y) + Node matchesEmpty = + nm->mkNode(STRING_IN_REGEXP, nm->mkConst(String("")), y); + // k = z ++ x + Node res1 = k.eqNode(nm->mkNode(STRING_CONCAT, z, x)); + + Node k1 = + sc->mkSkolemCached(x, y, SkolemCache::SK_FIRST_MATCH_PRE, "rre_pre"); + Node k2 = + sc->mkSkolemCached(x, y, SkolemCache::SK_FIRST_MATCH, "rre_match"); + Node k3 = + sc->mkSkolemCached(x, y, SkolemCache::SK_FIRST_MATCH_POST, "rre_post"); + // x = k1 ++ k2 ++ k3 + Node splitX = x.eqNode(nm->mkNode(STRING_CONCAT, k1, k2, k3)); + // ~in_re(k1 ++ str.substr(k2, 0, str.len(k2) - 1), re.++(_*, y, _*)) + Node k2len = nm->mkNode(STRING_LENGTH, k2); + Node firstMatch = + nm->mkNode( + STRING_IN_REGEXP, + nm->mkNode( + STRING_CONCAT, + k1, + nm->mkNode( + STRING_SUBSTR, k2, zero, nm->mkNode(MINUS, k2len, one))), + re) + .negate(); + // in_re(k2, y) + Node k2Match = nm->mkNode(STRING_IN_REGEXP, k2, y); + // k = k1 ++ z ++ k3 + Node res2 = k.eqNode(nm->mkNode(STRING_CONCAT, k1, z, k3)); + + // IF in_re(x, re.++(_*, y, _*)) + // THEN: + // IF in_re("", y) + // THEN: k = z ++ x + // ELSE: + // x = k1 ++ k2 ++ k3 ^ + // ~in_re(k1 ++ substr(k2, 0, str.len(k2) - 1), re.++(_*, y, _*)) ^ + // in_re(k2, y) ^ k = k1 ++ z ++ k3 + // ELSE: k = x + asserts.push_back(nm->mkNode( + ITE, + hasMatch, + nm->mkNode(ITE, + matchesEmpty, + res1, + nm->mkNode(AND, splitX, firstMatch, k2Match, res2)), + k.eqNode(x))); + retNode = k; + } + else if (t.getKind() == STRING_REPLACE_RE_ALL) + { + Node x = t[0]; + Node y = t[1]; + Node z = t[2]; + Node k = sc->mkSkolemCached(t, SkolemCache::SK_PURIFY, "k"); + + Node numOcc = sc->mkTypedSkolemCached( + nm->integerType(), x, y, SkolemCache::SK_NUM_OCCUR, "numOcc"); + std::vector<TypeNode> argTypes; + argTypes.push_back(nm->integerType()); + Node us = nm->mkSkolem("Us", nm->mkFunctionType(argTypes, t.getType())); + TypeNode ufType = nm->mkFunctionType(argTypes, nm->integerType()); + Node uf = sc->mkTypedSkolemCached( + ufType, x, y, SkolemCache::SK_OCCUR_INDEX, "Uf"); + Node ul = + sc->mkTypedSkolemCached(ufType, x, y, SkolemCache::SK_OCCUR_LEN, "Ul"); + + Node emp = Word::mkEmptyWord(t.getType()); + + std::vector<Node> emptyVec; + Node sigmaStar = + nm->mkNode(REGEXP_STAR, nm->mkNode(REGEXP_SIGMA, emptyVec)); + Node yp = nm->mkNode(REGEXP_DIFF, y, nm->mkNode(STRING_TO_REGEXP, emp)); + Node re = nm->mkNode(REGEXP_CONCAT, sigmaStar, yp, sigmaStar); + // in_re(x, _* ++ y' ++ _*) + Node hasMatch = nm->mkNode(STRING_IN_REGEXP, x, re); + + Node ufno = nm->mkNode(APPLY_UF, uf, numOcc); + Node usno = nm->mkNode(APPLY_UF, us, numOcc); + Node rem = nm->mkNode(STRING_SUBSTR, x, ufno, nm->mkNode(STRING_LENGTH, x)); + + std::vector<Node> lemmas; + // numOcc > 0 + lemmas.push_back(nm->mkNode(GT, numOcc, zero)); + // k = Us(0) + lemmas.push_back(k.eqNode(nm->mkNode(APPLY_UF, us, zero))); + // Us(numOcc) = substr(x, Uf(numOcc)) + lemmas.push_back(usno.eqNode(rem)); + // Uf(0) = 0 + lemmas.push_back(nm->mkNode(APPLY_UF, uf, zero).eqNode(zero)); + // not(in_re(substr(x, Uf(numOcc)), re.++(_*, y', _*))) + lemmas.push_back(nm->mkNode(STRING_IN_REGEXP, rem, re).negate()); + + Node i = SkolemCache::mkIndexVar(t); + Node bvli = nm->mkNode(BOUND_VAR_LIST, i); + Node bound = + nm->mkNode(AND, nm->mkNode(GEQ, i, zero), nm->mkNode(LT, i, numOcc)); + Node ip1 = nm->mkNode(PLUS, i, one); + Node ufi = nm->mkNode(APPLY_UF, uf, i); + Node uli = nm->mkNode(APPLY_UF, ul, i); + Node ufip1 = nm->mkNode(APPLY_UF, uf, ip1); + Node ii = nm->mkNode(MINUS, ufip1, uli); + Node match = nm->mkNode(STRING_SUBSTR, x, ii, uli); + Node pfxMatch = + nm->mkNode(STRING_SUBSTR, x, ufi, nm->mkNode(MINUS, ii, ufi)); + Node nonMatch = + nm->mkNode(STRING_SUBSTR, + x, + ufi, + nm->mkNode(MINUS, nm->mkNode(MINUS, ufip1, one), ufi)); + + std::vector<Node> flem; + // Ul(i) > 0 + flem.push_back(nm->mkNode(GT, uli, zero)); + // Uf(i + 1) >= Uf(i) + Ul(i) + flem.push_back(nm->mkNode(GEQ, ufip1, nm->mkNode(PLUS, ufi, uli))); + // in_re(substr(x, ii, Ul(i)), y') + flem.push_back(nm->mkNode(STRING_IN_REGEXP, match, yp)); + // ~in_re(substr(x, Uf(i), Uf(i + 1) - 1 - Uf(i)), re.++(_*, y', _*)) + flem.push_back(nm->mkNode(STRING_IN_REGEXP, nonMatch, re).negate()); + // Us(i) = substr(x, Uf(i), ii - Uf(i)) ++ z ++ Us(i + 1) + flem.push_back( + nm->mkNode(APPLY_UF, us, i) + .eqNode(nm->mkNode( + STRING_CONCAT, pfxMatch, z, nm->mkNode(APPLY_UF, us, ip1)))); + + Node forall = nm->mkNode( + FORALL, bvli, nm->mkNode(OR, bound.negate(), nm->mkNode(AND, flem))); + lemmas.push_back(forall); + + // IF in_re(x, re.++(_*, y', _*)) + // THEN: + // numOcc > 0 ^ + // k = Us(0) ^ Us(numOcc) = substr(x, Uf(numOcc)) ^ + // Uf(0) = 0 ^ not(in_re(substr(x, Uf(numOcc)), re.++(_*, y', _*))) + // forall i. 0 <= i < nummOcc => + // Ul(i) > 0 ^ + // Uf(i + 1) >= Uf(i) + Ul(i) ^ + // in_re(substr(x, ii, Ul(i)), y') ^ + // ~in_re(substr(x, Uf(i), Uf(i + 1) - 1 - Uf(i)), re.++(_*, y', _*)) ^ + // Us(i) = substr(x, Uf(i), ii - Uf(i)) ++ z ++ Us(i + 1) + // where ii = Uf(i + 1) - Ul(i) + // ELSE: k = x + // where y' = re.diff(y, "") + // + // Conceptually, y' is the regex y but excluding the empty string (because + // we do not want to match empty strings), numOcc is the number of shortest + // matches of y' in x, Uf(i) is the end position of the i-th match, Ul(i) + // is the length of the i^th match, and Us(i) is the result of processing + // the remainder after processing the i^th occurrence of y in x. + asserts.push_back( + nm->mkNode(ITE, hasMatch, nm->mkNode(AND, lemmas), k.eqNode(x))); + retNode = k; + } else if (t.getKind() == STRING_TOLOWER || t.getKind() == STRING_TOUPPER) { Node x = t[0]; |