/****************************************************************************** * Top contributors (to current version): * Andrew Reynolds, Andres Noetzli * * This file is part of the cvc5 project. * * Copyright (c) 2009-2021 by the authors listed in the file AUTHORS * in the top-level source directory and their institutional affiliations. * All rights reserved. See the file COPYING in the top-level source * directory for licensing information. * **************************************************************************** * * Entailment tests involving strings. */ #include "cvc5_private.h" #ifndef CVC5__THEORY__STRINGS__STRING_ENTAIL_H #define CVC5__THEORY__STRINGS__STRING_ENTAIL_H #include #include "expr/node.h" #include "theory/strings/arith_entail.h" namespace cvc5 { namespace theory { class Rewriter; namespace strings { class SequencesRewriter; /** * Entailment tests involving strings. * Some of these techniques are described in Reynolds et al, "High Level * Abstractions for Simplifying Extended String Constraints in SMT", CAV 2019. */ class StringsEntail { public: StringsEntail(Rewriter* r, ArithEntail& aent, SequencesRewriter& rewriter); /** can constant contain list * return true if constant c can contain the list l in order * firstc/lastc store which indices in l were used to determine the return * value. * (This is typically used when this function returns false, for minimizing * explanations) * * For example: * canConstantContainList( "abc", { x, "c", y } ) returns true * firstc/lastc are updated to 1/1 * canConstantContainList( "abc", { x, "d", y } ) returns false * firstc/lastc are updated to 1/1 * canConstantContainList( "abcdef", { x, "b", y, "a", z, "c", w } * returns false * firstc/lastc are updated to 1/3 * canConstantContainList( "abcdef", { x, "b", y, "e", z, "c", w } * returns false * firstc/lastc are updated to 1/5 */ static bool canConstantContainList(Node c, std::vector& l, int& firstc, int& lastc); /** can constant contain concat * same as above but with n = str.++( l ) instead of l */ bool canConstantContainConcat(Node c, Node n, int& firstc, int& lastc); /** strip symbolic length * * This function strips off components of n1 whose length is less than or * equal to argument curr, and stores them in nr. The direction dir * determines whether the components are removed from the start or end of n1. * If `strict` is set to true, then the function only returns true if full * length `curr` can be stripped. * * In detail, this function updates n1 to n1' such that: * If dir=1, * n1 = str.++( nr, n1' ) * If dir=-1 * n1 = str.++( n1', nr ) * It updates curr to curr' such that: * curr' = curr - str.len( str.++( nr ) ), and * curr' >= 0 * where the latter fact is determined by checkArithEntail. * * This function returns true if n1 is modified. * * For example: * * stripSymbolicLength( { x, "abc", y }, {}, 1, str.len(x)+1 ) * returns true * n1 is updated to { "bc", y } * nr is updated to { x, "a" } * curr is updated to 0 * * * stripSymbolicLength( { x, "abc", y }, {}, 1, str.len(x)-1 ) * returns false * * stripSymbolicLength( { y, "abc", x }, {}, 1, str.len(x)+1 ) * returns false * * stripSymbolicLength( { x, "abc", y }, {}, -1, 2*str.len(y)+4 ) * returns true * n1 is updated to { x } * nr is updated to { "abc", y } * curr is updated to str.len(y)+1 */ bool stripSymbolicLength(std::vector& n1, std::vector& nr, int dir, Node& curr, bool strict = false); /** component contains * This function is used when rewriting str.contains( t1, t2 ), where * n1 is the vector form of t1 * n2 is the vector form of t2 * * If this function returns n>=0 for some n, then * n1 = { x1...x{n-1} xn...x{n+s} x{n+s+1}...xm }, * n2 = { y1...ys }, * y1 is a suffix of xn, * y2...y{s-1} = x{n+1}...x{n+s-1}, and * ys is a prefix of x{n+s} * Otherwise it returns -1. * * This function may update n1 if computeRemainder = true. * We maintain the invariant that the resulting value n1' * of n1 after this function is such that: * n1 = str.++( nb, n1', ne ) * The vectors nb and ne have the following properties. * If computeRemainder = true, then * If remainderDir != -1, then * ne is { x{n+s}' x{n+s+1}...xm } * where x{n+s} = str.++( ys, x{n+s}' ). * If remainderDir != 1, then * nb is { x1, ..., x{n-1}, xn' } * where xn = str.++( xn', y1 ). * * For example: * * componentContains({ x, "abc", x }, { "b" }, {}, true, 0) * returns 1, * n1 is updated to { "b" }, * nb is updated to { x, "a" }, * ne is updated to { "c", x } * * componentContains({ x, "abc", x }, { "b" }, {}, true, 1) * returns 1, * n1 is updated to { x, "ab" }, * ne is updated to { "c", x } * * componentContains({ y, z, "abc", x, "def" }, { "c", x, "de" }, {}, true, 1) * returns 2, * n1 is updated to { y, z, "abc", x, "de" }, * ne is updated to { "f" } * * componentContains({ y, "abc", x, "def" }, { "c", x, "de" }, {}, true, -1) * returns 1, * n1 is updated to { "c", x, "def" }, * nb is updated to { y, "ab" } */ int componentContains(std::vector& n1, std::vector& n2, std::vector& nb, std::vector& ne, bool computeRemainder = false, int remainderDir = 0); /** strip constant endpoints * This function is used when rewriting str.contains( t1, t2 ), where * n1 is the vector form of t1 * n2 is the vector form of t2 * * It modifies n1 to a new vector n1' such that: * (1) str.contains( str.++( n1 ), str.++( n2 ) ) is equivalent to * str.contains( str.++( n1' ), str.++( n2 ) ) * (2) str.++( n1 ) = str.++( nb, n1', ne ) * * "dir" is the direction in which we can modify n1: * if dir = 1, then we allow dropping components from the front of n1, * if dir = -1, then we allow dropping components from the back of n1, * if dir = 0, then we allow dropping components from either. * * It returns true if n1 is modified. * * For example: * stripConstantEndpoints({ "ab", x, "de" }, { "c" }, {}, {}, 1) * returns true, * n1 is updated to { x, "de" } * nb is updated to { "ab" } * stripConstantEndpoints({ "ab", x, "de" }, { "bd" }, {}, {}, 0) * returns true, * n1 is updated to { "b", x, "d" } * nb is updated to { "a" } * ne is updated to { "e" } * stripConstantEndpoints({ "ad", substr("ccc",x,y) }, { "d" }, {}, {}, -1) * returns true, * n1 is updated to {"ad"} * ne is updated to { substr("ccc",x,y) } */ static bool stripConstantEndpoints(std::vector& n1, std::vector& n2, std::vector& nb, std::vector& ne, int dir = 0); /** * Checks whether a string term `a` is entailed to contain or not contain a * string term `b`. * * @param a The string that is checked whether it contains `b` * @param b The string that is checked whether it is contained in `a` * @param fullRewriter Determines whether the function can use the full * rewriter or only `rewriteContains()` (useful for avoiding loops) * @return true node if it can be shown that `a` contains `b`, false node if * it can be shown that `a` does not contain `b`, null node otherwise */ Node checkContains(Node a, Node b, bool fullRewriter = true); /** entail non-empty * * Checks whether string a is entailed to be non-empty. Is equivalent to * the call checkArithEntail( len( a ), true ). */ bool checkNonEmpty(Node a); /** * Checks whether string has at most/exactly length one. Length one strings * can be used for more aggressive rewriting because there is guaranteed that * it cannot be overlap multiple components in a string concatenation. * * @param s The string to check * @param strict If true, the string must have exactly length one, otherwise * at most length one * @return True if the string has at most/exactly length one, false otherwise */ bool checkLengthOne(Node s, bool strict = false); /** * Checks whether it is always true that `a` is a strict subset of `b` in the * multiset domain. * * Examples: * * a = (str.++ "A" x), b = (str.++ "A" x "B") ---> true * a = (str.++ "A" x), b = (str.++ "B" x "AA") ---> true * a = (str.++ "A" x), b = (str.++ "B" y "AA") ---> false * * @param a The term for which it should be checked if it is a strict subset * of `b` in the multiset domain * @param b The term for which it should be checked if it is a strict * superset of `a` in the multiset domain * @return True if it is always the case that `a` is a strict subset of `b`, * false otherwise. */ static bool checkMultisetSubset(Node a, Node b); /** * Returns a character `c` if it is always the case that str.in.re(a, c*), * i.e. if all possible values of `a` only consist of `c` characters, and the * null node otherwise. If `a` is the empty string, the function returns an * empty string. * * @param a The node to check for homogeneity * @return If `a` is homogeneous, the only character that it may contain, the * empty string if `a` is empty, and the null node otherwise */ static Node checkHomogeneousString(Node a); /** * Overapproximates the possible values of node n. This overapproximation * assumes that n can return a value x or the empty string and tries to find * the simplest x such that this holds. In the general case, x is the same as * the input n. This overapproximation can be used to sort terms with the * same possible values in string concatenation for example. * * Example: * * getStringOrEmpty( (str.replace "" x y) ) --> y because (str.replace "" x y) * either returns y or "" * * getStringOrEmpty( (str.substr "ABC" x y) ) --> (str.substr "ABC" x y) * because the function could not compute a simpler */ Node getStringOrEmpty(Node n); /** * Infers a conjunction of equalities that correspond to (str.contains x y) * if it can show that the length of y is greater or equal to the length of * x. If y is a concatentation, we get x = y1 ++ ... ++ yn, the conjunction * is of the form: * * (and (= x (str.++ y1' ... ym')) (= y1'' "") ... (= yk'' "")) * * where each yi'' are yi that must be empty for (= x y) to hold and yi' are * yi that the function could not infer anything about. Returns a null node * if the function cannot infer that str.len(y) >= str.len(x). Returns (= x * y) if the function can infer that str.len(y) >= str.len(x) but cannot * infer that any of the yi must be empty. */ Node inferEqsFromContains(Node x, Node y); private: /** component contains base * * This function is a helper for the above function. * * It returns true if n2 is contained in n1 with the following * restrictions: * If dir=1, then n2 must be a suffix of n1. * If dir=-1, then n2 must be a prefix of n1. * * If computeRemainder is true, then n1rb and n1re are * updated such that : * n1 = str.++( n1rb, n2, n1re ) * where a null value of n1rb and n1re indicates the * empty string. * * For example: * * componentContainsBase("cabe", "ab", n1rb, n1re, 1, false) * returns false. * * componentContainsBase("cabe", "ab", n1rb, n1re, 0, true) * returns true, * n1rb is set to "c", * n1re is set to "e". * * componentContainsBase(y, str.substr(y,0,5), n1rb, n1re, -1, true) * returns true, * n1re is set to str.substr(y,5,str.len(y)). * * * Notice that this function may return false when it cannot compute a * remainder when it otherwise would have returned true. For example: * * componentContainsBase(y, str.substr(y,x,z), n1rb, n1re, 0, false) * returns true. * * Hence, we know that str.substr(y,x,z) is contained in y. However: * * componentContainsBase(y, str.substr(y,x,z), n1rb, n1re, 0, true) * returns false. * * The reason is since computeRemainder=true, it must be that * y = str.++( n1rb, str.substr(y,x,z), n1re ) * for some n1rb, n1re. However, to construct such n1rb, n1re would require * e.g. the terms: * y = str.++( ite( x+z < 0 OR x < 0, "", str.substr(y,0,x) ), * str.substr(y,x,z), * ite( x+z < 0 OR x < 0, y, str.substr(y,x+z,len(y)) ) ) * * Since we do not wish to introduce ITE terms in the rewriter, we instead * return false, indicating that we cannot compute the remainder. */ bool componentContainsBase( Node n1, Node n2, Node& n1rb, Node& n1re, int dir, bool computeRemainder); /** * Simplifies a given node `a` s.t. the result is a concatenation of string * terms that can be interpreted as a multiset and which contains all * multisets that `a` could form. * * Examples: * * (str.substr "AA" 0 n) ---> "AA" * (str.replace "AAA" x "BB") ---> (str.++ "AAA" "BB") * * @param a The node to simplify * @return A concatenation that can be interpreted as a multiset */ static Node getMultisetApproximation(Node a); private: /** Pointer to the full rewriter */ Rewriter* d_rr; /** The arithmetic entailment module */ ArithEntail& d_arithEntail; /** * Reference to the sequences rewriter that owns this `StringsEntail` * instance. */ SequencesRewriter& d_rewriter; }; } // namespace strings } // namespace theory } // namespace cvc5 #endif /* CVC5__THEORY__STRINGS__STRING_ENTAIL_H */