src/theory/strings/strings_entail.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382

/*********************                                                        */
/*! \file strings_entail.h
 ** \verbatim
 ** Top contributors (to current version):
 **   Andrew Reynolds, Andres Noetzli
 ** This file is part of the CVC4 project.
 ** Copyright (c) 2009-2019 by the authors listed in the file AUTHORS
 ** in the top-level source directory) and their institutional affiliations.
 ** All rights reserved.  See the file COPYING in the top-level source
 ** directory for licensing information.\endverbatim
 **
 ** \brief Entailment tests involving strings
 **/

#include "cvc4_private.h"

#ifndef CVC4__THEORY__STRINGS__STRING_ENTAIL_H
#define CVC4__THEORY__STRINGS__STRING_ENTAIL_H

#include <vector>

#include "expr/node.h"

namespace CVC4 {
namespace theory {
namespace strings {

class SequencesRewriter;

/**
 * Entailment tests involving strings.
 * Some of these techniques are described in Reynolds et al, "High Level
 * Abstractions for Simplifying Extended String Constraints in SMT", CAV 2019.
 */
class StringsEntail
{
 public:
  StringsEntail(SequencesRewriter& rewriter);

  /** can constant contain list
   * return true if constant c can contain the list l in order
   * firstc/lastc store which indices in l were used to determine the return
   * value.
   *   (This is typically used when this function returns false, for minimizing
   * explanations)
   *
   * For example:
   *   canConstantContainList( "abc", { x, "c", y } ) returns true
   *     firstc/lastc are updated to 1/1
   *   canConstantContainList( "abc", { x, "d", y } ) returns false
   *     firstc/lastc are updated to 1/1
   *   canConstantContainList( "abcdef", { x, "b", y, "a", z, "c", w }
   *     returns false
   *     firstc/lastc are updated to 1/3
   *   canConstantContainList( "abcdef", { x, "b", y, "e", z, "c", w }
   *     returns false
   *     firstc/lastc are updated to 1/5
   */
  static bool canConstantContainList(Node c,
                                     std::vector<Node>& l,
                                     int& firstc,
                                     int& lastc);
  /** can constant contain concat
   * same as above but with n = str.++( l ) instead of l
   */
  static bool canConstantContainConcat(Node c, Node n, int& firstc, int& lastc);

  /** strip symbolic length
   *
   * This function strips off components of n1 whose length is less than
   * or equal to argument curr, and stores them in nr. The direction
   * dir determines whether the components are removed from the start
   * or end of n1.
   *
   * In detail, this function updates n1 to n1' such that:
   *   If dir=1,
   *     n1 = str.++( nr, n1' )
   *   If dir=-1
   *     n1 = str.++( n1', nr )
   * It updates curr to curr' such that:
   *   curr' = curr - str.len( str.++( nr ) ), and
   *   curr' >= 0
   * where the latter fact is determined by checkArithEntail.
   *
   * This function returns true if n1 is modified.
   *
   * For example:
   *
   *  stripSymbolicLength( { x, "abc", y }, {}, 1, str.len(x)+1 )
   *    returns true
   *    n1 is updated to { "bc", y }
   *    nr is updated to { x, "a" }
   *    curr is updated to 0   *
   *
   * stripSymbolicLength( { x, "abc", y }, {}, 1, str.len(x)-1 )
   *    returns false
   *
   *  stripSymbolicLength( { y, "abc", x }, {}, 1, str.len(x)+1 )
   *    returns false
   *
   *  stripSymbolicLength( { x, "abc", y }, {}, -1, 2*str.len(y)+4 )
   *    returns true
   *    n1 is updated to { x }
   *    nr is updated to { "abc", y }
   *    curr is updated to str.len(y)+1
   */
  static bool stripSymbolicLength(std::vector<Node>& n1,
                                  std::vector<Node>& nr,
                                  int dir,
                                  Node& curr);
  /** component contains
   * This function is used when rewriting str.contains( t1, t2 ), where
   * n1 is the vector form of t1
   * n2 is the vector form of t2
   *
   * If this function returns n>=0 for some n, then
   *    n1 = { x1...x{n-1} xn...x{n+s} x{n+s+1}...xm },
   *    n2 = { y1...ys },
   *    y1 is a suffix of xn,
   *    y2...y{s-1} = x{n+1}...x{n+s-1}, and
   *    ys is a prefix of x{n+s}
   * Otherwise it returns -1.
   *
   * This function may update n1 if computeRemainder = true.
   *   We maintain the invariant that the resulting value n1'
   *   of n1 after this function is such that:
   *     n1 = str.++( nb, n1', ne )
   * The vectors nb and ne have the following properties.
   * If computeRemainder = true, then
   *   If remainderDir != -1, then
   *     ne is { x{n+s}' x{n+s+1}...xm }
   *     where x{n+s} = str.++( ys, x{n+s}' ).
   *   If remainderDir != 1, then
   *     nb is { x1, ..., x{n-1}, xn' }
   *     where xn = str.++( xn', y1 ).
   *
   * For example:
   *
   * componentContains({ x, "abc", x }, { "b" }, {}, true, 0)
   *   returns 1,
   *   n1 is updated to { "b" },
   *   nb is updated to { x, "a" },
   *   ne is updated to { "c", x }
   *
   * componentContains({ x, "abc", x }, { "b" }, {}, true, 1)
   *   returns 1,
   *   n1 is updated to { x, "ab" },
   *   ne is updated to { "c", x }
   *
   * componentContains({ y, z, "abc", x, "def" }, { "c", x, "de" }, {}, true, 1)
   *   returns 2,
   *   n1 is updated to { y, z, "abc", x, "de" },
   *   ne is updated to { "f" }
   *
   * componentContains({ y, "abc", x, "def" }, { "c", x, "de" }, {}, true, -1)
   *   returns 1,
   *   n1 is updated to { "c", x, "def" },
   *   nb is updated to { y, "ab" }
   */
  int componentContains(std::vector<Node>& n1,
                        std::vector<Node>& n2,
                        std::vector<Node>& nb,
                        std::vector<Node>& ne,
                        bool computeRemainder = false,
                        int remainderDir = 0);
  /** strip constant endpoints
   * This function is used when rewriting str.contains( t1, t2 ), where
   * n1 is the vector form of t1
   * n2 is the vector form of t2
   *
   * It modifies n1 to a new vector n1' such that:
   * (1) str.contains( str.++( n1 ), str.++( n2 ) ) is equivalent to
   * str.contains( str.++( n1' ), str.++( n2 ) )
   * (2) str.++( n1 ) = str.++( nb, n1', ne )
   *
   * "dir" is the direction in which we can modify n1:
   * if dir = 1, then we allow dropping components from the front of n1,
   * if dir = -1, then we allow dropping components from the back of n1,
   * if dir = 0, then we allow dropping components from either.
   *
   * It returns true if n1 is modified.
   *
   * For example:
   * stripConstantEndpoints({ "ab", x, "de" }, { "c" }, {}, {}, 1)
   *   returns true,
   *   n1 is updated to { x, "de" }
   *   nb is updated to { "ab" }
   * stripConstantEndpoints({ "ab", x, "de" }, { "bd" }, {}, {}, 0)
   *   returns true,
   *   n1 is updated to { "b", x, "d" }
   *   nb is updated to { "a" }
   *   ne is updated to { "e" }
   * stripConstantEndpoints({ "ad", substr("ccc",x,y) }, { "d" }, {}, {}, -1)
   *   returns true,
   *   n1 is updated to {"ad"}
   *   ne is updated to { substr("ccc",x,y) }
   */
  static bool stripConstantEndpoints(std::vector<Node>& n1,
                                     std::vector<Node>& n2,
                                     std::vector<Node>& nb,
                                     std::vector<Node>& ne,
                                     int dir = 0);

  /**
   * Checks whether a string term `a` is entailed to contain or not contain a
   * string term `b`.
   *
   * @param a The string that is checked whether it contains `b`
   * @param b The string that is checked whether it is contained in `a`
   * @param fullRewriter Determines whether the function can use the full
   * rewriter or only `rewriteContains()` (useful for avoiding loops)
   * @return true node if it can be shown that `a` contains `b`, false node if
   * it can be shown that `a` does not contain `b`, null node otherwise
   */
  Node checkContains(Node a, Node b, bool fullRewriter = true);

  /** entail non-empty
   *
   * Checks whether string a is entailed to be non-empty. Is equivalent to
   * the call checkArithEntail( len( a ), true ).
   */
  static bool checkNonEmpty(Node a);

  /**
   * Checks whether string has at most/exactly length one. Length one strings
   * can be used for more aggressive rewriting because there is guaranteed that
   * it cannot be overlap multiple components in a string concatenation.
   *
   * @param s The string to check
   * @param strict If true, the string must have exactly length one, otherwise
   * at most length one
   * @return True if the string has at most/exactly length one, false otherwise
   */
  static bool checkLengthOne(Node s, bool strict = false);

  /**
   * Checks whether it is always true that `a` is a strict subset of `b` in the
   * multiset domain.
   *
   * Examples:
   *
   * a = (str.++ "A" x), b = (str.++ "A" x "B") ---> true
   * a = (str.++ "A" x), b = (str.++ "B" x "AA") ---> true
   * a = (str.++ "A" x), b = (str.++ "B" y "AA") ---> false
   *
   * @param a The term for which it should be checked if it is a strict subset
   * of `b` in the multiset domain
   * @param b The term for which it should be checked if it is a strict
   * superset of `a` in the multiset domain
   * @return True if it is always the case that `a` is a strict subset of `b`,
   * false otherwise.
   */
  static bool checkMultisetSubset(Node a, Node b);

  /**
   * Returns a character `c` if it is always the case that str.in.re(a, c*),
   * i.e. if all possible values of `a` only consist of `c` characters, and the
   * null node otherwise. If `a` is the empty string, the function returns an
   * empty string.
   *
   * @param a The node to check for homogeneity
   * @return If `a` is homogeneous, the only character that it may contain, the
   * empty string if `a` is empty, and the null node otherwise
   */
  static Node checkHomogeneousString(Node a);

  /**
   * Overapproximates the possible values of node n. This overapproximation
   * assumes that n can return a value x or the empty string and tries to find
   * the simplest x such that this holds. In the general case, x is the same as
   * the input n. This overapproximation can be used to sort terms with the
   * same possible values in string concatenation for example.
   *
   * Example:
   *
   * getStringOrEmpty( (str.replace "" x y) ) --> y because (str.replace "" x y)
   * either returns y or ""
   *
   * getStringOrEmpty( (str.substr "ABC" x y) ) --> (str.substr "ABC" x y)
   * because the function could not compute a simpler
   */
  static Node getStringOrEmpty(Node n);

  /**
   * Infers a conjunction of equalities that correspond to (str.contains x y)
   * if it can show that the length of y is greater or equal to the length of
   * x. If y is a concatentation, we get x = y1 ++ ... ++ yn, the conjunction
   * is of the form:
   *
   * (and (= x (str.++ y1' ... ym')) (= y1'' "") ... (= yk'' ""))
   *
   * where each yi'' are yi that must be empty for (= x y) to hold and yi' are
   * yi that the function could not infer anything about.  Returns a null node
   * if the function cannot infer that str.len(y) >= str.len(x). Returns (= x
   * y) if the function can infer that str.len(y) >= str.len(x) but cannot
   * infer that any of the yi must be empty.
   */
  static Node inferEqsFromContains(Node x, Node y);

 private:
  /** component contains base
   *
   * This function is a helper for the above function.
   *
   * It returns true if n2 is contained in n1 with the following
   * restrictions:
   *   If dir=1, then n2 must be a suffix of n1.
   *   If dir=-1, then n2 must be a prefix of n1.
   *
   * If computeRemainder is true, then n1rb and n1re are
   * updated such that :
   *   n1 = str.++( n1rb, n2, n1re )
   * where a null value of n1rb and n1re indicates the
   * empty string.
   *
   * For example:
   *
   * componentContainsBase("cabe", "ab", n1rb, n1re, 1, false)
   *   returns false.
   *
   * componentContainsBase("cabe", "ab", n1rb, n1re, 0, true)
   *   returns true,
   *   n1rb is set to "c",
   *   n1re is set to "e".
   *
   * componentContainsBase(y, str.substr(y,0,5), n1rb, n1re, -1, true)
   *   returns true,
   *   n1re is set to str.substr(y,5,str.len(y)).
   *
   *
   * Notice that this function may return false when it cannot compute a
   * remainder when it otherwise would have returned true. For example:
   *
   * componentContainsBase(y, str.substr(y,x,z), n1rb, n1re, 0, false)
   *   returns true.
   *
   * Hence, we know that str.substr(y,x,z) is contained in y. However:
   *
   * componentContainsBase(y, str.substr(y,x,z), n1rb, n1re, 0, true)
   *   returns false.
   *
   * The reason is since computeRemainder=true, it must be that
   *   y = str.++( n1rb, str.substr(y,x,z), n1re )
   * for some n1rb, n1re. However, to construct such n1rb, n1re would require
   * e.g. the terms:
   *   y = str.++( ite( x+z < 0 OR x < 0, "", str.substr(y,0,x) ),
   *               str.substr(y,x,z),
   *               ite( x+z < 0 OR x < 0, y, str.substr(y,x+z,len(y)) ) )
   *
   * Since we do not wish to introduce ITE terms in the rewriter, we instead
   * return false, indicating that we cannot compute the remainder.
   */
  bool componentContainsBase(
      Node n1, Node n2, Node& n1rb, Node& n1re, int dir, bool computeRemainder);
  /**
   * Simplifies a given node `a` s.t. the result is a concatenation of string
   * terms that can be interpreted as a multiset and which contains all
   * multisets that `a` could form.
   *
   * Examples:
   *
   * (str.substr "AA" 0 n) ---> "AA"
   * (str.replace "AAA" x "BB") ---> (str.++ "AAA" "BB")
   *
   * @param a The node to simplify
   * @return A concatenation that can be interpreted as a multiset
   */
  static Node getMultisetApproximation(Node a);

 private:
  /**
   * Reference to the sequences rewriter that owns this `StringsEntail`
   * instance.
   */
  SequencesRewriter& d_rewriter;
};

}  // namespace strings
}  // namespace theory
}  // namespace CVC4

#endif /* CVC4__THEORY__STRINGS__STRING_ENTAIL_H */