summaryrefslogtreecommitdiff
path: root/src/util/string.h
blob: ca458232f832b9f244d26020fe9f21ce7956b34b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
/*********************                                                        */
/*! \file string.h
 ** \verbatim
 ** Top contributors (to current version):
 **   Andrew Reynolds, Tim King, Tianyi Liang
 ** This file is part of the CVC4 project.
 ** Copyright (c) 2009-2020 by the authors listed in the file AUTHORS
 ** in the top-level source directory) and their institutional affiliations.
 ** All rights reserved.  See the file COPYING in the top-level source
 ** directory for licensing information.\endverbatim
 **
 ** \brief The string data type.
 **/

#include "cvc4_public.h"

#ifndef CVC4__UTIL__STRING_H
#define CVC4__UTIL__STRING_H

#include <cstddef>
#include <functional>
#include <ostream>
#include <string>
#include <vector>
#include "util/rational.h"

namespace CVC4 {

/** The CVC4 string class
 *
 * This data structure is the domain of values for the string type. It can also
 * be used as a generic utility for representing strings.
 */
class CVC4_PUBLIC String {
 public:
  /**
   * This is the cardinality of the alphabet that is representable by this
   * class. Notice that this must be greater than or equal to the cardinality
   * of the alphabet that the string theory reasons about.
   *
   * This must be strictly less than std::numeric_limits<unsigned>::max().
   *
   * As per the SMT-LIB standard for strings, we support the first 3 planes of
   * Unicode characters, where 196608 = 3*16^4.
   */
  static inline unsigned num_codes() { return 196608; }
  /** constructors for String
   *
   * Internally, a CVC4::String is represented by a vector of unsigned
   * integers (d_str) representing the code points of the characters.
   *
   * To build a string from a C++ string, we may process escape sequences
   * according to the SMT-LIB standard. In particular, if useEscSequences is
   * true, we convert unicode escape sequences:
   *  \u d_3 d_2 d_1 d_0
   *  \u{d_0}
   *  \u{d_1 d_0}
   *  \u{d_2 d_1 d_0}
   *  \u{d_3 d_2 d_1 d_0}
   *  \u{d_4 d_3 d_2 d_1 d_0}
   * where d_0 ... d_4 are hexidecimal digits, to the appropriate character.
   *
   * If useEscSequences is false, then the characters of the constructed
   * CVC4::String correspond one-to-one with the input string.
   */
  String() = default;
  explicit String(const std::string& s, bool useEscSequences = false)
      : d_str(toInternal(s, useEscSequences))
  {
  }
  explicit String(const char* s, bool useEscSequences = false)
      : d_str(toInternal(std::string(s), useEscSequences))
  {
  }
  explicit String(const std::vector<unsigned>& s);

  String& operator=(const String& y) {
    if (this != &y) {
      d_str = y.d_str;
    }
    return *this;
  }

  String concat(const String& other) const;

  bool operator==(const String& y) const { return cmp(y) == 0; }
  bool operator!=(const String& y) const { return cmp(y) != 0; }
  bool operator<(const String& y) const { return cmp(y) < 0; }
  bool operator>(const String& y) const { return cmp(y) > 0; }
  bool operator<=(const String& y) const { return cmp(y) <= 0; }
  bool operator>=(const String& y) const { return cmp(y) >= 0; }

  /**
   * Returns true if this string is equal to y for their first n characters.
   * If n is larger than the length of this string or y, this method returns
   * true if and only if this string is equal to y.
   */
  bool strncmp(const String& y, std::size_t n) const;
  /**
   * Returns true if this string is equal to y for their last n characters.
   * Similar to strncmp, if n is larger than the length of this string or y,
   * this method returns true if and only if this string is equal to y.
   */
  bool rstrncmp(const String& y, std::size_t n) const;

  /* toString
   * Converts this string to a std::string.
   *
   * The unprintable characters are converted to unicode escape sequences as
   * described above.
   *
   * If useEscSequences is false, the string's printable characters are
   * printed as characters. Notice that for all std::string s having only
   * printable characters, we have that
   *    CVC4::String( s ).toString() = s.
   */
  std::string toString(bool useEscSequences = false) const;
  /** is this the empty string? */
  bool empty() const { return d_str.empty(); }
  /** is less than or equal to string y */
  bool isLeq(const String& y) const;
  /** Return the length of the string */
  std::size_t size() const { return d_str.size(); }

  bool isRepeated() const;
  bool tailcmp(const String& y, int& c) const;

  /**
   * Return the first position y occurs in this string, or std::string::npos
   * otherwise.
   */
  std::size_t find(const String& y, const std::size_t start = 0) const;
  /**
   * Return the first position y occurs in this string searching from the end,
   * or std::string::npos otherwise.
   */
  std::size_t rfind(const String& y, const std::size_t start = 0) const;
  /** Returns true if y is a prefix of this */
  bool hasPrefix(const String& y) const;
  /** Returns true if y is a suffix of this */
  bool hasSuffix(const String& y) const;
  /** Replace the character at index i in this string with t */
  String update(std::size_t i, const String& t) const;
  /** Replace the first occurrence of s in this string with t */
  String replace(const String& s, const String& t) const;
  /** Return the substring of this string starting at index i */
  String substr(std::size_t i) const;
  /** Return the substring of this string starting at index i with size at most
   * j */
  String substr(std::size_t i, std::size_t j) const;
  /** Return the prefix of this string of size at most i */
  String prefix(std::size_t i) const { return substr(0, i); }
  /** Return the suffix of this string of size at most i */
  String suffix(std::size_t i) const { return substr(size() - i, i); }

  /**
   * Checks if there is any overlap between this string and another string. This
   * corresponds to checking whether one string contains the other and whether a
   * substring of one is a prefix of the other and vice-versa.
   *
   * @param y The other string
   * @return True if there is an overlap, false otherwise
   */
  bool noOverlapWith(const String& y) const;

  /** string overlap
  *
  * if overlap returns m>0,
  * then the maximal suffix of this string that is a prefix of y is of length m.
  *
  * For example, if x is "abcdef", then:
  * x.overlap("defg") = 3
  * x.overlap("ab") = 0
  * x.overlap("d") = 0
  * x.overlap("bcdefdef") = 5
  */
  std::size_t overlap(const String& y) const;
  /** string reverse overlap
  *
  * if roverlap returns m>0,
  * then the maximal prefix of this string that is a suffix of y is of length m.
  *
  * For example, if x is "abcdef", then:
  * x.roverlap("aaabc") = 3
  * x.roverlap("def") = 0
  * x.roverlap("d") = 0
  * x.roverlap("defabcde") = 5
  *
  * Notice that x.overlap(y) = y.roverlap(x)
  */
  std::size_t roverlap(const String& y) const;

  /**
   * Returns true if this string corresponds in text to a number, for example
   * this returns true for strings "7", "12", "004", "0" and false for strings
   * "abc", "4a", "-4", "".
   */
  bool isNumber() const;
  /** Returns the corresponding rational for the text of this string. */
  Rational toNumber() const;
  /** Get the unsigned representation (code points) of this string */
  const std::vector<unsigned>& getVec() const { return d_str; }
  /**
   * Get the unsigned (code point) value of the first character in this string
   */
  unsigned front() const;
  /**
   * Get the unsigned (code point) value of the last character in this string
   */
  unsigned back() const;
  /** is the unsigned a digit?
   *
   * This is true for code points between 48 ('0') and 57 ('9').
   */
  static bool isDigit(unsigned character);
  /** is the unsigned a hexidecimal digit?
   *
   * This is true for code points between 48 ('0') and 57 ('9'), code points
   * between 65 ('A') and 70 ('F) and code points between 97 ('a') and 102 ('f).
   */
  static bool isHexDigit(unsigned character);
  /** is the unsigned a printable code point?
   *
   * This is true for Unicode 32 (' ') to 126 ('~').
   */
  static bool isPrintable(unsigned character);

  /**
   * Returns the maximum length of string representable by this class.
   * Corresponds to the maximum size of d_str.
   */
  static size_t maxSize();
 private:
  /**
   * Helper for toInternal: add character ch to vector vec, storing a string in
   * internal format. This throws an error if ch is not a printable character,
   * since non-printable characters must be escaped in SMT-LIB.
   */
  static void addCharToInternal(unsigned char ch, std::vector<unsigned>& vec);
  /**
   * Convert the string s to the internal format (vector of code points).
   * The argument useEscSequences is whether to process unicode escape
   * sequences.
   */
  static std::vector<unsigned> toInternal(const std::string& s,
                                          bool useEscSequences);

  /**
   * Returns a negative number if *this < y, 0 if *this and y are equal and a
   * positive number if *this > y.
   */
  int cmp(const String& y) const;

  std::vector<unsigned> d_str;
}; /* class String */

namespace strings {

struct CVC4_PUBLIC StringHashFunction {
  size_t operator()(const ::CVC4::String& s) const {
    return std::hash<std::string>()(s.toString());
  }
}; /* struct StringHashFunction */

}  // namespace strings

std::ostream& operator<<(std::ostream& os, const String& s) CVC4_PUBLIC;

}  // namespace CVC4

#endif /* CVC4__UTIL__STRING_H */
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback