summaryrefslogtreecommitdiff
path: root/src/util/regexp.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/util/regexp.h')
-rw-r--r--src/util/regexp.h138
1 files changed, 71 insertions, 67 deletions
diff --git a/src/util/regexp.h b/src/util/regexp.h
index 731736f72..56fb969a3 100644
--- a/src/util/regexp.h
+++ b/src/util/regexp.h
@@ -37,60 +37,44 @@ namespace CVC4 {
class CVC4_PUBLIC String {
public:
/**
- * The start ASCII code. In our string representation below, we represent
- * characters using a vector d_str of unsigned integers. We refer to this as
- * the "internal representation" for the string.
- *
- * We make unsigned integer 0 correspond to the 65th character ("A") in the
- * ASCII alphabet to make models intuitive. In particular, say if we have
- * a set of string variables that are distinct but otherwise unconstrained,
- * then the model may assign them "A", "B", "C", ...
- */
- static inline unsigned start_code() { return 65; }
- /**
* This is the cardinality of the alphabet that is representable by this
* class. Notice that this must be greater than or equal to the cardinality
* of the alphabet that the string theory reasons about.
*
* This must be strictly less than std::numeric_limits<unsigned>::max().
+ *
+ * As per the SMT-LIB standard for strings, we support the first 3 planes of
+ * Unicode characters, where 196608 = 3*16^4.
*/
- static inline unsigned num_codes() { return 256; }
- /**
- * Convert unsigned char to the unsigned used in the internal representation
- * in d_str below.
- */
- static unsigned convertCharToUnsignedInt(unsigned char c);
- /** Convert the internal unsigned to a unsigned char. */
- static unsigned char convertUnsignedIntToChar(unsigned i);
- /** Does the internal unsigned correspond to a printable character? */
- static bool isPrintable(unsigned i);
- /** get the internal unsigned for ASCII code c. */
- static unsigned convertCodeToUnsignedInt(unsigned c);
- /** get the ASCII code number that internal unsigned i corresponds to. */
- static unsigned convertUnsignedIntToCode(unsigned i);
-
+ static inline unsigned num_codes() { return 196608; }
/** constructors for String
- *
- * Internally, a CVC4::String is represented by a vector of unsigned
- * integers (d_str), where the correspondence between C++ characters
- * to and from unsigned integers is determined by
- * by convertCharToUnsignedInt and convertUnsignedIntToChar.
- *
- * If useEscSequences is true, then the escape sequences in the input
- * are converted to the corresponding character. This constructor may
- * throw an exception if the input contains unrecognized escape sequences.
- * Currently supported escape sequences are \n, \t, \v, \b, \r, \f, \a, \\,
- * \x[N] where N is a hexidecimal, and octal escape sequences of the
- * form \[c1]([c2]([c3])?)? where c1, c2, c3 are digits from 0 to 7.
- *
- * If useEscSequences is false, then the characters of the constructed
- * CVC4::String correspond one-to-one with the input string.
- */
+ *
+ * Internally, a CVC4::String is represented by a vector of unsigned
+ * integers (d_str) representing the code points of the characters.
+ *
+ * To build a string from a C++ string, we may process escape sequences
+ * according to the SMT-LIB standard. In particular, if useEscSequences is
+ * true, we convert unicode escape sequences:
+ * \u d_3 d_2 d_1 d_0
+ * \u{d_0}
+ * \u{d_1 d_0}
+ * \u{d_2 d_1 d_0}
+ * \u{d_3 d_2 d_1 d_0}
+ * \u{d_4 d_3 d_2 d_1 d_0}
+ * where d_0 ... d_4 are hexidecimal digits, to the appropriate character.
+ *
+ * If useEscSequences is false, then the characters of the constructed
+ * CVC4::String correspond one-to-one with the input string.
+ */
String() = default;
explicit String(const std::string& s, bool useEscSequences = false)
- : d_str(toInternal(s, useEscSequences)) {}
+ : d_str(toInternal(s, useEscSequences))
+ {
+ }
explicit String(const char* s, bool useEscSequences = false)
- : d_str(toInternal(std::string(s), useEscSequences)) {}
+ : d_str(toInternal(std::string(s), useEscSequences))
+ {
+ }
explicit String(const std::vector<unsigned>& s);
String& operator=(const String& y) {
@@ -123,20 +107,16 @@ class CVC4_PUBLIC String {
bool rstrncmp(const String& y, std::size_t n) const;
/* toString
- * Converts this string to a std::string.
- *
- * If useEscSequences is true, then unprintable characters
- * are converted to escape sequences. The escape sequences
- * \n, \t, \v, \b, \r, \f, \a, \\ are printed in this way.
- * For all other unprintable characters, we print \x[N] where
- * [N] is the 2 digit hexidecimal corresponding to value of
- * the character.
- *
- * If useEscSequences is false, the returned std::string's characters
- * map one-to-one with the characters in this string.
- * Notice that for all std::string s, we have that
- * CVC4::String( s ).toString() = s.
- */
+ * Converts this string to a std::string.
+ *
+ * The unprintable characters are converted to unicode escape sequences as
+ * described above.
+ *
+ * If useEscSequences is false, the string's printable characters are
+ * printed as characters. Notice that for all std::string s having only
+ * printable characters, we have that
+ * CVC4::String( s ).toString() = s.
+ */
std::string toString(bool useEscSequences = false) const;
/** is this the empty string? */
bool empty() const { return d_str.empty(); }
@@ -221,16 +201,32 @@ class CVC4_PUBLIC String {
bool isNumber() const;
/** Returns the corresponding rational for the text of this string. */
Rational toNumber() const;
- /** get the internal unsigned representation of this string */
+ /** Get the unsigned representation (code points) of this string */
const std::vector<unsigned>& getVec() const { return d_str; }
- /** get the internal unsigned value of the first character in this string */
+ /**
+ * Get the unsigned (code point) value of the first character in this string
+ */
unsigned front() const;
- /** get the internal unsigned value of the last character in this string */
+ /**
+ * Get the unsigned (code point) value of the last character in this string
+ */
unsigned back() const;
/** is the unsigned a digit?
- * The input should be the same type as the element type of d_str
- */
+ *
+ * This is true for code points between 48 ('0') and 57 ('9').
+ */
static bool isDigit(unsigned character);
+ /** is the unsigned a hexidecimal digit?
+ *
+ * This is true for code points between 48 ('0') and 57 ('9'), code points
+ * between 65 ('A') and 70 ('F) and code points between 97 ('a') and 102 ('f).
+ */
+ static bool isHexDigit(unsigned character);
+ /** is the unsigned a printable code point?
+ *
+ * This is true for Unicode 32 (' ') to 126 ('~').
+ */
+ static bool isPrintable(unsigned character);
/**
* Returns the maximum length of string representable by this class.
@@ -238,11 +234,19 @@ class CVC4_PUBLIC String {
*/
static size_t maxSize();
private:
- // guarded
- static unsigned char hexToDec(unsigned char c);
-
+ /**
+ * Helper for toInternal: add character ch to vector vec, storing a string in
+ * internal format. This throws an error if ch is not a printable character,
+ * since non-printable characters must be escaped in SMT-LIB.
+ */
+ static void addCharToInternal(unsigned char ch, std::vector<unsigned>& vec);
+ /**
+ * Convert the string s to the internal format (vector of code points).
+ * The argument useEscSequences is whether to process unicode escape
+ * sequences.
+ */
static std::vector<unsigned> toInternal(const std::string& s,
- bool useEscSequences = true);
+ bool useEscSequences);
/**
* Returns a negative number if *this < y, 0 if *this and y are equal and a
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback