diff options
Diffstat (limited to 'src/util')
-rw-r--r-- | src/util/regexp.cpp | 349 | ||||
-rw-r--r-- | src/util/regexp.h | 138 |
2 files changed, 232 insertions, 255 deletions
diff --git a/src/util/regexp.cpp b/src/util/regexp.cpp index 00066edb6..36ba7182b 100644 --- a/src/util/regexp.cpp +++ b/src/util/regexp.cpp @@ -32,38 +32,12 @@ namespace CVC4 { static_assert(UCHAR_MAX == 255, "Unsigned char is assumed to have 256 values."); -unsigned String::convertCharToUnsignedInt(unsigned char c) -{ - return convertCodeToUnsignedInt(static_cast<unsigned>(c)); -} -unsigned char String::convertUnsignedIntToChar(unsigned i) -{ - Assert(i < num_codes()); - return static_cast<unsigned char>(convertUnsignedIntToCode(i)); -} -bool String::isPrintable(unsigned i) -{ - Assert(i < num_codes()); - unsigned char c = convertUnsignedIntToChar(i); - return (c >= ' ' && c <= '~'); -} -unsigned String::convertCodeToUnsignedInt(unsigned c) -{ - Assert(c < num_codes()); - return (c < start_code() ? c + num_codes() : c) - start_code(); -} -unsigned String::convertUnsignedIntToCode(unsigned i) -{ - Assert(i < num_codes()); - return (i + start_code()) % num_codes(); -} - String::String(const std::vector<unsigned> &s) : d_str(s) { #ifdef CVC4_ASSERTIONS for (unsigned u : d_str) { - Assert(convertUnsignedIntToCode(u) < num_codes()); + Assert(u < num_codes()); } #endif } @@ -74,8 +48,8 @@ int String::cmp(const String &y) const { } for (unsigned int i = 0; i < size(); ++i) { if (d_str[i] != y.d_str[i]) { - unsigned cp = convertUnsignedIntToCode(d_str[i]); - unsigned cpy = convertUnsignedIntToCode(y.d_str[i]); + unsigned cp = d_str[i]; + unsigned cpy = y.d_str[i]; return cp < cpy ? -1 : 1; } } @@ -122,107 +96,143 @@ bool String::rstrncmp(const String& y, std::size_t n) const return true; } -std::vector<unsigned> String::toInternal(const std::string &s, - bool useEscSequences) { +void String::addCharToInternal(unsigned char ch, std::vector<unsigned>& str) +{ + // if not a printable character + if (ch > 127 || ch < 32) + { + std::stringstream serr; + serr << "Illegal string character: \"" << ch + << "\", must use escape sequence"; + throw CVC4::Exception(serr.str()); + } + else + { + str.push_back(static_cast<unsigned>(ch)); + } +} + +std::vector<unsigned> String::toInternal(const std::string& s, + bool useEscSequences) +{ std::vector<unsigned> str; unsigned i = 0; - while (i < s.size()) { - if (s[i] == '\\' && useEscSequences) { - i++; - if (i < s.size()) { - switch (s[i]) { - case 'n': { - str.push_back(convertCharToUnsignedInt('\n')); - i++; - } break; - case 't': { - str.push_back(convertCharToUnsignedInt('\t')); - i++; - } break; - case 'v': { - str.push_back(convertCharToUnsignedInt('\v')); - i++; - } break; - case 'b': { - str.push_back(convertCharToUnsignedInt('\b')); - i++; - } break; - case 'r': { - str.push_back(convertCharToUnsignedInt('\r')); - i++; - } break; - case 'f': { - str.push_back(convertCharToUnsignedInt('\f')); - i++; - } break; - case 'a': { - str.push_back(convertCharToUnsignedInt('\a')); - i++; - } break; - case '\\': { - str.push_back(convertCharToUnsignedInt('\\')); - i++; - } break; - case 'x': { - if (i + 2 < s.size()) { - if (isxdigit(s[i + 1]) && isxdigit(s[i + 2])) { - str.push_back(convertCharToUnsignedInt(hexToDec(s[i + 1]) * 16 + - hexToDec(s[i + 2]))); - i += 3; - } else { - throw CVC4::Exception("Illegal String Literal: \"" + s + "\""); - } - } else { - throw CVC4::Exception("Illegal String Literal: \"" + s + - "\", must have two digits after \\x"); - } - } break; - default: { - if (isdigit(s[i])) { - // octal escape sequences TODO : revisit (issue #1251). - int num = (int)s[i] - (int)'0'; - bool flag = num < 4; - if (i + 1 < s.size() && num < 8 && isdigit(s[i + 1]) && - s[i + 1] < '8') { - num = num * 8 + (int)s[i + 1] - (int)'0'; - if (flag && i + 2 < s.size() && isdigit(s[i + 2]) && - s[i + 2] < '8') { - num = num * 8 + (int)s[i + 2] - (int)'0'; - str.push_back(convertCharToUnsignedInt((unsigned char)num)); - i += 3; - } else { - str.push_back(convertCharToUnsignedInt((unsigned char)num)); - i += 2; - } - } else { - str.push_back(convertCharToUnsignedInt((unsigned char)num)); - i++; - } - } else if ((unsigned)s[i] > 127) { - throw CVC4::Exception("Illegal String Literal: \"" + s + - "\", must use escaped sequence"); - } else { - str.push_back(convertCharToUnsignedInt(s[i])); - i++; - } + while (i < s.size()) + { + // get the current character + char si = s[i]; + if (si != '\\' || !useEscSequences) + { + addCharToInternal(si, str); + ++i; + continue; + } + // the vector of characters, in case we fail to read an escape sequence + std::vector<unsigned> nonEscCache; + // process the '\' + addCharToInternal(si, nonEscCache); + ++i; + // are we an escape sequence? + bool isEscapeSequence = true; + // the string corresponding to the hexidecimal code point + std::stringstream hexString; + // is the slash followed by a 'u'? Could be last character. + if (i >= s.size() || s[i] != 'u') + { + isEscapeSequence = false; + } + else + { + // process the 'u' + addCharToInternal(s[i], nonEscCache); + ++i; + bool isStart = true; + bool isEnd = false; + bool hasBrace = false; + while (i < s.size()) + { + // add the next character + si = s[i]; + if (isStart) + { + isStart = false; + // possibly read '{' + if (si == '{') + { + hasBrace = true; + addCharToInternal(si, nonEscCache); + ++i; + continue; } } - } else { - throw CVC4::Exception("should be handled by lexer: \"" + s + "\""); - // str.push_back( convertCharToUnsignedInt('\\') ); + else if (si == '}') + { + // can only end if we had an open brace and read at least one digit + isEscapeSequence = hasBrace && !hexString.str().empty(); + isEnd = true; + addCharToInternal(si, nonEscCache); + ++i; + break; + } + // must be a hex digit at this point + if (!isHexDigit(static_cast<unsigned>(si))) + { + isEscapeSequence = false; + break; + } + hexString << si; + addCharToInternal(si, nonEscCache); + ++i; + if (!hasBrace && hexString.str().size() == 4) + { + // will be finished reading \ u d_3 d_2 d_1 d_0 with no parens + isEnd = true; + break; + } + else if (hasBrace && hexString.str().size() > 5) + { + // too many digits enclosed in brace, not an escape sequence + isEscapeSequence = false; + break; + } + } + if (!isEnd) + { + // if we were interupted before ending, then this is not a valid + // escape sequence + isEscapeSequence = false; + } + } + if (isEscapeSequence) + { + Assert(!hexString.str().empty() && hexString.str().size() <= 5); + // Otherwise, we add the escaped character. + // This is guaranteed not to overflow due to the length of hstr. + uint32_t val; + hexString >> std::hex >> val; + if (val > num_codes()) + { + // Failed due to being out of range. This can happen for strings of + // the form \ u { d_4 d_3 d_2 d_1 d_0 } where d_4 is a hexidecimal not + // in the range [0-2]. + isEscapeSequence = false; + } + else + { + str.push_back(val); } - } else if ((unsigned)s[i] > 127 && useEscSequences) { - throw CVC4::Exception("Illegal String Literal: \"" + s + - "\", must use escaped sequence"); - } else { - str.push_back(convertCharToUnsignedInt(s[i])); - i++; + } + // if we did not successfully parse an escape sequence, we add back all + // characters that we cached + if (!isEscapeSequence) + { + str.insert(str.end(), nonEscCache.begin(), nonEscCache.end()); } } #ifdef CVC4_ASSERTIONS for (unsigned u : str) { - Assert(convertUnsignedIntToCode(u) < num_codes()); + Assert(u < num_codes()); } #endif return str; @@ -265,62 +275,23 @@ std::size_t String::roverlap(const String &y) const { } std::string String::toString(bool useEscSequences) const { - std::string str; + std::stringstream str; for (unsigned int i = 0; i < size(); ++i) { - unsigned char c = convertUnsignedIntToChar(d_str[i]); - if (!useEscSequences) { - str += c; - } else if (isprint(c)) { - if (c == '\\') { - str += "\\\\"; - } - // else if(c == '\"') { - // str += "\\\""; - //} - else { - str += c; - } - } else { - std::string s; - switch (c) { - case '\a': - s = "\\a"; - break; - case '\b': - s = "\\b"; - break; - case '\t': - s = "\\t"; - break; - case '\r': - s = "\\r"; - break; - case '\v': - s = "\\v"; - break; - case '\f': - s = "\\f"; - break; - case '\n': - s = "\\n"; - break; - case '\e': - s = "\\e"; - break; - default: { - std::stringstream ss; - ss << std::setfill('0') << std::setw(2) << std::hex << ((int)c); - std::string t = ss.str(); - t = t.substr(t.size() - 2, 2); - s = "\\x" + t; - // std::string s2 = static_cast<std::ostringstream*>( - // &(std::ostringstream() << (int)c) )->str(); - } - } - str += s; + // we always print forward slash as a code point so that it cannot + // be interpreted as specifying part of a code point, e.g. the string + // '\' + 'u' + '0' of length three. + if (isPrintable(d_str[i]) && d_str[i] != '\\' && !useEscSequences) + { + str << static_cast<char>(d_str[i]); + } + else + { + std::stringstream ss; + ss << std::hex << d_str[i]; + str << "\\u{" << ss.str() << "}"; } } - return str; + return str.str(); } bool String::isLeq(const String &y) const @@ -331,8 +302,8 @@ bool String::isLeq(const String &y) const { return false; } - unsigned ci = convertUnsignedIntToCode(d_str[i]); - unsigned cyi = convertUnsignedIntToCode(y.d_str[i]); + unsigned ci = d_str[i]; + unsigned cyi = y.d_str[i]; if (ci > cyi) { return false; @@ -484,8 +455,21 @@ bool String::isNumber() const { bool String::isDigit(unsigned character) { - unsigned char c = convertUnsignedIntToChar(character); - return c >= '0' && c <= '9'; + // '0' to '9' + return 48 <= character && character <= 57; +} + +bool String::isHexDigit(unsigned character) +{ + // '0' to '9' or 'A' to 'F' or 'a' to 'f' + return isDigit(character) || (65 <= character && character <= 70) + || (97 <= character && character <= 102); +} + +bool String::isPrintable(unsigned character) +{ + // Unicode 0x00020 (' ') to 0x0007E ('~') + return 32 <= character && character <= 126; } size_t String::maxSize() { return std::numeric_limits<uint32_t>::max(); } @@ -497,17 +481,6 @@ Rational String::toNumber() const return Rational(toString()); } -unsigned char String::hexToDec(unsigned char c) { - if (c >= '0' && c <= '9') { - return c - '0'; - } else if (c >= 'a' && c <= 'f') { - return c - 'a' + 10; - } else { - Assert(c >= 'A' && c <= 'F'); - return c - 'A' + 10; - } -} - std::ostream &operator<<(std::ostream &os, const String &s) { return os << "\"" << s.toString(true) << "\""; } diff --git a/src/util/regexp.h b/src/util/regexp.h index 731736f72..56fb969a3 100644 --- a/src/util/regexp.h +++ b/src/util/regexp.h @@ -37,60 +37,44 @@ namespace CVC4 { class CVC4_PUBLIC String { public: /** - * The start ASCII code. In our string representation below, we represent - * characters using a vector d_str of unsigned integers. We refer to this as - * the "internal representation" for the string. - * - * We make unsigned integer 0 correspond to the 65th character ("A") in the - * ASCII alphabet to make models intuitive. In particular, say if we have - * a set of string variables that are distinct but otherwise unconstrained, - * then the model may assign them "A", "B", "C", ... - */ - static inline unsigned start_code() { return 65; } - /** * This is the cardinality of the alphabet that is representable by this * class. Notice that this must be greater than or equal to the cardinality * of the alphabet that the string theory reasons about. * * This must be strictly less than std::numeric_limits<unsigned>::max(). + * + * As per the SMT-LIB standard for strings, we support the first 3 planes of + * Unicode characters, where 196608 = 3*16^4. */ - static inline unsigned num_codes() { return 256; } - /** - * Convert unsigned char to the unsigned used in the internal representation - * in d_str below. - */ - static unsigned convertCharToUnsignedInt(unsigned char c); - /** Convert the internal unsigned to a unsigned char. */ - static unsigned char convertUnsignedIntToChar(unsigned i); - /** Does the internal unsigned correspond to a printable character? */ - static bool isPrintable(unsigned i); - /** get the internal unsigned for ASCII code c. */ - static unsigned convertCodeToUnsignedInt(unsigned c); - /** get the ASCII code number that internal unsigned i corresponds to. */ - static unsigned convertUnsignedIntToCode(unsigned i); - + static inline unsigned num_codes() { return 196608; } /** constructors for String - * - * Internally, a CVC4::String is represented by a vector of unsigned - * integers (d_str), where the correspondence between C++ characters - * to and from unsigned integers is determined by - * by convertCharToUnsignedInt and convertUnsignedIntToChar. - * - * If useEscSequences is true, then the escape sequences in the input - * are converted to the corresponding character. This constructor may - * throw an exception if the input contains unrecognized escape sequences. - * Currently supported escape sequences are \n, \t, \v, \b, \r, \f, \a, \\, - * \x[N] where N is a hexidecimal, and octal escape sequences of the - * form \[c1]([c2]([c3])?)? where c1, c2, c3 are digits from 0 to 7. - * - * If useEscSequences is false, then the characters of the constructed - * CVC4::String correspond one-to-one with the input string. - */ + * + * Internally, a CVC4::String is represented by a vector of unsigned + * integers (d_str) representing the code points of the characters. + * + * To build a string from a C++ string, we may process escape sequences + * according to the SMT-LIB standard. In particular, if useEscSequences is + * true, we convert unicode escape sequences: + * \u d_3 d_2 d_1 d_0 + * \u{d_0} + * \u{d_1 d_0} + * \u{d_2 d_1 d_0} + * \u{d_3 d_2 d_1 d_0} + * \u{d_4 d_3 d_2 d_1 d_0} + * where d_0 ... d_4 are hexidecimal digits, to the appropriate character. + * + * If useEscSequences is false, then the characters of the constructed + * CVC4::String correspond one-to-one with the input string. + */ String() = default; explicit String(const std::string& s, bool useEscSequences = false) - : d_str(toInternal(s, useEscSequences)) {} + : d_str(toInternal(s, useEscSequences)) + { + } explicit String(const char* s, bool useEscSequences = false) - : d_str(toInternal(std::string(s), useEscSequences)) {} + : d_str(toInternal(std::string(s), useEscSequences)) + { + } explicit String(const std::vector<unsigned>& s); String& operator=(const String& y) { @@ -123,20 +107,16 @@ class CVC4_PUBLIC String { bool rstrncmp(const String& y, std::size_t n) const; /* toString - * Converts this string to a std::string. - * - * If useEscSequences is true, then unprintable characters - * are converted to escape sequences. The escape sequences - * \n, \t, \v, \b, \r, \f, \a, \\ are printed in this way. - * For all other unprintable characters, we print \x[N] where - * [N] is the 2 digit hexidecimal corresponding to value of - * the character. - * - * If useEscSequences is false, the returned std::string's characters - * map one-to-one with the characters in this string. - * Notice that for all std::string s, we have that - * CVC4::String( s ).toString() = s. - */ + * Converts this string to a std::string. + * + * The unprintable characters are converted to unicode escape sequences as + * described above. + * + * If useEscSequences is false, the string's printable characters are + * printed as characters. Notice that for all std::string s having only + * printable characters, we have that + * CVC4::String( s ).toString() = s. + */ std::string toString(bool useEscSequences = false) const; /** is this the empty string? */ bool empty() const { return d_str.empty(); } @@ -221,16 +201,32 @@ class CVC4_PUBLIC String { bool isNumber() const; /** Returns the corresponding rational for the text of this string. */ Rational toNumber() const; - /** get the internal unsigned representation of this string */ + /** Get the unsigned representation (code points) of this string */ const std::vector<unsigned>& getVec() const { return d_str; } - /** get the internal unsigned value of the first character in this string */ + /** + * Get the unsigned (code point) value of the first character in this string + */ unsigned front() const; - /** get the internal unsigned value of the last character in this string */ + /** + * Get the unsigned (code point) value of the last character in this string + */ unsigned back() const; /** is the unsigned a digit? - * The input should be the same type as the element type of d_str - */ + * + * This is true for code points between 48 ('0') and 57 ('9'). + */ static bool isDigit(unsigned character); + /** is the unsigned a hexidecimal digit? + * + * This is true for code points between 48 ('0') and 57 ('9'), code points + * between 65 ('A') and 70 ('F) and code points between 97 ('a') and 102 ('f). + */ + static bool isHexDigit(unsigned character); + /** is the unsigned a printable code point? + * + * This is true for Unicode 32 (' ') to 126 ('~'). + */ + static bool isPrintable(unsigned character); /** * Returns the maximum length of string representable by this class. @@ -238,11 +234,19 @@ class CVC4_PUBLIC String { */ static size_t maxSize(); private: - // guarded - static unsigned char hexToDec(unsigned char c); - + /** + * Helper for toInternal: add character ch to vector vec, storing a string in + * internal format. This throws an error if ch is not a printable character, + * since non-printable characters must be escaped in SMT-LIB. + */ + static void addCharToInternal(unsigned char ch, std::vector<unsigned>& vec); + /** + * Convert the string s to the internal format (vector of code points). + * The argument useEscSequences is whether to process unicode escape + * sequences. + */ static std::vector<unsigned> toInternal(const std::string& s, - bool useEscSequences = true); + bool useEscSequences); /** * Returns a negative number if *this < y, 0 if *this and y are equal and a |