2 files changed, 232 insertions, 255 deletions
diff --git a/src/util/regexp.cpp b/src/util/regexp.cpp
index 00066edb6..36ba7182b 100644
--- a/src/util/regexp.cpp
+++ b/src/util/regexp.cpp
@@ -32,38 +32,12 @@ namespace CVC4 {
 
 static_assert(UCHAR_MAX == 255, "Unsigned char is assumed to have 256 values.");
 
-unsigned String::convertCharToUnsignedInt(unsigned char c)
-{
-  return convertCodeToUnsignedInt(static_cast<unsigned>(c));
-}
-unsigned char String::convertUnsignedIntToChar(unsigned i)
-{
-  Assert(i < num_codes());
-  return static_cast<unsigned char>(convertUnsignedIntToCode(i));
-}
-bool String::isPrintable(unsigned i)
-{
-  Assert(i < num_codes());
-  unsigned char c = convertUnsignedIntToChar(i);
-  return (c >= ' ' && c <= '~');
-}
-unsigned String::convertCodeToUnsignedInt(unsigned c)
-{
-  Assert(c < num_codes());
-  return (c < start_code() ? c + num_codes() : c) - start_code();
-}
-unsigned String::convertUnsignedIntToCode(unsigned i)
-{
-  Assert(i < num_codes());
-  return (i + start_code()) % num_codes();
-}
-
 String::String(const std::vector<unsigned> &s) : d_str(s)
 {
 #ifdef CVC4_ASSERTIONS
   for (unsigned u : d_str)
   {
-    Assert(convertUnsignedIntToCode(u) < num_codes());
+    Assert(u < num_codes());
   }
 #endif
 }
@@ -74,8 +48,8 @@ int String::cmp(const String &y) const {
   }
   for (unsigned int i = 0; i < size(); ++i) {
     if (d_str[i] != y.d_str[i]) {
-      unsigned cp = convertUnsignedIntToCode(d_str[i]);
-      unsigned cpy = convertUnsignedIntToCode(y.d_str[i]);
+      unsigned cp = d_str[i];
+      unsigned cpy = y.d_str[i];
       return cp < cpy ? -1 : 1;
     }
   }
@@ -122,107 +96,143 @@ bool String::rstrncmp(const String& y, std::size_t n) const
   return true;
 }
 
-std::vector<unsigned> String::toInternal(const std::string &s,
-                                         bool useEscSequences) {
+void String::addCharToInternal(unsigned char ch, std::vector<unsigned>& str)
+{
+  // if not a printable character
+  if (ch > 127 || ch < 32)
+  {
+    std::stringstream serr;
+    serr << "Illegal string character: \"" << ch
+         << "\", must use escape sequence";
+    throw CVC4::Exception(serr.str());
+  }
+  else
+  {
+    str.push_back(static_cast<unsigned>(ch));
+  }
+}
+
+std::vector<unsigned> String::toInternal(const std::string& s,
+                                         bool useEscSequences)
+{
   std::vector<unsigned> str;
   unsigned i = 0;
-  while (i < s.size()) {
-    if (s[i] == '\\' && useEscSequences) {
-      i++;
-      if (i < s.size()) {
-        switch (s[i]) {
-          case 'n': {
-            str.push_back(convertCharToUnsignedInt('\n'));
-            i++;
-          } break;
-          case 't': {
-            str.push_back(convertCharToUnsignedInt('\t'));
-            i++;
-          } break;
-          case 'v': {
-            str.push_back(convertCharToUnsignedInt('\v'));
-            i++;
-          } break;
-          case 'b': {
-            str.push_back(convertCharToUnsignedInt('\b'));
-            i++;
-          } break;
-          case 'r': {
-            str.push_back(convertCharToUnsignedInt('\r'));
-            i++;
-          } break;
-          case 'f': {
-            str.push_back(convertCharToUnsignedInt('\f'));
-            i++;
-          } break;
-          case 'a': {
-            str.push_back(convertCharToUnsignedInt('\a'));
-            i++;
-          } break;
-          case '\\': {
-            str.push_back(convertCharToUnsignedInt('\\'));
-            i++;
-          } break;
-          case 'x': {
-            if (i + 2 < s.size()) {
-              if (isxdigit(s[i + 1]) && isxdigit(s[i + 2])) {
-                str.push_back(convertCharToUnsignedInt(hexToDec(s[i + 1]) * 16 +
-                                                       hexToDec(s[i + 2])));
-                i += 3;
-              } else {
-                throw CVC4::Exception("Illegal String Literal: \"" + s + "\"");
-              }
-            } else {
-              throw CVC4::Exception("Illegal String Literal: \"" + s +
-                                    "\", must have two digits after \\x");
-            }
-          } break;
-          default: {
-            if (isdigit(s[i])) {
-              // octal escape sequences  TODO : revisit (issue #1251).
-              int num = (int)s[i] - (int)'0';
-              bool flag = num < 4;
-              if (i + 1 < s.size() && num < 8 && isdigit(s[i + 1]) &&
-                  s[i + 1] < '8') {
-                num = num * 8 + (int)s[i + 1] - (int)'0';
-                if (flag && i + 2 < s.size() && isdigit(s[i + 2]) &&
-                    s[i + 2] < '8') {
-                  num = num * 8 + (int)s[i + 2] - (int)'0';
-                  str.push_back(convertCharToUnsignedInt((unsigned char)num));
-                  i += 3;
-                } else {
-                  str.push_back(convertCharToUnsignedInt((unsigned char)num));
-                  i += 2;
-                }
-              } else {
-                str.push_back(convertCharToUnsignedInt((unsigned char)num));
-                i++;
-              }
-            } else if ((unsigned)s[i] > 127) {
-              throw CVC4::Exception("Illegal String Literal: \"" + s +
-                                    "\", must use escaped sequence");
-            } else {
-              str.push_back(convertCharToUnsignedInt(s[i]));
-              i++;
-            }
+  while (i < s.size())
+  {
+    // get the current character
+    char si = s[i];
+    if (si != '\\' || !useEscSequences)
+    {
+      addCharToInternal(si, str);
+      ++i;
+      continue;
+    }
+    // the vector of characters, in case we fail to read an escape sequence
+    std::vector<unsigned> nonEscCache;
+    // process the '\'
+    addCharToInternal(si, nonEscCache);
+    ++i;
+    // are we an escape sequence?
+    bool isEscapeSequence = true;
+    // the string corresponding to the hexidecimal code point
+    std::stringstream hexString;
+    // is the slash followed by a 'u'? Could be last character.
+    if (i >= s.size() || s[i] != 'u')
+    {
+      isEscapeSequence = false;
+    }
+    else
+    {
+      // process the 'u'
+      addCharToInternal(s[i], nonEscCache);
+      ++i;
+      bool isStart = true;
+      bool isEnd = false;
+      bool hasBrace = false;
+      while (i < s.size())
+      {
+        // add the next character
+        si = s[i];
+        if (isStart)
+        {
+          isStart = false;
+          // possibly read '{'
+          if (si == '{')
+          {
+            hasBrace = true;
+            addCharToInternal(si, nonEscCache);
+            ++i;
+            continue;
           }
         }
-      } else {
-        throw CVC4::Exception("should be handled by lexer: \"" + s + "\"");
-        // str.push_back( convertCharToUnsignedInt('\\') );
+        else if (si == '}')
+        {
+          // can only end if we had an open brace and read at least one digit
+          isEscapeSequence = hasBrace && !hexString.str().empty();
+          isEnd = true;
+          addCharToInternal(si, nonEscCache);
+          ++i;
+          break;
+        }
+        // must be a hex digit at this point
+        if (!isHexDigit(static_cast<unsigned>(si)))
+        {
+          isEscapeSequence = false;
+          break;
+        }
+        hexString << si;
+        addCharToInternal(si, nonEscCache);
+        ++i;
+        if (!hasBrace && hexString.str().size() == 4)
+        {
+          // will be finished reading \ u d_3 d_2 d_1 d_0 with no parens
+          isEnd = true;
+          break;
+        }
+        else if (hasBrace && hexString.str().size() > 5)
+        {
+          // too many digits enclosed in brace, not an escape sequence
+          isEscapeSequence = false;
+          break;
+        }
+      }
+      if (!isEnd)
+      {
+        // if we were interupted before ending, then this is not a valid
+        // escape sequence
+        isEscapeSequence = false;
+      }
+    }
+    if (isEscapeSequence)
+    {
+      Assert(!hexString.str().empty() && hexString.str().size() <= 5);
+      // Otherwise, we add the escaped character.
+      // This is guaranteed not to overflow due to the length of hstr.
+      uint32_t val;
+      hexString >> std::hex >> val;
+      if (val > num_codes())
+      {
+        // Failed due to being out of range. This can happen for strings of
+        // the form \ u { d_4 d_3 d_2 d_1 d_0 } where d_4 is a hexidecimal not
+        // in the range [0-2].
+        isEscapeSequence = false;
+      }
+      else
+      {
+        str.push_back(val);
       }
-    } else if ((unsigned)s[i] > 127 && useEscSequences) {
-      throw CVC4::Exception("Illegal String Literal: \"" + s +
-                            "\", must use escaped sequence");
-    } else {
-      str.push_back(convertCharToUnsignedInt(s[i]));
-      i++;
+    }
+    // if we did not successfully parse an escape sequence, we add back all
+    // characters that we cached
+    if (!isEscapeSequence)
+    {
+      str.insert(str.end(), nonEscCache.begin(), nonEscCache.end());
     }
   }
 #ifdef CVC4_ASSERTIONS
   for (unsigned u : str)
   {
-    Assert(convertUnsignedIntToCode(u) < num_codes());
+    Assert(u < num_codes());
   }
 #endif
   return str;
@@ -265,62 +275,23 @@ std::size_t String::roverlap(const String &y) const {
 }
 
 std::string String::toString(bool useEscSequences) const {
-  std::string str;
+  std::stringstream str;
   for (unsigned int i = 0; i < size(); ++i) {
-    unsigned char c = convertUnsignedIntToChar(d_str[i]);
-    if (!useEscSequences) {
-      str += c;
-    } else if (isprint(c)) {
-      if (c == '\\') {
-        str += "\\\\";
-      }
-      // else if(c == '\"') {
-      //  str += "\\\"";
-      //}
-      else {
-        str += c;
-      }
-    } else {
-      std::string s;
-      switch (c) {
-        case '\a':
-          s = "\\a";
-          break;
-        case '\b':
-          s = "\\b";
-          break;
-        case '\t':
-          s = "\\t";
-          break;
-        case '\r':
-          s = "\\r";
-          break;
-        case '\v':
-          s = "\\v";
-          break;
-        case '\f':
-          s = "\\f";
-          break;
-        case '\n':
-          s = "\\n";
-          break;
-        case '\e':
-          s = "\\e";
-          break;
-        default: {
-          std::stringstream ss;
-          ss << std::setfill('0') << std::setw(2) << std::hex << ((int)c);
-          std::string t = ss.str();
-          t = t.substr(t.size() - 2, 2);
-          s = "\\x" + t;
-          // std::string s2 = static_cast<std::ostringstream*>(
-          // &(std::ostringstream() << (int)c) )->str();
-        }
-      }
-      str += s;
+    // we always print forward slash as a code point so that it cannot
+    // be interpreted as specifying part of a code point, e.g. the string
+    // '\' + 'u' + '0' of length three.
+    if (isPrintable(d_str[i]) && d_str[i] != '\\' && !useEscSequences)
+    {
+      str << static_cast<char>(d_str[i]);
+    }
+    else
+    {
+      std::stringstream ss;
+      ss << std::hex << d_str[i];
+      str << "\\u{" << ss.str() << "}";
     }
   }
-  return str;
+  return str.str();
 }
 
 bool String::isLeq(const String &y) const
@@ -331,8 +302,8 @@ bool String::isLeq(const String &y) const
     {
       return false;
     }
-    unsigned ci = convertUnsignedIntToCode(d_str[i]);
-    unsigned cyi = convertUnsignedIntToCode(y.d_str[i]);
+    unsigned ci = d_str[i];
+    unsigned cyi = y.d_str[i];
     if (ci > cyi)
     {
       return false;
@@ -484,8 +455,21 @@ bool String::isNumber() const {
 
 bool String::isDigit(unsigned character)
 {
-  unsigned char c = convertUnsignedIntToChar(character);
-  return c >= '0' && c <= '9';
+  // '0' to '9'
+  return 48 <= character && character <= 57;
+}
+
+bool String::isHexDigit(unsigned character)
+{
+  // '0' to '9' or 'A' to 'F' or 'a' to 'f'
+  return isDigit(character) || (65 <= character && character <= 70)
+         || (97 <= character && character <= 102);
+}
+
+bool String::isPrintable(unsigned character)
+{
+  // Unicode 0x00020 (' ') to 0x0007E ('~')
+  return 32 <= character && character <= 126;
 }
 
 size_t String::maxSize() { return std::numeric_limits<uint32_t>::max(); }
@@ -497,17 +481,6 @@ Rational String::toNumber() const
   return Rational(toString());
 }
 
-unsigned char String::hexToDec(unsigned char c) {
-  if (c >= '0' && c <= '9') {
-    return c - '0';
-  } else if (c >= 'a' && c <= 'f') {
-    return c - 'a' + 10;
-  } else {
-    Assert(c >= 'A' && c <= 'F');
-    return c - 'A' + 10;
-  }
-}
-
 std::ostream &operator<<(std::ostream &os, const String &s) {
   return os << "\"" << s.toString(true) << "\"";
 }
diff --git a/src/util/regexp.h b/src/util/regexp.h
index 731736f72..56fb969a3 100644
--- a/src/util/regexp.h
+++ b/src/util/regexp.h
@@ -37,60 +37,44 @@ namespace CVC4 {
 class CVC4_PUBLIC String {
  public:
   /**
-   * The start ASCII code. In our string representation below, we represent
-   * characters using a vector d_str of unsigned integers. We refer to this as
-   * the "internal representation" for the string.
-   *
-   * We make unsigned integer 0 correspond to the 65th character ("A") in the
-   * ASCII alphabet to make models intuitive. In particular, say if we have
-   * a set of string variables that are distinct but otherwise unconstrained,
-   * then the model may assign them "A", "B", "C", ...
-   */
-  static inline unsigned start_code() { return 65; }
-  /**
    * This is the cardinality of the alphabet that is representable by this
    * class. Notice that this must be greater than or equal to the cardinality
    * of the alphabet that the string theory reasons about.
    *
    * This must be strictly less than std::numeric_limits<unsigned>::max().
+   *
+   * As per the SMT-LIB standard for strings, we support the first 3 planes of
+   * Unicode characters, where 196608 = 3*16^4.
    */
-  static inline unsigned num_codes() { return 256; }
-  /**
-   * Convert unsigned char to the unsigned used in the internal representation
-   * in d_str below.
-   */
-  static unsigned convertCharToUnsignedInt(unsigned char c);
-  /** Convert the internal unsigned to a unsigned char. */
-  static unsigned char convertUnsignedIntToChar(unsigned i);
-  /** Does the internal unsigned correspond to a printable character? */
-  static bool isPrintable(unsigned i);
-  /** get the internal unsigned for ASCII code c. */
-  static unsigned convertCodeToUnsignedInt(unsigned c);
-  /** get the ASCII code number that internal unsigned i corresponds to. */
-  static unsigned convertUnsignedIntToCode(unsigned i);
-
+  static inline unsigned num_codes() { return 196608; }
   /** constructors for String
-  *
-  * Internally, a CVC4::String is represented by a vector of unsigned
-  * integers (d_str), where the correspondence between C++ characters
-  * to and from unsigned integers is determined by
-  * by convertCharToUnsignedInt and convertUnsignedIntToChar.
-  *
-  * If useEscSequences is true, then the escape sequences in the input
-  * are converted to the corresponding character. This constructor may
-  * throw an exception if the input contains unrecognized escape sequences.
-  * Currently supported escape sequences are \n, \t, \v, \b, \r, \f, \a, \\,
-  * \x[N] where N is a hexidecimal, and octal escape sequences of the
-  * form \[c1]([c2]([c3])?)? where c1, c2, c3 are digits from 0 to 7.
-  *
-  * If useEscSequences is false, then the characters of the constructed
-  * CVC4::String correspond one-to-one with the input string.
-  */
+   *
+   * Internally, a CVC4::String is represented by a vector of unsigned
+   * integers (d_str) representing the code points of the characters.
+   *
+   * To build a string from a C++ string, we may process escape sequences
+   * according to the SMT-LIB standard. In particular, if useEscSequences is
+   * true, we convert unicode escape sequences:
+   *  \u d_3 d_2 d_1 d_0
+   *  \u{d_0}
+   *  \u{d_1 d_0}
+   *  \u{d_2 d_1 d_0}
+   *  \u{d_3 d_2 d_1 d_0}
+   *  \u{d_4 d_3 d_2 d_1 d_0}
+   * where d_0 ... d_4 are hexidecimal digits, to the appropriate character.
+   *
+   * If useEscSequences is false, then the characters of the constructed
+   * CVC4::String correspond one-to-one with the input string.
+   */
   String() = default;
   explicit String(const std::string& s, bool useEscSequences = false)
-      : d_str(toInternal(s, useEscSequences)) {}
+      : d_str(toInternal(s, useEscSequences))
+  {
+  }
   explicit String(const char* s, bool useEscSequences = false)
-      : d_str(toInternal(std::string(s), useEscSequences)) {}
+      : d_str(toInternal(std::string(s), useEscSequences))
+  {
+  }
   explicit String(const std::vector<unsigned>& s);
 
   String& operator=(const String& y) {
@@ -123,20 +107,16 @@ class CVC4_PUBLIC String {
   bool rstrncmp(const String& y, std::size_t n) const;
 
   /* toString
-  * Converts this string to a std::string.
-  *
-  * If useEscSequences is true, then unprintable characters
-  * are converted to escape sequences. The escape sequences
-  * \n, \t, \v, \b, \r, \f, \a, \\ are printed in this way.
-  * For all other unprintable characters, we print \x[N] where
-  * [N] is the 2 digit hexidecimal corresponding to value of
-  * the character.
-  *
-  * If useEscSequences is false, the returned std::string's characters
-  * map one-to-one with the characters in this string.
-  * Notice that for all std::string s, we have that
-  *    CVC4::String( s ).toString() = s.
-  */
+   * Converts this string to a std::string.
+   *
+   * The unprintable characters are converted to unicode escape sequences as
+   * described above.
+   *
+   * If useEscSequences is false, the string's printable characters are
+   * printed as characters. Notice that for all std::string s having only
+   * printable characters, we have that
+   *    CVC4::String( s ).toString() = s.
+   */
   std::string toString(bool useEscSequences = false) const;
   /** is this the empty string? */
   bool empty() const { return d_str.empty(); }
@@ -221,16 +201,32 @@ class CVC4_PUBLIC String {
   bool isNumber() const;
   /** Returns the corresponding rational for the text of this string. */
   Rational toNumber() const;
-  /** get the internal unsigned representation of this string */
+  /** Get the unsigned representation (code points) of this string */
   const std::vector<unsigned>& getVec() const { return d_str; }
-  /** get the internal unsigned value of the first character in this string */
+  /**
+   * Get the unsigned (code point) value of the first character in this string
+   */
   unsigned front() const;
-  /** get the internal unsigned value of the last character in this string */
+  /**
+   * Get the unsigned (code point) value of the last character in this string
+   */
   unsigned back() const;
   /** is the unsigned a digit?
-  * The input should be the same type as the element type of d_str
-  */
+   *
+   * This is true for code points between 48 ('0') and 57 ('9').
+   */
   static bool isDigit(unsigned character);
+  /** is the unsigned a hexidecimal digit?
+   *
+   * This is true for code points between 48 ('0') and 57 ('9'), code points
+   * between 65 ('A') and 70 ('F) and code points between 97 ('a') and 102 ('f).
+   */
+  static bool isHexDigit(unsigned character);
+  /** is the unsigned a printable code point?
+   *
+   * This is true for Unicode 32 (' ') to 126 ('~').
+   */
+  static bool isPrintable(unsigned character);
 
   /**
    * Returns the maximum length of string representable by this class.
@@ -238,11 +234,19 @@ class CVC4_PUBLIC String {
    */
   static size_t maxSize();
  private:
-  // guarded
-  static unsigned char hexToDec(unsigned char c);
-
+  /**
+   * Helper for toInternal: add character ch to vector vec, storing a string in
+   * internal format. This throws an error if ch is not a printable character,
+   * since non-printable characters must be escaped in SMT-LIB.
+   */
+  static void addCharToInternal(unsigned char ch, std::vector<unsigned>& vec);
+  /**
+   * Convert the string s to the internal format (vector of code points).
+   * The argument useEscSequences is whether to process unicode escape
+   * sequences.
+   */
   static std::vector<unsigned> toInternal(const std::string& s,
-                                          bool useEscSequences = true);
+                                          bool useEscSequences);
 
   /**
    * Returns a negative number if *this < y, 0 if *this and y are equal and a