Support unicode internal representation and escape sequences (#3852)

Work towards support for the strings standard. This updates the string solver and parser such that: The internal representation of strings is vectors of code points, Generation of the previous internal representation of strings has been relegated to the type enumerator. This is the code that ensures that "A" is the first character chosen for values of strings in models, The previous ad-hoc escape sequence handling is moved from the String class to the parser. It will live there for at least one version of CVC4, until we no longer support non-smt-lib complaint escape sequences or non-printable characters in strings, Handle unicode escape sequences according to the SMT-LIB standard in String, Simplify a number of calls to String utility functions, since the conversion between the previous internal format and code points is now unnecessary, Fixed a bug in the handling of TO_CODE: it should be based on the alphabet cardinality, not the number of internal code points.
author: Andrew Reynolds <andrew.j.reynolds@gmail.com> 2020-03-27 09:01:38 -0500
committer: GitHub <noreply@github.com> 2020-03-27 09:01:38 -0500
commit: 27ac2ce712b0bcfdef83e2d44dd210f667ab7959 (patch)
tree: a64febad63c37b641eaaacf4ad79007888aa43f9
parent: fa2ba76ef83497108942ebb91cdb07fdfeed505b (diff)
24 files changed, 549 insertions, 285 deletions
diff --git a/src/parser/cvc/Cvc.g b/src/parser/cvc/Cvc.g
index 82c0581ce..033389610 100644
--- a/src/parser/cvc/Cvc.g
+++ b/src/parser/cvc/Cvc.g
@@ -2083,7 +2083,7 @@ stringTerm[CVC4::api::Term& f]
 
     /* string literal */
   | str[s]
-    { f = SOLVER->mkString(s, true); }
+    { f = PARSER_STATE->mkStringConstant(s); }
 
   | setsTerm[f]
   ;
diff --git a/src/parser/parser.cpp b/src/parser/parser.cpp
index b36f36a93..5dca92370 100644
--- a/src/parser/parser.cpp
+++ b/src/parser/parser.cpp
@@ -757,5 +757,153 @@ void Parser::attributeNotSupported(const std::string& attr) {
   }
 }
 
+std::vector<unsigned> Parser::processAdHocStringEsc(const std::string& s)
+{
+  std::vector<unsigned> str;
+  unsigned i = 0;
+  while (i < s.size())
+  {
+    // get the current character
+    if (s[i] != '\\')
+    {
+      // don't worry about printable here
+      str.push_back(static_cast<unsigned>(s[i]));
+      ++i;
+      continue;
+    }
+    // slash is always escaped
+    ++i;
+    if (i >= s.size())
+    {
+      // slash cannot be the last character if we are parsing escape sequences
+      std::stringstream serr;
+      serr << "Escape sequence at the end of string: \"" << s
+           << "\" should be handled by lexer";
+      parseError(serr.str());
+    }
+    switch (s[i])
+    {
+      case 'n':
+      {
+        str.push_back(static_cast<unsigned>('\n'));
+        i++;
+      }
+      break;
+      case 't':
+      {
+        str.push_back(static_cast<unsigned>('\t'));
+        i++;
+      }
+      break;
+      case 'v':
+      {
+        str.push_back(static_cast<unsigned>('\v'));
+        i++;
+      }
+      break;
+      case 'b':
+      {
+        str.push_back(static_cast<unsigned>('\b'));
+        i++;
+      }
+      break;
+      case 'r':
+      {
+        str.push_back(static_cast<unsigned>('\r'));
+        i++;
+      }
+      break;
+      case 'f':
+      {
+        str.push_back(static_cast<unsigned>('\f'));
+        i++;
+      }
+      break;
+      case 'a':
+      {
+        str.push_back(static_cast<unsigned>('\a'));
+        i++;
+      }
+      break;
+      case '\\':
+      {
+        str.push_back(static_cast<unsigned>('\\'));
+        i++;
+      }
+      break;
+      case 'x':
+      {
+        bool isValid = false;
+        if (i + 2 < s.size())
+        {
+          if (std::isxdigit(s[i + 1]) && std::isxdigit(s[i + 2]))
+          {
+            std::stringstream shex;
+            shex << s[i + 1] << s[i + 2];
+            unsigned val;
+            shex >> std::hex >> val;
+            str.push_back(val);
+            i += 3;
+            isValid = true;
+          }
+        }
+        if (!isValid)
+        {
+          std::stringstream serr;
+          serr << "Illegal String Literal: \"" << s
+               << "\", must have two digits after \\x";
+          parseError(serr.str());
+        }
+      }
+      break;
+      default:
+      {
+        if (std::isdigit(s[i]))
+        {
+          // octal escape sequences  TODO : revisit (issue #1251).
+          unsigned num = static_cast<unsigned>(s[i]) - 48;
+          bool flag = num < 4;
+          if (i + 1 < s.size() && num < 8 && std::isdigit(s[i + 1])
+              && s[i + 1] < '8')
+          {
+            num = num * 8 + static_cast<unsigned>(s[i + 1]) - 48;
+            if (flag && i + 2 < s.size() && std::isdigit(s[i + 2])
+                && s[i + 2] < '8')
+            {
+              num = num * 8 + static_cast<unsigned>(s[i + 2]) - 48;
+              str.push_back(num);
+              i += 3;
+            }
+            else
+            {
+              str.push_back(num);
+              i += 2;
+            }
+          }
+          else
+          {
+            str.push_back(num);
+            i++;
+          }
+        }
+      }
+    }
+  }
+  return str;
+}
+
+Expr Parser::mkStringConstant(const std::string& s)
+{
+  ExprManager* em = d_solver->getExprManager();
+  if (em->getOptions().getInputLanguage()
+      == language::input::LANG_SMTLIB_V2_6_1)
+  {
+    return d_solver->mkString(s, true).getExpr();
+  }
+  // otherwise, we must process ad-hoc escape sequences
+  std::vector<unsigned> str = processAdHocStringEsc(s);
+  return d_solver->mkString(str).getExpr();
+}
+
 } /* CVC4::parser namespace */
 } /* CVC4 namespace */
diff --git a/src/parser/parser.h b/src/parser/parser.h
index ecea4d3bd..d6c0e0e15 100644
--- a/src/parser/parser.h
+++ b/src/parser/parser.h
@@ -889,6 +889,28 @@ public:
         name, api::sortVectorToTypes(argTypes));
   }
   //------------------------ end operator overloading
+  /**
+   * Make string constant
+   *
+   * This makes the string constant based on the string s. This may involve
+   * processing ad-hoc escape sequences (if the language is not
+   * SMT-LIB 2.6.1 or higher), or otherwise calling the solver to construct
+   * the string.
+   */
+  Expr mkStringConstant(const std::string& s);
+
+ private:
+  /** ad-hoc string escaping
+   *
+   * Returns the (internal) vector of code points corresponding to processing
+   * the escape sequences in string s. This is to support string inputs that
+   * do no comply with the SMT-LIB standard.
+   *
+   * This method handles escape sequences, including \n, \t, \v, \b, \r, \f, \a,
+   * \\, \x[N] and octal escape sequences of the form \[c1]([c2]([c3])?)? where
+   * c1, c2, c3 are digits from 0 to 7.
+   */
+  std::vector<unsigned> processAdHocStringEsc(const std::string& s);
 };/* class Parser */
 
 }/* CVC4::parser namespace */
diff --git a/src/parser/smt2/Smt2.g b/src/parser/smt2/Smt2.g
index ec1eae7da..69f21acb7 100644
--- a/src/parser/smt2/Smt2.g
+++ b/src/parser/smt2/Smt2.g
@@ -2091,7 +2091,7 @@ termAtomic[CVC4::api::Term& atomTerm]
     }
 
   // String constant
-  | str[s,false] { atomTerm = SOLVER->mkString(s, true); }
+  | str[s,false] { atomTerm = PARSER_STATE->mkStringConstant(s); }
 
   // NOTE: Theory constants go here
 
diff --git a/src/preprocessing/passes/synth_rew_rules.cpp b/src/preprocessing/passes/synth_rew_rules.cpp
index 7b8e61359..f1e9e39c5 100644
--- a/src/preprocessing/passes/synth_rew_rules.cpp
+++ b/src/preprocessing/passes/synth_rew_rules.cpp
@@ -169,7 +169,7 @@ PreprocessingPassResult SynthRewRulesPass::applyInternal(
       std::stringstream ssv;
       if (varCounter < 26)
       {
-        ssv << String::convertUnsignedIntToChar(varCounter + 32);
+        ssv << static_cast<char>(varCounter + 61);
       }
       else
       {
diff --git a/src/printer/cvc/cvc_printer.cpp b/src/printer/cvc/cvc_printer.cpp
index cad3c4640..1178c7299 100644
--- a/src/printer/cvc/cvc_printer.cpp
+++ b/src/printer/cvc/cvc_printer.cpp
@@ -160,6 +160,11 @@ void CvcPrinter::toStream(
       toStreamRational(out, n, false);
       break;
     }
+    case kind::CONST_STRING:
+    {
+      out << '"' << n.getConst<String>().toString() << '"';
+      break;
+    }
     case kind::TYPE_CONSTANT:
       switch(TypeConstant tc = n.getConst<TypeConstant>()) {
       case REAL_TYPE:
diff --git a/src/printer/smt2/smt2_printer.cpp b/src/printer/smt2/smt2_printer.cpp
index 541827f89..6e4fcb63a 100644
--- a/src/printer/smt2/smt2_printer.cpp
+++ b/src/printer/smt2/smt2_printer.cpp
@@ -202,7 +202,7 @@ void Smt2Printer::toStream(std::ostream& out,
     }
 
     case kind::CONST_STRING: {
-      std::string s = n.getConst<String>().toString(true);
+      std::string s = n.getConst<String>().toString();
       out << '"';
       for(size_t i = 0; i < s.size(); ++i) {
         char c = s[i];
diff --git a/src/theory/evaluator.cpp b/src/theory/evaluator.cpp
index b827912d5..646f903f5 100644
--- a/src/theory/evaluator.cpp
+++ b/src/theory/evaluator.cpp
@@ -626,8 +626,7 @@ EvalResult Evaluator::evalInternal(
           const String& s = results[currNode[0]].d_str;
           if (s.size() == 1)
           {
-            results[currNode] = EvalResult(
-                Rational(String::convertUnsignedIntToCode(s.getVec()[0])));
+            results[currNode] = EvalResult(Rational(s.getVec()[0]));
           }
           else
           {
diff --git a/src/theory/quantifiers/sygus_sampler.cpp b/src/theory/quantifiers/sygus_sampler.cpp
index 28cfa69df..e9c858814 100644
--- a/src/theory/quantifiers/sygus_sampler.cpp
+++ b/src/theory/quantifiers/sygus_sampler.cpp
@@ -560,8 +560,7 @@ Node SygusSampler::getRandomValue(TypeNode tn)
       for (unsigned ch : alphas)
       {
         d_rstring_alphabet.push_back(ch);
-        Trace("sygus-sample-str-alpha")
-            << " \"" << String::convertUnsignedIntToChar(ch) << "\"";
+        Trace("sygus-sample-str-alpha") << " \\u" << ch;
       }
       Trace("sygus-sample-str-alpha") << std::endl;
     }
diff --git a/src/theory/strings/regexp_operation.cpp b/src/theory/strings/regexp_operation.cpp
index d5105a489..9a2091eac 100644
--- a/src/theory/strings/regexp_operation.cpp
+++ b/src/theory/strings/regexp_operation.cpp
@@ -739,9 +739,7 @@ void RegExpOpr::firstChars(Node r, std::set<unsigned> &pcset, SetNodes &pvset)
       }
       case kind::REGEXP_RANGE: {
         unsigned a = r[0].getConst<String>().front();
-        a = String::convertUnsignedIntToCode(a);
         unsigned b = r[1].getConst<String>().front();
-        b = String::convertUnsignedIntToCode(b);
         Assert(a < b);
         Assert(b < std::numeric_limits<unsigned>::max());
         for (unsigned c = a; c <= b; c++)
@@ -756,7 +754,6 @@ void RegExpOpr::firstChars(Node r, std::set<unsigned> &pcset, SetNodes &pvset)
           String s = st.getConst<String>();
           if(s.size() != 0) {
             unsigned sc = s.front();
-            sc = String::convertUnsignedIntToCode(sc);
             cset.insert(sc);
           }
         }
@@ -765,7 +762,6 @@ void RegExpOpr::firstChars(Node r, std::set<unsigned> &pcset, SetNodes &pvset)
           if(st[0].isConst()) {
             String s = st[0].getConst<String>();
             unsigned sc = s.front();
-            sc = String::convertUnsignedIntToCode(sc);
             cset.insert(sc);
           } else {
             vset.insert( st[0] );
@@ -887,13 +883,11 @@ void RegExpOpr::simplifyNRegExp( Node s, Node r, std::vector< Node > &new_nodes
       case kind::REGEXP_RANGE: {
         std::vector< Node > vec;
         unsigned a = r[0].getConst<String>().front();
-        a = String::convertUnsignedIntToCode(a);
         unsigned b = r[1].getConst<String>().front();
-        b = String::convertUnsignedIntToCode(b);
         for (unsigned c = a; c <= b; c++)
         {
           std::vector<unsigned> tmpVec;
-          tmpVec.push_back(String::convertCodeToUnsignedInt(c));
+          tmpVec.push_back(c);
           Node tmp = s.eqNode(nm->mkConst(String(tmpVec))).negate();
           vec.push_back( tmp );
         }
@@ -1522,7 +1516,7 @@ Node RegExpOpr::intersectInternal( Node r1, Node r2, std::map< PairNodes, Node >
              ++it)
         {
           std::vector<unsigned> cvec;
-          cvec.push_back(String::convertCodeToUnsignedInt(*it));
+          cvec.push_back(*it);
           String c(cvec);
           Trace("regexp-int-debug") << "Try character " << c << " ... " << std::endl;
           Node r1l = derivativeSingle(r1, c);
diff --git a/src/theory/strings/sequences_rewriter.cpp b/src/theory/strings/sequences_rewriter.cpp
index 716634d5f..b0940b7e1 100644
--- a/src/theory/strings/sequences_rewriter.cpp
+++ b/src/theory/strings/sequences_rewriter.cpp
@@ -1421,11 +1421,8 @@ bool SequencesRewriter::testConstStringInRegExp(CVC4::String& s,
       if (s.size() == index_start + 1)
       {
         unsigned a = r[0].getConst<String>().front();
-        a = String::convertUnsignedIntToCode(a);
         unsigned b = r[1].getConst<String>().front();
-        b = String::convertUnsignedIntToCode(b);
         unsigned c = s.back();
-        c = String::convertUnsignedIntToCode(c);
         return (a <= c && c <= b);
       }
       else
diff --git a/src/theory/strings/strings_rewriter.cpp b/src/theory/strings/strings_rewriter.cpp
index 75dfe7432..c7676d049 100644
--- a/src/theory/strings/strings_rewriter.cpp
+++ b/src/theory/strings/strings_rewriter.cpp
@@ -93,7 +93,7 @@ Node StringsRewriter::rewriteStrConvert(Node node)
     std::vector<unsigned> nvec = node[0].getConst<String>().getVec();
     for (unsigned i = 0, nvsize = nvec.size(); i < nvsize; i++)
     {
-      unsigned newChar = String::convertUnsignedIntToCode(nvec[i]);
+      unsigned newChar = nvec[i];
       // transform it
       // upper 65 ... 90
       // lower 97 ... 122
@@ -111,7 +111,6 @@ Node StringsRewriter::rewriteStrConvert(Node node)
           newChar = newChar + 32;
         }
       }
-      newChar = String::convertCodeToUnsignedInt(newChar);
       nvec[i] = newChar;
     }
     Node retNode = nm->mkConst(String(nvec));
@@ -231,7 +230,7 @@ Node StringsRewriter::rewriteStringToCode(Node n)
     {
       std::vector<unsigned> vec = s.getVec();
       Assert(vec.size() == 1);
-      ret = nm->mkConst(Rational(String::convertUnsignedIntToCode(vec[0])));
+      ret = nm->mkConst(Rational(vec[0]));
     }
     else
     {
diff --git a/src/theory/strings/theory_strings.cpp b/src/theory/strings/theory_strings.cpp
index 16183abdd..a81c96318 100644
--- a/src/theory/strings/theory_strings.cpp
+++ b/src/theory/strings/theory_strings.cpp
@@ -382,7 +382,7 @@ bool TheoryStrings::collectModelInfoType(
                   ctv.getConst<Rational>().getNumerator().toUnsignedInt();
               Trace("strings-model") << "(code: " << cvalue << ") ";
               std::vector<unsigned> vec;
-              vec.push_back(String::convertCodeToUnsignedInt(cvalue));
+              vec.push_back(cvalue);
               Node mv = nm->mkConst(String(vec));
               pure_eq_assign[eqc] = mv;
               m->getEqualityEngine()->addTerm(mv);
@@ -1099,13 +1099,14 @@ void TheoryStrings::registerTerm(Node n, int effort)
   else if (n.getKind() == STRING_TO_CODE)
   {
     d_has_str_code = true;
-    // ite( str.len(s)==1, 0 <= str.code(s) < num_codes, str.code(s)=-1 )
+    // ite( str.len(s)==1, 0 <= str.code(s) < |A|, str.code(s)=-1 )
     Node code_len = utils::mkNLength(n[0]).eqNode(d_one);
     Node code_eq_neg1 = n.eqNode(d_neg_one);
     Node code_range = nm->mkNode(
         AND,
         nm->mkNode(GEQ, n, d_zero),
-        nm->mkNode(LT, n, nm->mkConst(Rational(CVC4::String::num_codes()))));
+        nm->mkNode(
+            LT, n, nm->mkConst(Rational(utils::getAlphabetCardinality()))));
     regTermLem = nm->mkNode(ITE, code_len, code_range, code_eq_neg1);
   }
   else if (n.getKind() == STRING_STRIDOF)
diff --git a/src/theory/strings/theory_strings_type_rules.h b/src/theory/strings/theory_strings_type_rules.h
index 7ef31a92a..93a32f26e 100644
--- a/src/theory/strings/theory_strings_type_rules.h
+++ b/src/theory/strings/theory_strings_type_rules.h
@@ -299,7 +299,7 @@ public:
           throw TypeCheckingExceptionPrivate(n, "expecting a single constant string term in regexp range");
         }
         unsigned ci = (*it).getConst<String>().front();
-        ch[i] = String::convertUnsignedIntToCode(ci);
+        ch[i] = ci;
         ++it;
       }
       if(ch[0] > ch[1]) {
diff --git a/src/theory/strings/type_enumerator.cpp b/src/theory/strings/type_enumerator.cpp
index 12cf899b4..7352ae5de 100644
--- a/src/theory/strings/type_enumerator.cpp
+++ b/src/theory/strings/type_enumerator.cpp
@@ -21,6 +21,44 @@ namespace CVC4 {
 namespace theory {
 namespace strings {
 
+Node makeStandardModelConstant(const std::vector<unsigned>& vec,
+                               uint32_t cardinality)
+{
+  std::vector<unsigned> mvec;
+  // if we contain all of the printable characters
+  if (cardinality >= 255)
+  {
+    for (unsigned i = 0, vsize = vec.size(); i < vsize; i++)
+    {
+      unsigned curr = vec[i];
+      // convert
+      Assert(vec[i] < cardinality);
+      if (vec[i] <= 61)
+      {
+        // first 62 printable characters [\u{65}-\u{126}]: 'A', 'B', 'C', ...
+        curr = vec[i] + 65;
+      }
+      else if (vec[i] <= 94)
+      {
+        // remaining 33 printable characters [\u{32}-\u{64}]: ' ', '!', '"', ...
+        curr = vec[i] - 30;
+      }
+      else
+      {
+        // the remaining characters, starting with \u{127} and wrapping around
+        // the first 32 non-printable characters.
+        curr = (vec[i] + 32) % cardinality;
+      }
+      mvec.push_back(curr);
+    }
+  }
+  else
+  {
+    mvec = vec;
+  }
+  return NodeManager::currentNM()->mkConst(String(mvec));
+}
+
 WordIter::WordIter(uint32_t startLength) : d_hasEndLength(false), d_endLength(0)
 {
   for (uint32_t i = 0; i < startLength; i++)
@@ -117,7 +155,7 @@ bool StringEnumLen::increment()
 
 void StringEnumLen::mkCurr()
 {
-  d_curr = NodeManager::currentNM()->mkConst(String(d_witer->getData()));
+  d_curr = makeStandardModelConstant(d_witer->getData(), d_cardinality);
 }
 
 StringEnumerator::StringEnumerator(TypeNode type, TypeEnumeratorProperties* tep)
diff --git a/src/theory/strings/type_enumerator.h b/src/theory/strings/type_enumerator.h
index 2061628a5..b379ce5c3 100644
--- a/src/theory/strings/type_enumerator.h
+++ b/src/theory/strings/type_enumerator.h
@@ -28,6 +28,26 @@ namespace theory {
 namespace strings {
 
 /**
+ * Make standard model constant
+ *
+ * In our string representation, we represent characters using vectors
+ * of unsigned integers indicating code points for the characters of that
+ * string.
+ *
+ * To make models user-friendly, we make unsigned integer 0 correspond to the
+ * 65th character ("A") in the ASCII alphabet to make models intuitive. In
+ * particular, say if we have a set of string variables that are distinct but
+ * otherwise unconstrained, then the model may assign them "A", "B", "C", ...
+ *
+ * @param vec The code points of the string in a given model,
+ * @param cardinality The cardinality of the alphabet,
+ * @return A string whose characters have the code points corresponding
+ * to vec in the standard model construction described above.
+ */
+Node makeStandardModelConstant(const std::vector<unsigned>& vec,
+                               uint32_t cardinality);
+
+/**
  * Generic iteration over vectors of indices of a given start/end length.
  */
 class WordIter
diff --git a/src/util/regexp.cpp b/src/util/regexp.cpp
index 00066edb6..36ba7182b 100644
--- a/src/util/regexp.cpp
+++ b/src/util/regexp.cpp
@@ -32,38 +32,12 @@ namespace CVC4 {
 
 static_assert(UCHAR_MAX == 255, "Unsigned char is assumed to have 256 values.");
 
-unsigned String::convertCharToUnsignedInt(unsigned char c)
-{
-  return convertCodeToUnsignedInt(static_cast<unsigned>(c));
-}
-unsigned char String::convertUnsignedIntToChar(unsigned i)
-{
-  Assert(i < num_codes());
-  return static_cast<unsigned char>(convertUnsignedIntToCode(i));
-}
-bool String::isPrintable(unsigned i)
-{
-  Assert(i < num_codes());
-  unsigned char c = convertUnsignedIntToChar(i);
-  return (c >= ' ' && c <= '~');
-}
-unsigned String::convertCodeToUnsignedInt(unsigned c)
-{
-  Assert(c < num_codes());
-  return (c < start_code() ? c + num_codes() : c) - start_code();
-}
-unsigned String::convertUnsignedIntToCode(unsigned i)
-{
-  Assert(i < num_codes());
-  return (i + start_code()) % num_codes();
-}
-
 String::String(const std::vector<unsigned> &s) : d_str(s)
 {
 #ifdef CVC4_ASSERTIONS
   for (unsigned u : d_str)
   {
-    Assert(convertUnsignedIntToCode(u) < num_codes());
+    Assert(u < num_codes());
   }
 #endif
 }
@@ -74,8 +48,8 @@ int String::cmp(const String &y) const {
   }
   for (unsigned int i = 0; i < size(); ++i) {
     if (d_str[i] != y.d_str[i]) {
-      unsigned cp = convertUnsignedIntToCode(d_str[i]);
-      unsigned cpy = convertUnsignedIntToCode(y.d_str[i]);
+      unsigned cp = d_str[i];
+      unsigned cpy = y.d_str[i];
       return cp < cpy ? -1 : 1;
     }
   }
@@ -122,107 +96,143 @@ bool String::rstrncmp(const String& y, std::size_t n) const
   return true;
 }
 
-std::vector<unsigned> String::toInternal(const std::string &s,
-                                         bool useEscSequences) {
+void String::addCharToInternal(unsigned char ch, std::vector<unsigned>& str)
+{
+  // if not a printable character
+  if (ch > 127 || ch < 32)
+  {
+    std::stringstream serr;
+    serr << "Illegal string character: \"" << ch
+         << "\", must use escape sequence";
+    throw CVC4::Exception(serr.str());
+  }
+  else
+  {
+    str.push_back(static_cast<unsigned>(ch));
+  }
+}
+
+std::vector<unsigned> String::toInternal(const std::string& s,
+                                         bool useEscSequences)
+{
   std::vector<unsigned> str;
   unsigned i = 0;
-  while (i < s.size()) {
-    if (s[i] == '\\' && useEscSequences) {
-      i++;
-      if (i < s.size()) {
-        switch (s[i]) {
-          case 'n': {
-            str.push_back(convertCharToUnsignedInt('\n'));
-            i++;
-          } break;
-          case 't': {
-            str.push_back(convertCharToUnsignedInt('\t'));
-            i++;
-          } break;
-          case 'v': {
-            str.push_back(convertCharToUnsignedInt('\v'));
-            i++;
-          } break;
-          case 'b': {
-            str.push_back(convertCharToUnsignedInt('\b'));
-            i++;
-          } break;
-          case 'r': {
-            str.push_back(convertCharToUnsignedInt('\r'));
-            i++;
-          } break;
-          case 'f': {
-            str.push_back(convertCharToUnsignedInt('\f'));
-            i++;
-          } break;
-          case 'a': {
-            str.push_back(convertCharToUnsignedInt('\a'));
-            i++;
-          } break;
-          case '\\': {
-            str.push_back(convertCharToUnsignedInt('\\'));
-            i++;
-          } break;
-          case 'x': {
-            if (i + 2 < s.size()) {
-              if (isxdigit(s[i + 1]) && isxdigit(s[i + 2])) {
-                str.push_back(convertCharToUnsignedInt(hexToDec(s[i + 1]) * 16 +
-                                                       hexToDec(s[i + 2])));
-                i += 3;
-              } else {
-                throw CVC4::Exception("Illegal String Literal: \"" + s + "\"");
-              }
-            } else {
-              throw CVC4::Exception("Illegal String Literal: \"" + s +
-                                    "\", must have two digits after \\x");
-            }
-          } break;
-          default: {
-            if (isdigit(s[i])) {
-              // octal escape sequences  TODO : revisit (issue #1251).
-              int num = (int)s[i] - (int)'0';
-              bool flag = num < 4;
-              if (i + 1 < s.size() && num < 8 && isdigit(s[i + 1]) &&
-                  s[i + 1] < '8') {
-                num = num * 8 + (int)s[i + 1] - (int)'0';
-                if (flag && i + 2 < s.size() && isdigit(s[i + 2]) &&
-                    s[i + 2] < '8') {
-                  num = num * 8 + (int)s[i + 2] - (int)'0';
-                  str.push_back(convertCharToUnsignedInt((unsigned char)num));
-                  i += 3;
-                } else {
-                  str.push_back(convertCharToUnsignedInt((unsigned char)num));
-                  i += 2;
-                }
-              } else {
-                str.push_back(convertCharToUnsignedInt((unsigned char)num));
-                i++;
-              }
-            } else if ((unsigned)s[i] > 127) {
-              throw CVC4::Exception("Illegal String Literal: \"" + s +
-                                    "\", must use escaped sequence");
-            } else {
-              str.push_back(convertCharToUnsignedInt(s[i]));
-              i++;
-            }
+  while (i < s.size())
+  {
+    // get the current character
+    char si = s[i];
+    if (si != '\\' || !useEscSequences)
+    {
+      addCharToInternal(si, str);
+      ++i;
+      continue;
+    }
+    // the vector of characters, in case we fail to read an escape sequence
+    std::vector<unsigned> nonEscCache;
+    // process the '\'
+    addCharToInternal(si, nonEscCache);
+    ++i;
+    // are we an escape sequence?
+    bool isEscapeSequence = true;
+    // the string corresponding to the hexidecimal code point
+    std::stringstream hexString;
+    // is the slash followed by a 'u'? Could be last character.
+    if (i >= s.size() || s[i] != 'u')
+    {
+      isEscapeSequence = false;
+    }
+    else
+    {
+      // process the 'u'
+      addCharToInternal(s[i], nonEscCache);
+      ++i;
+      bool isStart = true;
+      bool isEnd = false;
+      bool hasBrace = false;
+      while (i < s.size())
+      {
+        // add the next character
+        si = s[i];
+        if (isStart)
+        {
+          isStart = false;
+          // possibly read '{'
+          if (si == '{')
+          {
+            hasBrace = true;
+            addCharToInternal(si, nonEscCache);
+            ++i;
+            continue;
           }
         }
-      } else {
-        throw CVC4::Exception("should be handled by lexer: \"" + s + "\"");
-        // str.push_back( convertCharToUnsignedInt('\\') );
+        else if (si == '}')
+        {
+          // can only end if we had an open brace and read at least one digit
+          isEscapeSequence = hasBrace && !hexString.str().empty();
+          isEnd = true;
+          addCharToInternal(si, nonEscCache);
+          ++i;
+          break;
+        }
+        // must be a hex digit at this point
+        if (!isHexDigit(static_cast<unsigned>(si)))
+        {
+          isEscapeSequence = false;
+          break;
+        }
+        hexString << si;
+        addCharToInternal(si, nonEscCache);
+        ++i;
+        if (!hasBrace && hexString.str().size() == 4)
+        {
+          // will be finished reading \ u d_3 d_2 d_1 d_0 with no parens
+          isEnd = true;
+          break;
+        }
+        else if (hasBrace && hexString.str().size() > 5)
+        {
+          // too many digits enclosed in brace, not an escape sequence
+          isEscapeSequence = false;
+          break;
+        }
+      }
+      if (!isEnd)
+      {
+        // if we were interupted before ending, then this is not a valid
+        // escape sequence
+        isEscapeSequence = false;
+      }
+    }
+    if (isEscapeSequence)
+    {
+      Assert(!hexString.str().empty() && hexString.str().size() <= 5);
+      // Otherwise, we add the escaped character.
+      // This is guaranteed not to overflow due to the length of hstr.
+      uint32_t val;
+      hexString >> std::hex >> val;
+      if (val > num_codes())
+      {
+        // Failed due to being out of range. This can happen for strings of
+        // the form \ u { d_4 d_3 d_2 d_1 d_0 } where d_4 is a hexidecimal not
+        // in the range [0-2].
+        isEscapeSequence = false;
+      }
+      else
+      {
+        str.push_back(val);
       }
-    } else if ((unsigned)s[i] > 127 && useEscSequences) {
-      throw CVC4::Exception("Illegal String Literal: \"" + s +
-                            "\", must use escaped sequence");
-    } else {
-      str.push_back(convertCharToUnsignedInt(s[i]));
-      i++;
+    }
+    // if we did not successfully parse an escape sequence, we add back all
+    // characters that we cached
+    if (!isEscapeSequence)
+    {
+      str.insert(str.end(), nonEscCache.begin(), nonEscCache.end());
     }
   }
 #ifdef CVC4_ASSERTIONS
   for (unsigned u : str)
   {
-    Assert(convertUnsignedIntToCode(u) < num_codes());
+    Assert(u < num_codes());
   }
 #endif
   return str;
@@ -265,62 +275,23 @@ std::size_t String::roverlap(const String &y) const {
 }
 
 std::string String::toString(bool useEscSequences) const {
-  std::string str;
+  std::stringstream str;
   for (unsigned int i = 0; i < size(); ++i) {
-    unsigned char c = convertUnsignedIntToChar(d_str[i]);
-    if (!useEscSequences) {
-      str += c;
-    } else if (isprint(c)) {
-      if (c == '\\') {
-        str += "\\\\";
-      }
-      // else if(c == '\"') {
-      //  str += "\\\"";
-      //}
-      else {
-        str += c;
-      }
-    } else {
-      std::string s;
-      switch (c) {
-        case '\a':
-          s = "\\a";
-          break;
-        case '\b':
-          s = "\\b";
-          break;
-        case '\t':
-          s = "\\t";
-          break;
-        case '\r':
-          s = "\\r";
-          break;
-        case '\v':
-          s = "\\v";
-          break;
-        case '\f':
-          s = "\\f";
-          break;
-        case '\n':
-          s = "\\n";
-          break;
-        case '\e':
-          s = "\\e";
-          break;
-        default: {
-          std::stringstream ss;
-          ss << std::setfill('0') << std::setw(2) << std::hex << ((int)c);
-          std::string t = ss.str();
-          t = t.substr(t.size() - 2, 2);
-          s = "\\x" + t;
-          // std::string s2 = static_cast<std::ostringstream*>(
-          // &(std::ostringstream() << (int)c) )->str();
-        }
-      }
-      str += s;
+    // we always print forward slash as a code point so that it cannot
+    // be interpreted as specifying part of a code point, e.g. the string
+    // '\' + 'u' + '0' of length three.
+    if (isPrintable(d_str[i]) && d_str[i] != '\\' && !useEscSequences)
+    {
+      str << static_cast<char>(d_str[i]);
+    }
+    else
+    {
+      std::stringstream ss;
+      ss << std::hex << d_str[i];
+      str << "\\u{" << ss.str() << "}";
     }
   }
-  return str;
+  return str.str();
 }
 
 bool String::isLeq(const String &y) const
@@ -331,8 +302,8 @@ bool String::isLeq(const String &y) const
     {
       return false;
     }
-    unsigned ci = convertUnsignedIntToCode(d_str[i]);
-    unsigned cyi = convertUnsignedIntToCode(y.d_str[i]);
+    unsigned ci = d_str[i];
+    unsigned cyi = y.d_str[i];
     if (ci > cyi)
     {
       return false;
@@ -484,8 +455,21 @@ bool String::isNumber() const {
 
 bool String::isDigit(unsigned character)
 {
-  unsigned char c = convertUnsignedIntToChar(character);
-  return c >= '0' && c <= '9';
+  // '0' to '9'
+  return 48 <= character && character <= 57;
+}
+
+bool String::isHexDigit(unsigned character)
+{
+  // '0' to '9' or 'A' to 'F' or 'a' to 'f'
+  return isDigit(character) || (65 <= character && character <= 70)
+         || (97 <= character && character <= 102);
+}
+
+bool String::isPrintable(unsigned character)
+{
+  // Unicode 0x00020 (' ') to 0x0007E ('~')
+  return 32 <= character && character <= 126;
 }
 
 size_t String::maxSize() { return std::numeric_limits<uint32_t>::max(); }
@@ -497,17 +481,6 @@ Rational String::toNumber() const
   return Rational(toString());
 }
 
-unsigned char String::hexToDec(unsigned char c) {
-  if (c >= '0' && c <= '9') {
-    return c - '0';
-  } else if (c >= 'a' && c <= 'f') {
-    return c - 'a' + 10;
-  } else {
-    Assert(c >= 'A' && c <= 'F');
-    return c - 'A' + 10;
-  }
-}
-
 std::ostream &operator<<(std::ostream &os, const String &s) {
   return os << "\"" << s.toString(true) << "\"";
 }
diff --git a/src/util/regexp.h b/src/util/regexp.h
index 731736f72..56fb969a3 100644
--- a/src/util/regexp.h
+++ b/src/util/regexp.h
@@ -37,60 +37,44 @@ namespace CVC4 {
 class CVC4_PUBLIC String {
  public:
   /**
-   * The start ASCII code. In our string representation below, we represent
-   * characters using a vector d_str of unsigned integers. We refer to this as
-   * the "internal representation" for the string.
-   *
-   * We make unsigned integer 0 correspond to the 65th character ("A") in the
-   * ASCII alphabet to make models intuitive. In particular, say if we have
-   * a set of string variables that are distinct but otherwise unconstrained,
-   * then the model may assign them "A", "B", "C", ...
-   */
-  static inline unsigned start_code() { return 65; }
-  /**
    * This is the cardinality of the alphabet that is representable by this
    * class. Notice that this must be greater than or equal to the cardinality
    * of the alphabet that the string theory reasons about.
    *
    * This must be strictly less than std::numeric_limits<unsigned>::max().
+   *
+   * As per the SMT-LIB standard for strings, we support the first 3 planes of
+   * Unicode characters, where 196608 = 3*16^4.
    */
-  static inline unsigned num_codes() { return 256; }
-  /**
-   * Convert unsigned char to the unsigned used in the internal representation
-   * in d_str below.
-   */
-  static unsigned convertCharToUnsignedInt(unsigned char c);
-  /** Convert the internal unsigned to a unsigned char. */
-  static unsigned char convertUnsignedIntToChar(unsigned i);
-  /** Does the internal unsigned correspond to a printable character? */
-  static bool isPrintable(unsigned i);
-  /** get the internal unsigned for ASCII code c. */
-  static unsigned convertCodeToUnsignedInt(unsigned c);
-  /** get the ASCII code number that internal unsigned i corresponds to. */
-  static unsigned convertUnsignedIntToCode(unsigned i);
-
+  static inline unsigned num_codes() { return 196608; }
   /** constructors for String
-  *
-  * Internally, a CVC4::String is represented by a vector of unsigned
-  * integers (d_str), where the correspondence between C++ characters
-  * to and from unsigned integers is determined by
-  * by convertCharToUnsignedInt and convertUnsignedIntToChar.
-  *
-  * If useEscSequences is true, then the escape sequences in the input
-  * are converted to the corresponding character. This constructor may
-  * throw an exception if the input contains unrecognized escape sequences.
-  * Currently supported escape sequences are \n, \t, \v, \b, \r, \f, \a, \\,
-  * \x[N] where N is a hexidecimal, and octal escape sequences of the
-  * form \[c1]([c2]([c3])?)? where c1, c2, c3 are digits from 0 to 7.
-  *
-  * If useEscSequences is false, then the characters of the constructed
-  * CVC4::String correspond one-to-one with the input string.
-  */
+   *
+   * Internally, a CVC4::String is represented by a vector of unsigned
+   * integers (d_str) representing the code points of the characters.
+   *
+   * To build a string from a C++ string, we may process escape sequences
+   * according to the SMT-LIB standard. In particular, if useEscSequences is
+   * true, we convert unicode escape sequences:
+   *  \u d_3 d_2 d_1 d_0
+   *  \u{d_0}
+   *  \u{d_1 d_0}
+   *  \u{d_2 d_1 d_0}
+   *  \u{d_3 d_2 d_1 d_0}
+   *  \u{d_4 d_3 d_2 d_1 d_0}
+   * where d_0 ... d_4 are hexidecimal digits, to the appropriate character.
+   *
+   * If useEscSequences is false, then the characters of the constructed
+   * CVC4::String correspond one-to-one with the input string.
+   */
   String() = default;
   explicit String(const std::string& s, bool useEscSequences = false)
-      : d_str(toInternal(s, useEscSequences)) {}
+      : d_str(toInternal(s, useEscSequences))
+  {
+  }
   explicit String(const char* s, bool useEscSequences = false)
-      : d_str(toInternal(std::string(s), useEscSequences)) {}
+      : d_str(toInternal(std::string(s), useEscSequences))
+  {
+  }
   explicit String(const std::vector<unsigned>& s);
 
   String& operator=(const String& y) {
@@ -123,20 +107,16 @@ class CVC4_PUBLIC String {
   bool rstrncmp(const String& y, std::size_t n) const;
 
   /* toString
-  * Converts this string to a std::string.
-  *
-  * If useEscSequences is true, then unprintable characters
-  * are converted to escape sequences. The escape sequences
-  * \n, \t, \v, \b, \r, \f, \a, \\ are printed in this way.
-  * For all other unprintable characters, we print \x[N] where
-  * [N] is the 2 digit hexidecimal corresponding to value of
-  * the character.
-  *
-  * If useEscSequences is false, the returned std::string's characters
-  * map one-to-one with the characters in this string.
-  * Notice that for all std::string s, we have that
-  *    CVC4::String( s ).toString() = s.
-  */
+   * Converts this string to a std::string.
+   *
+   * The unprintable characters are converted to unicode escape sequences as
+   * described above.
+   *
+   * If useEscSequences is false, the string's printable characters are
+   * printed as characters. Notice that for all std::string s having only
+   * printable characters, we have that
+   *    CVC4::String( s ).toString() = s.
+   */
   std::string toString(bool useEscSequences = false) const;
   /** is this the empty string? */
   bool empty() const { return d_str.empty(); }
@@ -221,16 +201,32 @@ class CVC4_PUBLIC String {
   bool isNumber() const;
   /** Returns the corresponding rational for the text of this string. */
   Rational toNumber() const;
-  /** get the internal unsigned representation of this string */
+  /** Get the unsigned representation (code points) of this string */
   const std::vector<unsigned>& getVec() const { return d_str; }
-  /** get the internal unsigned value of the first character in this string */
+  /**
+   * Get the unsigned (code point) value of the first character in this string
+   */
   unsigned front() const;
-  /** get the internal unsigned value of the last character in this string */
+  /**
+   * Get the unsigned (code point) value of the last character in this string
+   */
   unsigned back() const;
   /** is the unsigned a digit?
-  * The input should be the same type as the element type of d_str
-  */
+   *
+   * This is true for code points between 48 ('0') and 57 ('9').
+   */
   static bool isDigit(unsigned character);
+  /** is the unsigned a hexidecimal digit?
+   *
+   * This is true for code points between 48 ('0') and 57 ('9'), code points
+   * between 65 ('A') and 70 ('F) and code points between 97 ('a') and 102 ('f).
+   */
+  static bool isHexDigit(unsigned character);
+  /** is the unsigned a printable code point?
+   *
+   * This is true for Unicode 32 (' ') to 126 ('~').
+   */
+  static bool isPrintable(unsigned character);
 
   /**
    * Returns the maximum length of string representable by this class.
@@ -238,11 +234,19 @@ class CVC4_PUBLIC String {
    */
   static size_t maxSize();
  private:
-  // guarded
-  static unsigned char hexToDec(unsigned char c);
-
+  /**
+   * Helper for toInternal: add character ch to vector vec, storing a string in
+   * internal format. This throws an error if ch is not a printable character,
+   * since non-printable characters must be escaped in SMT-LIB.
+   */
+  static void addCharToInternal(unsigned char ch, std::vector<unsigned>& vec);
+  /**
+   * Convert the string s to the internal format (vector of code points).
+   * The argument useEscSequences is whether to process unicode escape
+   * sequences.
+   */
   static std::vector<unsigned> toInternal(const std::string& s,
-                                          bool useEscSequences = true);
+                                          bool useEscSequences);
 
   /**
    * Returns a negative number if *this < y, 0 if *this and y are equal and a
diff --git a/test/regress/CMakeLists.txt b/test/regress/CMakeLists.txt
index 8fab16b44..8382e40fc 100644
--- a/test/regress/CMakeLists.txt
+++ b/test/regress/CMakeLists.txt
@@ -922,6 +922,7 @@ set(regress_0_tests
   regress0/strings/escchar.smt2
   regress0/strings/escchar_25.smt2
   regress0/strings/from_code.smt2
+  regress0/strings/gen-esc-seq.smt2
   regress0/strings/hconst-092618.smt2
   regress0/strings/idof-rewrites.smt2
   regress0/strings/idof-sem.smt2
@@ -939,6 +940,8 @@ set(regress_0_tests
   regress0/strings/leadingzero001.smt2
   regress0/strings/loop001.smt2
   regress0/strings/model001.smt2
+  regress0/strings/model-code-point.smt2
+  regress0/strings/model-friendly.smt2
   regress0/strings/ncontrib-rewrites.smt2
   regress0/strings/norn-31.smt2
   regress0/strings/norn-simp-rew.smt2
@@ -967,6 +970,7 @@ set(regress_0_tests
   regress0/strings/tolower-rrs.smt2
   regress0/strings/tolower-simple.smt2
   regress0/strings/type001.smt2
+  regress0/strings/unicode-esc.smt2
   regress0/strings/unsound-0908.smt2
   regress0/strings/unsound-repl-rewrite.smt2
   regress0/sygus/General_plus10.sy
diff --git a/test/regress/regress0/strings/gen-esc-seq.smt2 b/test/regress/regress0/strings/gen-esc-seq.smt2
new file mode 100644
index 000000000..59f66046f
--- /dev/null
+++ b/test/regress/regress0/strings/gen-esc-seq.smt2
@@ -0,0 +1,9 @@
+; COMMAND-LINE: --produce-models --lang=smt2.6.1
+; EXPECT: sat
+; EXPECT: ((x "\u{5c}u1000"))
+(set-logic ALL)
+(set-info :status sat)
+(declare-const x String)
+(assert (= x (str.++ "\u" "1000")))
+(check-sat)
+(get-value (x))
diff --git a/test/regress/regress0/strings/model-code-point.smt2 b/test/regress/regress0/strings/model-code-point.smt2
new file mode 100644
index 000000000..1200ae704
--- /dev/null
+++ b/test/regress/regress0/strings/model-code-point.smt2
@@ -0,0 +1,13 @@
+; COMMAND-LINE: --lang=smt2.6.1 --produce-models
+; EXPECT: sat
+; EXPECT: ((x "\u{a}"))
+; EXPECT: ((y "\u{7f}"))
+(set-logic ALL)
+(set-info :status sat)
+(declare-fun x () String)
+(declare-fun y () String)
+(assert (= (str.to_code x) 10))
+(assert (= (str.to_code y) 127))
+(check-sat)
+(get-value (x))
+(get-value (y))
diff --git a/test/regress/regress0/strings/model-friendly.smt2 b/test/regress/regress0/strings/model-friendly.smt2
new file mode 100644
index 000000000..985ffaa62
--- /dev/null
+++ b/test/regress/regress0/strings/model-friendly.smt2
@@ -0,0 +1,9 @@
+; COMMAND-LINE: --lang=smt2.6.1 --produce-models
+; EXPECT: sat
+; EXPECT: ((x "AAAAA"))
+(set-logic ALL)
+(set-info :status sat)
+(declare-fun x () String)
+(assert (= (str.len x) 5))
+(check-sat)
+(get-value (x))
diff --git a/test/regress/regress0/strings/unicode-esc.smt2 b/test/regress/regress0/strings/unicode-esc.smt2
new file mode 100644
index 000000000..01f5f30ab
--- /dev/null
+++ b/test/regress/regress0/strings/unicode-esc.smt2
@@ -0,0 +1,30 @@
+; COMMAND-LINE: --strings-exp --lang=smt2.6.1
+; EXPECT: sat
+(set-logic ALL)
+
+(assert (= "\u{14}" "\u0014"))
+(assert (= "\u{00}" "\u{0}"))
+(assert (= "\u0000" "\u{0}"))
+(assert (= (str.len "\u1234") 1))
+(assert (= (str.len "\u{1}") 1))
+(assert (= (str.len "\u{99}") 1))
+(assert (= (str.len "\u{779}") 1))
+(assert (= (str.len "\u{0779}") 1))
+(assert (= (str.len "\u{01779}") 1))
+(assert (= (str.len "\u{001779}") 10))
+(assert (= (str.len "\u{0vv79}") 9))
+(assert (= (str.len "\u{11\u1234}") 7))
+(assert (= (str.len "\u12345") 2))
+(assert (= (str.len "\uu") 3))
+(assert (= (str.len "\u{123}\u{567}") 2))
+(assert (= (str.len "\u{0017") 7))
+(assert (= (str.len "\\u00178") 3))
+(assert (= (str.len "2\u{}") 5))
+(assert (= (str.len "\uaaaa") 1))
+(assert (= (str.len "\uAAAA") 1))
+(assert (= (str.len "\u{0AbC}") 1))
+(assert (= (str.len "\u{E}") 1))
+(assert (= (str.len "\u{44444}") 9))
+(assert (= (str.len "\u") 2))
+
+(check-sat)
diff --git a/test/unit/api/solver_black.h b/test/unit/api/solver_black.h
index 27f5aca12..0eefde700 100644
--- a/test/unit/api/solver_black.h
+++ b/test/unit/api/solver_black.h
@@ -556,9 +556,9 @@ void SolverBlack::testMkString()
   TS_ASSERT_THROWS_NOTHING(d_solver->mkString(""));
   TS_ASSERT_THROWS_NOTHING(d_solver->mkString("asdfasdf"));
   TS_ASSERT_EQUALS(d_solver->mkString("asdf\\nasdf").toString(),
-                   "\"asdf\\\\nasdf\"");
-  TS_ASSERT_EQUALS(d_solver->mkString("asdf\\nasdf", true).toString(),
-                   "\"asdf\\nasdf\"");
+                   "\"asdf\\u{5c}nasdf\"");
+  TS_ASSERT_EQUALS(d_solver->mkString("asdf\\u{005c}nasdf", true).toString(),
+                    "\"asdf\\u{5c}nasdf\"");
 }
 
 void SolverBlack::testMkTerm()
author	Andrew Reynolds <andrew.j.reynolds@gmail.com>	2020-03-27 09:01:38 -0500
committer	GitHub <noreply@github.com>	2020-03-27 09:01:38 -0500
commit	27ac2ce712b0bcfdef83e2d44dd210f667ab7959 (patch)
tree	a64febad63c37b641eaaacf4ad79007888aa43f9
parent	fa2ba76ef83497108942ebb91cdb07fdfeed505b (diff)