summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/parser/cvc/Cvc.g2
-rw-r--r--src/parser/parser.cpp148
-rw-r--r--src/parser/parser.h22
-rw-r--r--src/parser/smt2/Smt2.g2
-rw-r--r--src/preprocessing/passes/synth_rew_rules.cpp2
-rw-r--r--src/printer/cvc/cvc_printer.cpp5
-rw-r--r--src/printer/smt2/smt2_printer.cpp2
-rw-r--r--src/theory/evaluator.cpp3
-rw-r--r--src/theory/quantifiers/sygus_sampler.cpp3
-rw-r--r--src/theory/strings/regexp_operation.cpp10
-rw-r--r--src/theory/strings/sequences_rewriter.cpp3
-rw-r--r--src/theory/strings/strings_rewriter.cpp5
-rw-r--r--src/theory/strings/theory_strings.cpp7
-rw-r--r--src/theory/strings/theory_strings_type_rules.h2
-rw-r--r--src/theory/strings/type_enumerator.cpp40
-rw-r--r--src/theory/strings/type_enumerator.h20
-rw-r--r--src/util/regexp.cpp349
-rw-r--r--src/util/regexp.h138
-rw-r--r--test/regress/CMakeLists.txt4
-rw-r--r--test/regress/regress0/strings/gen-esc-seq.smt29
-rw-r--r--test/regress/regress0/strings/model-code-point.smt213
-rw-r--r--test/regress/regress0/strings/model-friendly.smt29
-rw-r--r--test/regress/regress0/strings/unicode-esc.smt230
-rw-r--r--test/unit/api/solver_black.h6
24 files changed, 549 insertions, 285 deletions
diff --git a/src/parser/cvc/Cvc.g b/src/parser/cvc/Cvc.g
index 82c0581ce..033389610 100644
--- a/src/parser/cvc/Cvc.g
+++ b/src/parser/cvc/Cvc.g
@@ -2083,7 +2083,7 @@ stringTerm[CVC4::api::Term& f]
/* string literal */
| str[s]
- { f = SOLVER->mkString(s, true); }
+ { f = PARSER_STATE->mkStringConstant(s); }
| setsTerm[f]
;
diff --git a/src/parser/parser.cpp b/src/parser/parser.cpp
index b36f36a93..5dca92370 100644
--- a/src/parser/parser.cpp
+++ b/src/parser/parser.cpp
@@ -757,5 +757,153 @@ void Parser::attributeNotSupported(const std::string& attr) {
}
}
+std::vector<unsigned> Parser::processAdHocStringEsc(const std::string& s)
+{
+ std::vector<unsigned> str;
+ unsigned i = 0;
+ while (i < s.size())
+ {
+ // get the current character
+ if (s[i] != '\\')
+ {
+ // don't worry about printable here
+ str.push_back(static_cast<unsigned>(s[i]));
+ ++i;
+ continue;
+ }
+ // slash is always escaped
+ ++i;
+ if (i >= s.size())
+ {
+ // slash cannot be the last character if we are parsing escape sequences
+ std::stringstream serr;
+ serr << "Escape sequence at the end of string: \"" << s
+ << "\" should be handled by lexer";
+ parseError(serr.str());
+ }
+ switch (s[i])
+ {
+ case 'n':
+ {
+ str.push_back(static_cast<unsigned>('\n'));
+ i++;
+ }
+ break;
+ case 't':
+ {
+ str.push_back(static_cast<unsigned>('\t'));
+ i++;
+ }
+ break;
+ case 'v':
+ {
+ str.push_back(static_cast<unsigned>('\v'));
+ i++;
+ }
+ break;
+ case 'b':
+ {
+ str.push_back(static_cast<unsigned>('\b'));
+ i++;
+ }
+ break;
+ case 'r':
+ {
+ str.push_back(static_cast<unsigned>('\r'));
+ i++;
+ }
+ break;
+ case 'f':
+ {
+ str.push_back(static_cast<unsigned>('\f'));
+ i++;
+ }
+ break;
+ case 'a':
+ {
+ str.push_back(static_cast<unsigned>('\a'));
+ i++;
+ }
+ break;
+ case '\\':
+ {
+ str.push_back(static_cast<unsigned>('\\'));
+ i++;
+ }
+ break;
+ case 'x':
+ {
+ bool isValid = false;
+ if (i + 2 < s.size())
+ {
+ if (std::isxdigit(s[i + 1]) && std::isxdigit(s[i + 2]))
+ {
+ std::stringstream shex;
+ shex << s[i + 1] << s[i + 2];
+ unsigned val;
+ shex >> std::hex >> val;
+ str.push_back(val);
+ i += 3;
+ isValid = true;
+ }
+ }
+ if (!isValid)
+ {
+ std::stringstream serr;
+ serr << "Illegal String Literal: \"" << s
+ << "\", must have two digits after \\x";
+ parseError(serr.str());
+ }
+ }
+ break;
+ default:
+ {
+ if (std::isdigit(s[i]))
+ {
+ // octal escape sequences TODO : revisit (issue #1251).
+ unsigned num = static_cast<unsigned>(s[i]) - 48;
+ bool flag = num < 4;
+ if (i + 1 < s.size() && num < 8 && std::isdigit(s[i + 1])
+ && s[i + 1] < '8')
+ {
+ num = num * 8 + static_cast<unsigned>(s[i + 1]) - 48;
+ if (flag && i + 2 < s.size() && std::isdigit(s[i + 2])
+ && s[i + 2] < '8')
+ {
+ num = num * 8 + static_cast<unsigned>(s[i + 2]) - 48;
+ str.push_back(num);
+ i += 3;
+ }
+ else
+ {
+ str.push_back(num);
+ i += 2;
+ }
+ }
+ else
+ {
+ str.push_back(num);
+ i++;
+ }
+ }
+ }
+ }
+ }
+ return str;
+}
+
+Expr Parser::mkStringConstant(const std::string& s)
+{
+ ExprManager* em = d_solver->getExprManager();
+ if (em->getOptions().getInputLanguage()
+ == language::input::LANG_SMTLIB_V2_6_1)
+ {
+ return d_solver->mkString(s, true).getExpr();
+ }
+ // otherwise, we must process ad-hoc escape sequences
+ std::vector<unsigned> str = processAdHocStringEsc(s);
+ return d_solver->mkString(str).getExpr();
+}
+
} /* CVC4::parser namespace */
} /* CVC4 namespace */
diff --git a/src/parser/parser.h b/src/parser/parser.h
index ecea4d3bd..d6c0e0e15 100644
--- a/src/parser/parser.h
+++ b/src/parser/parser.h
@@ -889,6 +889,28 @@ public:
name, api::sortVectorToTypes(argTypes));
}
//------------------------ end operator overloading
+ /**
+ * Make string constant
+ *
+ * This makes the string constant based on the string s. This may involve
+ * processing ad-hoc escape sequences (if the language is not
+ * SMT-LIB 2.6.1 or higher), or otherwise calling the solver to construct
+ * the string.
+ */
+ Expr mkStringConstant(const std::string& s);
+
+ private:
+ /** ad-hoc string escaping
+ *
+ * Returns the (internal) vector of code points corresponding to processing
+ * the escape sequences in string s. This is to support string inputs that
+ * do no comply with the SMT-LIB standard.
+ *
+ * This method handles escape sequences, including \n, \t, \v, \b, \r, \f, \a,
+ * \\, \x[N] and octal escape sequences of the form \[c1]([c2]([c3])?)? where
+ * c1, c2, c3 are digits from 0 to 7.
+ */
+ std::vector<unsigned> processAdHocStringEsc(const std::string& s);
};/* class Parser */
}/* CVC4::parser namespace */
diff --git a/src/parser/smt2/Smt2.g b/src/parser/smt2/Smt2.g
index ec1eae7da..69f21acb7 100644
--- a/src/parser/smt2/Smt2.g
+++ b/src/parser/smt2/Smt2.g
@@ -2091,7 +2091,7 @@ termAtomic[CVC4::api::Term& atomTerm]
}
// String constant
- | str[s,false] { atomTerm = SOLVER->mkString(s, true); }
+ | str[s,false] { atomTerm = PARSER_STATE->mkStringConstant(s); }
// NOTE: Theory constants go here
diff --git a/src/preprocessing/passes/synth_rew_rules.cpp b/src/preprocessing/passes/synth_rew_rules.cpp
index 7b8e61359..f1e9e39c5 100644
--- a/src/preprocessing/passes/synth_rew_rules.cpp
+++ b/src/preprocessing/passes/synth_rew_rules.cpp
@@ -169,7 +169,7 @@ PreprocessingPassResult SynthRewRulesPass::applyInternal(
std::stringstream ssv;
if (varCounter < 26)
{
- ssv << String::convertUnsignedIntToChar(varCounter + 32);
+ ssv << static_cast<char>(varCounter + 61);
}
else
{
diff --git a/src/printer/cvc/cvc_printer.cpp b/src/printer/cvc/cvc_printer.cpp
index cad3c4640..1178c7299 100644
--- a/src/printer/cvc/cvc_printer.cpp
+++ b/src/printer/cvc/cvc_printer.cpp
@@ -160,6 +160,11 @@ void CvcPrinter::toStream(
toStreamRational(out, n, false);
break;
}
+ case kind::CONST_STRING:
+ {
+ out << '"' << n.getConst<String>().toString() << '"';
+ break;
+ }
case kind::TYPE_CONSTANT:
switch(TypeConstant tc = n.getConst<TypeConstant>()) {
case REAL_TYPE:
diff --git a/src/printer/smt2/smt2_printer.cpp b/src/printer/smt2/smt2_printer.cpp
index 541827f89..6e4fcb63a 100644
--- a/src/printer/smt2/smt2_printer.cpp
+++ b/src/printer/smt2/smt2_printer.cpp
@@ -202,7 +202,7 @@ void Smt2Printer::toStream(std::ostream& out,
}
case kind::CONST_STRING: {
- std::string s = n.getConst<String>().toString(true);
+ std::string s = n.getConst<String>().toString();
out << '"';
for(size_t i = 0; i < s.size(); ++i) {
char c = s[i];
diff --git a/src/theory/evaluator.cpp b/src/theory/evaluator.cpp
index b827912d5..646f903f5 100644
--- a/src/theory/evaluator.cpp
+++ b/src/theory/evaluator.cpp
@@ -626,8 +626,7 @@ EvalResult Evaluator::evalInternal(
const String& s = results[currNode[0]].d_str;
if (s.size() == 1)
{
- results[currNode] = EvalResult(
- Rational(String::convertUnsignedIntToCode(s.getVec()[0])));
+ results[currNode] = EvalResult(Rational(s.getVec()[0]));
}
else
{
diff --git a/src/theory/quantifiers/sygus_sampler.cpp b/src/theory/quantifiers/sygus_sampler.cpp
index 28cfa69df..e9c858814 100644
--- a/src/theory/quantifiers/sygus_sampler.cpp
+++ b/src/theory/quantifiers/sygus_sampler.cpp
@@ -560,8 +560,7 @@ Node SygusSampler::getRandomValue(TypeNode tn)
for (unsigned ch : alphas)
{
d_rstring_alphabet.push_back(ch);
- Trace("sygus-sample-str-alpha")
- << " \"" << String::convertUnsignedIntToChar(ch) << "\"";
+ Trace("sygus-sample-str-alpha") << " \\u" << ch;
}
Trace("sygus-sample-str-alpha") << std::endl;
}
diff --git a/src/theory/strings/regexp_operation.cpp b/src/theory/strings/regexp_operation.cpp
index d5105a489..9a2091eac 100644
--- a/src/theory/strings/regexp_operation.cpp
+++ b/src/theory/strings/regexp_operation.cpp
@@ -739,9 +739,7 @@ void RegExpOpr::firstChars(Node r, std::set<unsigned> &pcset, SetNodes &pvset)
}
case kind::REGEXP_RANGE: {
unsigned a = r[0].getConst<String>().front();
- a = String::convertUnsignedIntToCode(a);
unsigned b = r[1].getConst<String>().front();
- b = String::convertUnsignedIntToCode(b);
Assert(a < b);
Assert(b < std::numeric_limits<unsigned>::max());
for (unsigned c = a; c <= b; c++)
@@ -756,7 +754,6 @@ void RegExpOpr::firstChars(Node r, std::set<unsigned> &pcset, SetNodes &pvset)
String s = st.getConst<String>();
if(s.size() != 0) {
unsigned sc = s.front();
- sc = String::convertUnsignedIntToCode(sc);
cset.insert(sc);
}
}
@@ -765,7 +762,6 @@ void RegExpOpr::firstChars(Node r, std::set<unsigned> &pcset, SetNodes &pvset)
if(st[0].isConst()) {
String s = st[0].getConst<String>();
unsigned sc = s.front();
- sc = String::convertUnsignedIntToCode(sc);
cset.insert(sc);
} else {
vset.insert( st[0] );
@@ -887,13 +883,11 @@ void RegExpOpr::simplifyNRegExp( Node s, Node r, std::vector< Node > &new_nodes
case kind::REGEXP_RANGE: {
std::vector< Node > vec;
unsigned a = r[0].getConst<String>().front();
- a = String::convertUnsignedIntToCode(a);
unsigned b = r[1].getConst<String>().front();
- b = String::convertUnsignedIntToCode(b);
for (unsigned c = a; c <= b; c++)
{
std::vector<unsigned> tmpVec;
- tmpVec.push_back(String::convertCodeToUnsignedInt(c));
+ tmpVec.push_back(c);
Node tmp = s.eqNode(nm->mkConst(String(tmpVec))).negate();
vec.push_back( tmp );
}
@@ -1522,7 +1516,7 @@ Node RegExpOpr::intersectInternal( Node r1, Node r2, std::map< PairNodes, Node >
++it)
{
std::vector<unsigned> cvec;
- cvec.push_back(String::convertCodeToUnsignedInt(*it));
+ cvec.push_back(*it);
String c(cvec);
Trace("regexp-int-debug") << "Try character " << c << " ... " << std::endl;
Node r1l = derivativeSingle(r1, c);
diff --git a/src/theory/strings/sequences_rewriter.cpp b/src/theory/strings/sequences_rewriter.cpp
index 716634d5f..b0940b7e1 100644
--- a/src/theory/strings/sequences_rewriter.cpp
+++ b/src/theory/strings/sequences_rewriter.cpp
@@ -1421,11 +1421,8 @@ bool SequencesRewriter::testConstStringInRegExp(CVC4::String& s,
if (s.size() == index_start + 1)
{
unsigned a = r[0].getConst<String>().front();
- a = String::convertUnsignedIntToCode(a);
unsigned b = r[1].getConst<String>().front();
- b = String::convertUnsignedIntToCode(b);
unsigned c = s.back();
- c = String::convertUnsignedIntToCode(c);
return (a <= c && c <= b);
}
else
diff --git a/src/theory/strings/strings_rewriter.cpp b/src/theory/strings/strings_rewriter.cpp
index 75dfe7432..c7676d049 100644
--- a/src/theory/strings/strings_rewriter.cpp
+++ b/src/theory/strings/strings_rewriter.cpp
@@ -93,7 +93,7 @@ Node StringsRewriter::rewriteStrConvert(Node node)
std::vector<unsigned> nvec = node[0].getConst<String>().getVec();
for (unsigned i = 0, nvsize = nvec.size(); i < nvsize; i++)
{
- unsigned newChar = String::convertUnsignedIntToCode(nvec[i]);
+ unsigned newChar = nvec[i];
// transform it
// upper 65 ... 90
// lower 97 ... 122
@@ -111,7 +111,6 @@ Node StringsRewriter::rewriteStrConvert(Node node)
newChar = newChar + 32;
}
}
- newChar = String::convertCodeToUnsignedInt(newChar);
nvec[i] = newChar;
}
Node retNode = nm->mkConst(String(nvec));
@@ -231,7 +230,7 @@ Node StringsRewriter::rewriteStringToCode(Node n)
{
std::vector<unsigned> vec = s.getVec();
Assert(vec.size() == 1);
- ret = nm->mkConst(Rational(String::convertUnsignedIntToCode(vec[0])));
+ ret = nm->mkConst(Rational(vec[0]));
}
else
{
diff --git a/src/theory/strings/theory_strings.cpp b/src/theory/strings/theory_strings.cpp
index 16183abdd..a81c96318 100644
--- a/src/theory/strings/theory_strings.cpp
+++ b/src/theory/strings/theory_strings.cpp
@@ -382,7 +382,7 @@ bool TheoryStrings::collectModelInfoType(
ctv.getConst<Rational>().getNumerator().toUnsignedInt();
Trace("strings-model") << "(code: " << cvalue << ") ";
std::vector<unsigned> vec;
- vec.push_back(String::convertCodeToUnsignedInt(cvalue));
+ vec.push_back(cvalue);
Node mv = nm->mkConst(String(vec));
pure_eq_assign[eqc] = mv;
m->getEqualityEngine()->addTerm(mv);
@@ -1099,13 +1099,14 @@ void TheoryStrings::registerTerm(Node n, int effort)
else if (n.getKind() == STRING_TO_CODE)
{
d_has_str_code = true;
- // ite( str.len(s)==1, 0 <= str.code(s) < num_codes, str.code(s)=-1 )
+ // ite( str.len(s)==1, 0 <= str.code(s) < |A|, str.code(s)=-1 )
Node code_len = utils::mkNLength(n[0]).eqNode(d_one);
Node code_eq_neg1 = n.eqNode(d_neg_one);
Node code_range = nm->mkNode(
AND,
nm->mkNode(GEQ, n, d_zero),
- nm->mkNode(LT, n, nm->mkConst(Rational(CVC4::String::num_codes()))));
+ nm->mkNode(
+ LT, n, nm->mkConst(Rational(utils::getAlphabetCardinality()))));
regTermLem = nm->mkNode(ITE, code_len, code_range, code_eq_neg1);
}
else if (n.getKind() == STRING_STRIDOF)
diff --git a/src/theory/strings/theory_strings_type_rules.h b/src/theory/strings/theory_strings_type_rules.h
index 7ef31a92a..93a32f26e 100644
--- a/src/theory/strings/theory_strings_type_rules.h
+++ b/src/theory/strings/theory_strings_type_rules.h
@@ -299,7 +299,7 @@ public:
throw TypeCheckingExceptionPrivate(n, "expecting a single constant string term in regexp range");
}
unsigned ci = (*it).getConst<String>().front();
- ch[i] = String::convertUnsignedIntToCode(ci);
+ ch[i] = ci;
++it;
}
if(ch[0] > ch[1]) {
diff --git a/src/theory/strings/type_enumerator.cpp b/src/theory/strings/type_enumerator.cpp
index 12cf899b4..7352ae5de 100644
--- a/src/theory/strings/type_enumerator.cpp
+++ b/src/theory/strings/type_enumerator.cpp
@@ -21,6 +21,44 @@ namespace CVC4 {
namespace theory {
namespace strings {
+Node makeStandardModelConstant(const std::vector<unsigned>& vec,
+ uint32_t cardinality)
+{
+ std::vector<unsigned> mvec;
+ // if we contain all of the printable characters
+ if (cardinality >= 255)
+ {
+ for (unsigned i = 0, vsize = vec.size(); i < vsize; i++)
+ {
+ unsigned curr = vec[i];
+ // convert
+ Assert(vec[i] < cardinality);
+ if (vec[i] <= 61)
+ {
+ // first 62 printable characters [\u{65}-\u{126}]: 'A', 'B', 'C', ...
+ curr = vec[i] + 65;
+ }
+ else if (vec[i] <= 94)
+ {
+ // remaining 33 printable characters [\u{32}-\u{64}]: ' ', '!', '"', ...
+ curr = vec[i] - 30;
+ }
+ else
+ {
+ // the remaining characters, starting with \u{127} and wrapping around
+ // the first 32 non-printable characters.
+ curr = (vec[i] + 32) % cardinality;
+ }
+ mvec.push_back(curr);
+ }
+ }
+ else
+ {
+ mvec = vec;
+ }
+ return NodeManager::currentNM()->mkConst(String(mvec));
+}
+
WordIter::WordIter(uint32_t startLength) : d_hasEndLength(false), d_endLength(0)
{
for (uint32_t i = 0; i < startLength; i++)
@@ -117,7 +155,7 @@ bool StringEnumLen::increment()
void StringEnumLen::mkCurr()
{
- d_curr = NodeManager::currentNM()->mkConst(String(d_witer->getData()));
+ d_curr = makeStandardModelConstant(d_witer->getData(), d_cardinality);
}
StringEnumerator::StringEnumerator(TypeNode type, TypeEnumeratorProperties* tep)
diff --git a/src/theory/strings/type_enumerator.h b/src/theory/strings/type_enumerator.h
index 2061628a5..b379ce5c3 100644
--- a/src/theory/strings/type_enumerator.h
+++ b/src/theory/strings/type_enumerator.h
@@ -28,6 +28,26 @@ namespace theory {
namespace strings {
/**
+ * Make standard model constant
+ *
+ * In our string representation, we represent characters using vectors
+ * of unsigned integers indicating code points for the characters of that
+ * string.
+ *
+ * To make models user-friendly, we make unsigned integer 0 correspond to the
+ * 65th character ("A") in the ASCII alphabet to make models intuitive. In
+ * particular, say if we have a set of string variables that are distinct but
+ * otherwise unconstrained, then the model may assign them "A", "B", "C", ...
+ *
+ * @param vec The code points of the string in a given model,
+ * @param cardinality The cardinality of the alphabet,
+ * @return A string whose characters have the code points corresponding
+ * to vec in the standard model construction described above.
+ */
+Node makeStandardModelConstant(const std::vector<unsigned>& vec,
+ uint32_t cardinality);
+
+/**
* Generic iteration over vectors of indices of a given start/end length.
*/
class WordIter
diff --git a/src/util/regexp.cpp b/src/util/regexp.cpp
index 00066edb6..36ba7182b 100644
--- a/src/util/regexp.cpp
+++ b/src/util/regexp.cpp
@@ -32,38 +32,12 @@ namespace CVC4 {
static_assert(UCHAR_MAX == 255, "Unsigned char is assumed to have 256 values.");
-unsigned String::convertCharToUnsignedInt(unsigned char c)
-{
- return convertCodeToUnsignedInt(static_cast<unsigned>(c));
-}
-unsigned char String::convertUnsignedIntToChar(unsigned i)
-{
- Assert(i < num_codes());
- return static_cast<unsigned char>(convertUnsignedIntToCode(i));
-}
-bool String::isPrintable(unsigned i)
-{
- Assert(i < num_codes());
- unsigned char c = convertUnsignedIntToChar(i);
- return (c >= ' ' && c <= '~');
-}
-unsigned String::convertCodeToUnsignedInt(unsigned c)
-{
- Assert(c < num_codes());
- return (c < start_code() ? c + num_codes() : c) - start_code();
-}
-unsigned String::convertUnsignedIntToCode(unsigned i)
-{
- Assert(i < num_codes());
- return (i + start_code()) % num_codes();
-}
-
String::String(const std::vector<unsigned> &s) : d_str(s)
{
#ifdef CVC4_ASSERTIONS
for (unsigned u : d_str)
{
- Assert(convertUnsignedIntToCode(u) < num_codes());
+ Assert(u < num_codes());
}
#endif
}
@@ -74,8 +48,8 @@ int String::cmp(const String &y) const {
}
for (unsigned int i = 0; i < size(); ++i) {
if (d_str[i] != y.d_str[i]) {
- unsigned cp = convertUnsignedIntToCode(d_str[i]);
- unsigned cpy = convertUnsignedIntToCode(y.d_str[i]);
+ unsigned cp = d_str[i];
+ unsigned cpy = y.d_str[i];
return cp < cpy ? -1 : 1;
}
}
@@ -122,107 +96,143 @@ bool String::rstrncmp(const String& y, std::size_t n) const
return true;
}
-std::vector<unsigned> String::toInternal(const std::string &s,
- bool useEscSequences) {
+void String::addCharToInternal(unsigned char ch, std::vector<unsigned>& str)
+{
+ // if not a printable character
+ if (ch > 127 || ch < 32)
+ {
+ std::stringstream serr;
+ serr << "Illegal string character: \"" << ch
+ << "\", must use escape sequence";
+ throw CVC4::Exception(serr.str());
+ }
+ else
+ {
+ str.push_back(static_cast<unsigned>(ch));
+ }
+}
+
+std::vector<unsigned> String::toInternal(const std::string& s,
+ bool useEscSequences)
+{
std::vector<unsigned> str;
unsigned i = 0;
- while (i < s.size()) {
- if (s[i] == '\\' && useEscSequences) {
- i++;
- if (i < s.size()) {
- switch (s[i]) {
- case 'n': {
- str.push_back(convertCharToUnsignedInt('\n'));
- i++;
- } break;
- case 't': {
- str.push_back(convertCharToUnsignedInt('\t'));
- i++;
- } break;
- case 'v': {
- str.push_back(convertCharToUnsignedInt('\v'));
- i++;
- } break;
- case 'b': {
- str.push_back(convertCharToUnsignedInt('\b'));
- i++;
- } break;
- case 'r': {
- str.push_back(convertCharToUnsignedInt('\r'));
- i++;
- } break;
- case 'f': {
- str.push_back(convertCharToUnsignedInt('\f'));
- i++;
- } break;
- case 'a': {
- str.push_back(convertCharToUnsignedInt('\a'));
- i++;
- } break;
- case '\\': {
- str.push_back(convertCharToUnsignedInt('\\'));
- i++;
- } break;
- case 'x': {
- if (i + 2 < s.size()) {
- if (isxdigit(s[i + 1]) && isxdigit(s[i + 2])) {
- str.push_back(convertCharToUnsignedInt(hexToDec(s[i + 1]) * 16 +
- hexToDec(s[i + 2])));
- i += 3;
- } else {
- throw CVC4::Exception("Illegal String Literal: \"" + s + "\"");
- }
- } else {
- throw CVC4::Exception("Illegal String Literal: \"" + s +
- "\", must have two digits after \\x");
- }
- } break;
- default: {
- if (isdigit(s[i])) {
- // octal escape sequences TODO : revisit (issue #1251).
- int num = (int)s[i] - (int)'0';
- bool flag = num < 4;
- if (i + 1 < s.size() && num < 8 && isdigit(s[i + 1]) &&
- s[i + 1] < '8') {
- num = num * 8 + (int)s[i + 1] - (int)'0';
- if (flag && i + 2 < s.size() && isdigit(s[i + 2]) &&
- s[i + 2] < '8') {
- num = num * 8 + (int)s[i + 2] - (int)'0';
- str.push_back(convertCharToUnsignedInt((unsigned char)num));
- i += 3;
- } else {
- str.push_back(convertCharToUnsignedInt((unsigned char)num));
- i += 2;
- }
- } else {
- str.push_back(convertCharToUnsignedInt((unsigned char)num));
- i++;
- }
- } else if ((unsigned)s[i] > 127) {
- throw CVC4::Exception("Illegal String Literal: \"" + s +
- "\", must use escaped sequence");
- } else {
- str.push_back(convertCharToUnsignedInt(s[i]));
- i++;
- }
+ while (i < s.size())
+ {
+ // get the current character
+ char si = s[i];
+ if (si != '\\' || !useEscSequences)
+ {
+ addCharToInternal(si, str);
+ ++i;
+ continue;
+ }
+ // the vector of characters, in case we fail to read an escape sequence
+ std::vector<unsigned> nonEscCache;
+ // process the '\'
+ addCharToInternal(si, nonEscCache);
+ ++i;
+ // are we an escape sequence?
+ bool isEscapeSequence = true;
+ // the string corresponding to the hexidecimal code point
+ std::stringstream hexString;
+ // is the slash followed by a 'u'? Could be last character.
+ if (i >= s.size() || s[i] != 'u')
+ {
+ isEscapeSequence = false;
+ }
+ else
+ {
+ // process the 'u'
+ addCharToInternal(s[i], nonEscCache);
+ ++i;
+ bool isStart = true;
+ bool isEnd = false;
+ bool hasBrace = false;
+ while (i < s.size())
+ {
+ // add the next character
+ si = s[i];
+ if (isStart)
+ {
+ isStart = false;
+ // possibly read '{'
+ if (si == '{')
+ {
+ hasBrace = true;
+ addCharToInternal(si, nonEscCache);
+ ++i;
+ continue;
}
}
- } else {
- throw CVC4::Exception("should be handled by lexer: \"" + s + "\"");
- // str.push_back( convertCharToUnsignedInt('\\') );
+ else if (si == '}')
+ {
+ // can only end if we had an open brace and read at least one digit
+ isEscapeSequence = hasBrace && !hexString.str().empty();
+ isEnd = true;
+ addCharToInternal(si, nonEscCache);
+ ++i;
+ break;
+ }
+ // must be a hex digit at this point
+ if (!isHexDigit(static_cast<unsigned>(si)))
+ {
+ isEscapeSequence = false;
+ break;
+ }
+ hexString << si;
+ addCharToInternal(si, nonEscCache);
+ ++i;
+ if (!hasBrace && hexString.str().size() == 4)
+ {
+ // will be finished reading \ u d_3 d_2 d_1 d_0 with no parens
+ isEnd = true;
+ break;
+ }
+ else if (hasBrace && hexString.str().size() > 5)
+ {
+ // too many digits enclosed in brace, not an escape sequence
+ isEscapeSequence = false;
+ break;
+ }
+ }
+ if (!isEnd)
+ {
+ // if we were interupted before ending, then this is not a valid
+ // escape sequence
+ isEscapeSequence = false;
+ }
+ }
+ if (isEscapeSequence)
+ {
+ Assert(!hexString.str().empty() && hexString.str().size() <= 5);
+ // Otherwise, we add the escaped character.
+ // This is guaranteed not to overflow due to the length of hstr.
+ uint32_t val;
+ hexString >> std::hex >> val;
+ if (val > num_codes())
+ {
+ // Failed due to being out of range. This can happen for strings of
+ // the form \ u { d_4 d_3 d_2 d_1 d_0 } where d_4 is a hexidecimal not
+ // in the range [0-2].
+ isEscapeSequence = false;
+ }
+ else
+ {
+ str.push_back(val);
}
- } else if ((unsigned)s[i] > 127 && useEscSequences) {
- throw CVC4::Exception("Illegal String Literal: \"" + s +
- "\", must use escaped sequence");
- } else {
- str.push_back(convertCharToUnsignedInt(s[i]));
- i++;
+ }
+ // if we did not successfully parse an escape sequence, we add back all
+ // characters that we cached
+ if (!isEscapeSequence)
+ {
+ str.insert(str.end(), nonEscCache.begin(), nonEscCache.end());
}
}
#ifdef CVC4_ASSERTIONS
for (unsigned u : str)
{
- Assert(convertUnsignedIntToCode(u) < num_codes());
+ Assert(u < num_codes());
}
#endif
return str;
@@ -265,62 +275,23 @@ std::size_t String::roverlap(const String &y) const {
}
std::string String::toString(bool useEscSequences) const {
- std::string str;
+ std::stringstream str;
for (unsigned int i = 0; i < size(); ++i) {
- unsigned char c = convertUnsignedIntToChar(d_str[i]);
- if (!useEscSequences) {
- str += c;
- } else if (isprint(c)) {
- if (c == '\\') {
- str += "\\\\";
- }
- // else if(c == '\"') {
- // str += "\\\"";
- //}
- else {
- str += c;
- }
- } else {
- std::string s;
- switch (c) {
- case '\a':
- s = "\\a";
- break;
- case '\b':
- s = "\\b";
- break;
- case '\t':
- s = "\\t";
- break;
- case '\r':
- s = "\\r";
- break;
- case '\v':
- s = "\\v";
- break;
- case '\f':
- s = "\\f";
- break;
- case '\n':
- s = "\\n";
- break;
- case '\e':
- s = "\\e";
- break;
- default: {
- std::stringstream ss;
- ss << std::setfill('0') << std::setw(2) << std::hex << ((int)c);
- std::string t = ss.str();
- t = t.substr(t.size() - 2, 2);
- s = "\\x" + t;
- // std::string s2 = static_cast<std::ostringstream*>(
- // &(std::ostringstream() << (int)c) )->str();
- }
- }
- str += s;
+ // we always print forward slash as a code point so that it cannot
+ // be interpreted as specifying part of a code point, e.g. the string
+ // '\' + 'u' + '0' of length three.
+ if (isPrintable(d_str[i]) && d_str[i] != '\\' && !useEscSequences)
+ {
+ str << static_cast<char>(d_str[i]);
+ }
+ else
+ {
+ std::stringstream ss;
+ ss << std::hex << d_str[i];
+ str << "\\u{" << ss.str() << "}";
}
}
- return str;
+ return str.str();
}
bool String::isLeq(const String &y) const
@@ -331,8 +302,8 @@ bool String::isLeq(const String &y) const
{
return false;
}
- unsigned ci = convertUnsignedIntToCode(d_str[i]);
- unsigned cyi = convertUnsignedIntToCode(y.d_str[i]);
+ unsigned ci = d_str[i];
+ unsigned cyi = y.d_str[i];
if (ci > cyi)
{
return false;
@@ -484,8 +455,21 @@ bool String::isNumber() const {
bool String::isDigit(unsigned character)
{
- unsigned char c = convertUnsignedIntToChar(character);
- return c >= '0' && c <= '9';
+ // '0' to '9'
+ return 48 <= character && character <= 57;
+}
+
+bool String::isHexDigit(unsigned character)
+{
+ // '0' to '9' or 'A' to 'F' or 'a' to 'f'
+ return isDigit(character) || (65 <= character && character <= 70)
+ || (97 <= character && character <= 102);
+}
+
+bool String::isPrintable(unsigned character)
+{
+ // Unicode 0x00020 (' ') to 0x0007E ('~')
+ return 32 <= character && character <= 126;
}
size_t String::maxSize() { return std::numeric_limits<uint32_t>::max(); }
@@ -497,17 +481,6 @@ Rational String::toNumber() const
return Rational(toString());
}
-unsigned char String::hexToDec(unsigned char c) {
- if (c >= '0' && c <= '9') {
- return c - '0';
- } else if (c >= 'a' && c <= 'f') {
- return c - 'a' + 10;
- } else {
- Assert(c >= 'A' && c <= 'F');
- return c - 'A' + 10;
- }
-}
-
std::ostream &operator<<(std::ostream &os, const String &s) {
return os << "\"" << s.toString(true) << "\"";
}
diff --git a/src/util/regexp.h b/src/util/regexp.h
index 731736f72..56fb969a3 100644
--- a/src/util/regexp.h
+++ b/src/util/regexp.h
@@ -37,60 +37,44 @@ namespace CVC4 {
class CVC4_PUBLIC String {
public:
/**
- * The start ASCII code. In our string representation below, we represent
- * characters using a vector d_str of unsigned integers. We refer to this as
- * the "internal representation" for the string.
- *
- * We make unsigned integer 0 correspond to the 65th character ("A") in the
- * ASCII alphabet to make models intuitive. In particular, say if we have
- * a set of string variables that are distinct but otherwise unconstrained,
- * then the model may assign them "A", "B", "C", ...
- */
- static inline unsigned start_code() { return 65; }
- /**
* This is the cardinality of the alphabet that is representable by this
* class. Notice that this must be greater than or equal to the cardinality
* of the alphabet that the string theory reasons about.
*
* This must be strictly less than std::numeric_limits<unsigned>::max().
+ *
+ * As per the SMT-LIB standard for strings, we support the first 3 planes of
+ * Unicode characters, where 196608 = 3*16^4.
*/
- static inline unsigned num_codes() { return 256; }
- /**
- * Convert unsigned char to the unsigned used in the internal representation
- * in d_str below.
- */
- static unsigned convertCharToUnsignedInt(unsigned char c);
- /** Convert the internal unsigned to a unsigned char. */
- static unsigned char convertUnsignedIntToChar(unsigned i);
- /** Does the internal unsigned correspond to a printable character? */
- static bool isPrintable(unsigned i);
- /** get the internal unsigned for ASCII code c. */
- static unsigned convertCodeToUnsignedInt(unsigned c);
- /** get the ASCII code number that internal unsigned i corresponds to. */
- static unsigned convertUnsignedIntToCode(unsigned i);
-
+ static inline unsigned num_codes() { return 196608; }
/** constructors for String
- *
- * Internally, a CVC4::String is represented by a vector of unsigned
- * integers (d_str), where the correspondence between C++ characters
- * to and from unsigned integers is determined by
- * by convertCharToUnsignedInt and convertUnsignedIntToChar.
- *
- * If useEscSequences is true, then the escape sequences in the input
- * are converted to the corresponding character. This constructor may
- * throw an exception if the input contains unrecognized escape sequences.
- * Currently supported escape sequences are \n, \t, \v, \b, \r, \f, \a, \\,
- * \x[N] where N is a hexidecimal, and octal escape sequences of the
- * form \[c1]([c2]([c3])?)? where c1, c2, c3 are digits from 0 to 7.
- *
- * If useEscSequences is false, then the characters of the constructed
- * CVC4::String correspond one-to-one with the input string.
- */
+ *
+ * Internally, a CVC4::String is represented by a vector of unsigned
+ * integers (d_str) representing the code points of the characters.
+ *
+ * To build a string from a C++ string, we may process escape sequences
+ * according to the SMT-LIB standard. In particular, if useEscSequences is
+ * true, we convert unicode escape sequences:
+ * \u d_3 d_2 d_1 d_0
+ * \u{d_0}
+ * \u{d_1 d_0}
+ * \u{d_2 d_1 d_0}
+ * \u{d_3 d_2 d_1 d_0}
+ * \u{d_4 d_3 d_2 d_1 d_0}
+ * where d_0 ... d_4 are hexidecimal digits, to the appropriate character.
+ *
+ * If useEscSequences is false, then the characters of the constructed
+ * CVC4::String correspond one-to-one with the input string.
+ */
String() = default;
explicit String(const std::string& s, bool useEscSequences = false)
- : d_str(toInternal(s, useEscSequences)) {}
+ : d_str(toInternal(s, useEscSequences))
+ {
+ }
explicit String(const char* s, bool useEscSequences = false)
- : d_str(toInternal(std::string(s), useEscSequences)) {}
+ : d_str(toInternal(std::string(s), useEscSequences))
+ {
+ }
explicit String(const std::vector<unsigned>& s);
String& operator=(const String& y) {
@@ -123,20 +107,16 @@ class CVC4_PUBLIC String {
bool rstrncmp(const String& y, std::size_t n) const;
/* toString
- * Converts this string to a std::string.
- *
- * If useEscSequences is true, then unprintable characters
- * are converted to escape sequences. The escape sequences
- * \n, \t, \v, \b, \r, \f, \a, \\ are printed in this way.
- * For all other unprintable characters, we print \x[N] where
- * [N] is the 2 digit hexidecimal corresponding to value of
- * the character.
- *
- * If useEscSequences is false, the returned std::string's characters
- * map one-to-one with the characters in this string.
- * Notice that for all std::string s, we have that
- * CVC4::String( s ).toString() = s.
- */
+ * Converts this string to a std::string.
+ *
+ * The unprintable characters are converted to unicode escape sequences as
+ * described above.
+ *
+ * If useEscSequences is false, the string's printable characters are
+ * printed as characters. Notice that for all std::string s having only
+ * printable characters, we have that
+ * CVC4::String( s ).toString() = s.
+ */
std::string toString(bool useEscSequences = false) const;
/** is this the empty string? */
bool empty() const { return d_str.empty(); }
@@ -221,16 +201,32 @@ class CVC4_PUBLIC String {
bool isNumber() const;
/** Returns the corresponding rational for the text of this string. */
Rational toNumber() const;
- /** get the internal unsigned representation of this string */
+ /** Get the unsigned representation (code points) of this string */
const std::vector<unsigned>& getVec() const { return d_str; }
- /** get the internal unsigned value of the first character in this string */
+ /**
+ * Get the unsigned (code point) value of the first character in this string
+ */
unsigned front() const;
- /** get the internal unsigned value of the last character in this string */
+ /**
+ * Get the unsigned (code point) value of the last character in this string
+ */
unsigned back() const;
/** is the unsigned a digit?
- * The input should be the same type as the element type of d_str
- */
+ *
+ * This is true for code points between 48 ('0') and 57 ('9').
+ */
static bool isDigit(unsigned character);
+ /** is the unsigned a hexidecimal digit?
+ *
+ * This is true for code points between 48 ('0') and 57 ('9'), code points
+ * between 65 ('A') and 70 ('F) and code points between 97 ('a') and 102 ('f).
+ */
+ static bool isHexDigit(unsigned character);
+ /** is the unsigned a printable code point?
+ *
+ * This is true for Unicode 32 (' ') to 126 ('~').
+ */
+ static bool isPrintable(unsigned character);
/**
* Returns the maximum length of string representable by this class.
@@ -238,11 +234,19 @@ class CVC4_PUBLIC String {
*/
static size_t maxSize();
private:
- // guarded
- static unsigned char hexToDec(unsigned char c);
-
+ /**
+ * Helper for toInternal: add character ch to vector vec, storing a string in
+ * internal format. This throws an error if ch is not a printable character,
+ * since non-printable characters must be escaped in SMT-LIB.
+ */
+ static void addCharToInternal(unsigned char ch, std::vector<unsigned>& vec);
+ /**
+ * Convert the string s to the internal format (vector of code points).
+ * The argument useEscSequences is whether to process unicode escape
+ * sequences.
+ */
static std::vector<unsigned> toInternal(const std::string& s,
- bool useEscSequences = true);
+ bool useEscSequences);
/**
* Returns a negative number if *this < y, 0 if *this and y are equal and a
diff --git a/test/regress/CMakeLists.txt b/test/regress/CMakeLists.txt
index 8fab16b44..8382e40fc 100644
--- a/test/regress/CMakeLists.txt
+++ b/test/regress/CMakeLists.txt
@@ -922,6 +922,7 @@ set(regress_0_tests
regress0/strings/escchar.smt2
regress0/strings/escchar_25.smt2
regress0/strings/from_code.smt2
+ regress0/strings/gen-esc-seq.smt2
regress0/strings/hconst-092618.smt2
regress0/strings/idof-rewrites.smt2
regress0/strings/idof-sem.smt2
@@ -939,6 +940,8 @@ set(regress_0_tests
regress0/strings/leadingzero001.smt2
regress0/strings/loop001.smt2
regress0/strings/model001.smt2
+ regress0/strings/model-code-point.smt2
+ regress0/strings/model-friendly.smt2
regress0/strings/ncontrib-rewrites.smt2
regress0/strings/norn-31.smt2
regress0/strings/norn-simp-rew.smt2
@@ -967,6 +970,7 @@ set(regress_0_tests
regress0/strings/tolower-rrs.smt2
regress0/strings/tolower-simple.smt2
regress0/strings/type001.smt2
+ regress0/strings/unicode-esc.smt2
regress0/strings/unsound-0908.smt2
regress0/strings/unsound-repl-rewrite.smt2
regress0/sygus/General_plus10.sy
diff --git a/test/regress/regress0/strings/gen-esc-seq.smt2 b/test/regress/regress0/strings/gen-esc-seq.smt2
new file mode 100644
index 000000000..59f66046f
--- /dev/null
+++ b/test/regress/regress0/strings/gen-esc-seq.smt2
@@ -0,0 +1,9 @@
+; COMMAND-LINE: --produce-models --lang=smt2.6.1
+; EXPECT: sat
+; EXPECT: ((x "\u{5c}u1000"))
+(set-logic ALL)
+(set-info :status sat)
+(declare-const x String)
+(assert (= x (str.++ "\u" "1000")))
+(check-sat)
+(get-value (x))
diff --git a/test/regress/regress0/strings/model-code-point.smt2 b/test/regress/regress0/strings/model-code-point.smt2
new file mode 100644
index 000000000..1200ae704
--- /dev/null
+++ b/test/regress/regress0/strings/model-code-point.smt2
@@ -0,0 +1,13 @@
+; COMMAND-LINE: --lang=smt2.6.1 --produce-models
+; EXPECT: sat
+; EXPECT: ((x "\u{a}"))
+; EXPECT: ((y "\u{7f}"))
+(set-logic ALL)
+(set-info :status sat)
+(declare-fun x () String)
+(declare-fun y () String)
+(assert (= (str.to_code x) 10))
+(assert (= (str.to_code y) 127))
+(check-sat)
+(get-value (x))
+(get-value (y))
diff --git a/test/regress/regress0/strings/model-friendly.smt2 b/test/regress/regress0/strings/model-friendly.smt2
new file mode 100644
index 000000000..985ffaa62
--- /dev/null
+++ b/test/regress/regress0/strings/model-friendly.smt2
@@ -0,0 +1,9 @@
+; COMMAND-LINE: --lang=smt2.6.1 --produce-models
+; EXPECT: sat
+; EXPECT: ((x "AAAAA"))
+(set-logic ALL)
+(set-info :status sat)
+(declare-fun x () String)
+(assert (= (str.len x) 5))
+(check-sat)
+(get-value (x))
diff --git a/test/regress/regress0/strings/unicode-esc.smt2 b/test/regress/regress0/strings/unicode-esc.smt2
new file mode 100644
index 000000000..01f5f30ab
--- /dev/null
+++ b/test/regress/regress0/strings/unicode-esc.smt2
@@ -0,0 +1,30 @@
+; COMMAND-LINE: --strings-exp --lang=smt2.6.1
+; EXPECT: sat
+(set-logic ALL)
+
+(assert (= "\u{14}" "\u0014"))
+(assert (= "\u{00}" "\u{0}"))
+(assert (= "\u0000" "\u{0}"))
+(assert (= (str.len "\u1234") 1))
+(assert (= (str.len "\u{1}") 1))
+(assert (= (str.len "\u{99}") 1))
+(assert (= (str.len "\u{779}") 1))
+(assert (= (str.len "\u{0779}") 1))
+(assert (= (str.len "\u{01779}") 1))
+(assert (= (str.len "\u{001779}") 10))
+(assert (= (str.len "\u{0vv79}") 9))
+(assert (= (str.len "\u{11\u1234}") 7))
+(assert (= (str.len "\u12345") 2))
+(assert (= (str.len "\uu") 3))
+(assert (= (str.len "\u{123}\u{567}") 2))
+(assert (= (str.len "\u{0017") 7))
+(assert (= (str.len "\\u00178") 3))
+(assert (= (str.len "2\u{}") 5))
+(assert (= (str.len "\uaaaa") 1))
+(assert (= (str.len "\uAAAA") 1))
+(assert (= (str.len "\u{0AbC}") 1))
+(assert (= (str.len "\u{E}") 1))
+(assert (= (str.len "\u{44444}") 9))
+(assert (= (str.len "\u") 2))
+
+(check-sat)
diff --git a/test/unit/api/solver_black.h b/test/unit/api/solver_black.h
index 27f5aca12..0eefde700 100644
--- a/test/unit/api/solver_black.h
+++ b/test/unit/api/solver_black.h
@@ -556,9 +556,9 @@ void SolverBlack::testMkString()
TS_ASSERT_THROWS_NOTHING(d_solver->mkString(""));
TS_ASSERT_THROWS_NOTHING(d_solver->mkString("asdfasdf"));
TS_ASSERT_EQUALS(d_solver->mkString("asdf\\nasdf").toString(),
- "\"asdf\\\\nasdf\"");
- TS_ASSERT_EQUALS(d_solver->mkString("asdf\\nasdf", true).toString(),
- "\"asdf\\nasdf\"");
+ "\"asdf\\u{5c}nasdf\"");
+ TS_ASSERT_EQUALS(d_solver->mkString("asdf\\u{005c}nasdf", true).toString(),
+ "\"asdf\\u{5c}nasdf\"");
}
void SolverBlack::testMkTerm()
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback