diff options
24 files changed, 549 insertions, 285 deletions
diff --git a/src/parser/cvc/Cvc.g b/src/parser/cvc/Cvc.g index 82c0581ce..033389610 100644 --- a/src/parser/cvc/Cvc.g +++ b/src/parser/cvc/Cvc.g @@ -2083,7 +2083,7 @@ stringTerm[CVC4::api::Term& f] /* string literal */ | str[s] - { f = SOLVER->mkString(s, true); } + { f = PARSER_STATE->mkStringConstant(s); } | setsTerm[f] ; diff --git a/src/parser/parser.cpp b/src/parser/parser.cpp index b36f36a93..5dca92370 100644 --- a/src/parser/parser.cpp +++ b/src/parser/parser.cpp @@ -757,5 +757,153 @@ void Parser::attributeNotSupported(const std::string& attr) { } } +std::vector<unsigned> Parser::processAdHocStringEsc(const std::string& s) +{ + std::vector<unsigned> str; + unsigned i = 0; + while (i < s.size()) + { + // get the current character + if (s[i] != '\\') + { + // don't worry about printable here + str.push_back(static_cast<unsigned>(s[i])); + ++i; + continue; + } + // slash is always escaped + ++i; + if (i >= s.size()) + { + // slash cannot be the last character if we are parsing escape sequences + std::stringstream serr; + serr << "Escape sequence at the end of string: \"" << s + << "\" should be handled by lexer"; + parseError(serr.str()); + } + switch (s[i]) + { + case 'n': + { + str.push_back(static_cast<unsigned>('\n')); + i++; + } + break; + case 't': + { + str.push_back(static_cast<unsigned>('\t')); + i++; + } + break; + case 'v': + { + str.push_back(static_cast<unsigned>('\v')); + i++; + } + break; + case 'b': + { + str.push_back(static_cast<unsigned>('\b')); + i++; + } + break; + case 'r': + { + str.push_back(static_cast<unsigned>('\r')); + i++; + } + break; + case 'f': + { + str.push_back(static_cast<unsigned>('\f')); + i++; + } + break; + case 'a': + { + str.push_back(static_cast<unsigned>('\a')); + i++; + } + break; + case '\\': + { + str.push_back(static_cast<unsigned>('\\')); + i++; + } + break; + case 'x': + { + bool isValid = false; + if (i + 2 < s.size()) + { + if (std::isxdigit(s[i + 1]) && std::isxdigit(s[i + 2])) + { + std::stringstream shex; + shex << s[i + 1] << s[i + 2]; + unsigned val; + shex >> std::hex >> val; + str.push_back(val); + i += 3; + isValid = true; + } + } + if (!isValid) + { + std::stringstream serr; + serr << "Illegal String Literal: \"" << s + << "\", must have two digits after \\x"; + parseError(serr.str()); + } + } + break; + default: + { + if (std::isdigit(s[i])) + { + // octal escape sequences TODO : revisit (issue #1251). + unsigned num = static_cast<unsigned>(s[i]) - 48; + bool flag = num < 4; + if (i + 1 < s.size() && num < 8 && std::isdigit(s[i + 1]) + && s[i + 1] < '8') + { + num = num * 8 + static_cast<unsigned>(s[i + 1]) - 48; + if (flag && i + 2 < s.size() && std::isdigit(s[i + 2]) + && s[i + 2] < '8') + { + num = num * 8 + static_cast<unsigned>(s[i + 2]) - 48; + str.push_back(num); + i += 3; + } + else + { + str.push_back(num); + i += 2; + } + } + else + { + str.push_back(num); + i++; + } + } + } + } + } + return str; +} + +Expr Parser::mkStringConstant(const std::string& s) +{ + ExprManager* em = d_solver->getExprManager(); + if (em->getOptions().getInputLanguage() + == language::input::LANG_SMTLIB_V2_6_1) + { + return d_solver->mkString(s, true).getExpr(); + } + // otherwise, we must process ad-hoc escape sequences + std::vector<unsigned> str = processAdHocStringEsc(s); + return d_solver->mkString(str).getExpr(); +} + } /* CVC4::parser namespace */ } /* CVC4 namespace */ diff --git a/src/parser/parser.h b/src/parser/parser.h index ecea4d3bd..d6c0e0e15 100644 --- a/src/parser/parser.h +++ b/src/parser/parser.h @@ -889,6 +889,28 @@ public: name, api::sortVectorToTypes(argTypes)); } //------------------------ end operator overloading + /** + * Make string constant + * + * This makes the string constant based on the string s. This may involve + * processing ad-hoc escape sequences (if the language is not + * SMT-LIB 2.6.1 or higher), or otherwise calling the solver to construct + * the string. + */ + Expr mkStringConstant(const std::string& s); + + private: + /** ad-hoc string escaping + * + * Returns the (internal) vector of code points corresponding to processing + * the escape sequences in string s. This is to support string inputs that + * do no comply with the SMT-LIB standard. + * + * This method handles escape sequences, including \n, \t, \v, \b, \r, \f, \a, + * \\, \x[N] and octal escape sequences of the form \[c1]([c2]([c3])?)? where + * c1, c2, c3 are digits from 0 to 7. + */ + std::vector<unsigned> processAdHocStringEsc(const std::string& s); };/* class Parser */ }/* CVC4::parser namespace */ diff --git a/src/parser/smt2/Smt2.g b/src/parser/smt2/Smt2.g index ec1eae7da..69f21acb7 100644 --- a/src/parser/smt2/Smt2.g +++ b/src/parser/smt2/Smt2.g @@ -2091,7 +2091,7 @@ termAtomic[CVC4::api::Term& atomTerm] } // String constant - | str[s,false] { atomTerm = SOLVER->mkString(s, true); } + | str[s,false] { atomTerm = PARSER_STATE->mkStringConstant(s); } // NOTE: Theory constants go here diff --git a/src/preprocessing/passes/synth_rew_rules.cpp b/src/preprocessing/passes/synth_rew_rules.cpp index 7b8e61359..f1e9e39c5 100644 --- a/src/preprocessing/passes/synth_rew_rules.cpp +++ b/src/preprocessing/passes/synth_rew_rules.cpp @@ -169,7 +169,7 @@ PreprocessingPassResult SynthRewRulesPass::applyInternal( std::stringstream ssv; if (varCounter < 26) { - ssv << String::convertUnsignedIntToChar(varCounter + 32); + ssv << static_cast<char>(varCounter + 61); } else { diff --git a/src/printer/cvc/cvc_printer.cpp b/src/printer/cvc/cvc_printer.cpp index cad3c4640..1178c7299 100644 --- a/src/printer/cvc/cvc_printer.cpp +++ b/src/printer/cvc/cvc_printer.cpp @@ -160,6 +160,11 @@ void CvcPrinter::toStream( toStreamRational(out, n, false); break; } + case kind::CONST_STRING: + { + out << '"' << n.getConst<String>().toString() << '"'; + break; + } case kind::TYPE_CONSTANT: switch(TypeConstant tc = n.getConst<TypeConstant>()) { case REAL_TYPE: diff --git a/src/printer/smt2/smt2_printer.cpp b/src/printer/smt2/smt2_printer.cpp index 541827f89..6e4fcb63a 100644 --- a/src/printer/smt2/smt2_printer.cpp +++ b/src/printer/smt2/smt2_printer.cpp @@ -202,7 +202,7 @@ void Smt2Printer::toStream(std::ostream& out, } case kind::CONST_STRING: { - std::string s = n.getConst<String>().toString(true); + std::string s = n.getConst<String>().toString(); out << '"'; for(size_t i = 0; i < s.size(); ++i) { char c = s[i]; diff --git a/src/theory/evaluator.cpp b/src/theory/evaluator.cpp index b827912d5..646f903f5 100644 --- a/src/theory/evaluator.cpp +++ b/src/theory/evaluator.cpp @@ -626,8 +626,7 @@ EvalResult Evaluator::evalInternal( const String& s = results[currNode[0]].d_str; if (s.size() == 1) { - results[currNode] = EvalResult( - Rational(String::convertUnsignedIntToCode(s.getVec()[0]))); + results[currNode] = EvalResult(Rational(s.getVec()[0])); } else { diff --git a/src/theory/quantifiers/sygus_sampler.cpp b/src/theory/quantifiers/sygus_sampler.cpp index 28cfa69df..e9c858814 100644 --- a/src/theory/quantifiers/sygus_sampler.cpp +++ b/src/theory/quantifiers/sygus_sampler.cpp @@ -560,8 +560,7 @@ Node SygusSampler::getRandomValue(TypeNode tn) for (unsigned ch : alphas) { d_rstring_alphabet.push_back(ch); - Trace("sygus-sample-str-alpha") - << " \"" << String::convertUnsignedIntToChar(ch) << "\""; + Trace("sygus-sample-str-alpha") << " \\u" << ch; } Trace("sygus-sample-str-alpha") << std::endl; } diff --git a/src/theory/strings/regexp_operation.cpp b/src/theory/strings/regexp_operation.cpp index d5105a489..9a2091eac 100644 --- a/src/theory/strings/regexp_operation.cpp +++ b/src/theory/strings/regexp_operation.cpp @@ -739,9 +739,7 @@ void RegExpOpr::firstChars(Node r, std::set<unsigned> &pcset, SetNodes &pvset) } case kind::REGEXP_RANGE: { unsigned a = r[0].getConst<String>().front(); - a = String::convertUnsignedIntToCode(a); unsigned b = r[1].getConst<String>().front(); - b = String::convertUnsignedIntToCode(b); Assert(a < b); Assert(b < std::numeric_limits<unsigned>::max()); for (unsigned c = a; c <= b; c++) @@ -756,7 +754,6 @@ void RegExpOpr::firstChars(Node r, std::set<unsigned> &pcset, SetNodes &pvset) String s = st.getConst<String>(); if(s.size() != 0) { unsigned sc = s.front(); - sc = String::convertUnsignedIntToCode(sc); cset.insert(sc); } } @@ -765,7 +762,6 @@ void RegExpOpr::firstChars(Node r, std::set<unsigned> &pcset, SetNodes &pvset) if(st[0].isConst()) { String s = st[0].getConst<String>(); unsigned sc = s.front(); - sc = String::convertUnsignedIntToCode(sc); cset.insert(sc); } else { vset.insert( st[0] ); @@ -887,13 +883,11 @@ void RegExpOpr::simplifyNRegExp( Node s, Node r, std::vector< Node > &new_nodes case kind::REGEXP_RANGE: { std::vector< Node > vec; unsigned a = r[0].getConst<String>().front(); - a = String::convertUnsignedIntToCode(a); unsigned b = r[1].getConst<String>().front(); - b = String::convertUnsignedIntToCode(b); for (unsigned c = a; c <= b; c++) { std::vector<unsigned> tmpVec; - tmpVec.push_back(String::convertCodeToUnsignedInt(c)); + tmpVec.push_back(c); Node tmp = s.eqNode(nm->mkConst(String(tmpVec))).negate(); vec.push_back( tmp ); } @@ -1522,7 +1516,7 @@ Node RegExpOpr::intersectInternal( Node r1, Node r2, std::map< PairNodes, Node > ++it) { std::vector<unsigned> cvec; - cvec.push_back(String::convertCodeToUnsignedInt(*it)); + cvec.push_back(*it); String c(cvec); Trace("regexp-int-debug") << "Try character " << c << " ... " << std::endl; Node r1l = derivativeSingle(r1, c); diff --git a/src/theory/strings/sequences_rewriter.cpp b/src/theory/strings/sequences_rewriter.cpp index 716634d5f..b0940b7e1 100644 --- a/src/theory/strings/sequences_rewriter.cpp +++ b/src/theory/strings/sequences_rewriter.cpp @@ -1421,11 +1421,8 @@ bool SequencesRewriter::testConstStringInRegExp(CVC4::String& s, if (s.size() == index_start + 1) { unsigned a = r[0].getConst<String>().front(); - a = String::convertUnsignedIntToCode(a); unsigned b = r[1].getConst<String>().front(); - b = String::convertUnsignedIntToCode(b); unsigned c = s.back(); - c = String::convertUnsignedIntToCode(c); return (a <= c && c <= b); } else diff --git a/src/theory/strings/strings_rewriter.cpp b/src/theory/strings/strings_rewriter.cpp index 75dfe7432..c7676d049 100644 --- a/src/theory/strings/strings_rewriter.cpp +++ b/src/theory/strings/strings_rewriter.cpp @@ -93,7 +93,7 @@ Node StringsRewriter::rewriteStrConvert(Node node) std::vector<unsigned> nvec = node[0].getConst<String>().getVec(); for (unsigned i = 0, nvsize = nvec.size(); i < nvsize; i++) { - unsigned newChar = String::convertUnsignedIntToCode(nvec[i]); + unsigned newChar = nvec[i]; // transform it // upper 65 ... 90 // lower 97 ... 122 @@ -111,7 +111,6 @@ Node StringsRewriter::rewriteStrConvert(Node node) newChar = newChar + 32; } } - newChar = String::convertCodeToUnsignedInt(newChar); nvec[i] = newChar; } Node retNode = nm->mkConst(String(nvec)); @@ -231,7 +230,7 @@ Node StringsRewriter::rewriteStringToCode(Node n) { std::vector<unsigned> vec = s.getVec(); Assert(vec.size() == 1); - ret = nm->mkConst(Rational(String::convertUnsignedIntToCode(vec[0]))); + ret = nm->mkConst(Rational(vec[0])); } else { diff --git a/src/theory/strings/theory_strings.cpp b/src/theory/strings/theory_strings.cpp index 16183abdd..a81c96318 100644 --- a/src/theory/strings/theory_strings.cpp +++ b/src/theory/strings/theory_strings.cpp @@ -382,7 +382,7 @@ bool TheoryStrings::collectModelInfoType( ctv.getConst<Rational>().getNumerator().toUnsignedInt(); Trace("strings-model") << "(code: " << cvalue << ") "; std::vector<unsigned> vec; - vec.push_back(String::convertCodeToUnsignedInt(cvalue)); + vec.push_back(cvalue); Node mv = nm->mkConst(String(vec)); pure_eq_assign[eqc] = mv; m->getEqualityEngine()->addTerm(mv); @@ -1099,13 +1099,14 @@ void TheoryStrings::registerTerm(Node n, int effort) else if (n.getKind() == STRING_TO_CODE) { d_has_str_code = true; - // ite( str.len(s)==1, 0 <= str.code(s) < num_codes, str.code(s)=-1 ) + // ite( str.len(s)==1, 0 <= str.code(s) < |A|, str.code(s)=-1 ) Node code_len = utils::mkNLength(n[0]).eqNode(d_one); Node code_eq_neg1 = n.eqNode(d_neg_one); Node code_range = nm->mkNode( AND, nm->mkNode(GEQ, n, d_zero), - nm->mkNode(LT, n, nm->mkConst(Rational(CVC4::String::num_codes())))); + nm->mkNode( + LT, n, nm->mkConst(Rational(utils::getAlphabetCardinality())))); regTermLem = nm->mkNode(ITE, code_len, code_range, code_eq_neg1); } else if (n.getKind() == STRING_STRIDOF) diff --git a/src/theory/strings/theory_strings_type_rules.h b/src/theory/strings/theory_strings_type_rules.h index 7ef31a92a..93a32f26e 100644 --- a/src/theory/strings/theory_strings_type_rules.h +++ b/src/theory/strings/theory_strings_type_rules.h @@ -299,7 +299,7 @@ public: throw TypeCheckingExceptionPrivate(n, "expecting a single constant string term in regexp range"); } unsigned ci = (*it).getConst<String>().front(); - ch[i] = String::convertUnsignedIntToCode(ci); + ch[i] = ci; ++it; } if(ch[0] > ch[1]) { diff --git a/src/theory/strings/type_enumerator.cpp b/src/theory/strings/type_enumerator.cpp index 12cf899b4..7352ae5de 100644 --- a/src/theory/strings/type_enumerator.cpp +++ b/src/theory/strings/type_enumerator.cpp @@ -21,6 +21,44 @@ namespace CVC4 { namespace theory { namespace strings { +Node makeStandardModelConstant(const std::vector<unsigned>& vec, + uint32_t cardinality) +{ + std::vector<unsigned> mvec; + // if we contain all of the printable characters + if (cardinality >= 255) + { + for (unsigned i = 0, vsize = vec.size(); i < vsize; i++) + { + unsigned curr = vec[i]; + // convert + Assert(vec[i] < cardinality); + if (vec[i] <= 61) + { + // first 62 printable characters [\u{65}-\u{126}]: 'A', 'B', 'C', ... + curr = vec[i] + 65; + } + else if (vec[i] <= 94) + { + // remaining 33 printable characters [\u{32}-\u{64}]: ' ', '!', '"', ... + curr = vec[i] - 30; + } + else + { + // the remaining characters, starting with \u{127} and wrapping around + // the first 32 non-printable characters. + curr = (vec[i] + 32) % cardinality; + } + mvec.push_back(curr); + } + } + else + { + mvec = vec; + } + return NodeManager::currentNM()->mkConst(String(mvec)); +} + WordIter::WordIter(uint32_t startLength) : d_hasEndLength(false), d_endLength(0) { for (uint32_t i = 0; i < startLength; i++) @@ -117,7 +155,7 @@ bool StringEnumLen::increment() void StringEnumLen::mkCurr() { - d_curr = NodeManager::currentNM()->mkConst(String(d_witer->getData())); + d_curr = makeStandardModelConstant(d_witer->getData(), d_cardinality); } StringEnumerator::StringEnumerator(TypeNode type, TypeEnumeratorProperties* tep) diff --git a/src/theory/strings/type_enumerator.h b/src/theory/strings/type_enumerator.h index 2061628a5..b379ce5c3 100644 --- a/src/theory/strings/type_enumerator.h +++ b/src/theory/strings/type_enumerator.h @@ -28,6 +28,26 @@ namespace theory { namespace strings { /** + * Make standard model constant + * + * In our string representation, we represent characters using vectors + * of unsigned integers indicating code points for the characters of that + * string. + * + * To make models user-friendly, we make unsigned integer 0 correspond to the + * 65th character ("A") in the ASCII alphabet to make models intuitive. In + * particular, say if we have a set of string variables that are distinct but + * otherwise unconstrained, then the model may assign them "A", "B", "C", ... + * + * @param vec The code points of the string in a given model, + * @param cardinality The cardinality of the alphabet, + * @return A string whose characters have the code points corresponding + * to vec in the standard model construction described above. + */ +Node makeStandardModelConstant(const std::vector<unsigned>& vec, + uint32_t cardinality); + +/** * Generic iteration over vectors of indices of a given start/end length. */ class WordIter diff --git a/src/util/regexp.cpp b/src/util/regexp.cpp index 00066edb6..36ba7182b 100644 --- a/src/util/regexp.cpp +++ b/src/util/regexp.cpp @@ -32,38 +32,12 @@ namespace CVC4 { static_assert(UCHAR_MAX == 255, "Unsigned char is assumed to have 256 values."); -unsigned String::convertCharToUnsignedInt(unsigned char c) -{ - return convertCodeToUnsignedInt(static_cast<unsigned>(c)); -} -unsigned char String::convertUnsignedIntToChar(unsigned i) -{ - Assert(i < num_codes()); - return static_cast<unsigned char>(convertUnsignedIntToCode(i)); -} -bool String::isPrintable(unsigned i) -{ - Assert(i < num_codes()); - unsigned char c = convertUnsignedIntToChar(i); - return (c >= ' ' && c <= '~'); -} -unsigned String::convertCodeToUnsignedInt(unsigned c) -{ - Assert(c < num_codes()); - return (c < start_code() ? c + num_codes() : c) - start_code(); -} -unsigned String::convertUnsignedIntToCode(unsigned i) -{ - Assert(i < num_codes()); - return (i + start_code()) % num_codes(); -} - String::String(const std::vector<unsigned> &s) : d_str(s) { #ifdef CVC4_ASSERTIONS for (unsigned u : d_str) { - Assert(convertUnsignedIntToCode(u) < num_codes()); + Assert(u < num_codes()); } #endif } @@ -74,8 +48,8 @@ int String::cmp(const String &y) const { } for (unsigned int i = 0; i < size(); ++i) { if (d_str[i] != y.d_str[i]) { - unsigned cp = convertUnsignedIntToCode(d_str[i]); - unsigned cpy = convertUnsignedIntToCode(y.d_str[i]); + unsigned cp = d_str[i]; + unsigned cpy = y.d_str[i]; return cp < cpy ? -1 : 1; } } @@ -122,107 +96,143 @@ bool String::rstrncmp(const String& y, std::size_t n) const return true; } -std::vector<unsigned> String::toInternal(const std::string &s, - bool useEscSequences) { +void String::addCharToInternal(unsigned char ch, std::vector<unsigned>& str) +{ + // if not a printable character + if (ch > 127 || ch < 32) + { + std::stringstream serr; + serr << "Illegal string character: \"" << ch + << "\", must use escape sequence"; + throw CVC4::Exception(serr.str()); + } + else + { + str.push_back(static_cast<unsigned>(ch)); + } +} + +std::vector<unsigned> String::toInternal(const std::string& s, + bool useEscSequences) +{ std::vector<unsigned> str; unsigned i = 0; - while (i < s.size()) { - if (s[i] == '\\' && useEscSequences) { - i++; - if (i < s.size()) { - switch (s[i]) { - case 'n': { - str.push_back(convertCharToUnsignedInt('\n')); - i++; - } break; - case 't': { - str.push_back(convertCharToUnsignedInt('\t')); - i++; - } break; - case 'v': { - str.push_back(convertCharToUnsignedInt('\v')); - i++; - } break; - case 'b': { - str.push_back(convertCharToUnsignedInt('\b')); - i++; - } break; - case 'r': { - str.push_back(convertCharToUnsignedInt('\r')); - i++; - } break; - case 'f': { - str.push_back(convertCharToUnsignedInt('\f')); - i++; - } break; - case 'a': { - str.push_back(convertCharToUnsignedInt('\a')); - i++; - } break; - case '\\': { - str.push_back(convertCharToUnsignedInt('\\')); - i++; - } break; - case 'x': { - if (i + 2 < s.size()) { - if (isxdigit(s[i + 1]) && isxdigit(s[i + 2])) { - str.push_back(convertCharToUnsignedInt(hexToDec(s[i + 1]) * 16 + - hexToDec(s[i + 2]))); - i += 3; - } else { - throw CVC4::Exception("Illegal String Literal: \"" + s + "\""); - } - } else { - throw CVC4::Exception("Illegal String Literal: \"" + s + - "\", must have two digits after \\x"); - } - } break; - default: { - if (isdigit(s[i])) { - // octal escape sequences TODO : revisit (issue #1251). - int num = (int)s[i] - (int)'0'; - bool flag = num < 4; - if (i + 1 < s.size() && num < 8 && isdigit(s[i + 1]) && - s[i + 1] < '8') { - num = num * 8 + (int)s[i + 1] - (int)'0'; - if (flag && i + 2 < s.size() && isdigit(s[i + 2]) && - s[i + 2] < '8') { - num = num * 8 + (int)s[i + 2] - (int)'0'; - str.push_back(convertCharToUnsignedInt((unsigned char)num)); - i += 3; - } else { - str.push_back(convertCharToUnsignedInt((unsigned char)num)); - i += 2; - } - } else { - str.push_back(convertCharToUnsignedInt((unsigned char)num)); - i++; - } - } else if ((unsigned)s[i] > 127) { - throw CVC4::Exception("Illegal String Literal: \"" + s + - "\", must use escaped sequence"); - } else { - str.push_back(convertCharToUnsignedInt(s[i])); - i++; - } + while (i < s.size()) + { + // get the current character + char si = s[i]; + if (si != '\\' || !useEscSequences) + { + addCharToInternal(si, str); + ++i; + continue; + } + // the vector of characters, in case we fail to read an escape sequence + std::vector<unsigned> nonEscCache; + // process the '\' + addCharToInternal(si, nonEscCache); + ++i; + // are we an escape sequence? + bool isEscapeSequence = true; + // the string corresponding to the hexidecimal code point + std::stringstream hexString; + // is the slash followed by a 'u'? Could be last character. + if (i >= s.size() || s[i] != 'u') + { + isEscapeSequence = false; + } + else + { + // process the 'u' + addCharToInternal(s[i], nonEscCache); + ++i; + bool isStart = true; + bool isEnd = false; + bool hasBrace = false; + while (i < s.size()) + { + // add the next character + si = s[i]; + if (isStart) + { + isStart = false; + // possibly read '{' + if (si == '{') + { + hasBrace = true; + addCharToInternal(si, nonEscCache); + ++i; + continue; } } - } else { - throw CVC4::Exception("should be handled by lexer: \"" + s + "\""); - // str.push_back( convertCharToUnsignedInt('\\') ); + else if (si == '}') + { + // can only end if we had an open brace and read at least one digit + isEscapeSequence = hasBrace && !hexString.str().empty(); + isEnd = true; + addCharToInternal(si, nonEscCache); + ++i; + break; + } + // must be a hex digit at this point + if (!isHexDigit(static_cast<unsigned>(si))) + { + isEscapeSequence = false; + break; + } + hexString << si; + addCharToInternal(si, nonEscCache); + ++i; + if (!hasBrace && hexString.str().size() == 4) + { + // will be finished reading \ u d_3 d_2 d_1 d_0 with no parens + isEnd = true; + break; + } + else if (hasBrace && hexString.str().size() > 5) + { + // too many digits enclosed in brace, not an escape sequence + isEscapeSequence = false; + break; + } + } + if (!isEnd) + { + // if we were interupted before ending, then this is not a valid + // escape sequence + isEscapeSequence = false; + } + } + if (isEscapeSequence) + { + Assert(!hexString.str().empty() && hexString.str().size() <= 5); + // Otherwise, we add the escaped character. + // This is guaranteed not to overflow due to the length of hstr. + uint32_t val; + hexString >> std::hex >> val; + if (val > num_codes()) + { + // Failed due to being out of range. This can happen for strings of + // the form \ u { d_4 d_3 d_2 d_1 d_0 } where d_4 is a hexidecimal not + // in the range [0-2]. + isEscapeSequence = false; + } + else + { + str.push_back(val); } - } else if ((unsigned)s[i] > 127 && useEscSequences) { - throw CVC4::Exception("Illegal String Literal: \"" + s + - "\", must use escaped sequence"); - } else { - str.push_back(convertCharToUnsignedInt(s[i])); - i++; + } + // if we did not successfully parse an escape sequence, we add back all + // characters that we cached + if (!isEscapeSequence) + { + str.insert(str.end(), nonEscCache.begin(), nonEscCache.end()); } } #ifdef CVC4_ASSERTIONS for (unsigned u : str) { - Assert(convertUnsignedIntToCode(u) < num_codes()); + Assert(u < num_codes()); } #endif return str; @@ -265,62 +275,23 @@ std::size_t String::roverlap(const String &y) const { } std::string String::toString(bool useEscSequences) const { - std::string str; + std::stringstream str; for (unsigned int i = 0; i < size(); ++i) { - unsigned char c = convertUnsignedIntToChar(d_str[i]); - if (!useEscSequences) { - str += c; - } else if (isprint(c)) { - if (c == '\\') { - str += "\\\\"; - } - // else if(c == '\"') { - // str += "\\\""; - //} - else { - str += c; - } - } else { - std::string s; - switch (c) { - case '\a': - s = "\\a"; - break; - case '\b': - s = "\\b"; - break; - case '\t': - s = "\\t"; - break; - case '\r': - s = "\\r"; - break; - case '\v': - s = "\\v"; - break; - case '\f': - s = "\\f"; - break; - case '\n': - s = "\\n"; - break; - case '\e': - s = "\\e"; - break; - default: { - std::stringstream ss; - ss << std::setfill('0') << std::setw(2) << std::hex << ((int)c); - std::string t = ss.str(); - t = t.substr(t.size() - 2, 2); - s = "\\x" + t; - // std::string s2 = static_cast<std::ostringstream*>( - // &(std::ostringstream() << (int)c) )->str(); - } - } - str += s; + // we always print forward slash as a code point so that it cannot + // be interpreted as specifying part of a code point, e.g. the string + // '\' + 'u' + '0' of length three. + if (isPrintable(d_str[i]) && d_str[i] != '\\' && !useEscSequences) + { + str << static_cast<char>(d_str[i]); + } + else + { + std::stringstream ss; + ss << std::hex << d_str[i]; + str << "\\u{" << ss.str() << "}"; } } - return str; + return str.str(); } bool String::isLeq(const String &y) const @@ -331,8 +302,8 @@ bool String::isLeq(const String &y) const { return false; } - unsigned ci = convertUnsignedIntToCode(d_str[i]); - unsigned cyi = convertUnsignedIntToCode(y.d_str[i]); + unsigned ci = d_str[i]; + unsigned cyi = y.d_str[i]; if (ci > cyi) { return false; @@ -484,8 +455,21 @@ bool String::isNumber() const { bool String::isDigit(unsigned character) { - unsigned char c = convertUnsignedIntToChar(character); - return c >= '0' && c <= '9'; + // '0' to '9' + return 48 <= character && character <= 57; +} + +bool String::isHexDigit(unsigned character) +{ + // '0' to '9' or 'A' to 'F' or 'a' to 'f' + return isDigit(character) || (65 <= character && character <= 70) + || (97 <= character && character <= 102); +} + +bool String::isPrintable(unsigned character) +{ + // Unicode 0x00020 (' ') to 0x0007E ('~') + return 32 <= character && character <= 126; } size_t String::maxSize() { return std::numeric_limits<uint32_t>::max(); } @@ -497,17 +481,6 @@ Rational String::toNumber() const return Rational(toString()); } -unsigned char String::hexToDec(unsigned char c) { - if (c >= '0' && c <= '9') { - return c - '0'; - } else if (c >= 'a' && c <= 'f') { - return c - 'a' + 10; - } else { - Assert(c >= 'A' && c <= 'F'); - return c - 'A' + 10; - } -} - std::ostream &operator<<(std::ostream &os, const String &s) { return os << "\"" << s.toString(true) << "\""; } diff --git a/src/util/regexp.h b/src/util/regexp.h index 731736f72..56fb969a3 100644 --- a/src/util/regexp.h +++ b/src/util/regexp.h @@ -37,60 +37,44 @@ namespace CVC4 { class CVC4_PUBLIC String { public: /** - * The start ASCII code. In our string representation below, we represent - * characters using a vector d_str of unsigned integers. We refer to this as - * the "internal representation" for the string. - * - * We make unsigned integer 0 correspond to the 65th character ("A") in the - * ASCII alphabet to make models intuitive. In particular, say if we have - * a set of string variables that are distinct but otherwise unconstrained, - * then the model may assign them "A", "B", "C", ... - */ - static inline unsigned start_code() { return 65; } - /** * This is the cardinality of the alphabet that is representable by this * class. Notice that this must be greater than or equal to the cardinality * of the alphabet that the string theory reasons about. * * This must be strictly less than std::numeric_limits<unsigned>::max(). + * + * As per the SMT-LIB standard for strings, we support the first 3 planes of + * Unicode characters, where 196608 = 3*16^4. */ - static inline unsigned num_codes() { return 256; } - /** - * Convert unsigned char to the unsigned used in the internal representation - * in d_str below. - */ - static unsigned convertCharToUnsignedInt(unsigned char c); - /** Convert the internal unsigned to a unsigned char. */ - static unsigned char convertUnsignedIntToChar(unsigned i); - /** Does the internal unsigned correspond to a printable character? */ - static bool isPrintable(unsigned i); - /** get the internal unsigned for ASCII code c. */ - static unsigned convertCodeToUnsignedInt(unsigned c); - /** get the ASCII code number that internal unsigned i corresponds to. */ - static unsigned convertUnsignedIntToCode(unsigned i); - + static inline unsigned num_codes() { return 196608; } /** constructors for String - * - * Internally, a CVC4::String is represented by a vector of unsigned - * integers (d_str), where the correspondence between C++ characters - * to and from unsigned integers is determined by - * by convertCharToUnsignedInt and convertUnsignedIntToChar. - * - * If useEscSequences is true, then the escape sequences in the input - * are converted to the corresponding character. This constructor may - * throw an exception if the input contains unrecognized escape sequences. - * Currently supported escape sequences are \n, \t, \v, \b, \r, \f, \a, \\, - * \x[N] where N is a hexidecimal, and octal escape sequences of the - * form \[c1]([c2]([c3])?)? where c1, c2, c3 are digits from 0 to 7. - * - * If useEscSequences is false, then the characters of the constructed - * CVC4::String correspond one-to-one with the input string. - */ + * + * Internally, a CVC4::String is represented by a vector of unsigned + * integers (d_str) representing the code points of the characters. + * + * To build a string from a C++ string, we may process escape sequences + * according to the SMT-LIB standard. In particular, if useEscSequences is + * true, we convert unicode escape sequences: + * \u d_3 d_2 d_1 d_0 + * \u{d_0} + * \u{d_1 d_0} + * \u{d_2 d_1 d_0} + * \u{d_3 d_2 d_1 d_0} + * \u{d_4 d_3 d_2 d_1 d_0} + * where d_0 ... d_4 are hexidecimal digits, to the appropriate character. + * + * If useEscSequences is false, then the characters of the constructed + * CVC4::String correspond one-to-one with the input string. + */ String() = default; explicit String(const std::string& s, bool useEscSequences = false) - : d_str(toInternal(s, useEscSequences)) {} + : d_str(toInternal(s, useEscSequences)) + { + } explicit String(const char* s, bool useEscSequences = false) - : d_str(toInternal(std::string(s), useEscSequences)) {} + : d_str(toInternal(std::string(s), useEscSequences)) + { + } explicit String(const std::vector<unsigned>& s); String& operator=(const String& y) { @@ -123,20 +107,16 @@ class CVC4_PUBLIC String { bool rstrncmp(const String& y, std::size_t n) const; /* toString - * Converts this string to a std::string. - * - * If useEscSequences is true, then unprintable characters - * are converted to escape sequences. The escape sequences - * \n, \t, \v, \b, \r, \f, \a, \\ are printed in this way. - * For all other unprintable characters, we print \x[N] where - * [N] is the 2 digit hexidecimal corresponding to value of - * the character. - * - * If useEscSequences is false, the returned std::string's characters - * map one-to-one with the characters in this string. - * Notice that for all std::string s, we have that - * CVC4::String( s ).toString() = s. - */ + * Converts this string to a std::string. + * + * The unprintable characters are converted to unicode escape sequences as + * described above. + * + * If useEscSequences is false, the string's printable characters are + * printed as characters. Notice that for all std::string s having only + * printable characters, we have that + * CVC4::String( s ).toString() = s. + */ std::string toString(bool useEscSequences = false) const; /** is this the empty string? */ bool empty() const { return d_str.empty(); } @@ -221,16 +201,32 @@ class CVC4_PUBLIC String { bool isNumber() const; /** Returns the corresponding rational for the text of this string. */ Rational toNumber() const; - /** get the internal unsigned representation of this string */ + /** Get the unsigned representation (code points) of this string */ const std::vector<unsigned>& getVec() const { return d_str; } - /** get the internal unsigned value of the first character in this string */ + /** + * Get the unsigned (code point) value of the first character in this string + */ unsigned front() const; - /** get the internal unsigned value of the last character in this string */ + /** + * Get the unsigned (code point) value of the last character in this string + */ unsigned back() const; /** is the unsigned a digit? - * The input should be the same type as the element type of d_str - */ + * + * This is true for code points between 48 ('0') and 57 ('9'). + */ static bool isDigit(unsigned character); + /** is the unsigned a hexidecimal digit? + * + * This is true for code points between 48 ('0') and 57 ('9'), code points + * between 65 ('A') and 70 ('F) and code points between 97 ('a') and 102 ('f). + */ + static bool isHexDigit(unsigned character); + /** is the unsigned a printable code point? + * + * This is true for Unicode 32 (' ') to 126 ('~'). + */ + static bool isPrintable(unsigned character); /** * Returns the maximum length of string representable by this class. @@ -238,11 +234,19 @@ class CVC4_PUBLIC String { */ static size_t maxSize(); private: - // guarded - static unsigned char hexToDec(unsigned char c); - + /** + * Helper for toInternal: add character ch to vector vec, storing a string in + * internal format. This throws an error if ch is not a printable character, + * since non-printable characters must be escaped in SMT-LIB. + */ + static void addCharToInternal(unsigned char ch, std::vector<unsigned>& vec); + /** + * Convert the string s to the internal format (vector of code points). + * The argument useEscSequences is whether to process unicode escape + * sequences. + */ static std::vector<unsigned> toInternal(const std::string& s, - bool useEscSequences = true); + bool useEscSequences); /** * Returns a negative number if *this < y, 0 if *this and y are equal and a diff --git a/test/regress/CMakeLists.txt b/test/regress/CMakeLists.txt index 8fab16b44..8382e40fc 100644 --- a/test/regress/CMakeLists.txt +++ b/test/regress/CMakeLists.txt @@ -922,6 +922,7 @@ set(regress_0_tests regress0/strings/escchar.smt2 regress0/strings/escchar_25.smt2 regress0/strings/from_code.smt2 + regress0/strings/gen-esc-seq.smt2 regress0/strings/hconst-092618.smt2 regress0/strings/idof-rewrites.smt2 regress0/strings/idof-sem.smt2 @@ -939,6 +940,8 @@ set(regress_0_tests regress0/strings/leadingzero001.smt2 regress0/strings/loop001.smt2 regress0/strings/model001.smt2 + regress0/strings/model-code-point.smt2 + regress0/strings/model-friendly.smt2 regress0/strings/ncontrib-rewrites.smt2 regress0/strings/norn-31.smt2 regress0/strings/norn-simp-rew.smt2 @@ -967,6 +970,7 @@ set(regress_0_tests regress0/strings/tolower-rrs.smt2 regress0/strings/tolower-simple.smt2 regress0/strings/type001.smt2 + regress0/strings/unicode-esc.smt2 regress0/strings/unsound-0908.smt2 regress0/strings/unsound-repl-rewrite.smt2 regress0/sygus/General_plus10.sy diff --git a/test/regress/regress0/strings/gen-esc-seq.smt2 b/test/regress/regress0/strings/gen-esc-seq.smt2 new file mode 100644 index 000000000..59f66046f --- /dev/null +++ b/test/regress/regress0/strings/gen-esc-seq.smt2 @@ -0,0 +1,9 @@ +; COMMAND-LINE: --produce-models --lang=smt2.6.1 +; EXPECT: sat +; EXPECT: ((x "\u{5c}u1000")) +(set-logic ALL) +(set-info :status sat) +(declare-const x String) +(assert (= x (str.++ "\u" "1000"))) +(check-sat) +(get-value (x)) diff --git a/test/regress/regress0/strings/model-code-point.smt2 b/test/regress/regress0/strings/model-code-point.smt2 new file mode 100644 index 000000000..1200ae704 --- /dev/null +++ b/test/regress/regress0/strings/model-code-point.smt2 @@ -0,0 +1,13 @@ +; COMMAND-LINE: --lang=smt2.6.1 --produce-models +; EXPECT: sat +; EXPECT: ((x "\u{a}")) +; EXPECT: ((y "\u{7f}")) +(set-logic ALL) +(set-info :status sat) +(declare-fun x () String) +(declare-fun y () String) +(assert (= (str.to_code x) 10)) +(assert (= (str.to_code y) 127)) +(check-sat) +(get-value (x)) +(get-value (y)) diff --git a/test/regress/regress0/strings/model-friendly.smt2 b/test/regress/regress0/strings/model-friendly.smt2 new file mode 100644 index 000000000..985ffaa62 --- /dev/null +++ b/test/regress/regress0/strings/model-friendly.smt2 @@ -0,0 +1,9 @@ +; COMMAND-LINE: --lang=smt2.6.1 --produce-models +; EXPECT: sat +; EXPECT: ((x "AAAAA")) +(set-logic ALL) +(set-info :status sat) +(declare-fun x () String) +(assert (= (str.len x) 5)) +(check-sat) +(get-value (x)) diff --git a/test/regress/regress0/strings/unicode-esc.smt2 b/test/regress/regress0/strings/unicode-esc.smt2 new file mode 100644 index 000000000..01f5f30ab --- /dev/null +++ b/test/regress/regress0/strings/unicode-esc.smt2 @@ -0,0 +1,30 @@ +; COMMAND-LINE: --strings-exp --lang=smt2.6.1 +; EXPECT: sat +(set-logic ALL) + +(assert (= "\u{14}" "\u0014")) +(assert (= "\u{00}" "\u{0}")) +(assert (= "\u0000" "\u{0}")) +(assert (= (str.len "\u1234") 1)) +(assert (= (str.len "\u{1}") 1)) +(assert (= (str.len "\u{99}") 1)) +(assert (= (str.len "\u{779}") 1)) +(assert (= (str.len "\u{0779}") 1)) +(assert (= (str.len "\u{01779}") 1)) +(assert (= (str.len "\u{001779}") 10)) +(assert (= (str.len "\u{0vv79}") 9)) +(assert (= (str.len "\u{11\u1234}") 7)) +(assert (= (str.len "\u12345") 2)) +(assert (= (str.len "\uu") 3)) +(assert (= (str.len "\u{123}\u{567}") 2)) +(assert (= (str.len "\u{0017") 7)) +(assert (= (str.len "\\u00178") 3)) +(assert (= (str.len "2\u{}") 5)) +(assert (= (str.len "\uaaaa") 1)) +(assert (= (str.len "\uAAAA") 1)) +(assert (= (str.len "\u{0AbC}") 1)) +(assert (= (str.len "\u{E}") 1)) +(assert (= (str.len "\u{44444}") 9)) +(assert (= (str.len "\u") 2)) + +(check-sat) diff --git a/test/unit/api/solver_black.h b/test/unit/api/solver_black.h index 27f5aca12..0eefde700 100644 --- a/test/unit/api/solver_black.h +++ b/test/unit/api/solver_black.h @@ -556,9 +556,9 @@ void SolverBlack::testMkString() TS_ASSERT_THROWS_NOTHING(d_solver->mkString("")); TS_ASSERT_THROWS_NOTHING(d_solver->mkString("asdfasdf")); TS_ASSERT_EQUALS(d_solver->mkString("asdf\\nasdf").toString(), - "\"asdf\\\\nasdf\""); - TS_ASSERT_EQUALS(d_solver->mkString("asdf\\nasdf", true).toString(), - "\"asdf\\nasdf\""); + "\"asdf\\u{5c}nasdf\""); + TS_ASSERT_EQUALS(d_solver->mkString("asdf\\u{005c}nasdf", true).toString(), + "\"asdf\\u{5c}nasdf\""); } void SolverBlack::testMkTerm() |