summaryrefslogtreecommitdiff
path: root/src/util/string.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/util/string.cpp')
-rw-r--r--src/util/string.cpp485
1 files changed, 485 insertions, 0 deletions
diff --git a/src/util/string.cpp b/src/util/string.cpp
new file mode 100644
index 000000000..ff522ba7b
--- /dev/null
+++ b/src/util/string.cpp
@@ -0,0 +1,485 @@
+/********************* */
+/*! \file string.cpp
+ ** \verbatim
+ ** Top contributors (to current version):
+ ** Tim King, Tianyi Liang, Andrew Reynolds
+ ** This file is part of the CVC4 project.
+ ** Copyright (c) 2009-2019 by the authors listed in the file AUTHORS
+ ** in the top-level source directory) and their institutional affiliations.
+ ** All rights reserved. See the file COPYING in the top-level source
+ ** directory for licensing information.\endverbatim
+ **
+ ** \brief Implementation of the string data type.
+ **/
+
+#include "util/string.h"
+
+#include <algorithm>
+#include <climits>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+
+#include "base/check.h"
+#include "base/exception.h"
+
+using namespace std;
+
+namespace CVC4 {
+
+static_assert(UCHAR_MAX == 255, "Unsigned char is assumed to have 256 values.");
+
+String::String(const std::vector<unsigned> &s) : d_str(s)
+{
+#ifdef CVC4_ASSERTIONS
+ for (unsigned u : d_str)
+ {
+ Assert(u < num_codes());
+ }
+#endif
+}
+
+int String::cmp(const String &y) const {
+ if (size() != y.size()) {
+ return size() < y.size() ? -1 : 1;
+ }
+ for (unsigned int i = 0; i < size(); ++i) {
+ if (d_str[i] != y.d_str[i]) {
+ unsigned cp = d_str[i];
+ unsigned cpy = y.d_str[i];
+ return cp < cpy ? -1 : 1;
+ }
+ }
+ return 0;
+}
+
+String String::concat(const String &other) const {
+ std::vector<unsigned int> ret_vec(d_str);
+ ret_vec.insert(ret_vec.end(), other.d_str.begin(), other.d_str.end());
+ return String(ret_vec);
+}
+
+bool String::strncmp(const String& y, std::size_t n) const
+{
+ std::size_t b = (size() >= y.size()) ? size() : y.size();
+ std::size_t s = (size() <= y.size()) ? size() : y.size();
+ if (n > s) {
+ if (b == s) {
+ n = s;
+ } else {
+ return false;
+ }
+ }
+ for (std::size_t i = 0; i < n; ++i) {
+ if (d_str[i] != y.d_str[i]) return false;
+ }
+ return true;
+}
+
+bool String::rstrncmp(const String& y, std::size_t n) const
+{
+ std::size_t b = (size() >= y.size()) ? size() : y.size();
+ std::size_t s = (size() <= y.size()) ? size() : y.size();
+ if (n > s) {
+ if (b == s) {
+ n = s;
+ } else {
+ return false;
+ }
+ }
+ for (std::size_t i = 0; i < n; ++i) {
+ if (d_str[size() - i - 1] != y.d_str[y.size() - i - 1]) return false;
+ }
+ return true;
+}
+
+void String::addCharToInternal(unsigned char ch, std::vector<unsigned>& str)
+{
+ // if not a printable character
+ if (ch > 127 || ch < 32)
+ {
+ std::stringstream serr;
+ serr << "Illegal string character: \"" << ch
+ << "\", must use escape sequence";
+ throw CVC4::Exception(serr.str());
+ }
+ else
+ {
+ str.push_back(static_cast<unsigned>(ch));
+ }
+}
+
+std::vector<unsigned> String::toInternal(const std::string& s,
+ bool useEscSequences)
+{
+ std::vector<unsigned> str;
+ unsigned i = 0;
+ while (i < s.size())
+ {
+ // get the current character
+ char si = s[i];
+ if (si != '\\' || !useEscSequences)
+ {
+ addCharToInternal(si, str);
+ ++i;
+ continue;
+ }
+ // the vector of characters, in case we fail to read an escape sequence
+ std::vector<unsigned> nonEscCache;
+ // process the '\'
+ addCharToInternal(si, nonEscCache);
+ ++i;
+ // are we an escape sequence?
+ bool isEscapeSequence = true;
+ // the string corresponding to the hexidecimal code point
+ std::stringstream hexString;
+ // is the slash followed by a 'u'? Could be last character.
+ if (i >= s.size() || s[i] != 'u')
+ {
+ isEscapeSequence = false;
+ }
+ else
+ {
+ // process the 'u'
+ addCharToInternal(s[i], nonEscCache);
+ ++i;
+ bool isStart = true;
+ bool isEnd = false;
+ bool hasBrace = false;
+ while (i < s.size())
+ {
+ // add the next character
+ si = s[i];
+ if (isStart)
+ {
+ isStart = false;
+ // possibly read '{'
+ if (si == '{')
+ {
+ hasBrace = true;
+ addCharToInternal(si, nonEscCache);
+ ++i;
+ continue;
+ }
+ }
+ else if (si == '}')
+ {
+ // can only end if we had an open brace and read at least one digit
+ isEscapeSequence = hasBrace && !hexString.str().empty();
+ isEnd = true;
+ addCharToInternal(si, nonEscCache);
+ ++i;
+ break;
+ }
+ // must be a hex digit at this point
+ if (!isHexDigit(static_cast<unsigned>(si)))
+ {
+ isEscapeSequence = false;
+ break;
+ }
+ hexString << si;
+ addCharToInternal(si, nonEscCache);
+ ++i;
+ if (!hasBrace && hexString.str().size() == 4)
+ {
+ // will be finished reading \ u d_3 d_2 d_1 d_0 with no parens
+ isEnd = true;
+ break;
+ }
+ else if (hasBrace && hexString.str().size() > 5)
+ {
+ // too many digits enclosed in brace, not an escape sequence
+ isEscapeSequence = false;
+ break;
+ }
+ }
+ if (!isEnd)
+ {
+ // if we were interupted before ending, then this is not a valid
+ // escape sequence
+ isEscapeSequence = false;
+ }
+ }
+ if (isEscapeSequence)
+ {
+ Assert(!hexString.str().empty() && hexString.str().size() <= 5);
+ // Otherwise, we add the escaped character.
+ // This is guaranteed not to overflow due to the length of hstr.
+ uint32_t val;
+ hexString >> std::hex >> val;
+ if (val > num_codes())
+ {
+ // Failed due to being out of range. This can happen for strings of
+ // the form \ u { d_4 d_3 d_2 d_1 d_0 } where d_4 is a hexidecimal not
+ // in the range [0-2].
+ isEscapeSequence = false;
+ }
+ else
+ {
+ str.push_back(val);
+ }
+ }
+ // if we did not successfully parse an escape sequence, we add back all
+ // characters that we cached
+ if (!isEscapeSequence)
+ {
+ str.insert(str.end(), nonEscCache.begin(), nonEscCache.end());
+ }
+ }
+#ifdef CVC4_ASSERTIONS
+ for (unsigned u : str)
+ {
+ Assert(u < num_codes());
+ }
+#endif
+ return str;
+}
+
+unsigned String::front() const
+{
+ Assert(!d_str.empty());
+ return d_str.front();
+}
+
+unsigned String::back() const
+{
+ Assert(!d_str.empty());
+ return d_str.back();
+}
+
+std::size_t String::overlap(const String &y) const {
+ std::size_t i = size() < y.size() ? size() : y.size();
+ for (; i > 0; i--) {
+ String s = suffix(i);
+ String p = y.prefix(i);
+ if (s == p) {
+ return i;
+ }
+ }
+ return i;
+}
+
+std::size_t String::roverlap(const String &y) const {
+ std::size_t i = size() < y.size() ? size() : y.size();
+ for (; i > 0; i--) {
+ String s = prefix(i);
+ String p = y.suffix(i);
+ if (s == p) {
+ return i;
+ }
+ }
+ return i;
+}
+
+std::string String::toString(bool useEscSequences) const {
+ std::stringstream str;
+ for (unsigned int i = 0; i < size(); ++i) {
+ // we always print forward slash as a code point so that it cannot
+ // be interpreted as specifying part of a code point, e.g. the string
+ // '\' + 'u' + '0' of length three.
+ if (isPrintable(d_str[i]) && d_str[i] != '\\' && !useEscSequences)
+ {
+ str << static_cast<char>(d_str[i]);
+ }
+ else
+ {
+ std::stringstream ss;
+ ss << std::hex << d_str[i];
+ str << "\\u{" << ss.str() << "}";
+ }
+ }
+ return str.str();
+}
+
+bool String::isLeq(const String &y) const
+{
+ for (unsigned i = 0; i < size(); ++i)
+ {
+ if (i >= y.size())
+ {
+ return false;
+ }
+ unsigned ci = d_str[i];
+ unsigned cyi = y.d_str[i];
+ if (ci > cyi)
+ {
+ return false;
+ }
+ if (ci < cyi)
+ {
+ return true;
+ }
+ }
+ return true;
+}
+
+bool String::isRepeated() const {
+ if (size() > 1) {
+ unsigned int f = d_str[0];
+ for (unsigned i = 1; i < size(); ++i) {
+ if (f != d_str[i]) return false;
+ }
+ }
+ return true;
+}
+
+bool String::tailcmp(const String &y, int &c) const {
+ int id_x = size() - 1;
+ int id_y = y.size() - 1;
+ while (id_x >= 0 && id_y >= 0) {
+ if (d_str[id_x] != y.d_str[id_y]) {
+ c = id_x;
+ return false;
+ }
+ --id_x;
+ --id_y;
+ }
+ c = id_x == -1 ? (-(id_y + 1)) : (id_x + 1);
+ return true;
+}
+
+std::size_t String::find(const String &y, const std::size_t start) const {
+ if (size() < y.size() + start) return std::string::npos;
+ if (y.empty()) return start;
+ if (empty()) return std::string::npos;
+
+ std::vector<unsigned>::const_iterator itr = std::search(
+ d_str.begin() + start, d_str.end(), y.d_str.begin(), y.d_str.end());
+ if (itr != d_str.end()) {
+ return itr - d_str.begin();
+ }
+ return std::string::npos;
+}
+
+std::size_t String::rfind(const String &y, const std::size_t start) const {
+ if (size() < y.size() + start) return std::string::npos;
+ if (y.empty()) return start;
+ if (empty()) return std::string::npos;
+
+ std::vector<unsigned>::const_reverse_iterator itr = std::search(
+ d_str.rbegin() + start, d_str.rend(), y.d_str.rbegin(), y.d_str.rend());
+ if (itr != d_str.rend()) {
+ return itr - d_str.rbegin();
+ }
+ return std::string::npos;
+}
+
+bool String::hasPrefix(const String& y) const
+{
+ size_t s = size();
+ size_t ys = y.size();
+ if (ys > s)
+ {
+ return false;
+ }
+ for (size_t i = 0; i < ys; i++)
+ {
+ if (d_str[i] != y.d_str[i])
+ {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool String::hasSuffix(const String& y) const
+{
+ size_t s = size();
+ size_t ys = y.size();
+ if (ys > s)
+ {
+ return false;
+ }
+ size_t idiff = s - ys;
+ for (size_t i = 0; i < ys; i++)
+ {
+ if (d_str[i + idiff] != y.d_str[i])
+ {
+ return false;
+ }
+ }
+ return true;
+}
+
+String String::replace(const String &s, const String &t) const {
+ std::size_t ret = find(s);
+ if (ret != std::string::npos) {
+ std::vector<unsigned int> vec;
+ vec.insert(vec.begin(), d_str.begin(), d_str.begin() + ret);
+ vec.insert(vec.end(), t.d_str.begin(), t.d_str.end());
+ vec.insert(vec.end(), d_str.begin() + ret + s.size(), d_str.end());
+ return String(vec);
+ } else {
+ return *this;
+ }
+}
+
+String String::substr(std::size_t i) const {
+ Assert(i <= size());
+ std::vector<unsigned int> ret_vec;
+ std::vector<unsigned int>::const_iterator itr = d_str.begin() + i;
+ ret_vec.insert(ret_vec.end(), itr, d_str.end());
+ return String(ret_vec);
+}
+
+String String::substr(std::size_t i, std::size_t j) const {
+ Assert(i + j <= size());
+ std::vector<unsigned int> ret_vec;
+ std::vector<unsigned int>::const_iterator itr = d_str.begin() + i;
+ ret_vec.insert(ret_vec.end(), itr, itr + j);
+ return String(ret_vec);
+}
+
+bool String::noOverlapWith(const String& y) const
+{
+ return y.find(*this) == std::string::npos
+ && this->find(y) == std::string::npos && this->overlap(y) == 0
+ && y.overlap(*this) == 0;
+}
+
+bool String::isNumber() const {
+ if (d_str.empty()) {
+ return false;
+ }
+ for (unsigned character : d_str) {
+ if (!isDigit(character))
+ {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool String::isDigit(unsigned character)
+{
+ // '0' to '9'
+ return 48 <= character && character <= 57;
+}
+
+bool String::isHexDigit(unsigned character)
+{
+ // '0' to '9' or 'A' to 'F' or 'a' to 'f'
+ return isDigit(character) || (65 <= character && character <= 70)
+ || (97 <= character && character <= 102);
+}
+
+bool String::isPrintable(unsigned character)
+{
+ // Unicode 0x00020 (' ') to 0x0007E ('~')
+ return 32 <= character && character <= 126;
+}
+
+size_t String::maxSize() { return std::numeric_limits<uint32_t>::max(); }
+
+Rational String::toNumber() const
+{
+ // when smt2 standard for strings is set, this may change, based on the
+ // semantics of str.from.int for leading zeros
+ return Rational(toString());
+}
+
+std::ostream &operator<<(std::ostream &os, const String &s) {
+ return os << "\"" << s.toString(true) << "\"";
+}
+
+} // namespace CVC4
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback