From cb7dd2ebdd96e6791cb60e7d995244511f56819f Mon Sep 17 00:00:00 2001 From: Andres Noetzli Date: Mon, 26 Nov 2018 20:10:05 -0800 Subject: Reduce lookahead when parsing string literals Fixes issue #2720. Between versions 2.0 and 2.5, the SMT-LIB standard changed the way that string literals esacpe double quotes (") within strings from \" to "". As a consequence, we have two different ways of lexing string literals depending on the language used. The code produced by ANTLR first uses a Deterministic Finite Automata (DFA) to predict the next token. Because the DFA is static, it considers both versions of the string literal. If we are parsing a string without esacpe sequences, e.g. "foo", it ends up reading three characters past the closing double quote (I am not sure whether this is by design or a bug). This is problematic in interactive mode because the third character may be on the next line, which may not be available yet. Consider the example from the issue mentioned above: ``` import subprocess p = subprocess.Popen(['cvc4', '--lang', 'smt'], stdout=subprocess.PIPE, stdin=subprocess.PIPE) p.stdin.write(b'(echo "foo")\n') p.stdin.flush() print(p.stdout.readline()) ``` In this example, "foo" is a valid string for both types of string literals and ANTLR reads ", \n, and then tries to access the character on the next line, which leads to a deadlock because CVC4 is waiting for the next line on stdin and the Python program is waiting for the output of CVC4. Note: this is why writing `(echo "foo")\n\n` or `(echo "foo" )\n` both do not lead to a deadlock. The solution this commit implements is to reorganize the way the string literals are parsed: we now have a single token `STRING_LITERAL` for both types of string literals, which is actually a bit simpler because it allows us to share common code for processing the literal. Additionally, the commit removes `SYGUS_QUOTED_LITERAL`, which was unused and was also had overlapping matches with the string literals. As to why it worked with older versions: we allocated memory for the input in increments of 1024 bytes and in our `myConsume` method, we did not actually check whether we were reading a character from the input or just from the buffer that had not been filled in yet---so I am assuming that we got lucky. --- src/parser/smt2/Smt2.g | 105 ++++++++++++++++++------------------------------- src/parser/smt2/smt2.h | 7 ++++ 2 files changed, 46 insertions(+), 66 deletions(-) diff --git a/src/parser/smt2/Smt2.g b/src/parser/smt2/Smt2.g index d9b0f622b..7143824d6 100644 --- a/src/parser/smt2/Smt2.g +++ b/src/parser/smt2/Smt2.g @@ -2611,65 +2611,50 @@ termList[std::vector& formulas, CVC4::Expr& expr] * Matches a string, and strips off the quotes. */ str[std::string& s, bool fsmtlib] - : STRING_LITERAL_2_0 - { s = AntlrInput::tokenText($STRING_LITERAL_2_0); + : STRING_LITERAL + { + s = AntlrInput::tokenText($STRING_LITERAL); /* strip off the quotes */ s = s.substr(1, s.size() - 2); - for(size_t i=0; i 127 && !isprint(s[i])) { - PARSER_STATE->parseError("Extended/unprintable characters are not " - "part of SMT-LIB, and they must be encoded " - "as escape sequences"); + for (size_t i = 0; i < s.size(); i++) + { + if ((unsigned)s[i] > 127 && !isprint(s[i])) + { + PARSER_STATE->parseError( + "Extended/unprintable characters are not " + "part of SMT-LIB, and they must be encoded " + "as escape sequences"); } } - if(fsmtlib) { - /* handle SMT-LIB standard escapes '\\' and '\"' */ + if (fsmtlib || PARSER_STATE->escapeDupDblQuote()) + { char* p_orig = strdup(s.c_str()); char *p = p_orig, *q = p_orig; - while(*q != '\0') { - if(*q == '\\') { + while (*q != '\0') + { + if (PARSER_STATE->escapeDupDblQuote() && *q == '"') + { + // Handle SMT-LIB >=2.5 standard escape '""'. + ++q; + assert(*q == '"'); + } + else if (!PARSER_STATE->escapeDupDblQuote() && *q == '\\') + { ++q; - if(*q == '\\' || *q == '"') { - *p++ = *q++; - } else { + // Handle SMT-LIB 2.0 standard escapes '\\' and '\"'. + if (*q != '\\' && *q != '"') + { assert(*q != '\0'); *p++ = '\\'; - *p++ = *q++; } - } else { - *p++ = *q++; } + *p++ = *q++; } *p = '\0'; s = p_orig; free(p_orig); } } - | STRING_LITERAL_2_5 - { s = AntlrInput::tokenText($STRING_LITERAL_2_5); - /* strip off the quotes */ - s = s.substr(1, s.size() - 2); - for(size_t i=0; i 127 && !isprint(s[i])) { - PARSER_STATE->parseError("Extended/unprintable characters are not " - "part of SMT-LIB, and they must be encoded " - "as escape sequences"); - } - } - // In the 2.5 version, always handle escapes (regardless of fsmtlib flag). - char* p_orig = strdup(s.c_str()); - char *p = p_orig, *q = p_orig; - while(*q != '\0') { - if(*q == '"') { - ++q; - assert(*q == '"'); - } - *p++ = *q++; - } - *p = '\0'; - s = p_orig; - free(p_orig); - } ; /** @@ -3276,37 +3261,25 @@ BINARY_LITERAL ; /** - * Matches a double-quoted string literal from SMT-LIB 2.0. - * Escaping is supported, and * escape character '\' has to be escaped. + * Matches a double-quoted string literal. Depending on the language that is + * being parsed, different escape sequences are supported: * - * You shouldn't generally use this in parser rules, as the quotes - * will be part of the token text. Use the str[] parser rule instead. - */ -STRING_LITERAL_2_0 - : { PARSER_STATE->v2_0() }?=> - '"' ('\\' . | ~('\\' | '"'))* '"' - ; - -/** - * Matches a double-quoted string literal from SMT-LIB 2.5. - * You escape a double-quote inside the string with "", e.g., - * "This is a string literal with "" a single, embedded double quote." + * For SMT-LIB 2.0 the sequence \" is interpreted as a double quote (") and the + * sequence \\ is interpreted as a backslash (\). + * + * For SMT-LIB >=2.5 and SyGuS a double-quote inside a string is escaped with + * "", e.g., "This is a string literal with "" a single, embedded double + * quote." * * You shouldn't generally use this in parser rules, as the quotes * will be part of the token text. Use the str[] parser rule instead. */ -STRING_LITERAL_2_5 - : { PARSER_STATE->v2_5() || PARSER_STATE->sygus() }?=> +STRING_LITERAL + : { !PARSER_STATE->escapeDupDblQuote() }?=> + '"' ('\\' . | ~('\\' | '"'))* '"' + | { PARSER_STATE->escapeDupDblQuote() }?=> '"' (~('"') | '""')* '"' ; - -/** - * Matches sygus quoted literal - */ -SYGUS_QUOTED_LITERAL - : { PARSER_STATE->sygus() }?=> - '"' (ALPHA|DIGIT)* '"' - ; /** * Matches the comments and ignores them diff --git a/src/parser/smt2/smt2.h b/src/parser/smt2/smt2.h index 2d57332af..7a3dbb9db 100644 --- a/src/parser/smt2/smt2.h +++ b/src/parser/smt2/smt2.h @@ -183,6 +183,13 @@ private: bool sygus() const { return getLanguage() == language::input::LANG_SYGUS; } + /** + * Returns true if the language that we are parsing (SMT-LIB version >=2.5 + * and SyGuS) treats duplicate double quotes ("") as an escape sequence + * denoting a single double quote ("). + */ + bool escapeDupDblQuote() const { return v2_5() || sygus(); } + void setInfo(const std::string& flag, const SExpr& sexpr); void setOption(const std::string& flag, const SExpr& sexpr); -- cgit v1.2.3