From cb7dd2ebdd96e6791cb60e7d995244511f56819f Mon Sep 17 00:00:00 2001
From: Andres Noetzli <andres.noetzli@gmail.com>
Date: Mon, 26 Nov 2018 20:10:05 -0800
Subject: Reduce lookahead when parsing string literals

Fixes issue #2720. Between versions 2.0 and 2.5, the SMT-LIB standard changed
the way that string literals esacpe double quotes (") within strings from \" to
"". As a consequence, we have two different ways of lexing string literals
depending on the language used. The code produced by ANTLR first uses a
Deterministic Finite Automata (DFA) to predict the next token. Because the DFA
is static, it considers both versions of the string literal. If we are parsing
a string without esacpe sequences, e.g. "foo", it ends up reading three
characters past the closing double quote (I am not sure whether this is by
design or a bug). This is problematic in interactive mode because the third
character may be on the next line, which may not be available yet. Consider the
example from the issue mentioned above:

```
import subprocess

p = subprocess.Popen(['cvc4', '--lang', 'smt'], stdout=subprocess.PIPE, stdin=subprocess.PIPE)
p.stdin.write(b'(echo "foo")\n')
p.stdin.flush()
print(p.stdout.readline())
```

In this example, "foo" is a valid string for both types of string literals and
ANTLR reads ", \n, and then tries to access the character on the next line,
which leads to a deadlock because CVC4 is waiting for the next line on stdin
and the Python program is waiting for the output of CVC4. Note: this is why
writing `(echo "foo")\n\n` or `(echo "foo" )\n` both do not lead to a deadlock.

The solution this commit implements is to reorganize the way the string
literals are parsed: we now have a single token `STRING_LITERAL` for both types
of string literals, which is actually a bit simpler because it allows us to
share common code for processing the literal. Additionally, the commit removes
`SYGUS_QUOTED_LITERAL`, which was unused and was also had overlapping matches
with the string literals.

As to why it worked with older versions: we allocated memory for the input in
increments of 1024 bytes and in our `myConsume` method, we did not actually
check whether we were reading a character from the input or just from the
buffer that had not been filled in yet---so I am assuming that we got lucky.
---
 src/parser/smt2/Smt2.g | 105 ++++++++++++++++++-------------------------------
 src/parser/smt2/smt2.h |   7 ++++
 2 files changed, 46 insertions(+), 66 deletions(-)

diff --git a/src/parser/smt2/Smt2.g b/src/parser/smt2/Smt2.g
index d9b0f622b..7143824d6 100644
--- a/src/parser/smt2/Smt2.g
+++ b/src/parser/smt2/Smt2.g
@@ -2611,65 +2611,50 @@ termList[std::vector<CVC4::Expr>& formulas, CVC4::Expr& expr]
  * Matches a string, and strips off the quotes.
  */
 str[std::string& s, bool fsmtlib]
-  : STRING_LITERAL_2_0
-    { s = AntlrInput::tokenText($STRING_LITERAL_2_0);
+  : STRING_LITERAL
+    {
+      s = AntlrInput::tokenText($STRING_LITERAL);
       /* strip off the quotes */
       s = s.substr(1, s.size() - 2);
-      for(size_t i=0; i<s.size(); i++) {
-        if((unsigned)s[i] > 127 && !isprint(s[i])) {
-          PARSER_STATE->parseError("Extended/unprintable characters are not "
-                                   "part of SMT-LIB, and they must be encoded "
-                                   "as escape sequences");
+      for (size_t i = 0; i < s.size(); i++)
+      {
+        if ((unsigned)s[i] > 127 && !isprint(s[i]))
+        {
+          PARSER_STATE->parseError(
+              "Extended/unprintable characters are not "
+              "part of SMT-LIB, and they must be encoded "
+              "as escape sequences");
         }
       }
-      if(fsmtlib) {
-        /* handle SMT-LIB standard escapes '\\' and '\"' */
+      if (fsmtlib || PARSER_STATE->escapeDupDblQuote())
+      {
         char* p_orig = strdup(s.c_str());
         char *p = p_orig, *q = p_orig;
-        while(*q != '\0') {
-          if(*q == '\\') {
+        while (*q != '\0')
+        {
+          if (PARSER_STATE->escapeDupDblQuote() && *q == '"')
+          {
+            // Handle SMT-LIB >=2.5 standard escape '""'.
+            ++q;
+            assert(*q == '"');
+          }
+          else if (!PARSER_STATE->escapeDupDblQuote() && *q == '\\')
+          {
             ++q;
-            if(*q == '\\' || *q == '"') {
-              *p++ = *q++;
-            } else {
+            // Handle SMT-LIB 2.0 standard escapes '\\' and '\"'.
+            if (*q != '\\' && *q != '"')
+            {
               assert(*q != '\0');
               *p++ = '\\';
-              *p++ = *q++;
             }
-          } else {
-            *p++ = *q++;
           }
+          *p++ = *q++;
         }
         *p = '\0';
         s = p_orig;
         free(p_orig);
       }
     }
-  | STRING_LITERAL_2_5
-    { s = AntlrInput::tokenText($STRING_LITERAL_2_5);
-      /* strip off the quotes */
-      s = s.substr(1, s.size() - 2);
-      for(size_t i=0; i<s.size(); i++) {
-        if((unsigned)s[i] > 127 && !isprint(s[i])) {
-          PARSER_STATE->parseError("Extended/unprintable characters are not "
-                                   "part of SMT-LIB, and they must be encoded "
-                                   "as escape sequences");
-        }
-      }
-      // In the 2.5 version, always handle escapes (regardless of fsmtlib flag).
-      char* p_orig = strdup(s.c_str());
-      char *p = p_orig, *q = p_orig;
-      while(*q != '\0') {
-        if(*q == '"') {
-          ++q;
-          assert(*q == '"');
-        }
-        *p++ = *q++;
-      }
-      *p = '\0';
-      s = p_orig;
-      free(p_orig);
-    }
   ;
 
 /**
@@ -3276,37 +3261,25 @@ BINARY_LITERAL
   ;
 
 /**
- * Matches a double-quoted string literal from SMT-LIB 2.0.
- * Escaping is supported, and * escape character '\' has to be escaped.
+ * Matches a double-quoted string literal. Depending on the language that is
+ * being parsed, different escape sequences are supported:
  *
- * You shouldn't generally use this in parser rules, as the quotes
- * will be part of the token text.  Use the str[] parser rule instead.
- */
-STRING_LITERAL_2_0
-  : { PARSER_STATE->v2_0() }?=>
-    '"' ('\\' . | ~('\\' | '"'))* '"'
-  ;
-
-/**
- * Matches a double-quoted string literal from SMT-LIB 2.5.
- * You escape a double-quote inside the string with "", e.g.,
- * "This is a string literal with "" a single, embedded double quote."
+ * For SMT-LIB 2.0 the sequence \" is interpreted as a double quote (") and the
+ * sequence \\ is interpreted as a backslash (\).
+ *
+ * For SMT-LIB >=2.5 and SyGuS a double-quote inside a string is escaped with
+ * "", e.g., "This is a string literal with "" a single, embedded double
+ * quote."
  *
  * You shouldn't generally use this in parser rules, as the quotes
  * will be part of the token text.  Use the str[] parser rule instead.
  */
-STRING_LITERAL_2_5
-  : { PARSER_STATE->v2_5() || PARSER_STATE->sygus() }?=>
+STRING_LITERAL
+  : { !PARSER_STATE->escapeDupDblQuote() }?=>
+    '"' ('\\' . | ~('\\' | '"'))* '"'
+  | { PARSER_STATE->escapeDupDblQuote() }?=>
     '"' (~('"') | '""')* '"'
   ;
-
-/**
- * Matches sygus quoted literal 
- */
-SYGUS_QUOTED_LITERAL
- : { PARSER_STATE->sygus() }?=>
-   '"' (ALPHA|DIGIT)* '"'
-  ;
   
 /**
  * Matches the comments and ignores them
diff --git a/src/parser/smt2/smt2.h b/src/parser/smt2/smt2.h
index 2d57332af..7a3dbb9db 100644
--- a/src/parser/smt2/smt2.h
+++ b/src/parser/smt2/smt2.h
@@ -183,6 +183,13 @@ private:
 
   bool sygus() const { return getLanguage() == language::input::LANG_SYGUS; }
 
+  /**
+   * Returns true if the language that we are parsing (SMT-LIB version >=2.5
+   * and SyGuS) treats duplicate double quotes ("") as an escape sequence
+   * denoting a single double quote (").
+   */
+  bool escapeDupDblQuote() const { return v2_5() || sygus(); }
+
   void setInfo(const std::string& flag, const SExpr& sexpr);
 
   void setOption(const std::string& flag, const SExpr& sexpr);
-- 
cgit v1.2.3