Reduce lookahead when parsing string literals (#2721)

author: Andres Noetzli <andres.noetzli@gmail.com> 2018-11-27 08:10:36 -0800
committer: Andrew Reynolds <andrew.j.reynolds@gmail.com> 2018-11-27 10:10:36 -0600
commit: 99a1da848889776586436f7f9aec9a1b088703c1 (patch)
tree: 1870aa917134196557f43cdc03d8dd36b667a2e3 /src
parent: 72f1d72852213f46d77c85216c9250bb0f0e3eae (diff)
2 files changed, 46 insertions, 66 deletions
diff --git a/src/parser/smt2/Smt2.g b/src/parser/smt2/Smt2.g
index d9b0f622b..7143824d6 100644
--- a/src/parser/smt2/Smt2.g
+++ b/src/parser/smt2/Smt2.g
@@ -2611,65 +2611,50 @@ termList[std::vector<CVC4::Expr>& formulas, CVC4::Expr& expr]
  * Matches a string, and strips off the quotes.
  */
 str[std::string& s, bool fsmtlib]
-  : STRING_LITERAL_2_0
-    { s = AntlrInput::tokenText($STRING_LITERAL_2_0);
+  : STRING_LITERAL
+    {
+      s = AntlrInput::tokenText($STRING_LITERAL);
       /* strip off the quotes */
       s = s.substr(1, s.size() - 2);
-      for(size_t i=0; i<s.size(); i++) {
-        if((unsigned)s[i] > 127 && !isprint(s[i])) {
-          PARSER_STATE->parseError("Extended/unprintable characters are not "
-                                   "part of SMT-LIB, and they must be encoded "
-                                   "as escape sequences");
+      for (size_t i = 0; i < s.size(); i++)
+      {
+        if ((unsigned)s[i] > 127 && !isprint(s[i]))
+        {
+          PARSER_STATE->parseError(
+              "Extended/unprintable characters are not "
+              "part of SMT-LIB, and they must be encoded "
+              "as escape sequences");
         }
       }
-      if(fsmtlib) {
-        /* handle SMT-LIB standard escapes '\\' and '\"' */
+      if (fsmtlib || PARSER_STATE->escapeDupDblQuote())
+      {
         char* p_orig = strdup(s.c_str());
         char *p = p_orig, *q = p_orig;
-        while(*q != '\0') {
-          if(*q == '\\') {
+        while (*q != '\0')
+        {
+          if (PARSER_STATE->escapeDupDblQuote() && *q == '"')
+          {
+            // Handle SMT-LIB >=2.5 standard escape '""'.
+            ++q;
+            assert(*q == '"');
+          }
+          else if (!PARSER_STATE->escapeDupDblQuote() && *q == '\\')
+          {
             ++q;
-            if(*q == '\\' || *q == '"') {
-              *p++ = *q++;
-            } else {
+            // Handle SMT-LIB 2.0 standard escapes '\\' and '\"'.
+            if (*q != '\\' && *q != '"')
+            {
               assert(*q != '\0');
               *p++ = '\\';
-              *p++ = *q++;
             }
-          } else {
-            *p++ = *q++;
           }
+          *p++ = *q++;
         }
         *p = '\0';
         s = p_orig;
         free(p_orig);
       }
     }
-  | STRING_LITERAL_2_5
-    { s = AntlrInput::tokenText($STRING_LITERAL_2_5);
-      /* strip off the quotes */
-      s = s.substr(1, s.size() - 2);
-      for(size_t i=0; i<s.size(); i++) {
-        if((unsigned)s[i] > 127 && !isprint(s[i])) {
-          PARSER_STATE->parseError("Extended/unprintable characters are not "
-                                   "part of SMT-LIB, and they must be encoded "
-                                   "as escape sequences");
-        }
-      }
-      // In the 2.5 version, always handle escapes (regardless of fsmtlib flag).
-      char* p_orig = strdup(s.c_str());
-      char *p = p_orig, *q = p_orig;
-      while(*q != '\0') {
-        if(*q == '"') {
-          ++q;
-          assert(*q == '"');
-        }
-        *p++ = *q++;
-      }
-      *p = '\0';
-      s = p_orig;
-      free(p_orig);
-    }
   ;
 
 /**
@@ -3276,37 +3261,25 @@ BINARY_LITERAL
   ;
 
 /**
- * Matches a double-quoted string literal from SMT-LIB 2.0.
- * Escaping is supported, and * escape character '\' has to be escaped.
+ * Matches a double-quoted string literal. Depending on the language that is
+ * being parsed, different escape sequences are supported:
  *
- * You shouldn't generally use this in parser rules, as the quotes
- * will be part of the token text.  Use the str[] parser rule instead.
- */
-STRING_LITERAL_2_0
-  : { PARSER_STATE->v2_0() }?=>
-    '"' ('\\' . | ~('\\' | '"'))* '"'
-  ;
-
-/**
- * Matches a double-quoted string literal from SMT-LIB 2.5.
- * You escape a double-quote inside the string with "", e.g.,
- * "This is a string literal with "" a single, embedded double quote."
+ * For SMT-LIB 2.0 the sequence \" is interpreted as a double quote (") and the
+ * sequence \\ is interpreted as a backslash (\).
+ *
+ * For SMT-LIB >=2.5 and SyGuS a double-quote inside a string is escaped with
+ * "", e.g., "This is a string literal with "" a single, embedded double
+ * quote."
  *
  * You shouldn't generally use this in parser rules, as the quotes
  * will be part of the token text.  Use the str[] parser rule instead.
  */
-STRING_LITERAL_2_5
-  : { PARSER_STATE->v2_5() || PARSER_STATE->sygus() }?=>
+STRING_LITERAL
+  : { !PARSER_STATE->escapeDupDblQuote() }?=>
+    '"' ('\\' . | ~('\\' | '"'))* '"'
+  | { PARSER_STATE->escapeDupDblQuote() }?=>
     '"' (~('"') | '""')* '"'
   ;
-
-/**
- * Matches sygus quoted literal 
- */
-SYGUS_QUOTED_LITERAL
- : { PARSER_STATE->sygus() }?=>
-   '"' (ALPHA|DIGIT)* '"'
-  ;
   
 /**
  * Matches the comments and ignores them
diff --git a/src/parser/smt2/smt2.h b/src/parser/smt2/smt2.h
index 2d57332af..7a3dbb9db 100644
--- a/src/parser/smt2/smt2.h
+++ b/src/parser/smt2/smt2.h
@@ -183,6 +183,13 @@ private:
 
   bool sygus() const { return getLanguage() == language::input::LANG_SYGUS; }
 
+  /**
+   * Returns true if the language that we are parsing (SMT-LIB version >=2.5
+   * and SyGuS) treats duplicate double quotes ("") as an escape sequence
+   * denoting a single double quote (").
+   */
+  bool escapeDupDblQuote() const { return v2_5() || sygus(); }
+
   void setInfo(const std::string& flag, const SExpr& sexpr);
 
   void setOption(const std::string& flag, const SExpr& sexpr);
author	Andres Noetzli <andres.noetzli@gmail.com>	2018-11-27 08:10:36 -0800
committer	Andrew Reynolds <andrew.j.reynolds@gmail.com>	2018-11-27 10:10:36 -0600
commit	99a1da848889776586436f7f9aec9a1b088703c1 (patch)
tree	1870aa917134196557f43cdc03d8dd36b667a2e3 /src
parent	72f1d72852213f46d77c85216c9250bb0f0e3eae (diff)