From 24093bf30356975b5017613d94bf2927521b666f Mon Sep 17 00:00:00 2001
From: Kshitij Bansal <kshitij@cs.nyu.edu>
Date: Thu, 24 Apr 2014 21:35:03 -0400
Subject: attempt to improve CVC4's "parse error" message

---
 src/parser/antlr_input.cpp | 155 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 151 insertions(+), 4 deletions(-)

(limited to 'src/parser')

diff --git a/src/parser/antlr_input.cpp b/src/parser/antlr_input.cpp
index d498d3c54..88b43eb0e 100644
--- a/src/parser/antlr_input.cpp
+++ b/src/parser/antlr_input.cpp
@@ -3,9 +3,9 @@
  ** \verbatim
  ** Original author: Christopher L. Conway
  ** Major contributors: Morgan Deters
- ** Minor contributors (to current version): Francois Bobot
+ ** Minor contributors (to current version): Francois Bobot, Kshitij Bansal
  ** This file is part of the CVC4 project.
- ** Copyright (c) 2009-2013  New York University and The University of Iowa
+ ** Copyright (c) 2009-2014  New York University and The University of Iowa
  ** See the file COPYING in the top-level source directory for licensing
  ** information.\endverbatim
  **
@@ -284,20 +284,167 @@ void AntlrInput::warning(const std::string& message) {
   Warning() << getInputStream()->getName() << ':' << d_lexer->getLine(d_lexer) << '.' << d_lexer->getCharPositionInLine(d_lexer) << ": " << message << endl;
 }
 
+
+/**
+ * characters considered part of a simple symbol in SMTLIB.
+ *
+ * TODO: Ideally this code shouldn't be smtlib specific (should work
+ * with CVC language too), but this per-language specialization has
+ * been left for a later point.
+ */
+inline bool isSimpleChar(char ch) {
+  return isalnum(ch) || (strchr("~!@$%^&*_-+=<>.?/", ch) != NULL);
+}
+
+/**
+ * Gets part of original input and tries to visually hint where the
+ * error might be.
+ *
+ * Something like:
+ *
+ *   ...nd (= alpha beta) (= beta delta))
+ *                                ^
+ *
+ * Implementation (as on 2014/04/24):
+ *
+ * > if suggested pointer by lexer is under a "simple char", move to
+ *   start of the word and print pointer there.
+ *
+ * > in the other case, it tries to find the nearest word in the error
+ *   message passed along. if it can't find it, we don't add this
+ *   visual hint, as experimentally position suggested by lexer was
+ *   found to be totally unhelpful. (TODO: fix this upstream to
+ *   improve)
+ */
+std::string parseErrorHelper(const char* lineStart, int charPositionInLine, const std::string& message)
+{
+  // Is it a multi-line message
+  bool multilineMessage = (message.find('\n') != string::npos);
+  // Useful only if it is a multi-line message
+  int firstLineEnd = message.find('\n');
+
+  std::ostringstream ss, slicess;
+
+  // Keep first line of message
+  if(multilineMessage) {
+    ss << message.substr(0, firstLineEnd) << endl << endl;
+  } else {
+    ss << message << endl << endl;
+  }
+
+  int posSliceStart = (charPositionInLine - 50 <= 0) ? 0 : charPositionInLine - 50 + 5;
+  int posSliceEnd = posSliceStart + 70;
+  int caretPos = 0;
+  int caretPosExtra = 0; // for inital intendation, epilipses etc.
+
+  ss << "  "; caretPosExtra += 2;
+  if(posSliceStart > 0) {
+    ss << "..."; caretPosExtra += 3;
+  }
+
+  for(int i = posSliceStart; lineStart[i] != '\n'; ++i) {
+    if(i == posSliceEnd) {
+      ss << "...";
+      break;
+    }
+    if(i < charPositionInLine) { caretPos++; }
+
+    if(!isprint(lineStart[i])) {
+      // non-printable character, something wrong, bail out
+      return message;
+    }
+
+    ss << (lineStart[i]);
+    slicess << (lineStart[i]);
+  }
+
+  // adjust position of caret, based on slice and message
+  {
+    int caretPosOrig = caretPos;
+    string slice = slicess.str();
+    if(isSimpleChar(slice[caretPos])) {
+      // if alphanumeric, try to go to beginning of word/number
+      while(caretPos > 0 && isSimpleChar(slice[caretPos - 1])) { --caretPos; }
+      if(caretPos == 0 && posSliceStart > 0) {
+        // reached start and this is not really the start? bail out
+        return message;
+      } else {
+        // likely it is also in original message? if so, very likely
+        // we found the right place
+        string word = slice.substr(caretPos, (caretPosOrig - caretPos + 1));
+        int messagePosSt = message.find(word);
+        int messagePosEn = messagePosSt + (caretPosOrig - caretPos);
+        if( messagePosSt < string::npos &&
+            (messagePosSt == 0 || !isSimpleChar(message[messagePosSt-1]) ) &&
+            (messagePosEn+1 == message.size() || !isSimpleChar(message[messagePosEn+1]) ) ) {
+          // ^the complicated if statement is just 'whole-word' match
+          Debug("friendlyparser") << "[friendlyparser] Feeling good." << std::endl;
+        }
+      }
+    } else {
+      // go to nearest alphanumeric string (before current position),
+      // see if that word can be found in original message. If so,
+      // point to that, else keep pointer where it was.
+      int nearestWordEn = caretPos - 1;
+      while(nearestWordEn > 0 && !isSimpleChar(slice[nearestWordEn])) {
+        --nearestWordEn;
+      }
+      if(isSimpleChar(slice[nearestWordEn])) {
+        int nearestWordSt = nearestWordEn;
+        while(nearestWordSt > 0 && isSimpleChar(slice[nearestWordSt - 1])) {
+          --nearestWordSt;
+        }
+        string word = slice.substr(nearestWordSt, (nearestWordEn - nearestWordSt + 1));
+        Debug("friendlyparser") << "[friendlyparser] nearest word = " << word << std::endl;
+        int messagePosSt = message.find(word);
+        int messagePosEn = messagePosSt + (nearestWordEn - nearestWordSt + 1);
+        if( messagePosSt < string::npos &&
+            (messagePosSt == 0 || !isSimpleChar(message[messagePosSt-1]) ) &&
+            (messagePosEn+1 == message.size() || !isSimpleChar(message[messagePosEn+1]) ) ) {
+          // ^the complicated if statement is just 'whole-word' match
+          Debug("friendlyparser") << "[friendlyparser] strong evidence that caret should be at "
+                                  << nearestWordSt << std::endl;
+          caretPos = nearestWordSt;
+        } else {
+          // this doesn't look good. caret generally getting printed
+          // at unhelpful positions. improve upstream?
+          return message;
+        }
+      }
+    }
+    caretPos += caretPosExtra;
+  }// end of caret position computation/heuristics
+
+  ss << std::endl;
+  while( caretPos-- > 0 ) {
+    ss << ' ';
+  }
+  ss << '^' << endl;
+  if(multilineMessage) {
+     ss << message.substr(firstLineEnd, message.size() - firstLineEnd);;
+  }
+  return ss.str();
+}
+
 void AntlrInput::parseError(const std::string& message, bool eofException)
   throw (ParserException) {
+
+  string updatedMessage = parseErrorHelper((const char*)d_antlr3InputStream->getLineBuf(d_antlr3InputStream),
+                                           d_lexer->getCharPositionInLine(d_lexer),
+                                           message);
+
   Debug("parser") << "Throwing exception: "
       << (const char*)d_lexer->rec->state->tokSource->fileName->chars << ":"
       << d_lexer->getLine(d_lexer) << "."
       << d_lexer->getCharPositionInLine(d_lexer) << ": "
-      << message << endl;
+      << updatedMessage << endl;
   if(eofException) {
     throw ParserEndOfFileException(message,
                                    (const char*)d_lexer->rec->state->tokSource->fileName->chars,
                                    d_lexer->getLine(d_lexer),
                                    d_lexer->getCharPositionInLine(d_lexer));
   } else {
-    throw ParserException(message,
+    throw ParserException(updatedMessage,
                           (const char*)d_lexer->rec->state->tokSource->fileName->chars,
                           d_lexer->getLine(d_lexer),
                           d_lexer->getCharPositionInLine(d_lexer));
-- 
cgit v1.2.3