summaryrefslogtreecommitdiff
path: root/src/parser/antlr_line_buffered_input.cpp
diff options
context:
space:
mode:
authorAndres Nötzli <andres.noetzli@gmail.com>2017-06-16 17:42:22 -0700
committerGitHub <noreply@github.com>2017-06-16 17:42:22 -0700
commit0da095c3f5be79a85c085b4b46d7a0a0513ecdd6 (patch)
treeb651858c19ac8afc6f8cebf69f958dfe97781b86 /src/parser/antlr_line_buffered_input.cpp
parentb3e1f7ade822d061f276a9477f1ea67fcb1f3a50 (diff)
Fix stream parsing
This commit fixes bug 811. Bug 811 was caused because tokens were referring to a buffer that was reallocated and thus the pointers were not valid anymore. Background: The buffered input stream avoids copying the whole input stream before handing it to ANTLR (in contrast to the non-buffered input stream that first copies everything into a buffer). This enables interactivity (e.g. with kind2) and may save memory. CVC4 uses it when reading from stdin in competition mode for the application track (the incremental benchmarks) and in non-competition mode. To set the CVC4_SMTCOMP_APPLICATION_TRACK flag, the {C,CXX}FLAGS have to be modified at configure time. Solution: This commit fixes the issue by changing how a stream gets buffered. Instead of storing the stream into a single buffer, CVC4 now stores each line in a separate buffer, making sure that they do not have to move, keeping tokens valid. The commit adds the LineBuffer class for managing those buffers. It further modifies CVC4's LA and consume functions to use line number and position within a line to index into the line buffer. This allows us to use the standard mark()/etc. functions because they automatically store and restore that state. The solution also (arguably) simplifies the code. Disadvantages: Tokens split across lines would cause problems (seems reasonable to me). One allocation per line. Alternatives considered: Pull request 162 by Tim was a first attempt to solve the problem. The issues with this solution are: memory usage (old versions of the buffer do not get deleted), tokens split across buffers would be problematic, and mark()/rewind()/etc. would have to be overwritten for the approach to work. I had a partially working fix that used indexes into the stream instead of pointers to memory. The solution stored the content of the stream into a segmented buffer (lines were not guaranteed to be consecutive in memory. This approach was working for basic use cases but had the following issues: ugly casting (the solution requires casting the index to a pointer and storing it in the input stream's nextChar because that's where ANTLR is taking the location information from when creating a token), more modifications (not only would this solution require overwriting more functions of the input stream such as substr, it also requires changes to the use of GETCHARINDEX() in the Smt2 parser and AntlrInput::tokenText() for example), more complex code.
Diffstat (limited to 'src/parser/antlr_line_buffered_input.cpp')
-rw-r--r--src/parser/antlr_line_buffered_input.cpp319
1 files changed, 181 insertions, 138 deletions
diff --git a/src/parser/antlr_line_buffered_input.cpp b/src/parser/antlr_line_buffered_input.cpp
index 22bbaf1db..e65125ad9 100644
--- a/src/parser/antlr_line_buffered_input.cpp
+++ b/src/parser/antlr_line_buffered_input.cpp
@@ -2,17 +2,30 @@
/*! \file antlr_line_buffered_input.cpp
** \verbatim
** Top contributors (to current version):
- ** Morgan Deters, Tim King
+ ** Morgan Deters, Tim King, Andres Noetzli
** This file is part of the CVC4 project.
** Copyright (c) 2009-2016 by the authors listed in the file AUTHORS
** in the top-level source directory) and their institutional affiliations.
** All rights reserved. See the file COPYING in the top-level source
** directory for licensing information.\endverbatim
**
- ** \brief [[ Add one-line brief description here ]]
+ ** \brief A custom ANTLR input stream that reads from the input stream lazily
**
- ** [[ Add lengthier description here ]]
- ** \todo document this file
+ ** WARNING: edits to this and related files should be done carefully due to the
+ *interaction with ANTLR internals.
+ **
+ ** This overwrites the _LA and the consume functions of the ANTLR input stream
+ ** to use a LineBuffer instead of accessing a buffer. The lines are kept in
+ ** memory to make sure that existing tokens remain valid (tokens store pointers
+ ** to the corresponding input). We do not overwrite mark(), etc.
+ *because
+ ** we can use the line number and the position within that line to index into
+ *the
+ ** line buffer and the default markers already store and restore that
+ ** information. The line buffer guarantees that lines are consecutive in
+ ** memory, so ANTLR3_INPUT_STREAM::getLineBuf() should work as intended and
+ ** tokens themselves are consecutive in memory (we are assuming that tokens
+ ** are not split across multiple lines).
**/
// We rely on the inclusion of #include <antlr3.h> in
@@ -32,7 +45,8 @@
namespace CVC4 {
namespace parser {
-static pANTLR3_INPUT_STREAM antlr3CreateLineBufferedStream(std::istream& in);
+static pANTLR3_INPUT_STREAM antlr3CreateLineBufferedStream(
+ std::istream& in, LineBuffer* line_buffer);
static void
setupInputStream(pANTLR3_INPUT_STREAM input)
@@ -206,111 +220,133 @@ setupInputStream(pANTLR3_INPUT_STREAM input)
#endif /* 0 */
}
-static ANTLR3_UCHAR
-myLA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la) {
- pANTLR3_INPUT_STREAM input;
+static ANTLR3_UCHAR bufferedInputLA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la) {
+ pANTLR3_INPUT_STREAM input = ((pANTLR3_INPUT_STREAM)(is->super));
+ CVC4::parser::pANTLR3_LINE_BUFFERED_INPUT_STREAM line_buffered_input =
+ (CVC4::parser::pANTLR3_LINE_BUFFERED_INPUT_STREAM)input;
+ uint8_t* result = line_buffered_input->line_buffer->getPtrWithOffset(
+ input->line, input->charPositionInLine, la - 1);
+ return (result != NULL) ? *result : ANTLR3_CHARSTREAM_EOF;
+}
- input = ((pANTLR3_INPUT_STREAM) (is->super));
+static void bufferedInputRewind(pANTLR3_INT_STREAM is, ANTLR3_MARKER mark) {
+ // This function is essentially the same as the original
+ // antlr38BitRewind() but does not do any seek. The seek in the
+ // original function does not do anything and also calls
+ // antlr38BitSeek() instead of the overloaded seek() function, which
+ // leads to subtle bugs.
+ pANTLR3_LEX_STATE state;
+ pANTLR3_INPUT_STREAM input;
+
+ input = ((pANTLR3_INPUT_STREAM)is->super);
+
+ // Perform any clean up of the marks
+ input->istream->release(input->istream, mark);
+
+ // Find the supplied mark state
+ state = (pANTLR3_LEX_STATE)input->markers->get(input->markers,
+ (ANTLR3_UINT32)(mark - 1));
+ if (state == NULL) {
+ return;
+ }
+
+ // Reset the information in the mark
+ input->charPositionInLine = state->charPositionInLine;
+ input->currentLine = state->currentLine;
+ input->line = state->line;
+ input->nextChar = state->nextChar;
+}
- Debug("pipe") << "LA" << std::endl;
- if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
- {
- std::istream& in = *((CVC4::parser::pANTLR3_LINE_BUFFERED_INPUT_STREAM)input)->in;
- //MGD
- // in.clear();
- if(!in) {
- Debug("pipe") << "EOF" << std::endl;
- return ANTLR3_CHARSTREAM_EOF;
- }
- Debug("pipe") << "READ" << std::endl;
- if(input->data == NULL) {
- Debug("pipe") << "ALLOC" << std::endl;
- input->data = malloc(1024);
- input->nextChar = input->data;
- } else {
- Debug("pipe") << "REALLOC" << std::endl;
- size_t pos = (char*)input->nextChar - (char*)input->data;
- input->data = realloc(input->data, input->sizeBuf + 1024);
- input->nextChar = (char*)input->data + pos;
- }
- in.getline((((char*)input->data) + input->sizeBuf), 1024);
- while(in.fail() && !in.eof()) {
- Debug("pipe") << "input string too long, reallocating" << std::endl;
- input->sizeBuf += strlen(((char*)input->data) + input->sizeBuf);
- size_t pos = (char*)input->nextChar - (char*)input->data;
- input->data = realloc(input->data, input->sizeBuf + 1024);
- input->nextChar = (char*)input->data + pos;
- in.clear();
- in.getline((((char*)input->data) + input->sizeBuf), 1024);
- }
- input->sizeBuf += strlen(((char*)input->data) + input->sizeBuf);
- assert(*(((char*)input->data) + input->sizeBuf) == '\0');
- Debug("pipe") << "SIZEBUF now " << input->sizeBuf << std::endl;
- *(((char*)input->data) + input->sizeBuf) = '\n';
- ++input->sizeBuf;
+static void bufferedInputConsume(pANTLR3_INT_STREAM is) {
+ pANTLR3_INPUT_STREAM input = ((pANTLR3_INPUT_STREAM)(is->super));
+ CVC4::parser::pANTLR3_LINE_BUFFERED_INPUT_STREAM line_buffered_input =
+ (CVC4::parser::pANTLR3_LINE_BUFFERED_INPUT_STREAM)input;
+
+ uint8_t* current = line_buffered_input->line_buffer->getPtr(
+ input->line, input->charPositionInLine);
+ if (current != NULL) {
+ input->charPositionInLine++;
+
+ if (*current == LineBuffer::NewLineChar) {
+ // Reset for start of a new line of input
+ input->line++;
+ input->charPositionInLine = 0;
+ input->currentLine = line_buffered_input->line_buffer->getPtr(
+ input->line, input->charPositionInLine);
+ Debug("pipe") << "-- newline!" << std::endl;
}
- Debug("pipe") << "READ POINTER[" << la << "] AT: >>" << std::string(((char*)input->nextChar), input->sizeBuf - (((char*)input->nextChar) - (char*)input->data)) << "<< returning '" << (char)(*((pANTLR3_UINT8)input->nextChar + la - 1)) << "' (" << (unsigned)(*((pANTLR3_UINT8)input->nextChar + la - 1)) << ")" << std::endl;
- return (ANTLR3_UCHAR)(*((pANTLR3_UINT8)input->nextChar + la - 1));
+ input->nextChar = line_buffered_input->line_buffer->getPtr(
+ input->line, input->charPositionInLine);
+ }
}
-
-static void
-myConsume(pANTLR3_INT_STREAM is)
-{
- pANTLR3_INPUT_STREAM input;
-
- input = ((pANTLR3_INPUT_STREAM) (is->super));
-
- Debug("pipe") << "consume! '" << *(char*)input->nextChar << "' (" << (unsigned)*(char*)input->nextChar << ")" << std::endl;
- if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
- {
- /* Indicate one more character in this line
- */
- input->charPositionInLine++;
-
- if ((ANTLR3_UCHAR)(*((pANTLR3_UINT8)input->nextChar)) == input->newlineChar)
- {
- /* Reset for start of a new line of input
- */
- input->line++;
- input->charPositionInLine = 0;
- input->currentLine = (void *)(((pANTLR3_UINT8)input->nextChar) + 1);
- Debug("pipe") << "-- newline!" << std::endl;
- }
-
- /* Increment to next character position
- */
- input->nextChar = (void *)(((pANTLR3_UINT8)input->nextChar) + 1);
- Debug("pipe") << "-- advance nextChar! looking at '" << *(char*)input->nextChar << "' (" << (unsigned)*(char*)input->nextChar << ")" << std::endl;
- } else Debug("pipe") << "-- nothing!" << std::endl;
+static void bufferedInputSeek(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint) {
+ // In contrast to the original antlr38BitSeek() function, we only
+ // support seeking forward (seeking backwards is only supported for
+ // rewinding in the original code, which we do not do when rewinding,
+ // so this should be fine).
+ pANTLR3_INPUT_STREAM input = ((pANTLR3_INPUT_STREAM)(is->super));
+ pANTLR3_LINE_BUFFERED_INPUT_STREAM line_buffered_input =
+ (CVC4::parser::pANTLR3_LINE_BUFFERED_INPUT_STREAM)input;
+
+ // Check that we are not seeking backwards.
+ assert(!line_buffered_input->line_buffer->isPtrBefore(
+ (uint8_t*)seekPoint, input->line, input->charPositionInLine));
+
+ ssize_t count = (ssize_t)(seekPoint - (ANTLR3_MARKER)(input->nextChar));
+ while (count > 0) {
+ is->consume(is);
+ count--;
+ }
}
-pANTLR3_INPUT_STREAM
-antlr3LineBufferedStreamNew(std::istream& in, ANTLR3_UINT32 encoding, pANTLR3_UINT8 name)
-{
- pANTLR3_INPUT_STREAM input;
-
- if(!in) {
- return NULL;
- }
+static ANTLR3_UINT32 bufferedInputSize(pANTLR3_INPUT_STREAM input) {
+ // Not supported for this type of stream
+ assert(false);
+ return 0;
+}
- // First order of business is to set up the stream and install the data pointer.
- // Then we will work out the encoding and byte order and adjust the API functions that are installed for the
- // default 8Bit stream accordingly.
- //
- input = antlr3CreateLineBufferedStream(in);
- if (input == NULL)
- {
- return NULL;
- }
+static void bufferedInputSetNewLineChar(pANTLR3_INPUT_STREAM input,
+ ANTLR3_UINT32 newlineChar) {
+ // Not supported for this type of stream
+ assert(false);
+}
- // Size (in bytes) of the given 'string'
- //
- input->sizeBuf = 0;
+static void bufferedInputSetUcaseLA(pANTLR3_INPUT_STREAM input,
+ ANTLR3_BOOLEAN flag) {
+ // Not supported for this type of stream
+ assert(false);
+}
- input->istream->_LA = myLA;
- input->istream->consume = myConsume;
+pANTLR3_INPUT_STREAM antlr3LineBufferedStreamNew(std::istream& in,
+ ANTLR3_UINT32 encoding,
+ pANTLR3_UINT8 name,
+ LineBuffer* line_buffer) {
+ pANTLR3_INPUT_STREAM input;
+
+ if (!in) {
+ return NULL;
+ }
+
+ // First order of business is to set up the stream and install the data
+ // pointer.
+ // Then we will work out the encoding and byte order and adjust the API
+ // functions that are installed for the
+ // default 8Bit stream accordingly.
+ //
+ input = antlr3CreateLineBufferedStream(in, line_buffer);
+ if (input == NULL) {
+ return NULL;
+ }
+
+ input->istream->_LA = bufferedInputLA;
+ input->istream->consume = bufferedInputConsume;
+ input->istream->seek = bufferedInputSeek;
+ input->istream->rewind = bufferedInputRewind;
+ input->size = bufferedInputSize;
+ input->SetNewLineChar = bufferedInputSetNewLineChar;
+ input->setUcaseLA = bufferedInputSetUcaseLA;
#ifndef CVC4_ANTLR3_OLD_INPUT_STREAM
// We have the data in memory now so we can deal with it according to
@@ -326,53 +362,60 @@ antlr3LineBufferedStreamNew(std::istream& in, ANTLR3_UINT32 encoding, pANTLR3_UI
// Now we can set up the file name
//
- input->istream->streamName = input->strFactory->newStr8(input->strFactory, name);
- input->fileName = input->istream->streamName;
+ input->istream->streamName =
+ input->strFactory->newStr8(input->strFactory, name);
+ input->fileName = input->istream->streamName;
return input;
}
-static pANTLR3_INPUT_STREAM
-antlr3CreateLineBufferedStream(std::istream& in)
-{
- // Pointer to the input stream we are going to create
- //
- pANTLR3_INPUT_STREAM input;
-
- if (!in)
- {
- return NULL;
- }
-
- // Allocate memory for the input stream structure
- //
- input = (pANTLR3_INPUT_STREAM)
- ANTLR3_CALLOC(1, sizeof(ANTLR3_LINE_BUFFERED_INPUT_STREAM));
-
- if (input == NULL)
- {
- return NULL;
- }
-
- // Structure was allocated correctly, now we can install the pointer
- //
- input->data = malloc(1024);
- input->isAllocated = ANTLR3_FALSE;
-
- ((pANTLR3_LINE_BUFFERED_INPUT_STREAM)input)->in = &in;
- // Call the common 8 bit input stream handler
- // initialization.
- //
+static pANTLR3_INPUT_STREAM antlr3CreateLineBufferedStream(
+ std::istream& in, LineBuffer* line_buffer) {
+ // Pointer to the input stream we are going to create
+ //
+ pANTLR3_INPUT_STREAM input;
+
+ if (!in) {
+ return NULL;
+ }
+
+ // Allocate memory for the input stream structure
+ //
+ input = (pANTLR3_INPUT_STREAM)ANTLR3_CALLOC(
+ 1, sizeof(ANTLR3_LINE_BUFFERED_INPUT_STREAM));
+
+ if (input == NULL) {
+ return NULL;
+ }
+
+ // Structure was allocated correctly, now we can install the pointer
+ //
+ input->data = NULL;
+ input->isAllocated = ANTLR3_FALSE;
+
+ ((pANTLR3_LINE_BUFFERED_INPUT_STREAM)input)->in = &in;
+ ((pANTLR3_LINE_BUFFERED_INPUT_STREAM)input)->line_buffer = line_buffer;
+// Call the common 8 bit input stream handler
+// initialization.
+//
#ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
- antlr3AsciiSetupStream(input, ANTLR3_CHARSTREAM);
+ antlr3AsciiSetupStream(input, ANTLR3_CHARSTREAM);
#else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
- antlr38BitSetupStream(input);
- // In some libantlr3c 3.4-beta versions, this call is not included in the above.
- // This is probably an erroneously-deleted line in the libantlr3c source since 3.2.
- antlr3GenericSetupStream(input);
+ antlr38BitSetupStream(input);
+ // In some libantlr3c 3.4-beta versions, this call is not included in the
+ // above.
+ // This is probably an erroneously-deleted line in the libantlr3c source since
+ // 3.2.
+ antlr3GenericSetupStream(input);
#endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
- return input;
+ input->sizeBuf = 0;
+ input->newlineChar = LineBuffer::NewLineChar;
+ input->charPositionInLine = 0;
+ input->line = 0;
+ input->nextChar = line_buffer->getPtr(0, 0);
+ input->currentLine = line_buffer->getPtr(0, 0);
+ return input;
}
}/* CVC4::parser namespace */
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback