Python regression script (#1662)

Until now, we have been using a bash script for the running the regressions. That script had several issues, e.g. it was creating many temprorary files and it was calling itself recursively, making it difficult to maintain. This commit replaces the script with a Python script. The Python script uses the TAP protocol, which allows us to display the results of multiple tests per file (e.g. with --check-models and without) separately and with more details. It also outputs whenever it starts running a test, which allows finding "stuck" regression tests on Travis. As recommended in the automake documentation [0], the commit copies the TAP driver to enable TAP support for the test driver. Note: the commit also changes some of the regressions slightly because we were using "%" for the test directives in SMT files which does not correspond to the comment character ";". Using the comment character simplifies the handling and makes it easier for users to reproduce a failure (they can simply run CVC4 on the regression file). [0] https://www.gnu.org/software/automake/manual/html_node/Use-TAP-with-the-Automake-test-harness.html
author: Andres Noetzli <andres.noetzli@gmail.com> 2018-04-05 21:45:17 -0700
committer: GitHub <noreply@github.com> 2018-04-05 21:45:17 -0700
commit: efc6163629c6c5de446eccfe81777c93829995d5 (patch)
tree: 8681a332d409e94fc4ab7259513bc7935e5568b5 /test/regress/run_regression.py
parent: cf73e3ceb3f93fef830349bd44afec99404b5038 (diff)
1 files changed, 287 insertions, 0 deletions
diff --git a/test/regress/run_regression.py b/test/regress/run_regression.py
new file mode 100755
index 000000000..c861b0d71
--- /dev/null
+++ b/test/regress/run_regression.py
@@ -0,0 +1,287 @@
+#!/usr/bin/env python3
+"""
+Usage:
+
+    run_regression.py [ --proof | --dump ] [ wrapper ] cvc4-binary
+        [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ]
+
+Runs benchmark and checks for correct exit status and output.
+"""
+
+import argparse
+import difflib
+import os
+import re
+import shlex
+import subprocess
+import sys
+
+SCRUBBER = 'SCRUBBER: '
+ERROR_SCRUBBER = 'ERROR-SCRUBBER: '
+EXPECT = 'EXPECT: '
+EXPECT_ERROR = 'EXPECT-ERROR: '
+EXIT = 'EXIT: '
+COMMAND_LINE = 'COMMAND-LINE: '
+
+
+def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc4_binary,
+                  command_line, benchmark_dir, benchmark_filename):
+    """Runs CVC4 on the file `benchmark_filename` in the directory
+    `benchmark_dir` using the binary `cvc4_binary` with the command line
+    options `command_line`. The output is scrubbed using `scrubber` and
+    `error_scrubber` for stdout and stderr, respectively. If dump is true, the
+    function first uses CVC4 to read in and dump the benchmark file and then
+    uses that as input."""
+
+    bin_args = wrapper[:]
+    bin_args.append(cvc4_binary)
+
+    output = None
+    error = None
+    exit_status = None
+    if dump:
+        dump_args = [
+            '--preprocess-only', '--dump', 'raw-benchmark',
+            '--output-lang=smt2', '-qq'
+        ]
+        dump_process = subprocess.Popen(
+            bin_args + command_line + dump_args + [benchmark_filename],
+            cwd=benchmark_dir,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        dump_output, _ = dump_process.communicate()
+        process = subprocess.Popen(
+            bin_args + command_line + ['--lang=smt2', '-'],
+            cwd=benchmark_dir,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        output, error = process.communicate(input=dump_output)
+        exit_status = process.returncode
+    else:
+        process = subprocess.Popen(
+            bin_args + command_line + [benchmark_filename],
+            cwd=benchmark_dir,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        output, error = process.communicate()
+        exit_status = process.returncode
+
+    # If a scrubber command has been specified then apply it to the output.
+    if scrubber:
+        scrubber_process = subprocess.Popen(
+            shlex.split(scrubber),
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        output, _ = scrubber_process.communicate(input=output)
+    if error_scrubber:
+        error_scrubber_process = subprocess.Popen(
+            shlex.split(error_scrubber),
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        error, _ = error_scrubber_process.communicate(input=error)
+
+    # Popen in Python 3 returns a bytes object instead of a string for
+    # stdout/stderr.
+    if isinstance(output, bytes):
+        output = output.decode()
+    if isinstance(error, bytes):
+        error = error.decode()
+    return (output.strip(), error.strip(), exit_status)
+
+
+def run_regression(proof, dump, wrapper, cvc4_binary, benchmark_path):
+    """Determines the expected output for a benchmark, runs CVC4 on it and then
+    checks whether the output corresponds to the expected output. Optionally
+    uses a wrapper `wrapper`, tests proof generation (if proof is true), or
+    dumps a benchmark and uses that as the input (if dump is true)."""
+
+    if not os.access(cvc4_binary, os.X_OK):
+        sys.exit(
+            '"{}" does not exist or is not executable'.format(cvc4_binary))
+    if not os.path.isfile(benchmark_path):
+        sys.exit('"{}" does not exist or is not a file'.format(benchmark_path))
+
+    basic_command_line_args = []
+
+    benchmark_basename = os.path.basename(benchmark_path)
+    benchmark_filename, benchmark_ext = os.path.splitext(benchmark_basename)
+    benchmark_dir = os.path.dirname(benchmark_path)
+    comment_char = '%'
+    status_regex = None
+    status_to_output = lambda s: s
+    if benchmark_ext == '.smt':
+        status_regex = r':status\s*(sat|unsat)'
+        comment_char = ';'
+    elif benchmark_ext == '.smt2':
+        status_regex = r'set-info\s*:status\s*(sat|unsat)'
+        comment_char = ';'
+    elif benchmark_ext == '.cvc':
+        pass
+    elif benchmark_ext == '.p':
+        basic_command_line_args.append('--finite-model-find')
+        status_regex = r'% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)'
+        status_to_output = lambda s: '% SZS status {} for {}'.format(s, benchmark_filename)
+    elif benchmark_ext == '.sy':
+        comment_char = ';'
+        # Do not use proofs/unsat-cores with .sy files
+        proof = False
+    else:
+        sys.exit('"{}" must be *.cvc or *.smt or *.smt2 or *.p or *.sy'.format(
+            benchmark_basename))
+
+    # If there is an ".expect" file for the benchmark, read the metadata
+    # from there, otherwise from the benchmark file.
+    metadata_filename = benchmark_path + '.expect'
+    if os.path.isfile(metadata_filename):
+        comment_char = '%'
+    else:
+        metadata_filename = benchmark_path
+
+    metadata_lines = None
+    with open(metadata_filename, 'r') as metadata_file:
+        metadata_lines = metadata_file.readlines()
+    metadata_content = ''.join(metadata_lines)
+
+    # Extract the metadata for the benchmark.
+    scrubber = None
+    error_scrubber = None
+    expected_output = ''
+    expected_error = ''
+    expected_exit_status = None
+    command_line = ''
+    for line in metadata_lines:
+        # Skip lines that do not start with "%"
+        if line[0] != comment_char:
+            continue
+        line = line[2:]
+
+        if line.startswith(SCRUBBER):
+            scrubber = line[len(SCRUBBER):]
+        elif line.startswith(ERROR_SCRUBBER):
+            error_scrubber = line[len(ERROR_SCRUBBER):]
+        elif line.startswith(EXPECT):
+            expected_output += line[len(EXPECT):]
+        elif line.startswith(EXPECT_ERROR):
+            expected_error += line[len(EXPECT_ERROR):]
+        elif line.startswith(EXIT):
+            expected_exit_status = int(line[len(EXIT):])
+        elif line.startswith(COMMAND_LINE):
+            command_line += line[len(COMMAND_LINE):]
+    expected_output = expected_output.strip()
+    expected_error = expected_error.strip()
+
+    # Expected output/expected error has not been defined in the metadata for
+    # the benchmark. Try to extract the information from the benchmark itself.
+    if expected_output == '' and expected_error == '':
+        match = None
+        if status_regex:
+            match = re.search(status_regex, metadata_content)
+
+        if match:
+            expected_output = status_to_output(match.group(1))
+        elif expected_exit_status is None:
+            # If there is no expected output/error and the exit status has not
+            # been set explicitly, the benchmark is invalid.
+            sys.exit('Cannot determine status of "{}"'.format(benchmark_path))
+
+    if not proof and ('(get-unsat-cores)' in metadata_content
+                      or '(get-unsat-assumptions)' in metadata_content):
+        print(
+            '1..0 # Skipped: unsat cores not supported without proof support')
+        return
+
+    if expected_exit_status is None:
+        expected_exit_status = 0
+
+    if 'CVC4_REGRESSION_ARGS' in os.environ:
+        basic_command_line_args += shlex.split(
+            os.environ['CVC4_REGRESSION_ARGS'])
+    basic_command_line_args += shlex.split(command_line)
+    command_line_args_configs = [basic_command_line_args]
+
+    extra_command_line_args = []
+    if benchmark_ext == '.sy' and \
+        '--no-check-synth-sol' not in basic_command_line_args and \
+        '--check-synth-sol' not in basic_command_line_args:
+        extra_command_line_args = ['--check-synth-sol']
+    if re.search(r'^(sat|invalid|unknown)$', expected_output) and \
+       '--no-check-models' not in basic_command_line_args:
+        extra_command_line_args = ['--check-models']
+    if proof and re.search(r'^(sat|valid)$', expected_output):
+        if '--no-check-proofs' not in basic_command_line_args and \
+           '--incremental' not in basic_command_line_args and \
+           '--unconstrained-simp' not in basic_command_line_args and \
+           not cvc4_binary.endswith('pcvc4'):
+            extra_command_line_args = [
+                '--check-proofs', '--no-bv-eq', '--no-bv-ineq',
+                '--no-bv-algebraic'
+            ]
+        if '--no-check-unsat-cores' not in basic_command_line_args and \
+           '--incremental' not in basic_command_line_args and \
+           '--unconstrained-simp' not in basic_command_line_args and \
+           not cvc4_binary.endswith('pcvc4'):
+            extra_command_line_args = [
+                '--check-proofs', '--no-bv-eq', '--no-bv-ineq',
+                '--no-bv-algebraic'
+            ]
+    if extra_command_line_args:
+        command_line_args_configs.append(
+            basic_command_line_args + extra_command_line_args)
+
+    # Run CVC4 on the benchmark with the different option sets and check
+    # whether the exit status, stdout output, stderr output are as expected.
+    print('1..{}'.format(len(command_line_args_configs)))
+    print('# Starting')
+    for command_line_args in command_line_args_configs:
+        output, error, exit_status = run_benchmark(
+            dump, wrapper, scrubber, error_scrubber, cvc4_binary,
+            command_line_args, benchmark_dir, benchmark_basename)
+        if output != expected_output:
+            print(
+                'not ok - Differences between expected and actual output on stdout - Flags: {}'.
+                format(command_line_args))
+            for line in difflib.context_diff(output.splitlines(),
+                                             expected_output.splitlines()):
+                print(line)
+        elif error != expected_error:
+            print(
+                'not ok - Differences between expected and actual output on stderr - Flags: {}'.
+                format(command_line_args))
+            for line in difflib.context_diff(error.splitlines(),
+                                             expected_error.splitlines()):
+                print(line)
+        elif expected_exit_status != exit_status:
+            print(
+                'not ok - Expected exit status "{}" but got "{}" - Flags: {}'.
+                format(expected_exit_status, exit_status, command_line_args))
+        else:
+            print('ok - Flags: {}'.format(command_line_args))
+
+
+def main():
+    """Parses the command line arguments and then calls the core of the
+    script."""
+
+    parser = argparse.ArgumentParser(
+        description=
+        'Runs benchmark and checks for correct exit status and output.')
+    parser.add_argument('--proof', action='store_true')
+    parser.add_argument('--dump', action='store_true')
+    parser.add_argument('wrapper', nargs='*')
+    parser.add_argument('cvc4_binary')
+    parser.add_argument('benchmark')
+    args = parser.parse_args()
+    cvc4_binary = os.path.abspath(args.cvc4_binary)
+
+    wrapper = args.wrapper
+    if os.environ.get('VALGRIND') == '1' and not wrapper:
+        wrapper = ['libtool', '--mode=execute', 'valgrind']
+
+    run_regression(args.proof, args.dump, wrapper, cvc4_binary, args.benchmark)
+
+
+if __name__ == "__main__":
+    main()
author	Andres Noetzli <andres.noetzli@gmail.com>	2018-04-05 21:45:17 -0700
committer	GitHub <noreply@github.com>	2018-04-05 21:45:17 -0700
commit	efc6163629c6c5de446eccfe81777c93829995d5 (patch)
tree	8681a332d409e94fc4ab7259513bc7935e5568b5 /test/regress/run_regression.py
parent	cf73e3ceb3f93fef830349bd44afec99404b5038 (diff)