diff options
author | Andres Noetzli <andres.noetzli@gmail.com> | 2018-04-05 21:45:17 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-04-05 21:45:17 -0700 |
commit | efc6163629c6c5de446eccfe81777c93829995d5 (patch) | |
tree | 8681a332d409e94fc4ab7259513bc7935e5568b5 /test/regress/run_regression.py | |
parent | cf73e3ceb3f93fef830349bd44afec99404b5038 (diff) |
Python regression script (#1662)
Until now, we have been using a bash script for the running the
regressions. That script had several issues, e.g. it was creating many
temprorary files and it was calling itself recursively, making it
difficult to maintain. This commit replaces the script with a Python
script. The Python script uses the TAP protocol, which allows us to
display the results of multiple tests per file (e.g. with --check-models
and without) separately and with more details. It also outputs whenever it starts running
a test, which allows finding "stuck" regression tests on Travis. As recommended in the
automake documentation [0], the commit copies the TAP driver to enable
TAP support for the test driver.
Note: the commit also changes some of the regressions slightly because
we were using "%" for the test directives in SMT files which does not
correspond to the comment character ";". Using the comment character
simplifies the handling and makes it easier for users to reproduce a
failure (they can simply run CVC4 on the regression file).
[0]
https://www.gnu.org/software/automake/manual/html_node/Use-TAP-with-the-Automake-test-harness.html
Diffstat (limited to 'test/regress/run_regression.py')
-rwxr-xr-x | test/regress/run_regression.py | 287 |
1 files changed, 287 insertions, 0 deletions
diff --git a/test/regress/run_regression.py b/test/regress/run_regression.py new file mode 100755 index 000000000..c861b0d71 --- /dev/null +++ b/test/regress/run_regression.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python3 +""" +Usage: + + run_regression.py [ --proof | --dump ] [ wrapper ] cvc4-binary + [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ] + +Runs benchmark and checks for correct exit status and output. +""" + +import argparse +import difflib +import os +import re +import shlex +import subprocess +import sys + +SCRUBBER = 'SCRUBBER: ' +ERROR_SCRUBBER = 'ERROR-SCRUBBER: ' +EXPECT = 'EXPECT: ' +EXPECT_ERROR = 'EXPECT-ERROR: ' +EXIT = 'EXIT: ' +COMMAND_LINE = 'COMMAND-LINE: ' + + +def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc4_binary, + command_line, benchmark_dir, benchmark_filename): + """Runs CVC4 on the file `benchmark_filename` in the directory + `benchmark_dir` using the binary `cvc4_binary` with the command line + options `command_line`. The output is scrubbed using `scrubber` and + `error_scrubber` for stdout and stderr, respectively. If dump is true, the + function first uses CVC4 to read in and dump the benchmark file and then + uses that as input.""" + + bin_args = wrapper[:] + bin_args.append(cvc4_binary) + + output = None + error = None + exit_status = None + if dump: + dump_args = [ + '--preprocess-only', '--dump', 'raw-benchmark', + '--output-lang=smt2', '-qq' + ] + dump_process = subprocess.Popen( + bin_args + command_line + dump_args + [benchmark_filename], + cwd=benchmark_dir, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + dump_output, _ = dump_process.communicate() + process = subprocess.Popen( + bin_args + command_line + ['--lang=smt2', '-'], + cwd=benchmark_dir, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + output, error = process.communicate(input=dump_output) + exit_status = process.returncode + else: + process = subprocess.Popen( + bin_args + command_line + [benchmark_filename], + cwd=benchmark_dir, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + output, error = process.communicate() + exit_status = process.returncode + + # If a scrubber command has been specified then apply it to the output. + if scrubber: + scrubber_process = subprocess.Popen( + shlex.split(scrubber), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + output, _ = scrubber_process.communicate(input=output) + if error_scrubber: + error_scrubber_process = subprocess.Popen( + shlex.split(error_scrubber), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + error, _ = error_scrubber_process.communicate(input=error) + + # Popen in Python 3 returns a bytes object instead of a string for + # stdout/stderr. + if isinstance(output, bytes): + output = output.decode() + if isinstance(error, bytes): + error = error.decode() + return (output.strip(), error.strip(), exit_status) + + +def run_regression(proof, dump, wrapper, cvc4_binary, benchmark_path): + """Determines the expected output for a benchmark, runs CVC4 on it and then + checks whether the output corresponds to the expected output. Optionally + uses a wrapper `wrapper`, tests proof generation (if proof is true), or + dumps a benchmark and uses that as the input (if dump is true).""" + + if not os.access(cvc4_binary, os.X_OK): + sys.exit( + '"{}" does not exist or is not executable'.format(cvc4_binary)) + if not os.path.isfile(benchmark_path): + sys.exit('"{}" does not exist or is not a file'.format(benchmark_path)) + + basic_command_line_args = [] + + benchmark_basename = os.path.basename(benchmark_path) + benchmark_filename, benchmark_ext = os.path.splitext(benchmark_basename) + benchmark_dir = os.path.dirname(benchmark_path) + comment_char = '%' + status_regex = None + status_to_output = lambda s: s + if benchmark_ext == '.smt': + status_regex = r':status\s*(sat|unsat)' + comment_char = ';' + elif benchmark_ext == '.smt2': + status_regex = r'set-info\s*:status\s*(sat|unsat)' + comment_char = ';' + elif benchmark_ext == '.cvc': + pass + elif benchmark_ext == '.p': + basic_command_line_args.append('--finite-model-find') + status_regex = r'% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)' + status_to_output = lambda s: '% SZS status {} for {}'.format(s, benchmark_filename) + elif benchmark_ext == '.sy': + comment_char = ';' + # Do not use proofs/unsat-cores with .sy files + proof = False + else: + sys.exit('"{}" must be *.cvc or *.smt or *.smt2 or *.p or *.sy'.format( + benchmark_basename)) + + # If there is an ".expect" file for the benchmark, read the metadata + # from there, otherwise from the benchmark file. + metadata_filename = benchmark_path + '.expect' + if os.path.isfile(metadata_filename): + comment_char = '%' + else: + metadata_filename = benchmark_path + + metadata_lines = None + with open(metadata_filename, 'r') as metadata_file: + metadata_lines = metadata_file.readlines() + metadata_content = ''.join(metadata_lines) + + # Extract the metadata for the benchmark. + scrubber = None + error_scrubber = None + expected_output = '' + expected_error = '' + expected_exit_status = None + command_line = '' + for line in metadata_lines: + # Skip lines that do not start with "%" + if line[0] != comment_char: + continue + line = line[2:] + + if line.startswith(SCRUBBER): + scrubber = line[len(SCRUBBER):] + elif line.startswith(ERROR_SCRUBBER): + error_scrubber = line[len(ERROR_SCRUBBER):] + elif line.startswith(EXPECT): + expected_output += line[len(EXPECT):] + elif line.startswith(EXPECT_ERROR): + expected_error += line[len(EXPECT_ERROR):] + elif line.startswith(EXIT): + expected_exit_status = int(line[len(EXIT):]) + elif line.startswith(COMMAND_LINE): + command_line += line[len(COMMAND_LINE):] + expected_output = expected_output.strip() + expected_error = expected_error.strip() + + # Expected output/expected error has not been defined in the metadata for + # the benchmark. Try to extract the information from the benchmark itself. + if expected_output == '' and expected_error == '': + match = None + if status_regex: + match = re.search(status_regex, metadata_content) + + if match: + expected_output = status_to_output(match.group(1)) + elif expected_exit_status is None: + # If there is no expected output/error and the exit status has not + # been set explicitly, the benchmark is invalid. + sys.exit('Cannot determine status of "{}"'.format(benchmark_path)) + + if not proof and ('(get-unsat-cores)' in metadata_content + or '(get-unsat-assumptions)' in metadata_content): + print( + '1..0 # Skipped: unsat cores not supported without proof support') + return + + if expected_exit_status is None: + expected_exit_status = 0 + + if 'CVC4_REGRESSION_ARGS' in os.environ: + basic_command_line_args += shlex.split( + os.environ['CVC4_REGRESSION_ARGS']) + basic_command_line_args += shlex.split(command_line) + command_line_args_configs = [basic_command_line_args] + + extra_command_line_args = [] + if benchmark_ext == '.sy' and \ + '--no-check-synth-sol' not in basic_command_line_args and \ + '--check-synth-sol' not in basic_command_line_args: + extra_command_line_args = ['--check-synth-sol'] + if re.search(r'^(sat|invalid|unknown)$', expected_output) and \ + '--no-check-models' not in basic_command_line_args: + extra_command_line_args = ['--check-models'] + if proof and re.search(r'^(sat|valid)$', expected_output): + if '--no-check-proofs' not in basic_command_line_args and \ + '--incremental' not in basic_command_line_args and \ + '--unconstrained-simp' not in basic_command_line_args and \ + not cvc4_binary.endswith('pcvc4'): + extra_command_line_args = [ + '--check-proofs', '--no-bv-eq', '--no-bv-ineq', + '--no-bv-algebraic' + ] + if '--no-check-unsat-cores' not in basic_command_line_args and \ + '--incremental' not in basic_command_line_args and \ + '--unconstrained-simp' not in basic_command_line_args and \ + not cvc4_binary.endswith('pcvc4'): + extra_command_line_args = [ + '--check-proofs', '--no-bv-eq', '--no-bv-ineq', + '--no-bv-algebraic' + ] + if extra_command_line_args: + command_line_args_configs.append( + basic_command_line_args + extra_command_line_args) + + # Run CVC4 on the benchmark with the different option sets and check + # whether the exit status, stdout output, stderr output are as expected. + print('1..{}'.format(len(command_line_args_configs))) + print('# Starting') + for command_line_args in command_line_args_configs: + output, error, exit_status = run_benchmark( + dump, wrapper, scrubber, error_scrubber, cvc4_binary, + command_line_args, benchmark_dir, benchmark_basename) + if output != expected_output: + print( + 'not ok - Differences between expected and actual output on stdout - Flags: {}'. + format(command_line_args)) + for line in difflib.context_diff(output.splitlines(), + expected_output.splitlines()): + print(line) + elif error != expected_error: + print( + 'not ok - Differences between expected and actual output on stderr - Flags: {}'. + format(command_line_args)) + for line in difflib.context_diff(error.splitlines(), + expected_error.splitlines()): + print(line) + elif expected_exit_status != exit_status: + print( + 'not ok - Expected exit status "{}" but got "{}" - Flags: {}'. + format(expected_exit_status, exit_status, command_line_args)) + else: + print('ok - Flags: {}'.format(command_line_args)) + + +def main(): + """Parses the command line arguments and then calls the core of the + script.""" + + parser = argparse.ArgumentParser( + description= + 'Runs benchmark and checks for correct exit status and output.') + parser.add_argument('--proof', action='store_true') + parser.add_argument('--dump', action='store_true') + parser.add_argument('wrapper', nargs='*') + parser.add_argument('cvc4_binary') + parser.add_argument('benchmark') + args = parser.parse_args() + cvc4_binary = os.path.abspath(args.cvc4_binary) + + wrapper = args.wrapper + if os.environ.get('VALGRIND') == '1' and not wrapper: + wrapper = ['libtool', '--mode=execute', 'valgrind'] + + run_regression(args.proof, args.dump, wrapper, cvc4_binary, args.benchmark) + + +if __name__ == "__main__": + main() |