diff options
author | Andres Noetzli <andres.noetzli@gmail.com> | 2018-04-05 21:45:17 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-04-05 21:45:17 -0700 |
commit | efc6163629c6c5de446eccfe81777c93829995d5 (patch) | |
tree | 8681a332d409e94fc4ab7259513bc7935e5568b5 /test | |
parent | cf73e3ceb3f93fef830349bd44afec99404b5038 (diff) |
Python regression script (#1662)
Until now, we have been using a bash script for the running the
regressions. That script had several issues, e.g. it was creating many
temprorary files and it was calling itself recursively, making it
difficult to maintain. This commit replaces the script with a Python
script. The Python script uses the TAP protocol, which allows us to
display the results of multiple tests per file (e.g. with --check-models
and without) separately and with more details. It also outputs whenever it starts running
a test, which allows finding "stuck" regression tests on Travis. As recommended in the
automake documentation [0], the commit copies the TAP driver to enable
TAP support for the test driver.
Note: the commit also changes some of the regressions slightly because
we were using "%" for the test directives in SMT files which does not
correspond to the comment character ";". Using the comment character
simplifies the handling and makes it easier for users to reproduce a
failure (they can simply run CVC4 on the regression file).
[0]
https://www.gnu.org/software/automake/manual/html_node/Use-TAP-with-the-Automake-test-harness.html
Diffstat (limited to 'test')
-rw-r--r-- | test/regress/Makefile.am | 8 | ||||
-rw-r--r-- | test/regress/regress0/arrays/incorrect10.smt | 4 | ||||
-rw-r--r-- | test/regress/regress0/expect/scrub.02.smt | 10 | ||||
-rw-r--r-- | test/regress/regress0/fmf/Arrow_Order-smtlib.778341.smt | 4 | ||||
-rw-r--r-- | test/regress/regress0/fmf/QEpres-uf.855035.smt | 4 | ||||
-rw-r--r-- | test/regress/regress1/fmf/Hoare-z3.931718.smt | 4 | ||||
-rwxr-xr-x | test/regress/run_regression | 424 | ||||
-rwxr-xr-x | test/regress/run_regression.py | 287 |
8 files changed, 305 insertions, 440 deletions
diff --git a/test/regress/Makefile.am b/test/regress/Makefile.am index f068ce9ad..19cbbae4a 100644 --- a/test/regress/Makefile.am +++ b/test/regress/Makefile.am @@ -9,7 +9,9 @@ TESTS = $(REG0_TESTS) $(REG1_TESTS) $(REG2_TESTS) $(REG3_TESTS) $(REG4_TESTS) @mk_empty@BINARY = cvc4 end@mk_if@ -LOG_COMPILER = $(top_srcdir)/test/regress/run_regression +LOG_DRIVER = env AM_TAP_AWK='$(AWK)' $(SHELL) \ + $(top_srcdir)/config/tap-driver.sh --comments +LOG_COMPILER = @abs_top_srcdir@/test/regress/run_regression.py AM_LOG_FLAGS = $(RUN_REGRESSION_ARGS) @abs_top_builddir@/src/main/$(BINARY)$(EXEEXT) if AUTOMAKE_1_11 @@ -55,5 +57,5 @@ EXTRA_DIST = \ regress0/tptp/Axioms/SYN000_0.ax \ Makefile.levels \ Makefile.tests \ - run_regression \ - README.md + run_regression.py \ + README.md
\ No newline at end of file diff --git a/test/regress/regress0/arrays/incorrect10.smt b/test/regress/regress0/arrays/incorrect10.smt index 6fbdb7ee9..6aacc37c0 100644 --- a/test/regress/regress0/arrays/incorrect10.smt +++ b/test/regress/regress0/arrays/incorrect10.smt @@ -1,5 +1,5 @@ -% COMMAND-LINE: --no-check-proofs -% EXPECT: unsat +; COMMAND-LINE: --no-check-proofs +; EXPECT: unsat (benchmark fuzzsmt :logic QF_AX :status unsat diff --git a/test/regress/regress0/expect/scrub.02.smt b/test/regress/regress0/expect/scrub.02.smt index 145801619..65c61ecd0 100644 --- a/test/regress/regress0/expect/scrub.02.smt +++ b/test/regress/regress0/expect/scrub.02.smt @@ -1,8 +1,8 @@ -% SCRUBBER: sed -e 's/The fact in question: .*$/The fact in question: TERM/' -% EXPECT: (error "A non-linear fact was asserted to arithmetic in a linear logic. -% EXPECT: The fact in question: TERM -% EXPECT: ") -% EXIT: 1 +; SCRUBBER: sed -e 's/The fact in question: .*$/The fact in question: TERM/' +; EXPECT: (error "A non-linear fact was asserted to arithmetic in a linear logic. +; EXPECT: The fact in question: TERM +; EXPECT: ") +; EXIT: 1 (benchmark reject_nonlinear :logic QF_LRA :extrafuns ((n Real)) diff --git a/test/regress/regress0/fmf/Arrow_Order-smtlib.778341.smt b/test/regress/regress0/fmf/Arrow_Order-smtlib.778341.smt index f62f057e4..e8c7949dc 100644 --- a/test/regress/regress0/fmf/Arrow_Order-smtlib.778341.smt +++ b/test/regress/regress0/fmf/Arrow_Order-smtlib.778341.smt @@ -1,5 +1,5 @@ -% COMMAND-LINE: --finite-model-find --mbqi=gen-ev -% EXPECT: unsat +; COMMAND-LINE: --finite-model-find --mbqi=gen-ev +; EXPECT: unsat (benchmark Isabelle :status sat :logic AUFLIA diff --git a/test/regress/regress0/fmf/QEpres-uf.855035.smt b/test/regress/regress0/fmf/QEpres-uf.855035.smt index 2945c8f4d..4fe592638 100644 --- a/test/regress/regress0/fmf/QEpres-uf.855035.smt +++ b/test/regress/regress0/fmf/QEpres-uf.855035.smt @@ -1,5 +1,5 @@ -% COMMAND-LINE: --finite-model-find --mbqi=gen-ev -% EXPECT: sat +; COMMAND-LINE: --finite-model-find --mbqi=gen-ev +; EXPECT: sat (benchmark Isabelle :status sat :logic AUFLIA diff --git a/test/regress/regress1/fmf/Hoare-z3.931718.smt b/test/regress/regress1/fmf/Hoare-z3.931718.smt index 754e19e88..4ab90756c 100644 --- a/test/regress/regress1/fmf/Hoare-z3.931718.smt +++ b/test/regress/regress1/fmf/Hoare-z3.931718.smt @@ -1,5 +1,5 @@ -% COMMAND-LINE: --finite-model-find -% EXPECT: sat +; COMMAND-LINE: --finite-model-find +; EXPECT: sat (benchmark Isabelle :status sat :logic AUFLIA diff --git a/test/regress/run_regression b/test/regress/run_regression deleted file mode 100755 index 4ae013911..000000000 --- a/test/regress/run_regression +++ /dev/null @@ -1,424 +0,0 @@ -#!/bin/bash -# -# Morgan Deters <mdeters@cs.nyu.edu> -# for the CVC4 project -# -# usage: -# -# run_regression [ --proof | --dump ] [ wrapper ] cvc4-binary [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ] -# -# Runs benchmark and checks for correct exit status and output. -# -# The TEST_GROUP and TEST_GROUPS environment variables can be used to split the -# benchmarks into groups and only run one of the groups. The benchmarks are -# associated with a group based on a hash of their name. TEST_GROUPS controls -# how many groups are created while TEST_GROUP specifies the group to run. -# Currently, the primary use case for this is to split the benchmarks across -# multiple Travis jobs. - -# ulimit -t 1 # For detecting long running regressions -ulimit -s 65000 # Needed for some (esp. portfolio and model-building) - -prog=`basename "$0"` - -if [ $# -lt 2 ]; then - echo "usage: $prog [ --proof | --dump ] [ wrapper ] cvc4-binary [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ]" >&2 - exit 1 -fi - -proof=no -dump=no -if [ x"$1" = x--proof ]; then - proof=yes - shift -elif [ x"$1" = x--dump ]; then - dump=yes - shift -fi - -if [ $# -lt 2 ]; then - echo "usage: $prog [ --proof | --dump ] [ wrapper ] cvc4-binary [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ]" >&2 - exit 1 -fi - -wrapper= -while [ $# -gt 2 ]; do - wrapper="$wrapper$1 " - shift -done - -[[ "$VALGRIND" = "1" ]] && { - wrapper="libtool --mode=execute valgrind $wrapper" -} - -cvc4=$1 -benchmark_orig=$2 -benchmark="$benchmark_orig" - -if [ -n "$TEST_GROUP" ]; then - # Use the checksum of the benchmark name to determine the group of the - # benchmark - benchmark_group=$(( $(echo "$benchmark" | cksum | cut -d " " -f1) % $TEST_GROUPS )) - if (( $benchmark_group != $TEST_GROUP )); then - # Skip the benchmark if it is not in the expected test group - exit 77 - fi -fi - -function error { - echo "$prog: error: $*" - exit 1 -} - -if ! [ -x "$cvc4" ]; then - error "\`$cvc4' doesn't exist or isn't executable" >&2 -fi -if ! [ -r "$benchmark" ]; then - error "\`$benchmark' doesn't exist or isn't readable" >&2 -fi - -# gettemp() and its associated tempfiles[] array are intended to never -# allow a temporary file to leak---the trap ensures that when this script -# exits, whether via a regular exit or an -INT or other signal, the -# temp files are deleted. -declare -a tempfiles -trap -- 'test ${#tempfiles[@]} -gt 0 && rm -f "${tempfiles[@]}"' EXIT -function gettemp { - local temp="`mktemp -t "$2"`" - tempfiles[${#tempfiles[@]}]="$temp" - eval "$1"="$temp" -} - -tmpbenchmark= -if expr "$benchmark" : '.*\.smt$' &>/dev/null; then - lang=smt1 - if test -e "$benchmark.expect"; then - scrubber=`grep '^% SCRUBBER: ' "$benchmark.expect" | sed 's,^% SCRUBBER: ,,'` - errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark.expect" | sed 's,^% ERROR-SCRUBBER: ,,'` - expected_output=`grep '^% EXPECT: ' "$benchmark.expect" | sed 's,^% EXPECT: ,,'` - expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark.expect" | sed 's,^% EXPECT-ERROR: ,,'` - expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark.expect" | perl -pe 's,^% EXIT: ,,;s,\r,,'` - command_line=`grep '^% COMMAND-LINE: ' "$benchmark.expect" | sed 's,^% COMMAND-LINE: ,,'` - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi - elif grep '^% \(PROOF\|EXPECT\|EXPECT-ERROR\|EXIT\|COMMAND-LINE\|SCRUBBER\|ERROR-SCRUBBER\): ' "$benchmark" "$benchmark" &>/dev/null; then - scrubber=`grep '^% SCRUBBER: ' "$benchmark" | sed 's,^% SCRUBBER: ,,'` - errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark" | sed 's,^% ERROR-SCRUBBER: ,,'` - expected_output=`grep '^% EXPECT: ' "$benchmark" | sed 's,^% EXPECT: ,,'` - expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark" | sed 's,^% EXPECT-ERROR: ,,'` - expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark" | perl -pe 's,^% EXIT: ,,;s,\r,,'` - command_line=`grep '^% COMMAND-LINE: ' "$benchmark" | sed 's,^% COMMAND-LINE: ,,'` - # old mktemp from coreutils 7.x is broken, can't do XXXX in the middle - # this frustrates our auto-language-detection - gettemp tmpbenchmark cvc4_benchmark.smt.$$.XXXXXXXXXX - grep -v '^% \(PROOF\|EXPECT\|EXPECT-ERROR\|EXIT\|COMMAND-LINE\|SCRUBBER\|ERROR-SCRUBBER\): ' "$benchmark" >"$tmpbenchmark" - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi - benchmark=$tmpbenchmark - elif grep '^ *:status *sat' "$benchmark" &>/dev/null; then - expected_output=sat - expected_exit_status=0 - command_line= - elif grep '^ *:status *unsat' "$benchmark" &>/dev/null; then - expected_output=unsat - expected_exit_status=0 - command_line= - else - error "cannot determine status of \`$benchmark'" - fi -elif expr "$benchmark" : '.*\.smt2$' &>/dev/null; then - lang=smt2 - - # If this test case requires unsat cores but CVC4 is not built with proof - # support, skip it. Note: checking $CVC4_REGRESSION_ARGS instead of $proof - # here because $proof is not set for the second run. - requires_proof=`grep '(get-unsat-core)\|(get-unsat-assumptions)' "$benchmark"` - if [[ ! "$CVC4_REGRESSION_ARGS" = *"--proof"* ]] && [ -n "$requires_proof" ]; then - exit 77 - fi - - if test -e "$benchmark.expect"; then - scrubber=`grep '^% SCRUBBER: ' "$benchmark.expect" | sed 's,^% SCRUBBER: ,,'` - errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark.expect" | sed 's,^% ERROR-SCRUBBER: ,,'` - expected_output=`grep '^% EXPECT: ' "$benchmark.expect" | sed 's,^% EXPECT: ,,'` - expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark.expect" | sed 's,^% EXPECT-ERROR: ,,'` - expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark.expect" | perl -pe 's,^% EXIT: ,,;s,\r,,'` - command_line=`grep '^% COMMAND-LINE: ' "$benchmark.expect" | sed 's,^% COMMAND-LINE: ,,'` - elif grep '^; \(EXPECT\|EXPECT-ERROR\|EXIT\|COMMAND-LINE\|SCRUBBER\|ERROR-SCRUBBER\): ' "$benchmark" "$benchmark" &>/dev/null; then - scrubber=`grep '^; SCRUBBER: ' "$benchmark" | sed 's,^; SCRUBBER: ,,'` - errscrubber=`grep '^; ERROR-SCRUBBER: ' "$benchmark" | sed 's,^; ERROR-SCRUBBER: ,,'` - expected_output=`grep '^; EXPECT: ' "$benchmark" | sed 's,^; EXPECT: ,,'` - expected_error=`grep '^; EXPECT-ERROR: ' "$benchmark" | sed 's,^; EXPECT-ERROR: ,,'` - expected_exit_status=`grep -m 1 '^; EXIT: ' "$benchmark" | perl -pe 's,^; EXIT: ,,;s,\r,,'` - command_line=`grep '^; COMMAND-LINE: ' "$benchmark" | sed 's,^; COMMAND-LINE: ,,'` - fi - - # If expected output/error was not given, try to determine the status from - # the commands. - if [ -z "$expected_output" -a -z "$expected_error" ]; then - if grep '^ *( *set-info *:status *sat' "$benchmark" &>/dev/null; then - expected_output=sat - elif grep '^ *( *set-info *:status *unsat' "$benchmark" &>/dev/null; then - expected_output=unsat - fi - fi - - # A valid test case needs at least an expected output (explicit or through - # SMT2 commands) or an expected exit status. - if [ -z "$expected_output" -a -z "$expected_error" -a -z "$expected_exit_status" ]; then - error "cannot determine status of \`$benchmark'" - fi - - # If no explicit expected exit status was given, we assume that the solver is - # supposed to succeed. - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi -elif expr "$benchmark" : '.*\.cvc$' &>/dev/null; then - lang=cvc4 - scrubber=`grep '^% SCRUBBER: ' "$benchmark" | sed 's,^% SCRUBBER: ,,'` - errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark" | sed 's,^% ERROR-SCRUBBER: ,,'` - expected_output=$(grep '^% EXPECT: ' "$benchmark") - expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark" | sed 's,^% EXPECT-ERROR: ,,'` - if [ -z "$expected_output" -a -z "$expected_error" ]; then - error "cannot determine expected output of \`$benchmark': " \ - "please use \`% EXPECT:' and/or \`% EXPECT-ERROR:' gestures" - fi - expected_output=$(echo "$expected_output" | perl -pe 's,^% EXPECT: ,,;s,\r,,') - expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark" | perl -pe 's,^% EXIT: ,,;s,\r,,'` - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi - command_line=`grep '^% COMMAND-LINE: ' "$benchmark" | sed 's,^% COMMAND-LINE: ,,'` -elif expr "$benchmark" : '.*\.p$' &>/dev/null; then - lang=tptp - command_line=--finite-model-find - scrubber=`grep '^% SCRUBBER: ' "$benchmark" | sed 's,^% SCRUBBER: ,,'` - errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark" | sed 's,^% ERROR-SCRUBBER: ,,'` - expected_output=$(grep '^% EXPECT: ' "$benchmark") - expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark" | sed 's,^% EXPECT-ERROR: ,,'` - if [ -z "$expected_output" -a -z "$expected_error" ]; then - if grep -q '^% Status *: ' "$benchmark"; then - expected_output="$(grep '^% *Status *: ' "$benchmark" | head -1 | awk '{print$NF}')" - case "$expected_output" in - Theorem|Unsatisfiable) expected_exit_status=0 ;; - CounterSatisfiable|Satisfiable) expected_exit_status=0 ;; - GaveUp) expected_exit_status=0 ;; - esac - expected_output="% SZS status $expected_output for $(basename "$benchmark" | sed 's,\.p$,,')" - else - error "cannot determine expected output of \`$benchmark': " \ - "please use \`% EXPECT:' and/or \`% EXPECT-ERROR:' gestures" - fi - else - expected_output=$(echo "$expected_output" | perl -pe 's,^% EXPECT: ,,;s,\r,,') - expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark" | perl -pe 's,^% EXIT: ,,;s,\r,,'` - fi - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi - if grep -q '^% COMMAND-LINE: ' "$benchmark"; then - command_line=`grep '^% COMMAND-LINE: ' "$benchmark" | sed 's,^% COMMAND-LINE: ,,'` - fi -elif expr "$benchmark" : '.*\.sy$' &>/dev/null; then - lang=sygus - if test -e "$benchmark.expect"; then - scrubber=`grep '^% SCRUBBER: ' "$benchmark.expect" | sed 's,^% SCRUBBER: ,,'` - errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark.expect" | sed 's,^% ERROR-SCRUBBER: ,,'` - expected_output=`grep '^% EXPECT: ' "$benchmark.expect" | sed 's,^% EXPECT: ,,'` - expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark.expect" | sed 's,^% EXPECT-ERROR: ,,'` - expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark.expect" | perl -pe 's,^% EXIT: ,,;s,\r,,'` - command_line=`grep '^% COMMAND-LINE: ' "$benchmark.expect" | sed 's,^% COMMAND-LINE: ,,'` - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi - elif grep '^; \(EXPECT\|EXPECT-ERROR\|EXIT\|COMMAND-LINE\|SCRUBBER\|ERROR-SCRUBBER\): ' "$benchmark" "$benchmark" &>/dev/null; then - scrubber=`grep '^; SCRUBBER: ' "$benchmark" | sed 's,^; SCRUBBER: ,,'` - errscrubber=`grep '^; ERROR-SCRUBBER: ' "$benchmark" | sed 's,^; ERROR-SCRUBBER: ,,'` - expected_output=`grep '^; EXPECT: ' "$benchmark" | sed 's,^; EXPECT: ,,'` - expected_error=`grep '^; EXPECT-ERROR: ' "$benchmark" | sed 's,^; EXPECT-ERROR: ,,'` - expected_exit_status=`grep -m 1 '^; EXIT: ' "$benchmark" | perl -pe 's,^; EXIT: ,,;s,\r,,'` - command_line=`grep '^; COMMAND-LINE: ' "$benchmark" | sed 's,^; COMMAND-LINE: ,,'` - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi - else - error "cannot determine expected output of \`$benchmark'" - fi -else - error "benchmark \`$benchmark' must be *.cvc or *.smt or *.smt2 or *.p or *.sy" -fi - -command_line="--lang=$lang ${command_line:+$command_line }" - -gettemp expoutfile cvc4_expect_stdout.$$.XXXXXXXXXX -gettemp experrfile cvc4_expect_stderr.$$.XXXXXXXXXX -gettemp outfile cvc4_stdout.$$.XXXXXXXXXX -gettemp errfile cvc4_stderr.$$.XXXXXXXXXX -gettemp errfilefix cvc4_stderr.$$.XXXXXXXXXX -gettemp exitstatusfile cvc4_exitstatus.$$.XXXXXXXXXX - -if [ -z "$expected_output" ]; then - # in case expected stdout output is empty, make sure we don't differ - # by a newline, which we would if we echo "" >"$expoutfile" - touch "$expoutfile" -else - echo "$expected_output" >"$expoutfile" -fi - -if [ "$lang" = sygus ]; then - if ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--check-synth-sol' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--no-check-synth-sol' &>/dev/null; then - # later on, we'll run another test with --check-models on - command_line="$command_line --check-synth-sol" - fi -fi -check_models=false -if grep '^sat$' "$expoutfile" &>/dev/null || grep '^invalid$' "$expoutfile" &>/dev/null || grep '^unknown$' "$expoptfile" &>/dev/null; then - if ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--check-models' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--no-check-models' &>/dev/null; then - # later on, we'll run another test with --check-models on - check_models=true - fi -fi -check_proofs=false -check_unsat_cores=false -if [ "$proof" = yes ]; then - # proofs not currently supported in incremental mode, turn it off - if grep '^unsat$' "$expoutfile" &>/dev/null || grep '^valid$' "$expoutfile" &>/dev/null &>/dev/null; then - if ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--check-proofs' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--no-check-proofs' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--incremental' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--unconstrained-simp' &>/dev/null && - ! expr " $CVC4_REGRESSION_ARGS $command_line" : '.* -[a-zA-Z]*i' &>/dev/null && - ! expr "$BINARY" : '.*pcvc4' &>/dev/null && - ! expr "$benchmark" : '.*\.sy$' &>/dev/null; then - # later on, we'll run another test with --check-proofs on - check_proofs=true - fi - if ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--check-unsat-cores' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--no-check-unsat-cores' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--incremental' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--unconstrained-simp' &>/dev/null && - ! expr " $CVC4_REGRESSION_ARGS $command_line" : '.* -[a-zA-Z]*i' &>/dev/null && - ! expr "$BINARY" : '.*pcvc4' &>/dev/null && - ! expr "$benchmark" : '.*\.sy$' &>/dev/null; then - # later on, we'll run another test with --check-unsat-cores on - check_unsat_cores=true - fi - fi -fi - -if [ -z "$expected_error" ]; then - # in case expected stderr output is empty, make sure we don't differ - # by a newline, which we would if we echo "" >"$experrfile" - touch "$experrfile" -else - echo "$expected_error" >"$experrfile" -fi - -cvc4dir=`dirname "$cvc4"` -cvc4dirfull=`cd "$cvc4dir" && pwd` -if [ -z "$cvc4dirfull" ]; then - error "getting directory of \`$cvc4 !?" -fi -cvc4base=`basename "$cvc4"` -cvc4full="$cvc4dirfull/$cvc4base" -if [ $dump = no ]; then - echo running $wrapper $cvc4full $CVC4_REGRESSION_ARGS $command_line `basename "$benchmark"` [from working dir `dirname "$benchmark"`] - time ( :; \ - ( cd `dirname "$benchmark"`; - $wrapper "$cvc4full" $CVC4_REGRESSION_ARGS $command_line `basename "$benchmark"`; - echo $? >"$exitstatusfile" - ) > "$outfile" 2> "$errfile" ) -else - echo running $wrapper $cvc4full $CVC4_REGRESSION_ARGS $command_line --preprocess-only --dump=clauses --output-lang=smt2 -q `basename "$benchmark"` \| $wrapper $cvc4full $CVC4_REGRESSION_ARGS $command_line --lang=smt2 - [from working dir `dirname "$benchmark"`] - time ( :; \ - ( cd `dirname "$benchmark"`; - $wrapper "$cvc4full" $CVC4_REGRESSION_ARGS $command_line --preprocess-only --dump=clauses --output-lang=smt2 -q `basename "$benchmark"` | $wrapper "$cvc4full" $CVC4_REGRESSION_ARGS $command_line --lang=smt2 -; - echo $? >"$exitstatusfile" - ) > "$outfile" 2> "$errfile" ) -fi - -# we have to actual error file same treatment as other files. differences in -# versions of echo/bash were causing failure on some platforms and not on others -# (also grep out valgrind output, if 0 errors reported by valgrind) -actual_error=$(cat $errfile) -if [[ "$VALGRIND" = "1" ]]; then - #valgrind_output=$(cat $errfile|grep -E "^==[0-9]+== "|) - valgrind_num_errors=$(cat $errfile|grep -E "^==[0-9]+== "|tail -n1|awk '{print $4}') - echo "valgrind errors (not suppressed): $valgrind_num_errors" 1>&2 - - ((valgrind_num_errors == 0)) && actual_error=$(echo "$actual_error"|grep -vE "^==[0-9]+== ") -fi -if [ -z "$actual_error" ]; then - # in case expected stderr output is empty, make sure we don't differ - # by a newline, which we would if we echo "" >"$experrfile" - touch "$errfilefix" -else - echo "$actual_error" >"$errfilefix" -fi - -# Scrub the output file if a scrubber has been specified. -if [ -n "$scrubber" ]; then - echo "About to scrub $outfile using $scrubber" - mv "$outfile" "$outfile.prescrub" - cat "$outfile.prescrub" | eval $scrubber >"$outfile" -else - echo "not scrubbing" -fi - -# Scrub the error file if a scrubber has been specified. -if [ -n "$errscrubber" ]; then - echo "About to scrub $errfilefix using $errscrubber" - mv "$errfilefix" "$errfilefix.prescrub" - cat "$errfilefix.prescrub" | eval $errscrubber >"$errfilefix" -else - echo "not scrubbing error file" -fi - -diffs=`diff -u --strip-trailing-cr "$expoutfile" "$outfile"` -diffexit=$? -diffserr=`diff -u --strip-trailing-cr "$experrfile" "$errfilefix"` -diffexiterr=$? -exit_status=`cat "$exitstatusfile"` - -exitcode=0 -if [ $diffexit -ne 0 ]; then - echo "$prog: error: differences between expected and actual output on stdout" - echo "$diffs" - exitcode=1 -fi -if [ $diffexiterr -ne 0 ]; then - echo "$prog: error: differences between expected and actual output on stderr" - echo "$diffserr" - exitcode=1 -fi - -if [ "$exit_status" != "$expected_exit_status" ]; then - echo "$prog: error: expected exit status \`$expected_exit_status' but got \`$exit_status'" - exitcode=1 -fi - -if $check_models || $check_proofs || $check_unsat_cores; then - check_cmdline= - if $check_models; then - check_cmdline="$check_cmdline --check-models" - fi - if $check_proofs; then - # taking: Make the extra flags part of --check-proofs. - check_cmdline="$check_cmdline --check-proofs --no-bv-eq --no-bv-ineq --no-bv-algebraic" - fi - if $check_unsat_cores; then - check_cmdline="$check_cmdline --check-unsat-cores" - fi - # run an extra model/proof/core-checking pass - if ! CVC4_REGRESSION_ARGS="$CVC4_REGRESSION_ARGS$check_cmdline" "$0" $wrapper "$cvc4" "$benchmark_orig"; then - exitcode=1 - fi -fi - - -exit $exitcode diff --git a/test/regress/run_regression.py b/test/regress/run_regression.py new file mode 100755 index 000000000..c861b0d71 --- /dev/null +++ b/test/regress/run_regression.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python3 +""" +Usage: + + run_regression.py [ --proof | --dump ] [ wrapper ] cvc4-binary + [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ] + +Runs benchmark and checks for correct exit status and output. +""" + +import argparse +import difflib +import os +import re +import shlex +import subprocess +import sys + +SCRUBBER = 'SCRUBBER: ' +ERROR_SCRUBBER = 'ERROR-SCRUBBER: ' +EXPECT = 'EXPECT: ' +EXPECT_ERROR = 'EXPECT-ERROR: ' +EXIT = 'EXIT: ' +COMMAND_LINE = 'COMMAND-LINE: ' + + +def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc4_binary, + command_line, benchmark_dir, benchmark_filename): + """Runs CVC4 on the file `benchmark_filename` in the directory + `benchmark_dir` using the binary `cvc4_binary` with the command line + options `command_line`. The output is scrubbed using `scrubber` and + `error_scrubber` for stdout and stderr, respectively. If dump is true, the + function first uses CVC4 to read in and dump the benchmark file and then + uses that as input.""" + + bin_args = wrapper[:] + bin_args.append(cvc4_binary) + + output = None + error = None + exit_status = None + if dump: + dump_args = [ + '--preprocess-only', '--dump', 'raw-benchmark', + '--output-lang=smt2', '-qq' + ] + dump_process = subprocess.Popen( + bin_args + command_line + dump_args + [benchmark_filename], + cwd=benchmark_dir, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + dump_output, _ = dump_process.communicate() + process = subprocess.Popen( + bin_args + command_line + ['--lang=smt2', '-'], + cwd=benchmark_dir, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + output, error = process.communicate(input=dump_output) + exit_status = process.returncode + else: + process = subprocess.Popen( + bin_args + command_line + [benchmark_filename], + cwd=benchmark_dir, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + output, error = process.communicate() + exit_status = process.returncode + + # If a scrubber command has been specified then apply it to the output. + if scrubber: + scrubber_process = subprocess.Popen( + shlex.split(scrubber), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + output, _ = scrubber_process.communicate(input=output) + if error_scrubber: + error_scrubber_process = subprocess.Popen( + shlex.split(error_scrubber), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + error, _ = error_scrubber_process.communicate(input=error) + + # Popen in Python 3 returns a bytes object instead of a string for + # stdout/stderr. + if isinstance(output, bytes): + output = output.decode() + if isinstance(error, bytes): + error = error.decode() + return (output.strip(), error.strip(), exit_status) + + +def run_regression(proof, dump, wrapper, cvc4_binary, benchmark_path): + """Determines the expected output for a benchmark, runs CVC4 on it and then + checks whether the output corresponds to the expected output. Optionally + uses a wrapper `wrapper`, tests proof generation (if proof is true), or + dumps a benchmark and uses that as the input (if dump is true).""" + + if not os.access(cvc4_binary, os.X_OK): + sys.exit( + '"{}" does not exist or is not executable'.format(cvc4_binary)) + if not os.path.isfile(benchmark_path): + sys.exit('"{}" does not exist or is not a file'.format(benchmark_path)) + + basic_command_line_args = [] + + benchmark_basename = os.path.basename(benchmark_path) + benchmark_filename, benchmark_ext = os.path.splitext(benchmark_basename) + benchmark_dir = os.path.dirname(benchmark_path) + comment_char = '%' + status_regex = None + status_to_output = lambda s: s + if benchmark_ext == '.smt': + status_regex = r':status\s*(sat|unsat)' + comment_char = ';' + elif benchmark_ext == '.smt2': + status_regex = r'set-info\s*:status\s*(sat|unsat)' + comment_char = ';' + elif benchmark_ext == '.cvc': + pass + elif benchmark_ext == '.p': + basic_command_line_args.append('--finite-model-find') + status_regex = r'% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)' + status_to_output = lambda s: '% SZS status {} for {}'.format(s, benchmark_filename) + elif benchmark_ext == '.sy': + comment_char = ';' + # Do not use proofs/unsat-cores with .sy files + proof = False + else: + sys.exit('"{}" must be *.cvc or *.smt or *.smt2 or *.p or *.sy'.format( + benchmark_basename)) + + # If there is an ".expect" file for the benchmark, read the metadata + # from there, otherwise from the benchmark file. + metadata_filename = benchmark_path + '.expect' + if os.path.isfile(metadata_filename): + comment_char = '%' + else: + metadata_filename = benchmark_path + + metadata_lines = None + with open(metadata_filename, 'r') as metadata_file: + metadata_lines = metadata_file.readlines() + metadata_content = ''.join(metadata_lines) + + # Extract the metadata for the benchmark. + scrubber = None + error_scrubber = None + expected_output = '' + expected_error = '' + expected_exit_status = None + command_line = '' + for line in metadata_lines: + # Skip lines that do not start with "%" + if line[0] != comment_char: + continue + line = line[2:] + + if line.startswith(SCRUBBER): + scrubber = line[len(SCRUBBER):] + elif line.startswith(ERROR_SCRUBBER): + error_scrubber = line[len(ERROR_SCRUBBER):] + elif line.startswith(EXPECT): + expected_output += line[len(EXPECT):] + elif line.startswith(EXPECT_ERROR): + expected_error += line[len(EXPECT_ERROR):] + elif line.startswith(EXIT): + expected_exit_status = int(line[len(EXIT):]) + elif line.startswith(COMMAND_LINE): + command_line += line[len(COMMAND_LINE):] + expected_output = expected_output.strip() + expected_error = expected_error.strip() + + # Expected output/expected error has not been defined in the metadata for + # the benchmark. Try to extract the information from the benchmark itself. + if expected_output == '' and expected_error == '': + match = None + if status_regex: + match = re.search(status_regex, metadata_content) + + if match: + expected_output = status_to_output(match.group(1)) + elif expected_exit_status is None: + # If there is no expected output/error and the exit status has not + # been set explicitly, the benchmark is invalid. + sys.exit('Cannot determine status of "{}"'.format(benchmark_path)) + + if not proof and ('(get-unsat-cores)' in metadata_content + or '(get-unsat-assumptions)' in metadata_content): + print( + '1..0 # Skipped: unsat cores not supported without proof support') + return + + if expected_exit_status is None: + expected_exit_status = 0 + + if 'CVC4_REGRESSION_ARGS' in os.environ: + basic_command_line_args += shlex.split( + os.environ['CVC4_REGRESSION_ARGS']) + basic_command_line_args += shlex.split(command_line) + command_line_args_configs = [basic_command_line_args] + + extra_command_line_args = [] + if benchmark_ext == '.sy' and \ + '--no-check-synth-sol' not in basic_command_line_args and \ + '--check-synth-sol' not in basic_command_line_args: + extra_command_line_args = ['--check-synth-sol'] + if re.search(r'^(sat|invalid|unknown)$', expected_output) and \ + '--no-check-models' not in basic_command_line_args: + extra_command_line_args = ['--check-models'] + if proof and re.search(r'^(sat|valid)$', expected_output): + if '--no-check-proofs' not in basic_command_line_args and \ + '--incremental' not in basic_command_line_args and \ + '--unconstrained-simp' not in basic_command_line_args and \ + not cvc4_binary.endswith('pcvc4'): + extra_command_line_args = [ + '--check-proofs', '--no-bv-eq', '--no-bv-ineq', + '--no-bv-algebraic' + ] + if '--no-check-unsat-cores' not in basic_command_line_args and \ + '--incremental' not in basic_command_line_args and \ + '--unconstrained-simp' not in basic_command_line_args and \ + not cvc4_binary.endswith('pcvc4'): + extra_command_line_args = [ + '--check-proofs', '--no-bv-eq', '--no-bv-ineq', + '--no-bv-algebraic' + ] + if extra_command_line_args: + command_line_args_configs.append( + basic_command_line_args + extra_command_line_args) + + # Run CVC4 on the benchmark with the different option sets and check + # whether the exit status, stdout output, stderr output are as expected. + print('1..{}'.format(len(command_line_args_configs))) + print('# Starting') + for command_line_args in command_line_args_configs: + output, error, exit_status = run_benchmark( + dump, wrapper, scrubber, error_scrubber, cvc4_binary, + command_line_args, benchmark_dir, benchmark_basename) + if output != expected_output: + print( + 'not ok - Differences between expected and actual output on stdout - Flags: {}'. + format(command_line_args)) + for line in difflib.context_diff(output.splitlines(), + expected_output.splitlines()): + print(line) + elif error != expected_error: + print( + 'not ok - Differences between expected and actual output on stderr - Flags: {}'. + format(command_line_args)) + for line in difflib.context_diff(error.splitlines(), + expected_error.splitlines()): + print(line) + elif expected_exit_status != exit_status: + print( + 'not ok - Expected exit status "{}" but got "{}" - Flags: {}'. + format(expected_exit_status, exit_status, command_line_args)) + else: + print('ok - Flags: {}'.format(command_line_args)) + + +def main(): + """Parses the command line arguments and then calls the core of the + script.""" + + parser = argparse.ArgumentParser( + description= + 'Runs benchmark and checks for correct exit status and output.') + parser.add_argument('--proof', action='store_true') + parser.add_argument('--dump', action='store_true') + parser.add_argument('wrapper', nargs='*') + parser.add_argument('cvc4_binary') + parser.add_argument('benchmark') + args = parser.parse_args() + cvc4_binary = os.path.abspath(args.cvc4_binary) + + wrapper = args.wrapper + if os.environ.get('VALGRIND') == '1' and not wrapper: + wrapper = ['libtool', '--mode=execute', 'valgrind'] + + run_regression(args.proof, args.dump, wrapper, cvc4_binary, args.benchmark) + + +if __name__ == "__main__": + main() |