diff options
-rwxr-xr-x | config/tap-driver.sh | 651 | ||||
-rw-r--r-- | configure.ac | 1 | ||||
-rw-r--r-- | test/regress/Makefile.am | 8 | ||||
-rw-r--r-- | test/regress/regress0/arrays/incorrect10.smt | 4 | ||||
-rw-r--r-- | test/regress/regress0/expect/scrub.02.smt | 10 | ||||
-rw-r--r-- | test/regress/regress0/fmf/Arrow_Order-smtlib.778341.smt | 4 | ||||
-rw-r--r-- | test/regress/regress0/fmf/QEpres-uf.855035.smt | 4 | ||||
-rw-r--r-- | test/regress/regress1/fmf/Hoare-z3.931718.smt | 4 | ||||
-rwxr-xr-x | test/regress/run_regression | 424 | ||||
-rwxr-xr-x | test/regress/run_regression.py | 287 |
10 files changed, 957 insertions, 440 deletions
diff --git a/config/tap-driver.sh b/config/tap-driver.sh new file mode 100755 index 000000000..d3e68bdc1 --- /dev/null +++ b/config/tap-driver.sh @@ -0,0 +1,651 @@ +#! /bin/sh +# Copyright (C) 2011-2018 Free Software Foundation, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +# This file is maintained in Automake, please report +# bugs to <bug-automake@gnu.org> or send patches to +# <automake-patches@gnu.org>. + +scriptversion=2013-12-23.17; # UTC + +# Make unconditional expansion of undefined variables an error. This +# helps a lot in preventing typo-related bugs. +set -u + +me=tap-driver.sh + +fatal () +{ + echo "$me: fatal: $*" >&2 + exit 1 +} + +usage_error () +{ + echo "$me: $*" >&2 + print_usage >&2 + exit 2 +} + +print_usage () +{ + cat <<END +Usage: + tap-driver.sh --test-name=NAME --log-file=PATH --trs-file=PATH + [--expect-failure={yes|no}] [--color-tests={yes|no}] + [--enable-hard-errors={yes|no}] [--ignore-exit] + [--diagnostic-string=STRING] [--merge|--no-merge] + [--comments|--no-comments] [--] TEST-COMMAND +The '--test-name', '-log-file' and '--trs-file' options are mandatory. +END +} + +# TODO: better error handling in option parsing (in particular, ensure +# TODO: $log_file, $trs_file and $test_name are defined). +test_name= # Used for reporting. +log_file= # Where to save the result and output of the test script. +trs_file= # Where to save the metadata of the test run. +expect_failure=0 +color_tests=0 +merge=0 +ignore_exit=0 +comments=0 +diag_string='#' +while test $# -gt 0; do + case $1 in + --help) print_usage; exit $?;; + --version) echo "$me $scriptversion"; exit $?;; + --test-name) test_name=$2; shift;; + --log-file) log_file=$2; shift;; + --trs-file) trs_file=$2; shift;; + --color-tests) color_tests=$2; shift;; + --expect-failure) expect_failure=$2; shift;; + --enable-hard-errors) shift;; # No-op. + --merge) merge=1;; + --no-merge) merge=0;; + --ignore-exit) ignore_exit=1;; + --comments) comments=1;; + --no-comments) comments=0;; + --diagnostic-string) diag_string=$2; shift;; + --) shift; break;; + -*) usage_error "invalid option: '$1'";; + esac + shift +done + +test $# -gt 0 || usage_error "missing test command" + +case $expect_failure in + yes) expect_failure=1;; + *) expect_failure=0;; +esac + +if test $color_tests = yes; then + init_colors=' + color_map["red"]="[0;31m" # Red. + color_map["grn"]="[0;32m" # Green. + color_map["lgn"]="[1;32m" # Light green. + color_map["blu"]="[1;34m" # Blue. + color_map["mgn"]="[0;35m" # Magenta. + color_map["std"]="[m" # No color. + color_for_result["ERROR"] = "mgn" + color_for_result["PASS"] = "grn" + color_for_result["XPASS"] = "red" + color_for_result["FAIL"] = "red" + color_for_result["XFAIL"] = "lgn" + color_for_result["SKIP"] = "blu"' +else + init_colors='' +fi + +# :; is there to work around a bug in bash 3.2 (and earlier) which +# does not always set '$?' properly on redirection failure. +# See the Autoconf manual for more details. +:;{ + ( + # Ignore common signals (in this subshell only!), to avoid potential + # problems with Korn shells. Some Korn shells are known to propagate + # to themselves signals that have killed a child process they were + # waiting for; this is done at least for SIGINT (and usually only for + # it, in truth). Without the `trap' below, such a behaviour could + # cause a premature exit in the current subshell, e.g., in case the + # test command it runs gets terminated by a SIGINT. Thus, the awk + # script we are piping into would never seen the exit status it + # expects on its last input line (which is displayed below by the + # last `echo $?' statement), and would thus die reporting an internal + # error. + # For more information, see the Autoconf manual and the threads: + # <https://lists.gnu.org/archive/html/bug-autoconf/2011-09/msg00004.html> + # <http://mail.opensolaris.org/pipermail/ksh93-integration-discuss/2009-February/004121.html> + trap : 1 3 2 13 15 + if test $merge -gt 0; then + exec 2>&1 + else + exec 2>&3 + fi + "$@" + echo $? + ) | LC_ALL=C ${AM_TAP_AWK-awk} \ + -v me="$me" \ + -v test_script_name="$test_name" \ + -v log_file="$log_file" \ + -v trs_file="$trs_file" \ + -v expect_failure="$expect_failure" \ + -v merge="$merge" \ + -v ignore_exit="$ignore_exit" \ + -v comments="$comments" \ + -v diag_string="$diag_string" \ +' +# TODO: the usages of "cat >&3" below could be optimized when using +# GNU awk, and/on on systems that supports /dev/fd/. + +# Implementation note: in what follows, `result_obj` will be an +# associative array that (partly) simulates a TAP result object +# from the `TAP::Parser` perl module. + +## ----------- ## +## FUNCTIONS ## +## ----------- ## + +function fatal(msg) +{ + print me ": " msg | "cat >&2" + exit 1 +} + +function abort(where) +{ + fatal("internal error " where) +} + +# Convert a boolean to a "yes"/"no" string. +function yn(bool) +{ + return bool ? "yes" : "no"; +} + +function add_test_result(result) +{ + if (!test_results_index) + test_results_index = 0 + test_results_list[test_results_index] = result + test_results_index += 1 + test_results_seen[result] = 1; +} + +# Whether the test script should be re-run by "make recheck". +function must_recheck() +{ + for (k in test_results_seen) + if (k != "XFAIL" && k != "PASS" && k != "SKIP") + return 1 + return 0 +} + +# Whether the content of the log file associated to this test should +# be copied into the "global" test-suite.log. +function copy_in_global_log() +{ + for (k in test_results_seen) + if (k != "PASS") + return 1 + return 0 +} + +function get_global_test_result() +{ + if ("ERROR" in test_results_seen) + return "ERROR" + if ("FAIL" in test_results_seen || "XPASS" in test_results_seen) + return "FAIL" + all_skipped = 1 + for (k in test_results_seen) + if (k != "SKIP") + all_skipped = 0 + if (all_skipped) + return "SKIP" + return "PASS"; +} + +function stringify_result_obj(result_obj) +{ + if (result_obj["is_unplanned"] || result_obj["number"] != testno) + return "ERROR" + + if (plan_seen == LATE_PLAN) + return "ERROR" + + if (result_obj["directive"] == "TODO") + return result_obj["is_ok"] ? "XPASS" : "XFAIL" + + if (result_obj["directive"] == "SKIP") + return result_obj["is_ok"] ? "SKIP" : COOKED_FAIL; + + if (length(result_obj["directive"])) + abort("in function stringify_result_obj()") + + return result_obj["is_ok"] ? COOKED_PASS : COOKED_FAIL +} + +function decorate_result(result) +{ + color_name = color_for_result[result] + if (color_name) + return color_map[color_name] "" result "" color_map["std"] + # If we are not using colorized output, or if we do not know how + # to colorize the given result, we should return it unchanged. + return result +} + +function report(result, details) +{ + if (result ~ /^(X?(PASS|FAIL)|SKIP|ERROR)/) + { + msg = ": " test_script_name + add_test_result(result) + } + else if (result == "#") + { + msg = " " test_script_name ":" + } + else + { + abort("in function report()") + } + if (length(details)) + msg = msg " " details + # Output on console might be colorized. + print decorate_result(result) msg + # Log the result in the log file too, to help debugging (this is + # especially true when said result is a TAP error or "Bail out!"). + print result msg | "cat >&3"; +} + +function testsuite_error(error_message) +{ + report("ERROR", "- " error_message) +} + +function handle_tap_result() +{ + details = result_obj["number"]; + if (length(result_obj["description"])) + details = details " " result_obj["description"] + + if (plan_seen == LATE_PLAN) + { + details = details " # AFTER LATE PLAN"; + } + else if (result_obj["is_unplanned"]) + { + details = details " # UNPLANNED"; + } + else if (result_obj["number"] != testno) + { + details = sprintf("%s # OUT-OF-ORDER (expecting %d)", + details, testno); + } + else if (result_obj["directive"]) + { + details = details " # " result_obj["directive"]; + if (length(result_obj["explanation"])) + details = details " " result_obj["explanation"] + } + + report(stringify_result_obj(result_obj), details) +} + +# `skip_reason` should be empty whenever planned > 0. +function handle_tap_plan(planned, skip_reason) +{ + planned += 0 # Avoid getting confused if, say, `planned` is "00" + if (length(skip_reason) && planned > 0) + abort("in function handle_tap_plan()") + if (plan_seen) + { + # Error, only one plan per stream is acceptable. + testsuite_error("multiple test plans") + return; + } + planned_tests = planned + # The TAP plan can come before or after *all* the TAP results; we speak + # respectively of an "early" or a "late" plan. If we see the plan line + # after at least one TAP result has been seen, assume we have a late + # plan; in this case, any further test result seen after the plan will + # be flagged as an error. + plan_seen = (testno >= 1 ? LATE_PLAN : EARLY_PLAN) + # If testno > 0, we have an error ("too many tests run") that will be + # automatically dealt with later, so do not worry about it here. If + # $plan_seen is true, we have an error due to a repeated plan, and that + # has already been dealt with above. Otherwise, we have a valid "plan + # with SKIP" specification, and should report it as a particular kind + # of SKIP result. + if (planned == 0 && testno == 0) + { + if (length(skip_reason)) + skip_reason = "- " skip_reason; + report("SKIP", skip_reason); + } +} + +function extract_tap_comment(line) +{ + if (index(line, diag_string) == 1) + { + # Strip leading `diag_string` from `line`. + line = substr(line, length(diag_string) + 1) + # And strip any leading and trailing whitespace left. + sub("^[ \t]*", "", line) + sub("[ \t]*$", "", line) + # Return what is left (if any). + return line; + } + return ""; +} + +# When this function is called, we know that line is a TAP result line, +# so that it matches the (perl) RE "^(not )?ok\b". +function setup_result_obj(line) +{ + # Get the result, and remove it from the line. + result_obj["is_ok"] = (substr(line, 1, 2) == "ok" ? 1 : 0) + sub("^(not )?ok[ \t]*", "", line) + + # If the result has an explicit number, get it and strip it; otherwise, + # automatically assing the next progresive number to it. + if (line ~ /^[0-9]+$/ || line ~ /^[0-9]+[^a-zA-Z0-9_]/) + { + match(line, "^[0-9]+") + # The final `+ 0` is to normalize numbers with leading zeros. + result_obj["number"] = substr(line, 1, RLENGTH) + 0 + line = substr(line, RLENGTH + 1) + } + else + { + result_obj["number"] = testno + } + + if (plan_seen == LATE_PLAN) + # No further test results are acceptable after a "late" TAP plan + # has been seen. + result_obj["is_unplanned"] = 1 + else if (plan_seen && testno > planned_tests) + result_obj["is_unplanned"] = 1 + else + result_obj["is_unplanned"] = 0 + + # Strip trailing and leading whitespace. + sub("^[ \t]*", "", line) + sub("[ \t]*$", "", line) + + # This will have to be corrected if we have a "TODO"/"SKIP" directive. + result_obj["description"] = line + result_obj["directive"] = "" + result_obj["explanation"] = "" + + if (index(line, "#") == 0) + return # No possible directive, nothing more to do. + + # Directives are case-insensitive. + rx = "[ \t]*#[ \t]*([tT][oO][dD][oO]|[sS][kK][iI][pP])[ \t]*" + + # See whether we have the directive, and if yes, where. + pos = match(line, rx "$") + if (!pos) + pos = match(line, rx "[^a-zA-Z0-9_]") + + # If there was no TAP directive, we have nothing more to do. + if (!pos) + return + + # Let`s now see if the TAP directive has been escaped. For example: + # escaped: ok \# SKIP + # not escaped: ok \\# SKIP + # escaped: ok \\\\\# SKIP + # not escaped: ok \ # SKIP + if (substr(line, pos, 1) == "#") + { + bslash_count = 0 + for (i = pos; i > 1 && substr(line, i - 1, 1) == "\\"; i--) + bslash_count += 1 + if (bslash_count % 2) + return # Directive was escaped. + } + + # Strip the directive and its explanation (if any) from the test + # description. + result_obj["description"] = substr(line, 1, pos - 1) + # Now remove the test description from the line, that has been dealt + # with already. + line = substr(line, pos) + # Strip the directive, and save its value (normalized to upper case). + sub("^[ \t]*#[ \t]*", "", line) + result_obj["directive"] = toupper(substr(line, 1, 4)) + line = substr(line, 5) + # Now get the explanation for the directive (if any), with leading + # and trailing whitespace removed. + sub("^[ \t]*", "", line) + sub("[ \t]*$", "", line) + result_obj["explanation"] = line +} + +function get_test_exit_message(status) +{ + if (status == 0) + return "" + if (status !~ /^[1-9][0-9]*$/) + abort("getting exit status") + if (status < 127) + exit_details = "" + else if (status == 127) + exit_details = " (command not found?)" + else if (status >= 128 && status <= 255) + exit_details = sprintf(" (terminated by signal %d?)", status - 128) + else if (status > 256 && status <= 384) + # We used to report an "abnormal termination" here, but some Korn + # shells, when a child process die due to signal number n, can leave + # in $? an exit status of 256+n instead of the more standard 128+n. + # Apparently, both behaviours are allowed by POSIX (2008), so be + # prepared to handle them both. See also Austing Group report ID + # 0000051 <http://www.austingroupbugs.net/view.php?id=51> + exit_details = sprintf(" (terminated by signal %d?)", status - 256) + else + # Never seen in practice. + exit_details = " (abnormal termination)" + return sprintf("exited with status %d%s", status, exit_details) +} + +function write_test_results() +{ + print ":global-test-result: " get_global_test_result() > trs_file + print ":recheck: " yn(must_recheck()) > trs_file + print ":copy-in-global-log: " yn(copy_in_global_log()) > trs_file + for (i = 0; i < test_results_index; i += 1) + print ":test-result: " test_results_list[i] > trs_file + close(trs_file); +} + +BEGIN { + +## ------- ## +## SETUP ## +## ------- ## + +'"$init_colors"' + +# Properly initialized once the TAP plan is seen. +planned_tests = 0 + +COOKED_PASS = expect_failure ? "XPASS": "PASS"; +COOKED_FAIL = expect_failure ? "XFAIL": "FAIL"; + +# Enumeration-like constants to remember which kind of plan (if any) +# has been seen. It is important that NO_PLAN evaluates "false" as +# a boolean. +NO_PLAN = 0 +EARLY_PLAN = 1 +LATE_PLAN = 2 + +testno = 0 # Number of test results seen so far. +bailed_out = 0 # Whether a "Bail out!" directive has been seen. + +# Whether the TAP plan has been seen or not, and if yes, which kind +# it is ("early" is seen before any test result, "late" otherwise). +plan_seen = NO_PLAN + +## --------- ## +## PARSING ## +## --------- ## + +is_first_read = 1 + +while (1) + { + # Involutions required so that we are able to read the exit status + # from the last input line. + st = getline + if (st < 0) # I/O error. + fatal("I/O error while reading from input stream") + else if (st == 0) # End-of-input + { + if (is_first_read) + abort("in input loop: only one input line") + break + } + if (is_first_read) + { + is_first_read = 0 + nextline = $0 + continue + } + else + { + curline = nextline + nextline = $0 + $0 = curline + } + # Copy any input line verbatim into the log file. + print | "cat >&3" + # Parsing of TAP input should stop after a "Bail out!" directive. + if (bailed_out) + continue + + # TAP test result. + if ($0 ~ /^(not )?ok$/ || $0 ~ /^(not )?ok[^a-zA-Z0-9_]/) + { + testno += 1 + setup_result_obj($0) + handle_tap_result() + } + # TAP plan (normal or "SKIP" without explanation). + else if ($0 ~ /^1\.\.[0-9]+[ \t]*$/) + { + # The next two lines will put the number of planned tests in $0. + sub("^1\\.\\.", "") + sub("[^0-9]*$", "") + handle_tap_plan($0, "") + continue + } + # TAP "SKIP" plan, with an explanation. + else if ($0 ~ /^1\.\.0+[ \t]*#/) + { + # The next lines will put the skip explanation in $0, stripping + # any leading and trailing whitespace. This is a little more + # tricky in truth, since we want to also strip a potential leading + # "SKIP" string from the message. + sub("^[^#]*#[ \t]*(SKIP[: \t][ \t]*)?", "") + sub("[ \t]*$", ""); + handle_tap_plan(0, $0) + } + # "Bail out!" magic. + # Older versions of prove and TAP::Harness (e.g., 3.17) did not + # recognize a "Bail out!" directive when preceded by leading + # whitespace, but more modern versions (e.g., 3.23) do. So we + # emulate the latter, "more modern" behaviour. + else if ($0 ~ /^[ \t]*Bail out!/) + { + bailed_out = 1 + # Get the bailout message (if any), with leading and trailing + # whitespace stripped. The message remains stored in `$0`. + sub("^[ \t]*Bail out![ \t]*", ""); + sub("[ \t]*$", ""); + # Format the error message for the + bailout_message = "Bail out!" + if (length($0)) + bailout_message = bailout_message " " $0 + testsuite_error(bailout_message) + } + # Maybe we have too look for dianogtic comments too. + else if (comments != 0) + { + comment = extract_tap_comment($0); + if (length(comment)) + report("#", comment); + } + } + +## -------- ## +## FINISH ## +## -------- ## + +# A "Bail out!" directive should cause us to ignore any following TAP +# error, as well as a non-zero exit status from the TAP producer. +if (!bailed_out) + { + if (!plan_seen) + { + testsuite_error("missing test plan") + } + else if (planned_tests != testno) + { + bad_amount = testno > planned_tests ? "many" : "few" + testsuite_error(sprintf("too %s tests run (expected %d, got %d)", + bad_amount, planned_tests, testno)) + } + if (!ignore_exit) + { + # Fetch exit status from the last line. + exit_message = get_test_exit_message(nextline) + if (exit_message) + testsuite_error(exit_message) + } + } + +write_test_results() + +exit 0 + +} # End of "BEGIN" block. +' + +# TODO: document that we consume the file descriptor 3 :-( +} 3>"$log_file" + +test $? -eq 0 || fatal "I/O or internal error" + +# Local Variables: +# mode: shell-script +# sh-indentation: 2 +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "scriptversion=" +# time-stamp-format: "%:y-%02m-%02d.%02H" +# time-stamp-time-zone: "UTC0" +# time-stamp-end: "; # UTC" +# End: diff --git a/configure.ac b/configure.ac index fb225b688..92053daf6 100644 --- a/configure.ac +++ b/configure.ac @@ -14,6 +14,7 @@ AC_PREREQ([2.61]) AC_INIT([cvc4], _CVC4_RELEASE_STRING, [cvc4-bugs@cs.stanford.edu]) AC_CONFIG_SRCDIR([src/include/cvc4_public.h]) AC_CONFIG_AUX_DIR([config]) +AC_REQUIRE_AUX_FILE([tap-driver.sh]) AC_CONFIG_MACRO_DIR([config]) AC_CONFIG_LIBOBJ_DIR([src/lib]) diff --git a/test/regress/Makefile.am b/test/regress/Makefile.am index f068ce9ad..19cbbae4a 100644 --- a/test/regress/Makefile.am +++ b/test/regress/Makefile.am @@ -9,7 +9,9 @@ TESTS = $(REG0_TESTS) $(REG1_TESTS) $(REG2_TESTS) $(REG3_TESTS) $(REG4_TESTS) @mk_empty@BINARY = cvc4 end@mk_if@ -LOG_COMPILER = $(top_srcdir)/test/regress/run_regression +LOG_DRIVER = env AM_TAP_AWK='$(AWK)' $(SHELL) \ + $(top_srcdir)/config/tap-driver.sh --comments +LOG_COMPILER = @abs_top_srcdir@/test/regress/run_regression.py AM_LOG_FLAGS = $(RUN_REGRESSION_ARGS) @abs_top_builddir@/src/main/$(BINARY)$(EXEEXT) if AUTOMAKE_1_11 @@ -55,5 +57,5 @@ EXTRA_DIST = \ regress0/tptp/Axioms/SYN000_0.ax \ Makefile.levels \ Makefile.tests \ - run_regression \ - README.md + run_regression.py \ + README.md
\ No newline at end of file diff --git a/test/regress/regress0/arrays/incorrect10.smt b/test/regress/regress0/arrays/incorrect10.smt index 6fbdb7ee9..6aacc37c0 100644 --- a/test/regress/regress0/arrays/incorrect10.smt +++ b/test/regress/regress0/arrays/incorrect10.smt @@ -1,5 +1,5 @@ -% COMMAND-LINE: --no-check-proofs -% EXPECT: unsat +; COMMAND-LINE: --no-check-proofs +; EXPECT: unsat (benchmark fuzzsmt :logic QF_AX :status unsat diff --git a/test/regress/regress0/expect/scrub.02.smt b/test/regress/regress0/expect/scrub.02.smt index 145801619..65c61ecd0 100644 --- a/test/regress/regress0/expect/scrub.02.smt +++ b/test/regress/regress0/expect/scrub.02.smt @@ -1,8 +1,8 @@ -% SCRUBBER: sed -e 's/The fact in question: .*$/The fact in question: TERM/' -% EXPECT: (error "A non-linear fact was asserted to arithmetic in a linear logic. -% EXPECT: The fact in question: TERM -% EXPECT: ") -% EXIT: 1 +; SCRUBBER: sed -e 's/The fact in question: .*$/The fact in question: TERM/' +; EXPECT: (error "A non-linear fact was asserted to arithmetic in a linear logic. +; EXPECT: The fact in question: TERM +; EXPECT: ") +; EXIT: 1 (benchmark reject_nonlinear :logic QF_LRA :extrafuns ((n Real)) diff --git a/test/regress/regress0/fmf/Arrow_Order-smtlib.778341.smt b/test/regress/regress0/fmf/Arrow_Order-smtlib.778341.smt index f62f057e4..e8c7949dc 100644 --- a/test/regress/regress0/fmf/Arrow_Order-smtlib.778341.smt +++ b/test/regress/regress0/fmf/Arrow_Order-smtlib.778341.smt @@ -1,5 +1,5 @@ -% COMMAND-LINE: --finite-model-find --mbqi=gen-ev -% EXPECT: unsat +; COMMAND-LINE: --finite-model-find --mbqi=gen-ev +; EXPECT: unsat (benchmark Isabelle :status sat :logic AUFLIA diff --git a/test/regress/regress0/fmf/QEpres-uf.855035.smt b/test/regress/regress0/fmf/QEpres-uf.855035.smt index 2945c8f4d..4fe592638 100644 --- a/test/regress/regress0/fmf/QEpres-uf.855035.smt +++ b/test/regress/regress0/fmf/QEpres-uf.855035.smt @@ -1,5 +1,5 @@ -% COMMAND-LINE: --finite-model-find --mbqi=gen-ev -% EXPECT: sat +; COMMAND-LINE: --finite-model-find --mbqi=gen-ev +; EXPECT: sat (benchmark Isabelle :status sat :logic AUFLIA diff --git a/test/regress/regress1/fmf/Hoare-z3.931718.smt b/test/regress/regress1/fmf/Hoare-z3.931718.smt index 754e19e88..4ab90756c 100644 --- a/test/regress/regress1/fmf/Hoare-z3.931718.smt +++ b/test/regress/regress1/fmf/Hoare-z3.931718.smt @@ -1,5 +1,5 @@ -% COMMAND-LINE: --finite-model-find -% EXPECT: sat +; COMMAND-LINE: --finite-model-find +; EXPECT: sat (benchmark Isabelle :status sat :logic AUFLIA diff --git a/test/regress/run_regression b/test/regress/run_regression deleted file mode 100755 index 4ae013911..000000000 --- a/test/regress/run_regression +++ /dev/null @@ -1,424 +0,0 @@ -#!/bin/bash -# -# Morgan Deters <mdeters@cs.nyu.edu> -# for the CVC4 project -# -# usage: -# -# run_regression [ --proof | --dump ] [ wrapper ] cvc4-binary [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ] -# -# Runs benchmark and checks for correct exit status and output. -# -# The TEST_GROUP and TEST_GROUPS environment variables can be used to split the -# benchmarks into groups and only run one of the groups. The benchmarks are -# associated with a group based on a hash of their name. TEST_GROUPS controls -# how many groups are created while TEST_GROUP specifies the group to run. -# Currently, the primary use case for this is to split the benchmarks across -# multiple Travis jobs. - -# ulimit -t 1 # For detecting long running regressions -ulimit -s 65000 # Needed for some (esp. portfolio and model-building) - -prog=`basename "$0"` - -if [ $# -lt 2 ]; then - echo "usage: $prog [ --proof | --dump ] [ wrapper ] cvc4-binary [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ]" >&2 - exit 1 -fi - -proof=no -dump=no -if [ x"$1" = x--proof ]; then - proof=yes - shift -elif [ x"$1" = x--dump ]; then - dump=yes - shift -fi - -if [ $# -lt 2 ]; then - echo "usage: $prog [ --proof | --dump ] [ wrapper ] cvc4-binary [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ]" >&2 - exit 1 -fi - -wrapper= -while [ $# -gt 2 ]; do - wrapper="$wrapper$1 " - shift -done - -[[ "$VALGRIND" = "1" ]] && { - wrapper="libtool --mode=execute valgrind $wrapper" -} - -cvc4=$1 -benchmark_orig=$2 -benchmark="$benchmark_orig" - -if [ -n "$TEST_GROUP" ]; then - # Use the checksum of the benchmark name to determine the group of the - # benchmark - benchmark_group=$(( $(echo "$benchmark" | cksum | cut -d " " -f1) % $TEST_GROUPS )) - if (( $benchmark_group != $TEST_GROUP )); then - # Skip the benchmark if it is not in the expected test group - exit 77 - fi -fi - -function error { - echo "$prog: error: $*" - exit 1 -} - -if ! [ -x "$cvc4" ]; then - error "\`$cvc4' doesn't exist or isn't executable" >&2 -fi -if ! [ -r "$benchmark" ]; then - error "\`$benchmark' doesn't exist or isn't readable" >&2 -fi - -# gettemp() and its associated tempfiles[] array are intended to never -# allow a temporary file to leak---the trap ensures that when this script -# exits, whether via a regular exit or an -INT or other signal, the -# temp files are deleted. -declare -a tempfiles -trap -- 'test ${#tempfiles[@]} -gt 0 && rm -f "${tempfiles[@]}"' EXIT -function gettemp { - local temp="`mktemp -t "$2"`" - tempfiles[${#tempfiles[@]}]="$temp" - eval "$1"="$temp" -} - -tmpbenchmark= -if expr "$benchmark" : '.*\.smt$' &>/dev/null; then - lang=smt1 - if test -e "$benchmark.expect"; then - scrubber=`grep '^% SCRUBBER: ' "$benchmark.expect" | sed 's,^% SCRUBBER: ,,'` - errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark.expect" | sed 's,^% ERROR-SCRUBBER: ,,'` - expected_output=`grep '^% EXPECT: ' "$benchmark.expect" | sed 's,^% EXPECT: ,,'` - expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark.expect" | sed 's,^% EXPECT-ERROR: ,,'` - expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark.expect" | perl -pe 's,^% EXIT: ,,;s,\r,,'` - command_line=`grep '^% COMMAND-LINE: ' "$benchmark.expect" | sed 's,^% COMMAND-LINE: ,,'` - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi - elif grep '^% \(PROOF\|EXPECT\|EXPECT-ERROR\|EXIT\|COMMAND-LINE\|SCRUBBER\|ERROR-SCRUBBER\): ' "$benchmark" "$benchmark" &>/dev/null; then - scrubber=`grep '^% SCRUBBER: ' "$benchmark" | sed 's,^% SCRUBBER: ,,'` - errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark" | sed 's,^% ERROR-SCRUBBER: ,,'` - expected_output=`grep '^% EXPECT: ' "$benchmark" | sed 's,^% EXPECT: ,,'` - expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark" | sed 's,^% EXPECT-ERROR: ,,'` - expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark" | perl -pe 's,^% EXIT: ,,;s,\r,,'` - command_line=`grep '^% COMMAND-LINE: ' "$benchmark" | sed 's,^% COMMAND-LINE: ,,'` - # old mktemp from coreutils 7.x is broken, can't do XXXX in the middle - # this frustrates our auto-language-detection - gettemp tmpbenchmark cvc4_benchmark.smt.$$.XXXXXXXXXX - grep -v '^% \(PROOF\|EXPECT\|EXPECT-ERROR\|EXIT\|COMMAND-LINE\|SCRUBBER\|ERROR-SCRUBBER\): ' "$benchmark" >"$tmpbenchmark" - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi - benchmark=$tmpbenchmark - elif grep '^ *:status *sat' "$benchmark" &>/dev/null; then - expected_output=sat - expected_exit_status=0 - command_line= - elif grep '^ *:status *unsat' "$benchmark" &>/dev/null; then - expected_output=unsat - expected_exit_status=0 - command_line= - else - error "cannot determine status of \`$benchmark'" - fi -elif expr "$benchmark" : '.*\.smt2$' &>/dev/null; then - lang=smt2 - - # If this test case requires unsat cores but CVC4 is not built with proof - # support, skip it. Note: checking $CVC4_REGRESSION_ARGS instead of $proof - # here because $proof is not set for the second run. - requires_proof=`grep '(get-unsat-core)\|(get-unsat-assumptions)' "$benchmark"` - if [[ ! "$CVC4_REGRESSION_ARGS" = *"--proof"* ]] && [ -n "$requires_proof" ]; then - exit 77 - fi - - if test -e "$benchmark.expect"; then - scrubber=`grep '^% SCRUBBER: ' "$benchmark.expect" | sed 's,^% SCRUBBER: ,,'` - errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark.expect" | sed 's,^% ERROR-SCRUBBER: ,,'` - expected_output=`grep '^% EXPECT: ' "$benchmark.expect" | sed 's,^% EXPECT: ,,'` - expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark.expect" | sed 's,^% EXPECT-ERROR: ,,'` - expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark.expect" | perl -pe 's,^% EXIT: ,,;s,\r,,'` - command_line=`grep '^% COMMAND-LINE: ' "$benchmark.expect" | sed 's,^% COMMAND-LINE: ,,'` - elif grep '^; \(EXPECT\|EXPECT-ERROR\|EXIT\|COMMAND-LINE\|SCRUBBER\|ERROR-SCRUBBER\): ' "$benchmark" "$benchmark" &>/dev/null; then - scrubber=`grep '^; SCRUBBER: ' "$benchmark" | sed 's,^; SCRUBBER: ,,'` - errscrubber=`grep '^; ERROR-SCRUBBER: ' "$benchmark" | sed 's,^; ERROR-SCRUBBER: ,,'` - expected_output=`grep '^; EXPECT: ' "$benchmark" | sed 's,^; EXPECT: ,,'` - expected_error=`grep '^; EXPECT-ERROR: ' "$benchmark" | sed 's,^; EXPECT-ERROR: ,,'` - expected_exit_status=`grep -m 1 '^; EXIT: ' "$benchmark" | perl -pe 's,^; EXIT: ,,;s,\r,,'` - command_line=`grep '^; COMMAND-LINE: ' "$benchmark" | sed 's,^; COMMAND-LINE: ,,'` - fi - - # If expected output/error was not given, try to determine the status from - # the commands. - if [ -z "$expected_output" -a -z "$expected_error" ]; then - if grep '^ *( *set-info *:status *sat' "$benchmark" &>/dev/null; then - expected_output=sat - elif grep '^ *( *set-info *:status *unsat' "$benchmark" &>/dev/null; then - expected_output=unsat - fi - fi - - # A valid test case needs at least an expected output (explicit or through - # SMT2 commands) or an expected exit status. - if [ -z "$expected_output" -a -z "$expected_error" -a -z "$expected_exit_status" ]; then - error "cannot determine status of \`$benchmark'" - fi - - # If no explicit expected exit status was given, we assume that the solver is - # supposed to succeed. - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi -elif expr "$benchmark" : '.*\.cvc$' &>/dev/null; then - lang=cvc4 - scrubber=`grep '^% SCRUBBER: ' "$benchmark" | sed 's,^% SCRUBBER: ,,'` - errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark" | sed 's,^% ERROR-SCRUBBER: ,,'` - expected_output=$(grep '^% EXPECT: ' "$benchmark") - expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark" | sed 's,^% EXPECT-ERROR: ,,'` - if [ -z "$expected_output" -a -z "$expected_error" ]; then - error "cannot determine expected output of \`$benchmark': " \ - "please use \`% EXPECT:' and/or \`% EXPECT-ERROR:' gestures" - fi - expected_output=$(echo "$expected_output" | perl -pe 's,^% EXPECT: ,,;s,\r,,') - expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark" | perl -pe 's,^% EXIT: ,,;s,\r,,'` - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi - command_line=`grep '^% COMMAND-LINE: ' "$benchmark" | sed 's,^% COMMAND-LINE: ,,'` -elif expr "$benchmark" : '.*\.p$' &>/dev/null; then - lang=tptp - command_line=--finite-model-find - scrubber=`grep '^% SCRUBBER: ' "$benchmark" | sed 's,^% SCRUBBER: ,,'` - errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark" | sed 's,^% ERROR-SCRUBBER: ,,'` - expected_output=$(grep '^% EXPECT: ' "$benchmark") - expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark" | sed 's,^% EXPECT-ERROR: ,,'` - if [ -z "$expected_output" -a -z "$expected_error" ]; then - if grep -q '^% Status *: ' "$benchmark"; then - expected_output="$(grep '^% *Status *: ' "$benchmark" | head -1 | awk '{print$NF}')" - case "$expected_output" in - Theorem|Unsatisfiable) expected_exit_status=0 ;; - CounterSatisfiable|Satisfiable) expected_exit_status=0 ;; - GaveUp) expected_exit_status=0 ;; - esac - expected_output="% SZS status $expected_output for $(basename "$benchmark" | sed 's,\.p$,,')" - else - error "cannot determine expected output of \`$benchmark': " \ - "please use \`% EXPECT:' and/or \`% EXPECT-ERROR:' gestures" - fi - else - expected_output=$(echo "$expected_output" | perl -pe 's,^% EXPECT: ,,;s,\r,,') - expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark" | perl -pe 's,^% EXIT: ,,;s,\r,,'` - fi - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi - if grep -q '^% COMMAND-LINE: ' "$benchmark"; then - command_line=`grep '^% COMMAND-LINE: ' "$benchmark" | sed 's,^% COMMAND-LINE: ,,'` - fi -elif expr "$benchmark" : '.*\.sy$' &>/dev/null; then - lang=sygus - if test -e "$benchmark.expect"; then - scrubber=`grep '^% SCRUBBER: ' "$benchmark.expect" | sed 's,^% SCRUBBER: ,,'` - errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark.expect" | sed 's,^% ERROR-SCRUBBER: ,,'` - expected_output=`grep '^% EXPECT: ' "$benchmark.expect" | sed 's,^% EXPECT: ,,'` - expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark.expect" | sed 's,^% EXPECT-ERROR: ,,'` - expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark.expect" | perl -pe 's,^% EXIT: ,,;s,\r,,'` - command_line=`grep '^% COMMAND-LINE: ' "$benchmark.expect" | sed 's,^% COMMAND-LINE: ,,'` - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi - elif grep '^; \(EXPECT\|EXPECT-ERROR\|EXIT\|COMMAND-LINE\|SCRUBBER\|ERROR-SCRUBBER\): ' "$benchmark" "$benchmark" &>/dev/null; then - scrubber=`grep '^; SCRUBBER: ' "$benchmark" | sed 's,^; SCRUBBER: ,,'` - errscrubber=`grep '^; ERROR-SCRUBBER: ' "$benchmark" | sed 's,^; ERROR-SCRUBBER: ,,'` - expected_output=`grep '^; EXPECT: ' "$benchmark" | sed 's,^; EXPECT: ,,'` - expected_error=`grep '^; EXPECT-ERROR: ' "$benchmark" | sed 's,^; EXPECT-ERROR: ,,'` - expected_exit_status=`grep -m 1 '^; EXIT: ' "$benchmark" | perl -pe 's,^; EXIT: ,,;s,\r,,'` - command_line=`grep '^; COMMAND-LINE: ' "$benchmark" | sed 's,^; COMMAND-LINE: ,,'` - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi - else - error "cannot determine expected output of \`$benchmark'" - fi -else - error "benchmark \`$benchmark' must be *.cvc or *.smt or *.smt2 or *.p or *.sy" -fi - -command_line="--lang=$lang ${command_line:+$command_line }" - -gettemp expoutfile cvc4_expect_stdout.$$.XXXXXXXXXX -gettemp experrfile cvc4_expect_stderr.$$.XXXXXXXXXX -gettemp outfile cvc4_stdout.$$.XXXXXXXXXX -gettemp errfile cvc4_stderr.$$.XXXXXXXXXX -gettemp errfilefix cvc4_stderr.$$.XXXXXXXXXX -gettemp exitstatusfile cvc4_exitstatus.$$.XXXXXXXXXX - -if [ -z "$expected_output" ]; then - # in case expected stdout output is empty, make sure we don't differ - # by a newline, which we would if we echo "" >"$expoutfile" - touch "$expoutfile" -else - echo "$expected_output" >"$expoutfile" -fi - -if [ "$lang" = sygus ]; then - if ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--check-synth-sol' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--no-check-synth-sol' &>/dev/null; then - # later on, we'll run another test with --check-models on - command_line="$command_line --check-synth-sol" - fi -fi -check_models=false -if grep '^sat$' "$expoutfile" &>/dev/null || grep '^invalid$' "$expoutfile" &>/dev/null || grep '^unknown$' "$expoptfile" &>/dev/null; then - if ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--check-models' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--no-check-models' &>/dev/null; then - # later on, we'll run another test with --check-models on - check_models=true - fi -fi -check_proofs=false -check_unsat_cores=false -if [ "$proof" = yes ]; then - # proofs not currently supported in incremental mode, turn it off - if grep '^unsat$' "$expoutfile" &>/dev/null || grep '^valid$' "$expoutfile" &>/dev/null &>/dev/null; then - if ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--check-proofs' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--no-check-proofs' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--incremental' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--unconstrained-simp' &>/dev/null && - ! expr " $CVC4_REGRESSION_ARGS $command_line" : '.* -[a-zA-Z]*i' &>/dev/null && - ! expr "$BINARY" : '.*pcvc4' &>/dev/null && - ! expr "$benchmark" : '.*\.sy$' &>/dev/null; then - # later on, we'll run another test with --check-proofs on - check_proofs=true - fi - if ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--check-unsat-cores' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--no-check-unsat-cores' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--incremental' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--unconstrained-simp' &>/dev/null && - ! expr " $CVC4_REGRESSION_ARGS $command_line" : '.* -[a-zA-Z]*i' &>/dev/null && - ! expr "$BINARY" : '.*pcvc4' &>/dev/null && - ! expr "$benchmark" : '.*\.sy$' &>/dev/null; then - # later on, we'll run another test with --check-unsat-cores on - check_unsat_cores=true - fi - fi -fi - -if [ -z "$expected_error" ]; then - # in case expected stderr output is empty, make sure we don't differ - # by a newline, which we would if we echo "" >"$experrfile" - touch "$experrfile" -else - echo "$expected_error" >"$experrfile" -fi - -cvc4dir=`dirname "$cvc4"` -cvc4dirfull=`cd "$cvc4dir" && pwd` -if [ -z "$cvc4dirfull" ]; then - error "getting directory of \`$cvc4 !?" -fi -cvc4base=`basename "$cvc4"` -cvc4full="$cvc4dirfull/$cvc4base" -if [ $dump = no ]; then - echo running $wrapper $cvc4full $CVC4_REGRESSION_ARGS $command_line `basename "$benchmark"` [from working dir `dirname "$benchmark"`] - time ( :; \ - ( cd `dirname "$benchmark"`; - $wrapper "$cvc4full" $CVC4_REGRESSION_ARGS $command_line `basename "$benchmark"`; - echo $? >"$exitstatusfile" - ) > "$outfile" 2> "$errfile" ) -else - echo running $wrapper $cvc4full $CVC4_REGRESSION_ARGS $command_line --preprocess-only --dump=clauses --output-lang=smt2 -q `basename "$benchmark"` \| $wrapper $cvc4full $CVC4_REGRESSION_ARGS $command_line --lang=smt2 - [from working dir `dirname "$benchmark"`] - time ( :; \ - ( cd `dirname "$benchmark"`; - $wrapper "$cvc4full" $CVC4_REGRESSION_ARGS $command_line --preprocess-only --dump=clauses --output-lang=smt2 -q `basename "$benchmark"` | $wrapper "$cvc4full" $CVC4_REGRESSION_ARGS $command_line --lang=smt2 -; - echo $? >"$exitstatusfile" - ) > "$outfile" 2> "$errfile" ) -fi - -# we have to actual error file same treatment as other files. differences in -# versions of echo/bash were causing failure on some platforms and not on others -# (also grep out valgrind output, if 0 errors reported by valgrind) -actual_error=$(cat $errfile) -if [[ "$VALGRIND" = "1" ]]; then - #valgrind_output=$(cat $errfile|grep -E "^==[0-9]+== "|) - valgrind_num_errors=$(cat $errfile|grep -E "^==[0-9]+== "|tail -n1|awk '{print $4}') - echo "valgrind errors (not suppressed): $valgrind_num_errors" 1>&2 - - ((valgrind_num_errors == 0)) && actual_error=$(echo "$actual_error"|grep -vE "^==[0-9]+== ") -fi -if [ -z "$actual_error" ]; then - # in case expected stderr output is empty, make sure we don't differ - # by a newline, which we would if we echo "" >"$experrfile" - touch "$errfilefix" -else - echo "$actual_error" >"$errfilefix" -fi - -# Scrub the output file if a scrubber has been specified. -if [ -n "$scrubber" ]; then - echo "About to scrub $outfile using $scrubber" - mv "$outfile" "$outfile.prescrub" - cat "$outfile.prescrub" | eval $scrubber >"$outfile" -else - echo "not scrubbing" -fi - -# Scrub the error file if a scrubber has been specified. -if [ -n "$errscrubber" ]; then - echo "About to scrub $errfilefix using $errscrubber" - mv "$errfilefix" "$errfilefix.prescrub" - cat "$errfilefix.prescrub" | eval $errscrubber >"$errfilefix" -else - echo "not scrubbing error file" -fi - -diffs=`diff -u --strip-trailing-cr "$expoutfile" "$outfile"` -diffexit=$? -diffserr=`diff -u --strip-trailing-cr "$experrfile" "$errfilefix"` -diffexiterr=$? -exit_status=`cat "$exitstatusfile"` - -exitcode=0 -if [ $diffexit -ne 0 ]; then - echo "$prog: error: differences between expected and actual output on stdout" - echo "$diffs" - exitcode=1 -fi -if [ $diffexiterr -ne 0 ]; then - echo "$prog: error: differences between expected and actual output on stderr" - echo "$diffserr" - exitcode=1 -fi - -if [ "$exit_status" != "$expected_exit_status" ]; then - echo "$prog: error: expected exit status \`$expected_exit_status' but got \`$exit_status'" - exitcode=1 -fi - -if $check_models || $check_proofs || $check_unsat_cores; then - check_cmdline= - if $check_models; then - check_cmdline="$check_cmdline --check-models" - fi - if $check_proofs; then - # taking: Make the extra flags part of --check-proofs. - check_cmdline="$check_cmdline --check-proofs --no-bv-eq --no-bv-ineq --no-bv-algebraic" - fi - if $check_unsat_cores; then - check_cmdline="$check_cmdline --check-unsat-cores" - fi - # run an extra model/proof/core-checking pass - if ! CVC4_REGRESSION_ARGS="$CVC4_REGRESSION_ARGS$check_cmdline" "$0" $wrapper "$cvc4" "$benchmark_orig"; then - exitcode=1 - fi -fi - - -exit $exitcode diff --git a/test/regress/run_regression.py b/test/regress/run_regression.py new file mode 100755 index 000000000..c861b0d71 --- /dev/null +++ b/test/regress/run_regression.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python3 +""" +Usage: + + run_regression.py [ --proof | --dump ] [ wrapper ] cvc4-binary + [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ] + +Runs benchmark and checks for correct exit status and output. +""" + +import argparse +import difflib +import os +import re +import shlex +import subprocess +import sys + +SCRUBBER = 'SCRUBBER: ' +ERROR_SCRUBBER = 'ERROR-SCRUBBER: ' +EXPECT = 'EXPECT: ' +EXPECT_ERROR = 'EXPECT-ERROR: ' +EXIT = 'EXIT: ' +COMMAND_LINE = 'COMMAND-LINE: ' + + +def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc4_binary, + command_line, benchmark_dir, benchmark_filename): + """Runs CVC4 on the file `benchmark_filename` in the directory + `benchmark_dir` using the binary `cvc4_binary` with the command line + options `command_line`. The output is scrubbed using `scrubber` and + `error_scrubber` for stdout and stderr, respectively. If dump is true, the + function first uses CVC4 to read in and dump the benchmark file and then + uses that as input.""" + + bin_args = wrapper[:] + bin_args.append(cvc4_binary) + + output = None + error = None + exit_status = None + if dump: + dump_args = [ + '--preprocess-only', '--dump', 'raw-benchmark', + '--output-lang=smt2', '-qq' + ] + dump_process = subprocess.Popen( + bin_args + command_line + dump_args + [benchmark_filename], + cwd=benchmark_dir, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + dump_output, _ = dump_process.communicate() + process = subprocess.Popen( + bin_args + command_line + ['--lang=smt2', '-'], + cwd=benchmark_dir, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + output, error = process.communicate(input=dump_output) + exit_status = process.returncode + else: + process = subprocess.Popen( + bin_args + command_line + [benchmark_filename], + cwd=benchmark_dir, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + output, error = process.communicate() + exit_status = process.returncode + + # If a scrubber command has been specified then apply it to the output. + if scrubber: + scrubber_process = subprocess.Popen( + shlex.split(scrubber), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + output, _ = scrubber_process.communicate(input=output) + if error_scrubber: + error_scrubber_process = subprocess.Popen( + shlex.split(error_scrubber), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + error, _ = error_scrubber_process.communicate(input=error) + + # Popen in Python 3 returns a bytes object instead of a string for + # stdout/stderr. + if isinstance(output, bytes): + output = output.decode() + if isinstance(error, bytes): + error = error.decode() + return (output.strip(), error.strip(), exit_status) + + +def run_regression(proof, dump, wrapper, cvc4_binary, benchmark_path): + """Determines the expected output for a benchmark, runs CVC4 on it and then + checks whether the output corresponds to the expected output. Optionally + uses a wrapper `wrapper`, tests proof generation (if proof is true), or + dumps a benchmark and uses that as the input (if dump is true).""" + + if not os.access(cvc4_binary, os.X_OK): + sys.exit( + '"{}" does not exist or is not executable'.format(cvc4_binary)) + if not os.path.isfile(benchmark_path): + sys.exit('"{}" does not exist or is not a file'.format(benchmark_path)) + + basic_command_line_args = [] + + benchmark_basename = os.path.basename(benchmark_path) + benchmark_filename, benchmark_ext = os.path.splitext(benchmark_basename) + benchmark_dir = os.path.dirname(benchmark_path) + comment_char = '%' + status_regex = None + status_to_output = lambda s: s + if benchmark_ext == '.smt': + status_regex = r':status\s*(sat|unsat)' + comment_char = ';' + elif benchmark_ext == '.smt2': + status_regex = r'set-info\s*:status\s*(sat|unsat)' + comment_char = ';' + elif benchmark_ext == '.cvc': + pass + elif benchmark_ext == '.p': + basic_command_line_args.append('--finite-model-find') + status_regex = r'% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)' + status_to_output = lambda s: '% SZS status {} for {}'.format(s, benchmark_filename) + elif benchmark_ext == '.sy': + comment_char = ';' + # Do not use proofs/unsat-cores with .sy files + proof = False + else: + sys.exit('"{}" must be *.cvc or *.smt or *.smt2 or *.p or *.sy'.format( + benchmark_basename)) + + # If there is an ".expect" file for the benchmark, read the metadata + # from there, otherwise from the benchmark file. + metadata_filename = benchmark_path + '.expect' + if os.path.isfile(metadata_filename): + comment_char = '%' + else: + metadata_filename = benchmark_path + + metadata_lines = None + with open(metadata_filename, 'r') as metadata_file: + metadata_lines = metadata_file.readlines() + metadata_content = ''.join(metadata_lines) + + # Extract the metadata for the benchmark. + scrubber = None + error_scrubber = None + expected_output = '' + expected_error = '' + expected_exit_status = None + command_line = '' + for line in metadata_lines: + # Skip lines that do not start with "%" + if line[0] != comment_char: + continue + line = line[2:] + + if line.startswith(SCRUBBER): + scrubber = line[len(SCRUBBER):] + elif line.startswith(ERROR_SCRUBBER): + error_scrubber = line[len(ERROR_SCRUBBER):] + elif line.startswith(EXPECT): + expected_output += line[len(EXPECT):] + elif line.startswith(EXPECT_ERROR): + expected_error += line[len(EXPECT_ERROR):] + elif line.startswith(EXIT): + expected_exit_status = int(line[len(EXIT):]) + elif line.startswith(COMMAND_LINE): + command_line += line[len(COMMAND_LINE):] + expected_output = expected_output.strip() + expected_error = expected_error.strip() + + # Expected output/expected error has not been defined in the metadata for + # the benchmark. Try to extract the information from the benchmark itself. + if expected_output == '' and expected_error == '': + match = None + if status_regex: + match = re.search(status_regex, metadata_content) + + if match: + expected_output = status_to_output(match.group(1)) + elif expected_exit_status is None: + # If there is no expected output/error and the exit status has not + # been set explicitly, the benchmark is invalid. + sys.exit('Cannot determine status of "{}"'.format(benchmark_path)) + + if not proof and ('(get-unsat-cores)' in metadata_content + or '(get-unsat-assumptions)' in metadata_content): + print( + '1..0 # Skipped: unsat cores not supported without proof support') + return + + if expected_exit_status is None: + expected_exit_status = 0 + + if 'CVC4_REGRESSION_ARGS' in os.environ: + basic_command_line_args += shlex.split( + os.environ['CVC4_REGRESSION_ARGS']) + basic_command_line_args += shlex.split(command_line) + command_line_args_configs = [basic_command_line_args] + + extra_command_line_args = [] + if benchmark_ext == '.sy' and \ + '--no-check-synth-sol' not in basic_command_line_args and \ + '--check-synth-sol' not in basic_command_line_args: + extra_command_line_args = ['--check-synth-sol'] + if re.search(r'^(sat|invalid|unknown)$', expected_output) and \ + '--no-check-models' not in basic_command_line_args: + extra_command_line_args = ['--check-models'] + if proof and re.search(r'^(sat|valid)$', expected_output): + if '--no-check-proofs' not in basic_command_line_args and \ + '--incremental' not in basic_command_line_args and \ + '--unconstrained-simp' not in basic_command_line_args and \ + not cvc4_binary.endswith('pcvc4'): + extra_command_line_args = [ + '--check-proofs', '--no-bv-eq', '--no-bv-ineq', + '--no-bv-algebraic' + ] + if '--no-check-unsat-cores' not in basic_command_line_args and \ + '--incremental' not in basic_command_line_args and \ + '--unconstrained-simp' not in basic_command_line_args and \ + not cvc4_binary.endswith('pcvc4'): + extra_command_line_args = [ + '--check-proofs', '--no-bv-eq', '--no-bv-ineq', + '--no-bv-algebraic' + ] + if extra_command_line_args: + command_line_args_configs.append( + basic_command_line_args + extra_command_line_args) + + # Run CVC4 on the benchmark with the different option sets and check + # whether the exit status, stdout output, stderr output are as expected. + print('1..{}'.format(len(command_line_args_configs))) + print('# Starting') + for command_line_args in command_line_args_configs: + output, error, exit_status = run_benchmark( + dump, wrapper, scrubber, error_scrubber, cvc4_binary, + command_line_args, benchmark_dir, benchmark_basename) + if output != expected_output: + print( + 'not ok - Differences between expected and actual output on stdout - Flags: {}'. + format(command_line_args)) + for line in difflib.context_diff(output.splitlines(), + expected_output.splitlines()): + print(line) + elif error != expected_error: + print( + 'not ok - Differences between expected and actual output on stderr - Flags: {}'. + format(command_line_args)) + for line in difflib.context_diff(error.splitlines(), + expected_error.splitlines()): + print(line) + elif expected_exit_status != exit_status: + print( + 'not ok - Expected exit status "{}" but got "{}" - Flags: {}'. + format(expected_exit_status, exit_status, command_line_args)) + else: + print('ok - Flags: {}'.format(command_line_args)) + + +def main(): + """Parses the command line arguments and then calls the core of the + script.""" + + parser = argparse.ArgumentParser( + description= + 'Runs benchmark and checks for correct exit status and output.') + parser.add_argument('--proof', action='store_true') + parser.add_argument('--dump', action='store_true') + parser.add_argument('wrapper', nargs='*') + parser.add_argument('cvc4_binary') + parser.add_argument('benchmark') + args = parser.parse_args() + cvc4_binary = os.path.abspath(args.cvc4_binary) + + wrapper = args.wrapper + if os.environ.get('VALGRIND') == '1' and not wrapper: + wrapper = ['libtool', '--mode=execute', 'valgrind'] + + run_regression(args.proof, args.dump, wrapper, cvc4_binary, args.benchmark) + + +if __name__ == "__main__": + main() |