10 files changed, 957 insertions, 440 deletions
diff --git a/config/tap-driver.sh b/config/tap-driver.sh
new file mode 100755
index 000000000..d3e68bdc1
--- /dev/null
+++ b/config/tap-driver.sh
@@ -0,0 +1,651 @@
+#! /bin/sh
+# Copyright (C) 2011-2018 Free Software Foundation, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# This file is maintained in Automake, please report
+# bugs to <bug-automake@gnu.org> or send patches to
+# <automake-patches@gnu.org>.
+
+scriptversion=2013-12-23.17; # UTC
+
+# Make unconditional expansion of undefined variables an error.  This
+# helps a lot in preventing typo-related bugs.
+set -u
+
+me=tap-driver.sh
+
+fatal ()
+{
+  echo "$me: fatal: $*" >&2
+  exit 1
+}
+
+usage_error ()
+{
+  echo "$me: $*" >&2
+  print_usage >&2
+  exit 2
+}
+
+print_usage ()
+{
+  cat <<END
+Usage:
+  tap-driver.sh --test-name=NAME --log-file=PATH --trs-file=PATH
+                [--expect-failure={yes|no}] [--color-tests={yes|no}]
+                [--enable-hard-errors={yes|no}] [--ignore-exit]
+                [--diagnostic-string=STRING] [--merge|--no-merge]
+                [--comments|--no-comments] [--] TEST-COMMAND
+The '--test-name', '-log-file' and '--trs-file' options are mandatory.
+END
+}
+
+# TODO: better error handling in option parsing (in particular, ensure
+# TODO: $log_file, $trs_file and $test_name are defined).
+test_name= # Used for reporting.
+log_file=  # Where to save the result and output of the test script.
+trs_file=  # Where to save the metadata of the test run.
+expect_failure=0
+color_tests=0
+merge=0
+ignore_exit=0
+comments=0
+diag_string='#'
+while test $# -gt 0; do
+  case $1 in
+  --help) print_usage; exit $?;;
+  --version) echo "$me $scriptversion"; exit $?;;
+  --test-name) test_name=$2; shift;;
+  --log-file) log_file=$2; shift;;
+  --trs-file) trs_file=$2; shift;;
+  --color-tests) color_tests=$2; shift;;
+  --expect-failure) expect_failure=$2; shift;;
+  --enable-hard-errors) shift;; # No-op.
+  --merge) merge=1;;
+  --no-merge) merge=0;;
+  --ignore-exit) ignore_exit=1;;
+  --comments) comments=1;;
+  --no-comments) comments=0;;
+  --diagnostic-string) diag_string=$2; shift;;
+  --) shift; break;;
+  -*) usage_error "invalid option: '$1'";;
+  esac
+  shift
+done
+
+test $# -gt 0 || usage_error "missing test command"
+
+case $expect_failure in
+  yes) expect_failure=1;;
+    *) expect_failure=0;;
+esac
+
+if test $color_tests = yes; then
+  init_colors='
+    color_map["red"]="[0;31m" # Red.
+    color_map["grn"]="[0;32m" # Green.
+    color_map["lgn"]="[1;32m" # Light green.
+    color_map["blu"]="[1;34m" # Blue.
+    color_map["mgn"]="[0;35m" # Magenta.
+    color_map["std"]="[m"     # No color.
+    color_for_result["ERROR"] = "mgn"
+    color_for_result["PASS"]  = "grn"
+    color_for_result["XPASS"] = "red"
+    color_for_result["FAIL"]  = "red"
+    color_for_result["XFAIL"] = "lgn"
+    color_for_result["SKIP"]  = "blu"'
+else
+  init_colors=''
+fi
+
+# :; is there to work around a bug in bash 3.2 (and earlier) which
+# does not always set '$?' properly on redirection failure.
+# See the Autoconf manual for more details.
+:;{
+  (
+    # Ignore common signals (in this subshell only!), to avoid potential
+    # problems with Korn shells.  Some Korn shells are known to propagate
+    # to themselves signals that have killed a child process they were
+    # waiting for; this is done at least for SIGINT (and usually only for
+    # it, in truth).  Without the `trap' below, such a behaviour could
+    # cause a premature exit in the current subshell, e.g., in case the
+    # test command it runs gets terminated by a SIGINT.  Thus, the awk
+    # script we are piping into would never seen the exit status it
+    # expects on its last input line (which is displayed below by the
+    # last `echo $?' statement), and would thus die reporting an internal
+    # error.
+    # For more information, see the Autoconf manual and the threads:
+    # <https://lists.gnu.org/archive/html/bug-autoconf/2011-09/msg00004.html>
+    # <http://mail.opensolaris.org/pipermail/ksh93-integration-discuss/2009-February/004121.html>
+    trap : 1 3 2 13 15
+    if test $merge -gt 0; then
+      exec 2>&1
+    else
+      exec 2>&3
+    fi
+    "$@"
+    echo $?
+  ) | LC_ALL=C ${AM_TAP_AWK-awk} \
+        -v me="$me" \
+        -v test_script_name="$test_name" \
+        -v log_file="$log_file" \
+        -v trs_file="$trs_file" \
+        -v expect_failure="$expect_failure" \
+        -v merge="$merge" \
+        -v ignore_exit="$ignore_exit" \
+        -v comments="$comments" \
+        -v diag_string="$diag_string" \
+'
+# TODO: the usages of "cat >&3" below could be optimized when using
+#       GNU awk, and/on on systems that supports /dev/fd/.
+
+# Implementation note: in what follows, `result_obj` will be an
+# associative array that (partly) simulates a TAP result object
+# from the `TAP::Parser` perl module.
+
+## ----------- ##
+##  FUNCTIONS  ##
+## ----------- ##
+
+function fatal(msg)
+{
+  print me ": " msg | "cat >&2"
+  exit 1
+}
+
+function abort(where)
+{
+  fatal("internal error " where)
+}
+
+# Convert a boolean to a "yes"/"no" string.
+function yn(bool)
+{
+  return bool ? "yes" : "no";
+}
+
+function add_test_result(result)
+{
+  if (!test_results_index)
+    test_results_index = 0
+  test_results_list[test_results_index] = result
+  test_results_index += 1
+  test_results_seen[result] = 1;
+}
+
+# Whether the test script should be re-run by "make recheck".
+function must_recheck()
+{
+  for (k in test_results_seen)
+    if (k != "XFAIL" && k != "PASS" && k != "SKIP")
+      return 1
+  return 0
+}
+
+# Whether the content of the log file associated to this test should
+# be copied into the "global" test-suite.log.
+function copy_in_global_log()
+{
+  for (k in test_results_seen)
+    if (k != "PASS")
+      return 1
+  return 0
+}
+
+function get_global_test_result()
+{
+    if ("ERROR" in test_results_seen)
+      return "ERROR"
+    if ("FAIL" in test_results_seen || "XPASS" in test_results_seen)
+      return "FAIL"
+    all_skipped = 1
+    for (k in test_results_seen)
+      if (k != "SKIP")
+        all_skipped = 0
+    if (all_skipped)
+      return "SKIP"
+    return "PASS";
+}
+
+function stringify_result_obj(result_obj)
+{
+  if (result_obj["is_unplanned"] || result_obj["number"] != testno)
+    return "ERROR"
+
+  if (plan_seen == LATE_PLAN)
+    return "ERROR"
+
+  if (result_obj["directive"] == "TODO")
+    return result_obj["is_ok"] ? "XPASS" : "XFAIL"
+
+  if (result_obj["directive"] == "SKIP")
+    return result_obj["is_ok"] ? "SKIP" : COOKED_FAIL;
+
+  if (length(result_obj["directive"]))
+      abort("in function stringify_result_obj()")
+
+  return result_obj["is_ok"] ? COOKED_PASS : COOKED_FAIL
+}
+
+function decorate_result(result)
+{
+  color_name = color_for_result[result]
+  if (color_name)
+    return color_map[color_name] "" result "" color_map["std"]
+  # If we are not using colorized output, or if we do not know how
+  # to colorize the given result, we should return it unchanged.
+  return result
+}
+
+function report(result, details)
+{
+  if (result ~ /^(X?(PASS|FAIL)|SKIP|ERROR)/)
+    {
+      msg = ": " test_script_name
+      add_test_result(result)
+    }
+  else if (result == "#")
+    {
+      msg = " " test_script_name ":"
+    }
+  else
+    {
+      abort("in function report()")
+    }
+  if (length(details))
+    msg = msg " " details
+  # Output on console might be colorized.
+  print decorate_result(result) msg
+  # Log the result in the log file too, to help debugging (this is
+  # especially true when said result is a TAP error or "Bail out!").
+  print result msg | "cat >&3";
+}
+
+function testsuite_error(error_message)
+{
+  report("ERROR", "- " error_message)
+}
+
+function handle_tap_result()
+{
+  details = result_obj["number"];
+  if (length(result_obj["description"]))
+    details = details " " result_obj["description"]
+
+  if (plan_seen == LATE_PLAN)
+    {
+      details = details " # AFTER LATE PLAN";
+    }
+  else if (result_obj["is_unplanned"])
+    {
+       details = details " # UNPLANNED";
+    }
+  else if (result_obj["number"] != testno)
+    {
+       details = sprintf("%s # OUT-OF-ORDER (expecting %d)",
+                         details, testno);
+    }
+  else if (result_obj["directive"])
+    {
+      details = details " # " result_obj["directive"];
+      if (length(result_obj["explanation"]))
+        details = details " " result_obj["explanation"]
+    }
+
+  report(stringify_result_obj(result_obj), details)
+}
+
+# `skip_reason` should be empty whenever planned > 0.
+function handle_tap_plan(planned, skip_reason)
+{
+  planned += 0 # Avoid getting confused if, say, `planned` is "00"
+  if (length(skip_reason) && planned > 0)
+    abort("in function handle_tap_plan()")
+  if (plan_seen)
+    {
+      # Error, only one plan per stream is acceptable.
+      testsuite_error("multiple test plans")
+      return;
+    }
+  planned_tests = planned
+  # The TAP plan can come before or after *all* the TAP results; we speak
+  # respectively of an "early" or a "late" plan.  If we see the plan line
+  # after at least one TAP result has been seen, assume we have a late
+  # plan; in this case, any further test result seen after the plan will
+  # be flagged as an error.
+  plan_seen = (testno >= 1 ? LATE_PLAN : EARLY_PLAN)
+  # If testno > 0, we have an error ("too many tests run") that will be
+  # automatically dealt with later, so do not worry about it here.  If
+  # $plan_seen is true, we have an error due to a repeated plan, and that
+  # has already been dealt with above.  Otherwise, we have a valid "plan
+  # with SKIP" specification, and should report it as a particular kind
+  # of SKIP result.
+  if (planned == 0 && testno == 0)
+    {
+      if (length(skip_reason))
+        skip_reason = "- "  skip_reason;
+      report("SKIP", skip_reason);
+    }
+}
+
+function extract_tap_comment(line)
+{
+  if (index(line, diag_string) == 1)
+    {
+      # Strip leading `diag_string` from `line`.
+      line = substr(line, length(diag_string) + 1)
+      # And strip any leading and trailing whitespace left.
+      sub("^[ \t]*", "", line)
+      sub("[ \t]*$", "", line)
+      # Return what is left (if any).
+      return line;
+    }
+  return "";
+}
+
+# When this function is called, we know that line is a TAP result line,
+# so that it matches the (perl) RE "^(not )?ok\b".
+function setup_result_obj(line)
+{
+  # Get the result, and remove it from the line.
+  result_obj["is_ok"] = (substr(line, 1, 2) == "ok" ? 1 : 0)
+  sub("^(not )?ok[ \t]*", "", line)
+
+  # If the result has an explicit number, get it and strip it; otherwise,
+  # automatically assing the next progresive number to it.
+  if (line ~ /^[0-9]+$/ || line ~ /^[0-9]+[^a-zA-Z0-9_]/)
+    {
+      match(line, "^[0-9]+")
+      # The final `+ 0` is to normalize numbers with leading zeros.
+      result_obj["number"] = substr(line, 1, RLENGTH) + 0
+      line = substr(line, RLENGTH + 1)
+    }
+  else
+    {
+      result_obj["number"] = testno
+    }
+
+  if (plan_seen == LATE_PLAN)
+    # No further test results are acceptable after a "late" TAP plan
+    # has been seen.
+    result_obj["is_unplanned"] = 1
+  else if (plan_seen && testno > planned_tests)
+    result_obj["is_unplanned"] = 1
+  else
+    result_obj["is_unplanned"] = 0
+
+  # Strip trailing and leading whitespace.
+  sub("^[ \t]*", "", line)
+  sub("[ \t]*$", "", line)
+
+  # This will have to be corrected if we have a "TODO"/"SKIP" directive.
+  result_obj["description"] = line
+  result_obj["directive"] = ""
+  result_obj["explanation"] = ""
+
+  if (index(line, "#") == 0)
+    return # No possible directive, nothing more to do.
+
+  # Directives are case-insensitive.
+  rx = "[ \t]*#[ \t]*([tT][oO][dD][oO]|[sS][kK][iI][pP])[ \t]*"
+
+  # See whether we have the directive, and if yes, where.
+  pos = match(line, rx "$")
+  if (!pos)
+    pos = match(line, rx "[^a-zA-Z0-9_]")
+
+  # If there was no TAP directive, we have nothing more to do.
+  if (!pos)
+    return
+
+  # Let`s now see if the TAP directive has been escaped.  For example:
+  #  escaped:     ok \# SKIP
+  #  not escaped: ok \\# SKIP
+  #  escaped:     ok \\\\\# SKIP
+  #  not escaped: ok \ # SKIP
+  if (substr(line, pos, 1) == "#")
+    {
+      bslash_count = 0
+      for (i = pos; i > 1 && substr(line, i - 1, 1) == "\\"; i--)
+        bslash_count += 1
+      if (bslash_count % 2)
+        return # Directive was escaped.
+    }
+
+  # Strip the directive and its explanation (if any) from the test
+  # description.
+  result_obj["description"] = substr(line, 1, pos - 1)
+  # Now remove the test description from the line, that has been dealt
+  # with already.
+  line = substr(line, pos)
+  # Strip the directive, and save its value (normalized to upper case).
+  sub("^[ \t]*#[ \t]*", "", line)
+  result_obj["directive"] = toupper(substr(line, 1, 4))
+  line = substr(line, 5)
+  # Now get the explanation for the directive (if any), with leading
+  # and trailing whitespace removed.
+  sub("^[ \t]*", "", line)
+  sub("[ \t]*$", "", line)
+  result_obj["explanation"] = line
+}
+
+function get_test_exit_message(status)
+{
+  if (status == 0)
+    return ""
+  if (status !~ /^[1-9][0-9]*$/)
+    abort("getting exit status")
+  if (status < 127)
+    exit_details = ""
+  else if (status == 127)
+    exit_details = " (command not found?)"
+  else if (status >= 128 && status <= 255)
+    exit_details = sprintf(" (terminated by signal %d?)", status - 128)
+  else if (status > 256 && status <= 384)
+    # We used to report an "abnormal termination" here, but some Korn
+    # shells, when a child process die due to signal number n, can leave
+    # in $? an exit status of 256+n instead of the more standard 128+n.
+    # Apparently, both behaviours are allowed by POSIX (2008), so be
+    # prepared to handle them both.  See also Austing Group report ID
+    # 0000051 <http://www.austingroupbugs.net/view.php?id=51>
+    exit_details = sprintf(" (terminated by signal %d?)", status - 256)
+  else
+    # Never seen in practice.
+    exit_details = " (abnormal termination)"
+  return sprintf("exited with status %d%s", status, exit_details)
+}
+
+function write_test_results()
+{
+  print ":global-test-result: " get_global_test_result() > trs_file
+  print ":recheck: "  yn(must_recheck()) > trs_file
+  print ":copy-in-global-log: " yn(copy_in_global_log()) > trs_file
+  for (i = 0; i < test_results_index; i += 1)
+    print ":test-result: " test_results_list[i] > trs_file
+  close(trs_file);
+}
+
+BEGIN {
+
+## ------- ##
+##  SETUP  ##
+## ------- ##
+
+'"$init_colors"'
+
+# Properly initialized once the TAP plan is seen.
+planned_tests = 0
+
+COOKED_PASS = expect_failure ? "XPASS": "PASS";
+COOKED_FAIL = expect_failure ? "XFAIL": "FAIL";
+
+# Enumeration-like constants to remember which kind of plan (if any)
+# has been seen.  It is important that NO_PLAN evaluates "false" as
+# a boolean.
+NO_PLAN = 0
+EARLY_PLAN = 1
+LATE_PLAN = 2
+
+testno = 0     # Number of test results seen so far.
+bailed_out = 0 # Whether a "Bail out!" directive has been seen.
+
+# Whether the TAP plan has been seen or not, and if yes, which kind
+# it is ("early" is seen before any test result, "late" otherwise).
+plan_seen = NO_PLAN
+
+## --------- ##
+##  PARSING  ##
+## --------- ##
+
+is_first_read = 1
+
+while (1)
+  {
+    # Involutions required so that we are able to read the exit status
+    # from the last input line.
+    st = getline
+    if (st < 0) # I/O error.
+      fatal("I/O error while reading from input stream")
+    else if (st == 0) # End-of-input
+      {
+        if (is_first_read)
+          abort("in input loop: only one input line")
+        break
+      }
+    if (is_first_read)
+      {
+        is_first_read = 0
+        nextline = $0
+        continue
+      }
+    else
+      {
+        curline = nextline
+        nextline = $0
+        $0 = curline
+      }
+    # Copy any input line verbatim into the log file.
+    print | "cat >&3"
+    # Parsing of TAP input should stop after a "Bail out!" directive.
+    if (bailed_out)
+      continue
+
+    # TAP test result.
+    if ($0 ~ /^(not )?ok$/ || $0 ~ /^(not )?ok[^a-zA-Z0-9_]/)
+      {
+        testno += 1
+        setup_result_obj($0)
+        handle_tap_result()
+      }
+    # TAP plan (normal or "SKIP" without explanation).
+    else if ($0 ~ /^1\.\.[0-9]+[ \t]*$/)
+      {
+        # The next two lines will put the number of planned tests in $0.
+        sub("^1\\.\\.", "")
+        sub("[^0-9]*$", "")
+        handle_tap_plan($0, "")
+        continue
+      }
+    # TAP "SKIP" plan, with an explanation.
+    else if ($0 ~ /^1\.\.0+[ \t]*#/)
+      {
+        # The next lines will put the skip explanation in $0, stripping
+        # any leading and trailing whitespace.  This is a little more
+        # tricky in truth, since we want to also strip a potential leading
+        # "SKIP" string from the message.
+        sub("^[^#]*#[ \t]*(SKIP[: \t][ \t]*)?", "")
+        sub("[ \t]*$", "");
+        handle_tap_plan(0, $0)
+      }
+    # "Bail out!" magic.
+    # Older versions of prove and TAP::Harness (e.g., 3.17) did not
+    # recognize a "Bail out!" directive when preceded by leading
+    # whitespace, but more modern versions (e.g., 3.23) do.  So we
+    # emulate the latter, "more modern" behaviour.
+    else if ($0 ~ /^[ \t]*Bail out!/)
+      {
+        bailed_out = 1
+        # Get the bailout message (if any), with leading and trailing
+        # whitespace stripped.  The message remains stored in `$0`.
+        sub("^[ \t]*Bail out![ \t]*", "");
+        sub("[ \t]*$", "");
+        # Format the error message for the
+        bailout_message = "Bail out!"
+        if (length($0))
+          bailout_message = bailout_message " " $0
+        testsuite_error(bailout_message)
+      }
+    # Maybe we have too look for dianogtic comments too.
+    else if (comments != 0)
+      {
+        comment = extract_tap_comment($0);
+        if (length(comment))
+          report("#", comment);
+      }
+  }
+
+## -------- ##
+##  FINISH  ##
+## -------- ##
+
+# A "Bail out!" directive should cause us to ignore any following TAP
+# error, as well as a non-zero exit status from the TAP producer.
+if (!bailed_out)
+  {
+    if (!plan_seen)
+      {
+        testsuite_error("missing test plan")
+      }
+    else if (planned_tests != testno)
+      {
+        bad_amount = testno > planned_tests ? "many" : "few"
+        testsuite_error(sprintf("too %s tests run (expected %d, got %d)",
+                                bad_amount, planned_tests, testno))
+      }
+    if (!ignore_exit)
+      {
+        # Fetch exit status from the last line.
+        exit_message = get_test_exit_message(nextline)
+        if (exit_message)
+          testsuite_error(exit_message)
+      }
+  }
+
+write_test_results()
+
+exit 0
+
+} # End of "BEGIN" block.
+'
+
+# TODO: document that we consume the file descriptor 3 :-(
+} 3>"$log_file"
+
+test $? -eq 0 || fatal "I/O or internal error"
+
+# Local Variables:
+# mode: shell-script
+# sh-indentation: 2
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-time-zone: "UTC0"
+# time-stamp-end: "; # UTC"
+# End:
diff --git a/configure.ac b/configure.ac
index fb225b688..92053daf6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -14,6 +14,7 @@ AC_PREREQ([2.61])
 AC_INIT([cvc4], _CVC4_RELEASE_STRING, [cvc4-bugs@cs.stanford.edu])
 AC_CONFIG_SRCDIR([src/include/cvc4_public.h])
 AC_CONFIG_AUX_DIR([config])
+AC_REQUIRE_AUX_FILE([tap-driver.sh])
 AC_CONFIG_MACRO_DIR([config])
 AC_CONFIG_LIBOBJ_DIR([src/lib])
 
diff --git a/test/regress/Makefile.am b/test/regress/Makefile.am
index f068ce9ad..19cbbae4a 100644
--- a/test/regress/Makefile.am
+++ b/test/regress/Makefile.am
@@ -9,7 +9,9 @@ TESTS = $(REG0_TESTS) $(REG1_TESTS) $(REG2_TESTS) $(REG3_TESTS) $(REG4_TESTS)
 @mk_empty@BINARY = cvc4
 end@mk_if@
 
-LOG_COMPILER = $(top_srcdir)/test/regress/run_regression
+LOG_DRIVER = env AM_TAP_AWK='$(AWK)' $(SHELL) \
+			 	 $(top_srcdir)/config/tap-driver.sh --comments
+LOG_COMPILER = @abs_top_srcdir@/test/regress/run_regression.py
 AM_LOG_FLAGS = $(RUN_REGRESSION_ARGS) @abs_top_builddir@/src/main/$(BINARY)$(EXEEXT)
 
 if AUTOMAKE_1_11
@@ -55,5 +57,5 @@ EXTRA_DIST = \
 	regress0/tptp/Axioms/SYN000_0.ax \
 	Makefile.levels \
 	Makefile.tests \
-	run_regression \
-	README.md
+  run_regression.py \
+	README.md
+\ No newline at end of file
diff --git a/test/regress/regress0/arrays/incorrect10.smt b/test/regress/regress0/arrays/incorrect10.smt
index 6fbdb7ee9..6aacc37c0 100644
--- a/test/regress/regress0/arrays/incorrect10.smt
+++ b/test/regress/regress0/arrays/incorrect10.smt
@@ -1,5 +1,5 @@
-% COMMAND-LINE: --no-check-proofs
-% EXPECT: unsat
+; COMMAND-LINE: --no-check-proofs
+; EXPECT: unsat
 (benchmark fuzzsmt
 :logic QF_AX
 :status unsat
diff --git a/test/regress/regress0/expect/scrub.02.smt b/test/regress/regress0/expect/scrub.02.smt
index 145801619..65c61ecd0 100644
--- a/test/regress/regress0/expect/scrub.02.smt
+++ b/test/regress/regress0/expect/scrub.02.smt
@@ -1,8 +1,8 @@
-% SCRUBBER: sed -e 's/The fact in question: .*$/The fact in question: TERM/'
-% EXPECT: (error "A non-linear fact was asserted to arithmetic in a linear logic.
-% EXPECT: The fact in question: TERM
-% EXPECT: ")
-% EXIT: 1
+; SCRUBBER: sed -e 's/The fact in question: .*$/The fact in question: TERM/'
+; EXPECT: (error "A non-linear fact was asserted to arithmetic in a linear logic.
+; EXPECT: The fact in question: TERM
+; EXPECT: ")
+; EXIT: 1
 (benchmark reject_nonlinear
 :logic QF_LRA
 :extrafuns ((n Real))
diff --git a/test/regress/regress0/fmf/Arrow_Order-smtlib.778341.smt b/test/regress/regress0/fmf/Arrow_Order-smtlib.778341.smt
index f62f057e4..e8c7949dc 100644
--- a/test/regress/regress0/fmf/Arrow_Order-smtlib.778341.smt
+++ b/test/regress/regress0/fmf/Arrow_Order-smtlib.778341.smt
@@ -1,5 +1,5 @@
-% COMMAND-LINE: --finite-model-find --mbqi=gen-ev
-% EXPECT: unsat
+; COMMAND-LINE: --finite-model-find --mbqi=gen-ev
+; EXPECT: unsat
 (benchmark Isabelle
 :status sat
 :logic AUFLIA
diff --git a/test/regress/regress0/fmf/QEpres-uf.855035.smt b/test/regress/regress0/fmf/QEpres-uf.855035.smt
index 2945c8f4d..4fe592638 100644
--- a/test/regress/regress0/fmf/QEpres-uf.855035.smt
+++ b/test/regress/regress0/fmf/QEpres-uf.855035.smt
@@ -1,5 +1,5 @@
-% COMMAND-LINE: --finite-model-find --mbqi=gen-ev
-% EXPECT: sat
+; COMMAND-LINE: --finite-model-find --mbqi=gen-ev
+; EXPECT: sat
 (benchmark Isabelle
 :status sat
 :logic AUFLIA
diff --git a/test/regress/regress1/fmf/Hoare-z3.931718.smt b/test/regress/regress1/fmf/Hoare-z3.931718.smt
index 754e19e88..4ab90756c 100644
--- a/test/regress/regress1/fmf/Hoare-z3.931718.smt
+++ b/test/regress/regress1/fmf/Hoare-z3.931718.smt
@@ -1,5 +1,5 @@
-% COMMAND-LINE: --finite-model-find
-% EXPECT: sat
+; COMMAND-LINE: --finite-model-find
+; EXPECT: sat
 (benchmark Isabelle
 :status sat
 :logic AUFLIA
diff --git a/test/regress/run_regression b/test/regress/run_regression
deleted file mode 100755
index 4ae013911..000000000
--- a/test/regress/run_regression
+++ /dev/null
@@ -1,424 +0,0 @@
-#!/bin/bash
-#
-# Morgan Deters <mdeters@cs.nyu.edu>
-# for the CVC4 project
-#
-# usage:
-#
-#     run_regression [ --proof | --dump ] [ wrapper ] cvc4-binary [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ]
-#
-# Runs benchmark and checks for correct exit status and output.
-#
-# The TEST_GROUP and TEST_GROUPS environment variables can be used to split the
-# benchmarks into groups and only run one of the groups. The benchmarks are
-# associated with a group based on a hash of their name. TEST_GROUPS controls
-# how many groups are created while TEST_GROUP specifies the group to run.
-# Currently, the primary use case for this is to split the benchmarks across
-# multiple Travis jobs.
-
-# ulimit -t 1    # For detecting long running regressions
-ulimit -s 65000  # Needed for some (esp. portfolio and model-building)
-
-prog=`basename "$0"`
-
-if [ $# -lt 2 ]; then
-  echo "usage: $prog [ --proof | --dump ] [ wrapper ] cvc4-binary [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ]" >&2
-  exit 1
-fi
-
-proof=no
-dump=no
-if [ x"$1" = x--proof ]; then
-  proof=yes
-  shift
-elif [ x"$1" = x--dump ]; then
-  dump=yes
-  shift
-fi
-
-if [ $# -lt 2 ]; then
-  echo "usage: $prog [ --proof | --dump ] [ wrapper ] cvc4-binary [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ]" >&2
-  exit 1
-fi
-
-wrapper=
-while [ $# -gt 2 ]; do
-  wrapper="$wrapper$1 "
-  shift
-done
-
-[[ "$VALGRIND" = "1" ]] && {
-  wrapper="libtool --mode=execute valgrind $wrapper"
-}
-
-cvc4=$1
-benchmark_orig=$2
-benchmark="$benchmark_orig"
-
-if [ -n "$TEST_GROUP" ]; then
-  # Use the checksum of the benchmark name to determine the group of the
-  # benchmark
-  benchmark_group=$(( $(echo "$benchmark" | cksum | cut -d " " -f1) % $TEST_GROUPS ))
-  if (( $benchmark_group != $TEST_GROUP )); then
-    # Skip the benchmark if it is not in the expected test group
-    exit 77
-  fi
-fi
-
-function error {
-  echo "$prog: error: $*"
-  exit 1
-}
-
-if ! [ -x "$cvc4" ]; then
-  error "\`$cvc4' doesn't exist or isn't executable" >&2
-fi
-if ! [ -r "$benchmark" ]; then
-  error "\`$benchmark' doesn't exist or isn't readable" >&2
-fi
-
-# gettemp() and its associated tempfiles[] array are intended to never
-# allow a temporary file to leak---the trap ensures that when this script
-# exits, whether via a regular exit or an -INT or other signal, the
-# temp files are deleted.
-declare -a tempfiles
-trap -- 'test ${#tempfiles[@]} -gt 0 && rm -f "${tempfiles[@]}"' EXIT
-function gettemp {
-  local temp="`mktemp -t "$2"`"
-  tempfiles[${#tempfiles[@]}]="$temp"
-  eval "$1"="$temp"
-}
-
-tmpbenchmark=
-if expr "$benchmark" : '.*\.smt$' &>/dev/null; then
-  lang=smt1
-  if test -e "$benchmark.expect"; then
-    scrubber=`grep '^% SCRUBBER: ' "$benchmark.expect" | sed 's,^% SCRUBBER: ,,'`
-    errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark.expect" | sed 's,^% ERROR-SCRUBBER: ,,'`
-    expected_output=`grep '^% EXPECT: ' "$benchmark.expect" | sed 's,^% EXPECT: ,,'`
-    expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark.expect" | sed 's,^% EXPECT-ERROR: ,,'`
-    expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark.expect" | perl -pe 's,^% EXIT: ,,;s,\r,,'`
-    command_line=`grep '^% COMMAND-LINE: ' "$benchmark.expect" | sed 's,^% COMMAND-LINE: ,,'`
-    if [ -z "$expected_exit_status" ]; then
-      expected_exit_status=0
-    fi
-  elif grep '^% \(PROOF\|EXPECT\|EXPECT-ERROR\|EXIT\|COMMAND-LINE\|SCRUBBER\|ERROR-SCRUBBER\): ' "$benchmark" "$benchmark" &>/dev/null; then
-    scrubber=`grep '^% SCRUBBER: ' "$benchmark" | sed 's,^% SCRUBBER: ,,'`
-    errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark" | sed 's,^% ERROR-SCRUBBER: ,,'`
-    expected_output=`grep '^% EXPECT: ' "$benchmark" | sed 's,^% EXPECT: ,,'`
-    expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark" | sed 's,^% EXPECT-ERROR: ,,'`
-    expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark" | perl -pe 's,^% EXIT: ,,;s,\r,,'`
-    command_line=`grep '^% COMMAND-LINE: ' "$benchmark" | sed 's,^% COMMAND-LINE: ,,'`
-    # old mktemp from coreutils 7.x is broken, can't do XXXX in the middle
-    # this frustrates our auto-language-detection
-    gettemp tmpbenchmark cvc4_benchmark.smt.$$.XXXXXXXXXX
-    grep -v '^% \(PROOF\|EXPECT\|EXPECT-ERROR\|EXIT\|COMMAND-LINE\|SCRUBBER\|ERROR-SCRUBBER\): ' "$benchmark" >"$tmpbenchmark"
-    if [ -z "$expected_exit_status" ]; then
-      expected_exit_status=0
-    fi
-    benchmark=$tmpbenchmark
-  elif grep '^ *:status  *sat' "$benchmark" &>/dev/null; then
-    expected_output=sat
-    expected_exit_status=0
-    command_line=
-  elif grep '^ *:status  *unsat' "$benchmark" &>/dev/null; then
-    expected_output=unsat
-    expected_exit_status=0
-    command_line=
-  else
-    error "cannot determine status of \`$benchmark'"
-  fi
-elif expr "$benchmark" : '.*\.smt2$' &>/dev/null; then
-  lang=smt2
-
-  # If this test case requires unsat cores but CVC4 is not built with proof
-  # support, skip it. Note: checking $CVC4_REGRESSION_ARGS instead of $proof
-  # here because $proof is not set for the second run.
-  requires_proof=`grep '(get-unsat-core)\|(get-unsat-assumptions)' "$benchmark"`
-  if [[ ! "$CVC4_REGRESSION_ARGS" = *"--proof"* ]] && [ -n "$requires_proof" ]; then
-    exit 77
-  fi
-
-  if test -e "$benchmark.expect"; then
-    scrubber=`grep '^% SCRUBBER: ' "$benchmark.expect" | sed 's,^% SCRUBBER: ,,'`
-    errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark.expect" | sed 's,^% ERROR-SCRUBBER: ,,'`
-    expected_output=`grep '^% EXPECT: ' "$benchmark.expect" | sed 's,^% EXPECT: ,,'`
-    expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark.expect" | sed 's,^% EXPECT-ERROR: ,,'`
-    expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark.expect" | perl -pe 's,^% EXIT: ,,;s,\r,,'`
-    command_line=`grep '^% COMMAND-LINE: ' "$benchmark.expect" | sed 's,^% COMMAND-LINE: ,,'`
-  elif grep '^; \(EXPECT\|EXPECT-ERROR\|EXIT\|COMMAND-LINE\|SCRUBBER\|ERROR-SCRUBBER\): ' "$benchmark" "$benchmark" &>/dev/null; then
-    scrubber=`grep '^; SCRUBBER: ' "$benchmark" | sed 's,^; SCRUBBER: ,,'`
-    errscrubber=`grep '^; ERROR-SCRUBBER: ' "$benchmark" | sed 's,^; ERROR-SCRUBBER: ,,'`
-    expected_output=`grep '^; EXPECT: ' "$benchmark" | sed 's,^; EXPECT: ,,'`
-    expected_error=`grep '^; EXPECT-ERROR: ' "$benchmark" | sed 's,^; EXPECT-ERROR: ,,'`
-    expected_exit_status=`grep -m 1 '^; EXIT: ' "$benchmark" | perl -pe 's,^; EXIT: ,,;s,\r,,'`
-    command_line=`grep '^; COMMAND-LINE: ' "$benchmark" | sed 's,^; COMMAND-LINE: ,,'`
-  fi
-
-  # If expected output/error was not given, try to determine the status from
-  # the commands.
-  if [ -z "$expected_output" -a -z "$expected_error" ]; then
-    if grep '^ *( *set-info  *:status  *sat' "$benchmark" &>/dev/null; then
-      expected_output=sat
-    elif grep '^ *( *set-info  *:status  *unsat' "$benchmark" &>/dev/null; then
-      expected_output=unsat
-    fi
-  fi
-
-  # A valid test case needs at least an expected output (explicit or through
-  # SMT2 commands) or an expected exit status.
-  if [ -z "$expected_output" -a -z "$expected_error" -a -z "$expected_exit_status" ]; then
-      error "cannot determine status of \`$benchmark'"
-  fi
-
-  # If no explicit expected exit status was given, we assume that the solver is
-  # supposed to succeed.
-  if [ -z "$expected_exit_status" ]; then
-    expected_exit_status=0
-  fi
-elif expr "$benchmark" : '.*\.cvc$' &>/dev/null; then
-  lang=cvc4
-  scrubber=`grep '^% SCRUBBER: ' "$benchmark" | sed 's,^% SCRUBBER: ,,'`
-  errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark" | sed 's,^% ERROR-SCRUBBER: ,,'`
-  expected_output=$(grep '^% EXPECT: ' "$benchmark")
-  expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark" | sed 's,^% EXPECT-ERROR: ,,'`
-  if [ -z "$expected_output" -a -z "$expected_error" ]; then
-    error "cannot determine expected output of \`$benchmark': " \
-          "please use \`% EXPECT:' and/or \`% EXPECT-ERROR:' gestures"
-  fi
-  expected_output=$(echo "$expected_output" | perl -pe 's,^% EXPECT: ,,;s,\r,,')
-  expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark" | perl -pe 's,^% EXIT: ,,;s,\r,,'`
-  if [ -z "$expected_exit_status" ]; then
-    expected_exit_status=0
-  fi
-  command_line=`grep '^% COMMAND-LINE: ' "$benchmark" | sed 's,^% COMMAND-LINE: ,,'`
-elif expr "$benchmark" : '.*\.p$' &>/dev/null; then
-  lang=tptp
-  command_line=--finite-model-find
-  scrubber=`grep '^% SCRUBBER: ' "$benchmark" | sed 's,^% SCRUBBER: ,,'`
-  errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark" | sed 's,^% ERROR-SCRUBBER: ,,'`
-  expected_output=$(grep '^% EXPECT: ' "$benchmark")
-  expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark" | sed 's,^% EXPECT-ERROR: ,,'`
-  if [ -z "$expected_output" -a -z "$expected_error" ]; then
-    if grep -q '^% Status *: ' "$benchmark"; then
-      expected_output="$(grep '^% *Status *: ' "$benchmark" | head -1 | awk '{print$NF}')"
-      case "$expected_output" in
-        Theorem|Unsatisfiable) expected_exit_status=0 ;;
-        CounterSatisfiable|Satisfiable) expected_exit_status=0 ;;
-        GaveUp) expected_exit_status=0 ;;
-      esac
-      expected_output="% SZS status $expected_output for $(basename "$benchmark" | sed 's,\.p$,,')"
-    else
-      error "cannot determine expected output of \`$benchmark': " \
-            "please use \`% EXPECT:' and/or \`% EXPECT-ERROR:' gestures"
-    fi
-  else
-    expected_output=$(echo "$expected_output" | perl -pe 's,^% EXPECT: ,,;s,\r,,')
-    expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark" | perl -pe 's,^% EXIT: ,,;s,\r,,'`
-  fi
-  if [ -z "$expected_exit_status" ]; then
-    expected_exit_status=0
-  fi
-  if grep -q '^% COMMAND-LINE: ' "$benchmark"; then
-    command_line=`grep '^% COMMAND-LINE: ' "$benchmark" | sed 's,^% COMMAND-LINE: ,,'`
-  fi
-elif expr "$benchmark" : '.*\.sy$' &>/dev/null; then
-  lang=sygus
-  if test -e "$benchmark.expect"; then
-    scrubber=`grep '^% SCRUBBER: ' "$benchmark.expect" | sed 's,^% SCRUBBER: ,,'`
-    errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark.expect" | sed 's,^% ERROR-SCRUBBER: ,,'`
-    expected_output=`grep '^% EXPECT: ' "$benchmark.expect" | sed 's,^% EXPECT: ,,'`
-    expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark.expect" | sed 's,^% EXPECT-ERROR: ,,'`
-    expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark.expect" | perl -pe 's,^% EXIT: ,,;s,\r,,'`
-    command_line=`grep '^% COMMAND-LINE: ' "$benchmark.expect" | sed 's,^% COMMAND-LINE: ,,'`
-    if [ -z "$expected_exit_status" ]; then
-      expected_exit_status=0
-    fi
-  elif grep '^; \(EXPECT\|EXPECT-ERROR\|EXIT\|COMMAND-LINE\|SCRUBBER\|ERROR-SCRUBBER\): ' "$benchmark" "$benchmark" &>/dev/null; then
-    scrubber=`grep '^; SCRUBBER: ' "$benchmark" | sed 's,^; SCRUBBER: ,,'`
-    errscrubber=`grep '^; ERROR-SCRUBBER: ' "$benchmark" | sed 's,^; ERROR-SCRUBBER: ,,'`
-    expected_output=`grep '^; EXPECT: ' "$benchmark" | sed 's,^; EXPECT: ,,'`
-    expected_error=`grep '^; EXPECT-ERROR: ' "$benchmark" | sed 's,^; EXPECT-ERROR: ,,'`
-    expected_exit_status=`grep -m 1 '^; EXIT: ' "$benchmark" | perl -pe 's,^; EXIT: ,,;s,\r,,'`
-    command_line=`grep '^; COMMAND-LINE: ' "$benchmark" | sed 's,^; COMMAND-LINE: ,,'`
-    if [ -z "$expected_exit_status" ]; then
-      expected_exit_status=0
-    fi
-  else
-    error "cannot determine expected output of \`$benchmark'"
-  fi
-else
-  error "benchmark \`$benchmark' must be *.cvc or *.smt or *.smt2 or *.p or *.sy"
-fi
-
-command_line="--lang=$lang ${command_line:+$command_line }"
-
-gettemp expoutfile cvc4_expect_stdout.$$.XXXXXXXXXX
-gettemp experrfile cvc4_expect_stderr.$$.XXXXXXXXXX
-gettemp outfile cvc4_stdout.$$.XXXXXXXXXX
-gettemp errfile cvc4_stderr.$$.XXXXXXXXXX
-gettemp errfilefix cvc4_stderr.$$.XXXXXXXXXX
-gettemp exitstatusfile cvc4_exitstatus.$$.XXXXXXXXXX
-
-if [ -z "$expected_output" ]; then
-  # in case expected stdout output is empty, make sure we don't differ
-  # by a newline, which we would if we echo "" >"$expoutfile"
-  touch "$expoutfile"
-else
-  echo "$expected_output" >"$expoutfile"
-fi
-
-if [ "$lang" = sygus ]; then
-  if ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--check-synth-sol' &>/dev/null &&
-     ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--no-check-synth-sol' &>/dev/null; then
-    # later on, we'll run another test with --check-models on
-    command_line="$command_line --check-synth-sol"
-  fi
-fi
-check_models=false
-if grep '^sat$' "$expoutfile" &>/dev/null || grep '^invalid$' "$expoutfile" &>/dev/null || grep '^unknown$' "$expoptfile" &>/dev/null; then
-  if ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--check-models' &>/dev/null &&
-     ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--no-check-models' &>/dev/null; then
-    # later on, we'll run another test with --check-models on
-    check_models=true
-  fi
-fi
-check_proofs=false
-check_unsat_cores=false
-if [ "$proof" = yes ]; then
-  # proofs not currently supported in incremental mode, turn it off
-  if grep '^unsat$' "$expoutfile" &>/dev/null || grep '^valid$' "$expoutfile" &>/dev/null &>/dev/null; then
-    if ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--check-proofs' &>/dev/null &&
-       ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--no-check-proofs' &>/dev/null &&
-       ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--incremental' &>/dev/null &&
-       ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--unconstrained-simp' &>/dev/null &&
-       ! expr " $CVC4_REGRESSION_ARGS $command_line" : '.* -[a-zA-Z]*i' &>/dev/null &&
-       ! expr "$BINARY" : '.*pcvc4' &>/dev/null &&
-       ! expr "$benchmark" : '.*\.sy$' &>/dev/null; then
-      # later on, we'll run another test with --check-proofs on
-      check_proofs=true
-    fi
-    if ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--check-unsat-cores' &>/dev/null &&
-       ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--no-check-unsat-cores' &>/dev/null &&
-       ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--incremental' &>/dev/null &&
-       ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--unconstrained-simp' &>/dev/null &&
-       ! expr " $CVC4_REGRESSION_ARGS $command_line" : '.* -[a-zA-Z]*i' &>/dev/null &&
-       ! expr "$BINARY" : '.*pcvc4' &>/dev/null &&
-       ! expr "$benchmark" : '.*\.sy$' &>/dev/null; then
-      # later on, we'll run another test with --check-unsat-cores on
-      check_unsat_cores=true
-    fi
-  fi
-fi
-
-if [ -z "$expected_error" ]; then
-  # in case expected stderr output is empty, make sure we don't differ
-  # by a newline, which we would if we echo "" >"$experrfile"
-  touch "$experrfile"
-else
-  echo "$expected_error" >"$experrfile"
-fi
-
-cvc4dir=`dirname "$cvc4"`
-cvc4dirfull=`cd "$cvc4dir" && pwd`
-if [ -z "$cvc4dirfull" ]; then
-  error "getting directory of \`$cvc4 !?"
-fi
-cvc4base=`basename "$cvc4"`
-cvc4full="$cvc4dirfull/$cvc4base"
-if [ $dump = no ]; then
-  echo running $wrapper $cvc4full $CVC4_REGRESSION_ARGS $command_line `basename "$benchmark"` [from working dir `dirname "$benchmark"`]
-  time ( :; \
-  ( cd `dirname "$benchmark"`;
-    $wrapper "$cvc4full" $CVC4_REGRESSION_ARGS $command_line `basename "$benchmark"`;
-    echo $? >"$exitstatusfile"
-  ) > "$outfile" 2> "$errfile" )
-else
-  echo running $wrapper $cvc4full $CVC4_REGRESSION_ARGS $command_line --preprocess-only --dump=clauses --output-lang=smt2 -q `basename "$benchmark"` \| $wrapper $cvc4full $CVC4_REGRESSION_ARGS $command_line --lang=smt2 - [from working dir `dirname "$benchmark"`]
-  time ( :; \
-  ( cd `dirname "$benchmark"`;
-    $wrapper "$cvc4full" $CVC4_REGRESSION_ARGS $command_line --preprocess-only --dump=clauses --output-lang=smt2 -q `basename "$benchmark"` | $wrapper "$cvc4full" $CVC4_REGRESSION_ARGS $command_line --lang=smt2 -;
-    echo $? >"$exitstatusfile"
-  ) > "$outfile" 2> "$errfile" )
-fi
-
-# we have to actual error file same treatment as other files. differences in
-# versions of echo/bash were causing failure on some platforms and not on others
-# (also grep out valgrind output, if 0 errors reported by valgrind)
-actual_error=$(cat $errfile)
-if [[ "$VALGRIND" = "1" ]]; then
-  #valgrind_output=$(cat $errfile|grep -E "^==[0-9]+== "|)
-  valgrind_num_errors=$(cat $errfile|grep -E "^==[0-9]+== "|tail -n1|awk '{print $4}')
-  echo "valgrind errors (not suppressed): $valgrind_num_errors" 1>&2
-
-  ((valgrind_num_errors == 0)) && actual_error=$(echo "$actual_error"|grep -vE "^==[0-9]+== ")
-fi
-if [ -z "$actual_error" ]; then
-  # in case expected stderr output is empty, make sure we don't differ
-  # by a newline, which we would if we echo "" >"$experrfile"
-  touch "$errfilefix"
-else
-  echo "$actual_error" >"$errfilefix"
-fi
-
-# Scrub the output file if a scrubber has been specified.
-if [ -n "$scrubber" ]; then
-  echo "About to scrub $outfile using $scrubber"
-  mv "$outfile" "$outfile.prescrub"
-  cat "$outfile.prescrub" | eval $scrubber >"$outfile"
-else
-  echo "not scrubbing"
-fi
-
-# Scrub the error file if a scrubber has been specified.
-if [ -n "$errscrubber" ]; then
-  echo "About to scrub $errfilefix using $errscrubber"
-  mv "$errfilefix" "$errfilefix.prescrub"
-  cat "$errfilefix.prescrub" | eval $errscrubber >"$errfilefix"
-else
-  echo "not scrubbing error file"
-fi
-
-diffs=`diff -u --strip-trailing-cr "$expoutfile" "$outfile"`
-diffexit=$?
-diffserr=`diff -u --strip-trailing-cr "$experrfile" "$errfilefix"`
-diffexiterr=$?
-exit_status=`cat "$exitstatusfile"`
-
-exitcode=0
-if [ $diffexit -ne 0 ]; then
-  echo "$prog: error: differences between expected and actual output on stdout"
-  echo "$diffs"
-  exitcode=1
-fi
-if [ $diffexiterr -ne 0 ]; then
-  echo "$prog: error: differences between expected and actual output on stderr"
-  echo "$diffserr"
-  exitcode=1
-fi
-
-if [ "$exit_status" != "$expected_exit_status" ]; then
-  echo "$prog: error: expected exit status \`$expected_exit_status' but got \`$exit_status'"
-  exitcode=1
-fi
-
-if $check_models || $check_proofs || $check_unsat_cores; then
-  check_cmdline=
-  if $check_models; then
-    check_cmdline="$check_cmdline --check-models"
-  fi
-  if $check_proofs; then
-    # taking: Make the extra flags part of --check-proofs.
-    check_cmdline="$check_cmdline --check-proofs --no-bv-eq --no-bv-ineq --no-bv-algebraic"
-  fi
-  if $check_unsat_cores; then
-    check_cmdline="$check_cmdline --check-unsat-cores"
-  fi
-  # run an extra model/proof/core-checking pass
-  if ! CVC4_REGRESSION_ARGS="$CVC4_REGRESSION_ARGS$check_cmdline" "$0" $wrapper "$cvc4" "$benchmark_orig"; then
-    exitcode=1
-  fi
-fi
-
-
-exit $exitcode
diff --git a/test/regress/run_regression.py b/test/regress/run_regression.py
new file mode 100755
index 000000000..c861b0d71
--- /dev/null
+++ b/test/regress/run_regression.py
@@ -0,0 +1,287 @@
+#!/usr/bin/env python3
+"""
+Usage:
+
+    run_regression.py [ --proof | --dump ] [ wrapper ] cvc4-binary
+        [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ]
+
+Runs benchmark and checks for correct exit status and output.
+"""
+
+import argparse
+import difflib
+import os
+import re
+import shlex
+import subprocess
+import sys
+
+SCRUBBER = 'SCRUBBER: '
+ERROR_SCRUBBER = 'ERROR-SCRUBBER: '
+EXPECT = 'EXPECT: '
+EXPECT_ERROR = 'EXPECT-ERROR: '
+EXIT = 'EXIT: '
+COMMAND_LINE = 'COMMAND-LINE: '
+
+
+def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc4_binary,
+                  command_line, benchmark_dir, benchmark_filename):
+    """Runs CVC4 on the file `benchmark_filename` in the directory
+    `benchmark_dir` using the binary `cvc4_binary` with the command line
+    options `command_line`. The output is scrubbed using `scrubber` and
+    `error_scrubber` for stdout and stderr, respectively. If dump is true, the
+    function first uses CVC4 to read in and dump the benchmark file and then
+    uses that as input."""
+
+    bin_args = wrapper[:]
+    bin_args.append(cvc4_binary)
+
+    output = None
+    error = None
+    exit_status = None
+    if dump:
+        dump_args = [
+            '--preprocess-only', '--dump', 'raw-benchmark',
+            '--output-lang=smt2', '-qq'
+        ]
+        dump_process = subprocess.Popen(
+            bin_args + command_line + dump_args + [benchmark_filename],
+            cwd=benchmark_dir,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        dump_output, _ = dump_process.communicate()
+        process = subprocess.Popen(
+            bin_args + command_line + ['--lang=smt2', '-'],
+            cwd=benchmark_dir,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        output, error = process.communicate(input=dump_output)
+        exit_status = process.returncode
+    else:
+        process = subprocess.Popen(
+            bin_args + command_line + [benchmark_filename],
+            cwd=benchmark_dir,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        output, error = process.communicate()
+        exit_status = process.returncode
+
+    # If a scrubber command has been specified then apply it to the output.
+    if scrubber:
+        scrubber_process = subprocess.Popen(
+            shlex.split(scrubber),
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        output, _ = scrubber_process.communicate(input=output)
+    if error_scrubber:
+        error_scrubber_process = subprocess.Popen(
+            shlex.split(error_scrubber),
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        error, _ = error_scrubber_process.communicate(input=error)
+
+    # Popen in Python 3 returns a bytes object instead of a string for
+    # stdout/stderr.
+    if isinstance(output, bytes):
+        output = output.decode()
+    if isinstance(error, bytes):
+        error = error.decode()
+    return (output.strip(), error.strip(), exit_status)
+
+
+def run_regression(proof, dump, wrapper, cvc4_binary, benchmark_path):
+    """Determines the expected output for a benchmark, runs CVC4 on it and then
+    checks whether the output corresponds to the expected output. Optionally
+    uses a wrapper `wrapper`, tests proof generation (if proof is true), or
+    dumps a benchmark and uses that as the input (if dump is true)."""
+
+    if not os.access(cvc4_binary, os.X_OK):
+        sys.exit(
+            '"{}" does not exist or is not executable'.format(cvc4_binary))
+    if not os.path.isfile(benchmark_path):
+        sys.exit('"{}" does not exist or is not a file'.format(benchmark_path))
+
+    basic_command_line_args = []
+
+    benchmark_basename = os.path.basename(benchmark_path)
+    benchmark_filename, benchmark_ext = os.path.splitext(benchmark_basename)
+    benchmark_dir = os.path.dirname(benchmark_path)
+    comment_char = '%'
+    status_regex = None
+    status_to_output = lambda s: s
+    if benchmark_ext == '.smt':
+        status_regex = r':status\s*(sat|unsat)'
+        comment_char = ';'
+    elif benchmark_ext == '.smt2':
+        status_regex = r'set-info\s*:status\s*(sat|unsat)'
+        comment_char = ';'
+    elif benchmark_ext == '.cvc':
+        pass
+    elif benchmark_ext == '.p':
+        basic_command_line_args.append('--finite-model-find')
+        status_regex = r'% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)'
+        status_to_output = lambda s: '% SZS status {} for {}'.format(s, benchmark_filename)
+    elif benchmark_ext == '.sy':
+        comment_char = ';'
+        # Do not use proofs/unsat-cores with .sy files
+        proof = False
+    else:
+        sys.exit('"{}" must be *.cvc or *.smt or *.smt2 or *.p or *.sy'.format(
+            benchmark_basename))
+
+    # If there is an ".expect" file for the benchmark, read the metadata
+    # from there, otherwise from the benchmark file.
+    metadata_filename = benchmark_path + '.expect'
+    if os.path.isfile(metadata_filename):
+        comment_char = '%'
+    else:
+        metadata_filename = benchmark_path
+
+    metadata_lines = None
+    with open(metadata_filename, 'r') as metadata_file:
+        metadata_lines = metadata_file.readlines()
+    metadata_content = ''.join(metadata_lines)
+
+    # Extract the metadata for the benchmark.
+    scrubber = None
+    error_scrubber = None
+    expected_output = ''
+    expected_error = ''
+    expected_exit_status = None
+    command_line = ''
+    for line in metadata_lines:
+        # Skip lines that do not start with "%"
+        if line[0] != comment_char:
+            continue
+        line = line[2:]
+
+        if line.startswith(SCRUBBER):
+            scrubber = line[len(SCRUBBER):]
+        elif line.startswith(ERROR_SCRUBBER):
+            error_scrubber = line[len(ERROR_SCRUBBER):]
+        elif line.startswith(EXPECT):
+            expected_output += line[len(EXPECT):]
+        elif line.startswith(EXPECT_ERROR):
+            expected_error += line[len(EXPECT_ERROR):]
+        elif line.startswith(EXIT):
+            expected_exit_status = int(line[len(EXIT):])
+        elif line.startswith(COMMAND_LINE):
+            command_line += line[len(COMMAND_LINE):]
+    expected_output = expected_output.strip()
+    expected_error = expected_error.strip()
+
+    # Expected output/expected error has not been defined in the metadata for
+    # the benchmark. Try to extract the information from the benchmark itself.
+    if expected_output == '' and expected_error == '':
+        match = None
+        if status_regex:
+            match = re.search(status_regex, metadata_content)
+
+        if match:
+            expected_output = status_to_output(match.group(1))
+        elif expected_exit_status is None:
+            # If there is no expected output/error and the exit status has not
+            # been set explicitly, the benchmark is invalid.
+            sys.exit('Cannot determine status of "{}"'.format(benchmark_path))
+
+    if not proof and ('(get-unsat-cores)' in metadata_content
+                      or '(get-unsat-assumptions)' in metadata_content):
+        print(
+            '1..0 # Skipped: unsat cores not supported without proof support')
+        return
+
+    if expected_exit_status is None:
+        expected_exit_status = 0
+
+    if 'CVC4_REGRESSION_ARGS' in os.environ:
+        basic_command_line_args += shlex.split(
+            os.environ['CVC4_REGRESSION_ARGS'])
+    basic_command_line_args += shlex.split(command_line)
+    command_line_args_configs = [basic_command_line_args]
+
+    extra_command_line_args = []
+    if benchmark_ext == '.sy' and \
+        '--no-check-synth-sol' not in basic_command_line_args and \
+        '--check-synth-sol' not in basic_command_line_args:
+        extra_command_line_args = ['--check-synth-sol']
+    if re.search(r'^(sat|invalid|unknown)$', expected_output) and \
+       '--no-check-models' not in basic_command_line_args:
+        extra_command_line_args = ['--check-models']
+    if proof and re.search(r'^(sat|valid)$', expected_output):
+        if '--no-check-proofs' not in basic_command_line_args and \
+           '--incremental' not in basic_command_line_args and \
+           '--unconstrained-simp' not in basic_command_line_args and \
+           not cvc4_binary.endswith('pcvc4'):
+            extra_command_line_args = [
+                '--check-proofs', '--no-bv-eq', '--no-bv-ineq',
+                '--no-bv-algebraic'
+            ]
+        if '--no-check-unsat-cores' not in basic_command_line_args and \
+           '--incremental' not in basic_command_line_args and \
+           '--unconstrained-simp' not in basic_command_line_args and \
+           not cvc4_binary.endswith('pcvc4'):
+            extra_command_line_args = [
+                '--check-proofs', '--no-bv-eq', '--no-bv-ineq',
+                '--no-bv-algebraic'
+            ]
+    if extra_command_line_args:
+        command_line_args_configs.append(
+            basic_command_line_args + extra_command_line_args)
+
+    # Run CVC4 on the benchmark with the different option sets and check
+    # whether the exit status, stdout output, stderr output are as expected.
+    print('1..{}'.format(len(command_line_args_configs)))
+    print('# Starting')
+    for command_line_args in command_line_args_configs:
+        output, error, exit_status = run_benchmark(
+            dump, wrapper, scrubber, error_scrubber, cvc4_binary,
+            command_line_args, benchmark_dir, benchmark_basename)
+        if output != expected_output:
+            print(
+                'not ok - Differences between expected and actual output on stdout - Flags: {}'.
+                format(command_line_args))
+            for line in difflib.context_diff(output.splitlines(),
+                                             expected_output.splitlines()):
+                print(line)
+        elif error != expected_error:
+            print(
+                'not ok - Differences between expected and actual output on stderr - Flags: {}'.
+                format(command_line_args))
+            for line in difflib.context_diff(error.splitlines(),
+                                             expected_error.splitlines()):
+                print(line)
+        elif expected_exit_status != exit_status:
+            print(
+                'not ok - Expected exit status "{}" but got "{}" - Flags: {}'.
+                format(expected_exit_status, exit_status, command_line_args))
+        else:
+            print('ok - Flags: {}'.format(command_line_args))
+
+
+def main():
+    """Parses the command line arguments and then calls the core of the
+    script."""
+
+    parser = argparse.ArgumentParser(
+        description=
+        'Runs benchmark and checks for correct exit status and output.')
+    parser.add_argument('--proof', action='store_true')
+    parser.add_argument('--dump', action='store_true')
+    parser.add_argument('wrapper', nargs='*')
+    parser.add_argument('cvc4_binary')
+    parser.add_argument('benchmark')
+    args = parser.parse_args()
+    cvc4_binary = os.path.abspath(args.cvc4_binary)
+
+    wrapper = args.wrapper
+    if os.environ.get('VALGRIND') == '1' and not wrapper:
+        wrapper = ['libtool', '--mode=execute', 'valgrind']
+
+    run_regression(args.proof, args.dump, wrapper, cvc4_binary, args.benchmark)
+
+
+if __name__ == "__main__":
+    main()