summaryrefslogtreecommitdiff
path: root/cryptominisat5/cryptominisat-5.6.3/scripts/learn/predict.py
diff options
context:
space:
mode:
Diffstat (limited to 'cryptominisat5/cryptominisat-5.6.3/scripts/learn/predict.py')
-rwxr-xr-xcryptominisat5/cryptominisat-5.6.3/scripts/learn/predict.py372
1 files changed, 372 insertions, 0 deletions
diff --git a/cryptominisat5/cryptominisat-5.6.3/scripts/learn/predict.py b/cryptominisat5/cryptominisat-5.6.3/scripts/learn/predict.py
new file mode 100755
index 000000000..68fc3bfba
--- /dev/null
+++ b/cryptominisat5/cryptominisat-5.6.3/scripts/learn/predict.py
@@ -0,0 +1,372 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright (C) 2018 Mate Soos
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; version 2
+# of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+# 02110-1301, USA.
+
+import pandas as pd
+import pickle
+import sklearn
+import sklearn.svm
+import sklearn.tree
+import sklearn.ensemble
+import optparse
+import numpy as np
+import sklearn.metrics
+import time
+import itertools
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+
+class_names = ["throw", "longer"]
+cuts = [-1, 10000, 1000000000000]
+class_names2 = ["middle", "forever"]
+cuts2 = [-1, 30000, 1000000000000]
+#class_names3 = ["middle2", "forever"]
+#cuts3 = [-1, 60000, 1000000000000]
+
+
+def output_to_dot(clf, features, nameextra):
+ fname = options.dot+nameextra
+ sklearn.tree.export_graphviz(clf, out_file=fname,
+ feature_names=features,
+ class_names=class_names,
+ filled=True, rounded=True,
+ special_characters=True,
+ proportion=True)
+ print("Run dot:")
+ print("dot -Tpng {fname} -o {fname}.png".format(fname=fname))
+ print("gwenview {fname}.png".format(fname=fname))
+
+
+def calc_cross_val():
+ # calculate accuracy/prec/recall for cross-validation
+ accuracy = sklearn.model_selection.cross_val_score(self.clf, X_train, y_train, cv=10)
+ precision = sklearn.model_selection.cross_val_score(self.clf, X_train, y_train, cv=10, scoring='precision')
+ recall = sklearn.model_selection.cross_val_score(self.clf, X_train, y_train, cv=10, scoring='recall')
+ print("cv-accuracy:", accuracy)
+ print("cv-precision:", precision)
+ print("cv-recall:", recall)
+ accuracy = np.mean(accuracy)
+ precision = np.mean(precision)
+ recall = np.mean(recall)
+ print("cv-prec: %-3.4f cv-recall: %-3.4f cv-accuracy: %-3.4f T: %-3.2f" %
+ (precision, recall, accuracy, (time.time() - t)))
+
+
+def plot_confusion_matrix(cm, classes,
+ normalize=False,
+ title='Confusion matrix',
+ cmap=plt.cm.Blues):
+ """
+ This function prints and plots the confusion matrix.
+ Normalization can be applied by setting `normalize=True`.
+ """
+ if normalize:
+ cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+ print("Normalized confusion matrix")
+ else:
+ print('Confusion matrix, without normalization')
+
+ print(cm)
+
+ plt.imshow(cm, interpolation='nearest', cmap=cmap)
+ plt.title(title)
+ plt.colorbar()
+ tick_marks = np.arange(len(classes))
+ plt.xticks(tick_marks, classes, rotation=45)
+ plt.yticks(tick_marks, classes)
+
+ fmt = '.2f' if normalize else 'd'
+ thresh = cm.max() / 2.
+ for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
+ plt.text(j, i, format(cm[i, j], fmt),
+ horizontalalignment="center",
+ color="white" if cm[i, j] > thresh else "black")
+
+ plt.tight_layout()
+ plt.ylabel('True label')
+ plt.xlabel('Predicted label')
+
+
+# to check for too large or NaN values:
+def check_too_large_or_nan_values(df):
+ features = df.columns.values.flatten().tolist()
+ index = 0
+ for index, row in df.iterrows():
+ for x, name in zip(row, features):
+ if not np.isfinite(x) or x > np.finfo(np.float32).max:
+ print("issue with data for features: ", name, x)
+ index += 1
+
+
+def get_code(tree, feature_names):
+ left = tree.tree_.children_left
+ right = tree.tree_.children_right
+ threshold = tree.tree_.threshold
+ features = [feature_names[i] for i in tree.tree_.feature]
+ value = tree.tree_.value
+
+ def recurse(left, right, threshold, features, node):
+ if (threshold[node] != -2):
+ print("if ( " + features[node] + " <= " + str(threshold[node]) + " ) {")
+ if left[node] != -1:
+ recurse(left, right, threshold, features, left[node])
+ print("} else {")
+ if right[node] != -1:
+ recurse(left, right, threshold, features, right[node])
+ print("}")
+ else:
+ print("return " + str(value[node]))
+
+ recurse(left, right, threshold, features, 0)
+
+
+def one_classifier(df, features, to_predict, names, w_name, w_number, final):
+ print("================ predicting %s ================" % to_predict)
+ print("-> Number of features :", len(features))
+ print("-> Number of datapoints:", df.shape)
+ print("-> Predicting :", to_predict)
+
+ train, test = train_test_split(df, test_size=0.33)
+ X_train = train[features]
+ y_train = train[to_predict]
+ X_test = test[features]
+ y_test = test[to_predict]
+
+ t = time.time()
+ clf = None
+ # clf = sklearn.linear_model.LogisticRegression()
+ # clf = sklearn.svm.SVC()
+ if final:
+ clf = sklearn.tree.DecisionTreeClassifier(max_depth=options.tree_depth)
+ else:
+ clf = sklearn.ensemble.RandomForestClassifier(n_estimators=80)
+ #clf = sklearn.ensemble.ExtraTreesClassifier(n_estimators=80)
+
+ sample_weight = [w_number if i == w_name else 1 for i in y_train]
+ clf.fit(X_train, y_train, sample_weight=sample_weight)
+
+ print("Training finished. T: %-3.2f" % (time.time() - t))
+
+ best_features = []
+ if not final:
+ importances = clf.feature_importances_
+ std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
+ indices = np.argsort(importances)[::-1]
+ indices = indices[:options.top_num_features]
+ myrange = min(X_train.shape[1], options.top_num_features)
+
+ # Print the feature ranking
+ print("Feature ranking:")
+
+ for f in range(myrange):
+ print("%-3d %-35s -- %8.4f" %
+ (f + 1, features[indices[f]], importances[indices[f]]))
+ best_features.append(features[indices[f]])
+
+ # Plot the feature importances of the clf
+ plt.figure()
+ plt.title("Feature importances")
+ plt.bar(range(myrange), importances[indices],
+ color="r", align="center"
+ , yerr=std[indices])
+ plt.xticks(range(myrange), [features[x] for x in indices], rotation=45)
+ plt.xlim([-1, myrange])
+ else:
+ get_code(clf, features)
+
+ print("Calculating scores....")
+ y_pred = clf.predict(X_test)
+ accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
+ precision = sklearn.metrics.precision_score(y_test, y_pred, average="macro")
+ recall = sklearn.metrics.recall_score(y_test, y_pred, average="macro")
+ print("prec: %-3.4f recall: %-3.4f accuracy: %-3.4f T: %-3.2f" % (
+ precision, recall, accuracy, (time.time() - t)))
+
+ if options.confusion:
+ sample_weight = [w_number if i == w_name else 1 for i in y_pred]
+ cnf_matrix = sklearn.metrics.confusion_matrix(
+ y_test, y_pred, labels=names, sample_weight=sample_weight)
+
+ np.set_printoptions(precision=2)
+
+ # Plot non-normalized confusion matrix
+ plt.figure()
+ plot_confusion_matrix(
+ cnf_matrix, classes=names,
+ title='Confusion matrix, without normalization')
+
+ # Plot normalized confusion matrix
+ plt.figure()
+ plot_confusion_matrix(
+ cnf_matrix, classes=names, normalize=True,
+ title='Normalized confusion matrix')
+
+ # TODO do L1 regularization
+
+ if False:
+ calc_cross_val()
+
+ if options.dot is not None and final:
+ output_to_dot(clf, features, names[0])
+
+ return best_features
+
+
+def remove_old_clause_features(features):
+ todel = []
+ for name in features:
+ if "cl2" in name or "cl3" in name or "cl4" in name:
+ todel.append(name)
+
+ for x in todel:
+ features.remove(x)
+ if options.verbose:
+ print("Removing old clause feature:", x)
+
+
+def rem_features(feat, to_remove):
+ feat_less = list(feat)
+ todel = []
+ for feature in feat:
+ for rem in to_remove:
+ if rem in feature:
+ feat_less.remove(feature)
+ if options.verbose:
+ print("Removing feature from feat_less:", feature)
+
+ return feat_less
+
+
+def learn(fname):
+ with open(fname, "rb") as f:
+ df = pickle.load(f)
+
+ if options.check_row_data:
+ check_too_large_or_nan_values(df)
+
+ print("total samples: %5d" % df.shape[0])
+
+ # lifetime to predict
+ df["x.lifetime_cut"] = pd.cut(
+ df["x.lifetime"],
+ cuts,
+ labels=class_names)
+
+ df["x.lifetime_cut2"] = pd.cut(
+ df["x.lifetime"],
+ cuts2,
+ labels=class_names2)
+
+ #df["x.lifetime_cut3"] = pd.cut(
+ #df["x.lifetime"],
+ #cuts3,
+ #labels=class_names3)
+
+ features = df.columns.values.flatten().tolist()
+ features = rem_features(features,
+ ["x.num_used", "x.class", "x.lifetime", "fname"])
+
+ # this needs binarization
+ features = rem_features(features, ["cl.cur_restart_type"])
+ # x = (df["cl.cur_restart_type"].values[:, np.newaxis] == df["cl.cur_restart_type"].unique()).astype(int)
+ # print(x)
+
+ if True:
+ remove_old_clause_features(features)
+
+ if options.raw_data_plots:
+ pd.options.display.mpl_style = "default"
+ df.hist()
+ df.boxplot()
+
+ if True:
+ feat_less = rem_features(features, ["rdb1", "rdb2", "rdb3", "rdb4"])
+ best_feats = one_classifier(df, feat_less, "x.lifetime_cut",
+ class_names, "longer", 17,
+ False)
+ if options.show:
+ plt.show()
+
+ one_classifier(df, best_feats, "x.lifetime_cut",
+ class_names, "longer", 3,
+ True)
+ if options.show:
+ plt.show()
+
+ if True:
+ feat_less = rem_features(features, ["rdb3", "rdb4"])
+ df2 = df[df["x.lifetime"] > cuts[1]]
+
+ best_feats = one_classifier(df2, feat_less, "x.lifetime_cut2",
+ class_names2, "middle", 30,
+ False)
+ if options.show:
+ plt.show()
+
+ one_classifier(df2, best_feats, "x.lifetime_cut2",
+ class_names2, "middle", 4,
+ True)
+
+ if options.show:
+ plt.show()
+
+ #if True:
+ #df3 = df[df["x.lifetime"] > cuts2[1]]
+
+ #best_feats = one_classifier(df3, features, "x.lifetime_cut3",
+ #class_names3, "middle2", 20,
+ #False)
+ #if options.show:
+ #plt.show()
+
+ #one_classifier(df3, best_feats, "x.lifetime_cut3",
+ #class_names3, "middle2", 8,
+ #True)
+
+
+if __name__ == "__main__":
+ usage = "usage: %prog [options] file.pandas"
+ parser = optparse.OptionParser(usage=usage)
+
+ parser.add_option("--verbose", "-v", action="store_true", default=False,
+ dest="verbose", help="Print more output")
+ parser.add_option("--cross", action="store_true", default=False,
+ dest="cross_validate", help="Cross-validate prec/recall/acc against training data")
+ parser.add_option("--depth", default=6, type=int,
+ dest="tree_depth", help="Depth of the tree to create")
+ parser.add_option("--dot", type=str, default=None,
+ dest="dot", help="Create DOT file")
+ parser.add_option("--conf", action="store_true", default=False,
+ dest="confusion", help="Create confusion matrix")
+ parser.add_option("--show", action="store_true", default=False,
+ dest="show", help="Show visual graphs")
+ parser.add_option("--check", action="store_true", default=False,
+ dest="check_row_data", help="Check row data for NaN or float overflow")
+ parser.add_option("--rawplots", action="store_true", default=False,
+ dest="raw_data_plots", help="Display raw data plots")
+ parser.add_option("--top", default=12, type=int,
+ dest="top_num_features", help="Number of top features to take to generate the final predictor")
+
+ (options, args) = parser.parse_args()
+
+ if len(args) < 1:
+ print("ERROR: You must give the pandas file!")
+ exit(-1)
+
+ learn(args[0])
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback