diff options
Diffstat (limited to 'cryptominisat5/cryptominisat-5.6.3/scripts/learn/predict.py')
-rwxr-xr-x | cryptominisat5/cryptominisat-5.6.3/scripts/learn/predict.py | 372 |
1 files changed, 372 insertions, 0 deletions
diff --git a/cryptominisat5/cryptominisat-5.6.3/scripts/learn/predict.py b/cryptominisat5/cryptominisat-5.6.3/scripts/learn/predict.py new file mode 100755 index 000000000..68fc3bfba --- /dev/null +++ b/cryptominisat5/cryptominisat-5.6.3/scripts/learn/predict.py @@ -0,0 +1,372 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright (C) 2018 Mate Soos +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; version 2 +# of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +# 02110-1301, USA. + +import pandas as pd +import pickle +import sklearn +import sklearn.svm +import sklearn.tree +import sklearn.ensemble +import optparse +import numpy as np +import sklearn.metrics +import time +import itertools +import matplotlib.pyplot as plt +from sklearn.model_selection import train_test_split + +class_names = ["throw", "longer"] +cuts = [-1, 10000, 1000000000000] +class_names2 = ["middle", "forever"] +cuts2 = [-1, 30000, 1000000000000] +#class_names3 = ["middle2", "forever"] +#cuts3 = [-1, 60000, 1000000000000] + + +def output_to_dot(clf, features, nameextra): + fname = options.dot+nameextra + sklearn.tree.export_graphviz(clf, out_file=fname, + feature_names=features, + class_names=class_names, + filled=True, rounded=True, + special_characters=True, + proportion=True) + print("Run dot:") + print("dot -Tpng {fname} -o {fname}.png".format(fname=fname)) + print("gwenview {fname}.png".format(fname=fname)) + + +def calc_cross_val(): + # calculate accuracy/prec/recall for cross-validation + accuracy = sklearn.model_selection.cross_val_score(self.clf, X_train, y_train, cv=10) + precision = sklearn.model_selection.cross_val_score(self.clf, X_train, y_train, cv=10, scoring='precision') + recall = sklearn.model_selection.cross_val_score(self.clf, X_train, y_train, cv=10, scoring='recall') + print("cv-accuracy:", accuracy) + print("cv-precision:", precision) + print("cv-recall:", recall) + accuracy = np.mean(accuracy) + precision = np.mean(precision) + recall = np.mean(recall) + print("cv-prec: %-3.4f cv-recall: %-3.4f cv-accuracy: %-3.4f T: %-3.2f" % + (precision, recall, accuracy, (time.time() - t))) + + +def plot_confusion_matrix(cm, classes, + normalize=False, + title='Confusion matrix', + cmap=plt.cm.Blues): + """ + This function prints and plots the confusion matrix. + Normalization can be applied by setting `normalize=True`. + """ + if normalize: + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + print("Normalized confusion matrix") + else: + print('Confusion matrix, without normalization') + + print(cm) + + plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.title(title) + plt.colorbar() + tick_marks = np.arange(len(classes)) + plt.xticks(tick_marks, classes, rotation=45) + plt.yticks(tick_marks, classes) + + fmt = '.2f' if normalize else 'd' + thresh = cm.max() / 2. + for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): + plt.text(j, i, format(cm[i, j], fmt), + horizontalalignment="center", + color="white" if cm[i, j] > thresh else "black") + + plt.tight_layout() + plt.ylabel('True label') + plt.xlabel('Predicted label') + + +# to check for too large or NaN values: +def check_too_large_or_nan_values(df): + features = df.columns.values.flatten().tolist() + index = 0 + for index, row in df.iterrows(): + for x, name in zip(row, features): + if not np.isfinite(x) or x > np.finfo(np.float32).max: + print("issue with data for features: ", name, x) + index += 1 + + +def get_code(tree, feature_names): + left = tree.tree_.children_left + right = tree.tree_.children_right + threshold = tree.tree_.threshold + features = [feature_names[i] for i in tree.tree_.feature] + value = tree.tree_.value + + def recurse(left, right, threshold, features, node): + if (threshold[node] != -2): + print("if ( " + features[node] + " <= " + str(threshold[node]) + " ) {") + if left[node] != -1: + recurse(left, right, threshold, features, left[node]) + print("} else {") + if right[node] != -1: + recurse(left, right, threshold, features, right[node]) + print("}") + else: + print("return " + str(value[node])) + + recurse(left, right, threshold, features, 0) + + +def one_classifier(df, features, to_predict, names, w_name, w_number, final): + print("================ predicting %s ================" % to_predict) + print("-> Number of features :", len(features)) + print("-> Number of datapoints:", df.shape) + print("-> Predicting :", to_predict) + + train, test = train_test_split(df, test_size=0.33) + X_train = train[features] + y_train = train[to_predict] + X_test = test[features] + y_test = test[to_predict] + + t = time.time() + clf = None + # clf = sklearn.linear_model.LogisticRegression() + # clf = sklearn.svm.SVC() + if final: + clf = sklearn.tree.DecisionTreeClassifier(max_depth=options.tree_depth) + else: + clf = sklearn.ensemble.RandomForestClassifier(n_estimators=80) + #clf = sklearn.ensemble.ExtraTreesClassifier(n_estimators=80) + + sample_weight = [w_number if i == w_name else 1 for i in y_train] + clf.fit(X_train, y_train, sample_weight=sample_weight) + + print("Training finished. T: %-3.2f" % (time.time() - t)) + + best_features = [] + if not final: + importances = clf.feature_importances_ + std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0) + indices = np.argsort(importances)[::-1] + indices = indices[:options.top_num_features] + myrange = min(X_train.shape[1], options.top_num_features) + + # Print the feature ranking + print("Feature ranking:") + + for f in range(myrange): + print("%-3d %-35s -- %8.4f" % + (f + 1, features[indices[f]], importances[indices[f]])) + best_features.append(features[indices[f]]) + + # Plot the feature importances of the clf + plt.figure() + plt.title("Feature importances") + plt.bar(range(myrange), importances[indices], + color="r", align="center" + , yerr=std[indices]) + plt.xticks(range(myrange), [features[x] for x in indices], rotation=45) + plt.xlim([-1, myrange]) + else: + get_code(clf, features) + + print("Calculating scores....") + y_pred = clf.predict(X_test) + accuracy = sklearn.metrics.accuracy_score(y_test, y_pred) + precision = sklearn.metrics.precision_score(y_test, y_pred, average="macro") + recall = sklearn.metrics.recall_score(y_test, y_pred, average="macro") + print("prec: %-3.4f recall: %-3.4f accuracy: %-3.4f T: %-3.2f" % ( + precision, recall, accuracy, (time.time() - t))) + + if options.confusion: + sample_weight = [w_number if i == w_name else 1 for i in y_pred] + cnf_matrix = sklearn.metrics.confusion_matrix( + y_test, y_pred, labels=names, sample_weight=sample_weight) + + np.set_printoptions(precision=2) + + # Plot non-normalized confusion matrix + plt.figure() + plot_confusion_matrix( + cnf_matrix, classes=names, + title='Confusion matrix, without normalization') + + # Plot normalized confusion matrix + plt.figure() + plot_confusion_matrix( + cnf_matrix, classes=names, normalize=True, + title='Normalized confusion matrix') + + # TODO do L1 regularization + + if False: + calc_cross_val() + + if options.dot is not None and final: + output_to_dot(clf, features, names[0]) + + return best_features + + +def remove_old_clause_features(features): + todel = [] + for name in features: + if "cl2" in name or "cl3" in name or "cl4" in name: + todel.append(name) + + for x in todel: + features.remove(x) + if options.verbose: + print("Removing old clause feature:", x) + + +def rem_features(feat, to_remove): + feat_less = list(feat) + todel = [] + for feature in feat: + for rem in to_remove: + if rem in feature: + feat_less.remove(feature) + if options.verbose: + print("Removing feature from feat_less:", feature) + + return feat_less + + +def learn(fname): + with open(fname, "rb") as f: + df = pickle.load(f) + + if options.check_row_data: + check_too_large_or_nan_values(df) + + print("total samples: %5d" % df.shape[0]) + + # lifetime to predict + df["x.lifetime_cut"] = pd.cut( + df["x.lifetime"], + cuts, + labels=class_names) + + df["x.lifetime_cut2"] = pd.cut( + df["x.lifetime"], + cuts2, + labels=class_names2) + + #df["x.lifetime_cut3"] = pd.cut( + #df["x.lifetime"], + #cuts3, + #labels=class_names3) + + features = df.columns.values.flatten().tolist() + features = rem_features(features, + ["x.num_used", "x.class", "x.lifetime", "fname"]) + + # this needs binarization + features = rem_features(features, ["cl.cur_restart_type"]) + # x = (df["cl.cur_restart_type"].values[:, np.newaxis] == df["cl.cur_restart_type"].unique()).astype(int) + # print(x) + + if True: + remove_old_clause_features(features) + + if options.raw_data_plots: + pd.options.display.mpl_style = "default" + df.hist() + df.boxplot() + + if True: + feat_less = rem_features(features, ["rdb1", "rdb2", "rdb3", "rdb4"]) + best_feats = one_classifier(df, feat_less, "x.lifetime_cut", + class_names, "longer", 17, + False) + if options.show: + plt.show() + + one_classifier(df, best_feats, "x.lifetime_cut", + class_names, "longer", 3, + True) + if options.show: + plt.show() + + if True: + feat_less = rem_features(features, ["rdb3", "rdb4"]) + df2 = df[df["x.lifetime"] > cuts[1]] + + best_feats = one_classifier(df2, feat_less, "x.lifetime_cut2", + class_names2, "middle", 30, + False) + if options.show: + plt.show() + + one_classifier(df2, best_feats, "x.lifetime_cut2", + class_names2, "middle", 4, + True) + + if options.show: + plt.show() + + #if True: + #df3 = df[df["x.lifetime"] > cuts2[1]] + + #best_feats = one_classifier(df3, features, "x.lifetime_cut3", + #class_names3, "middle2", 20, + #False) + #if options.show: + #plt.show() + + #one_classifier(df3, best_feats, "x.lifetime_cut3", + #class_names3, "middle2", 8, + #True) + + +if __name__ == "__main__": + usage = "usage: %prog [options] file.pandas" + parser = optparse.OptionParser(usage=usage) + + parser.add_option("--verbose", "-v", action="store_true", default=False, + dest="verbose", help="Print more output") + parser.add_option("--cross", action="store_true", default=False, + dest="cross_validate", help="Cross-validate prec/recall/acc against training data") + parser.add_option("--depth", default=6, type=int, + dest="tree_depth", help="Depth of the tree to create") + parser.add_option("--dot", type=str, default=None, + dest="dot", help="Create DOT file") + parser.add_option("--conf", action="store_true", default=False, + dest="confusion", help="Create confusion matrix") + parser.add_option("--show", action="store_true", default=False, + dest="show", help="Show visual graphs") + parser.add_option("--check", action="store_true", default=False, + dest="check_row_data", help="Check row data for NaN or float overflow") + parser.add_option("--rawplots", action="store_true", default=False, + dest="raw_data_plots", help="Display raw data plots") + parser.add_option("--top", default=12, type=int, + dest="top_num_features", help="Number of top features to take to generate the final predictor") + + (options, args) = parser.parse_args() + + if len(args) < 1: + print("ERROR: You must give the pandas file!") + exit(-1) + + learn(args[0]) |