import os import matplotlib matplotlib.use("agg") import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns from scipy import interpolate from sklearn.decomposition import TruncatedSVD from sklearn.manifold import TSNE from sklearn.metrics import ( auc, classification_report, confusion_matrix, fbeta_score, precision_recall_curve, roc_auc_score, roc_curve ) def scores(y_true): for (path, dirnames, fnames) in os.walk("results/"): for f in fnames: if path[-1] == "1" and f.endswith("npy"): y_pred = np.load(os.path.join(path, f)).flatten() print(path) tp = np.sum(np.logical_and(y_pred >= 0.5, y_true == 1)) tn = np.sum(np.logical_and(y_pred < 0.5, y_true == 0)) fp = np.sum(np.logical_and(y_pred >= 0.5, y_true == 0)) fn = np.sum(np.logical_and(y_pred < 0.5, y_true == 1)) precision = tp / (tp + fp) recall = tp / (tp + fn) accuracy = (tp + tn) / len(y_true) f1_score = 2 * (precision * recall) / (precision + recall) f05_score = (1 + 0.5 ** 2) * (precision * recall) / (0.5 ** 2 * precision + recall) print(" precision:", precision) print(" recall:", recall) print(" accuracy:", accuracy) print(" f1 score:", f1_score) print(" f0.5 score:", f05_score) def plot_clf(): plt.clf() sns.set_context("paper") sns.set_style("white") def plot_save(path, dpi=600, set_size=True): # plt.title(path) fig = plt.gcf() # fig.suptitle(path) if set_size: fig.set_size_inches(8, 4.5) fig.savefig(path, dpi=dpi, bbox_inches='tight') plt.close() def plot_legend(): plt.legend() def mathews_correlation_curve(y, y_pred): pass def plot_precision_recall(y, y_pred, label=""): y = y.flatten() y_pred = y_pred.flatten() precision, recall, thresholds = precision_recall_curve(y, y_pred) # decreasing_max_precision = np.maximum.accumulate(precision)[::-1] # fig, ax = plt.subplots(1, 1) # ax.hold(True) score = fbeta_score(y, y_pred.round(), 1) # prc_ap = average_precision_score(y, y_pred) plt.plot(recall, precision, '--', label=f"{label} - {score:5.4}") # ax.step(recall[::-1], decreasing_max_precision, '-r') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0]) def calc_pr_mean(y, y_preds): return calc_metrics_mean(y, y_preds, "prc") def plot_mean_curve(x, ys, std, score, label): plt.plot(x, ys, label=f"{label} - {score:5.4}") plt.fill_between(x, ys - std, ys + std, alpha=0.1) plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0]) def plot_pr_mean(y, y_preds, label=""): x = np.linspace(0, 1, 10000) ys_mean, ys_std, score = calc_pr_mean(y, y_preds) plot_mean_curve(x, ys_mean, ys_std, score, label) plt.xlabel('Recall') plt.ylabel('Precision') def score_model(y, prediction): y = y.flatten() y_pred = prediction.flatten() precision, recall, thresholds = precision_recall_curve(y, y_pred) print(classification_report(y, y_pred.round())) print("Area under PR curve", auc(recall, precision)) print("roc auc score", roc_auc_score(y, y_pred)) print("F1 Score", fbeta_score(y, y_pred.round(), 1)) print("F0.5 Score", fbeta_score(y, y_pred.round(), 0.5)) def plot_roc_curve(mask, prediction, label=""): y = mask.flatten() y_pred = prediction.flatten() fpr, tpr, thresholds = roc_curve(y, y_pred) roc_auc = auc(fpr, tpr) plt.xscale('log') plt.plot(fpr, tpr, label=f"{label} - {roc_auc:5.4}") plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') def calc_metrics_mean(y, y_preds, metric): appr = [] y = y.flatten() for idx, y_pred in enumerate(y_preds): y_pred = y_pred.flatten() if metric == "prc": precision, recall, thresholds = precision_recall_curve(y, y_pred) appr.append(interpolate.interp1d(recall, precision)) elif metric == "roc": fpr, tpr, thresholds = roc_curve(y, y_pred) appr.append(interpolate.interp1d(fpr, tpr)) x = np.linspace(0, 1, 10000) ys = np.vstack([f(x) for f in appr]) ys_mean = ys.mean(axis=0) ys_std = ys.std(axis=0) return ys_mean, ys_std, ys def calc_roc_mean(y, y_preds): return calc_metrics_mean(y, y_preds, "roc") def plot_roc_mean(y, y_preds, label=""): x = np.linspace(0, 1, 10000) ys_mean, ys_std, score = calc_roc_mean(y, y_preds) plt.xscale('log') plot_mean_curve(x, ys_mean, ys_std, score, label) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') def plot_confusion_matrix(y_true, y_pred, path, normalize=False, classes=("benign", "malicious"), title='Confusion matrix', cmap="Blues", dpi=600): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ plt.clf() cm = confusion_matrix(y_true, y_pred) if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') print(cm) plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) thresh = cm.max() / 2. for i, j in ((i, j) for i in range(cm.shape[0]) for j in range(cm.shape[1])): plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') plt.savefig(path, dpi=dpi) plt.close() def plot_training_curve(logs, key, path, dpi=600): plt.clf() plt.plot(logs[f"{key}acc"], label="accuracy") plt.plot(logs[f"{key}f1_score"], label="f1_score") plt.plot(logs[f"val_{key}acc"], label="val_accuracy") # plt.plot(logs[f"val_{key}f1_score"], label="val_f1_score") plt.xlabel('epoch') plt.ylabel('percentage') plt.legend() plt.savefig(path, dpi=dpi) plt.close() def plot_error_bars(results): rates = [] for m, r in results.items(): if m == "all": continue rates.append((r / r.sum(axis=0, keepdims=True)).flatten()) rates = pd.DataFrame(np.vstack(rates), columns=("TN", "FP", "FN", "TP")) ax = rates.mean().plot.bar(yerr=rates.std()) for p in ax.patches: ax.annotate(str(np.round(p.get_height(), 4)), (p.get_x(), 0.5)) def plot_embedding(domain_embedding, labels, path, dpi=600, method="svd"): if method == "svd": red = TruncatedSVD(n_components=2) elif method == "tsne": red = TSNE(n_components=2, verbose=2) domain_reduced = red.fit_transform(domain_embedding) print(red.explained_variance_ratio_) # use if draw subset of predictions # idx = np.random.choice(np.arange(len(domain_reduced)), 10000) plt.scatter(domain_reduced[:, 0], domain_reduced[:, 1], c=(labels * (1, 2)).sum(1).astype(int), cmap=plt.cm.plasma, s=3, alpha=0.2) plt.colorbar() plt.savefig(path, dpi=dpi) def plot_model_as(model, path, shapes=True, layer_names=True): from keras.utils.vis_utils import plot_model plot_model(model, to_file=path, show_shapes=shapes, show_layer_names=layer_names)