import os import matplotlib.pyplot as plt import numpy as np import pandas as pd from scipy import interpolate from sklearn.decomposition import TruncatedSVD from sklearn.manifold import TSNE from sklearn.metrics import ( auc, classification_report, confusion_matrix, fbeta_score, precision_recall_curve, roc_auc_score, roc_curve ) def scores(y_true): for (path, dirnames, fnames) in os.walk("results/"): for f in fnames: if path[-1] == "1" and f.endswith("npy"): y_pred = np.load(os.path.join(path, f)).flatten() print(path) tp = np.sum(np.logical_and(y_pred >= 0.5, y_true == 1)) tn = np.sum(np.logical_and(y_pred < 0.5, y_true == 0)) fp = np.sum(np.logical_and(y_pred >= 0.5, y_true == 0)) fn = np.sum(np.logical_and(y_pred < 0.5, y_true == 1)) precision = tp / (tp + fp) recall = tp / (tp + fn) accuracy = (tp + tn) / len(y_true) f1_score = 2 * (precision * recall) / (precision + recall) f05_score = (1 + 0.5 ** 2) * (precision * recall) / (0.5 ** 2 * precision + recall) print(" precision:", precision) print(" recall:", recall) print(" accuracy:", accuracy) print(" f1 score:", f1_score) print(" f0.5 score:", f05_score) def plot_clf(): plt.clf() def plot_save(path, dpi=300): plt.title(path) fig = plt.gcf() fig.set_size_inches(18.5, 10.5) fig.savefig(path, dpi=dpi) plt.close() def plot_legend(): plt.legend() def mathews_correlation_curve(y, y_pred): pass def plot_precision_recall(y, y_pred, label=""): y = y.flatten() y_pred = y_pred.flatten() precision, recall, thresholds = precision_recall_curve(y, y_pred) # decreasing_max_precision = np.maximum.accumulate(precision)[::-1] # fig, ax = plt.subplots(1, 1) # ax.hold(True) score = fbeta_score(y, y_pred.round(), 1) # prc_ap = average_precision_score(y, y_pred) plt.plot(recall, precision, '--', label=f"{label} - {score:5.4}") # ax.step(recall[::-1], decreasing_max_precision, '-r') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0]) def calc_pr_mean(y, y_preds): appr = [] scores = [] y = y.flatten() for idx, y_pred in enumerate(y_preds): y_pred = y_pred.flatten() precision, recall, thresholds = precision_recall_curve(y, y_pred) appr.append(interpolate.interp1d(recall, precision)) scores.append(fbeta_score(y, y_pred.round(), 1)) x = np.linspace(0, 1, 10000) ys = np.vstack([f(x) for f in appr]) ys_mean = ys.mean(axis=0) ys_std = ys.std(axis=0) scores_mean = np.mean(scores) return ys_mean, ys_std, scores_mean def plot_mean_curve(x, ys, std, score, label): plt.plot(x, ys, label=f"{label} - {score:5.4}") plt.fill_between(x, ys - std, ys + std, alpha=0.1) plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0]) def plot_pr_mean(y, y_preds, label=""): x = np.linspace(0, 1, 10000) ys_mean, ys_std, score = calc_pr_mean(y, y_preds) plot_mean_curve(x, ys_mean, ys_std, score, label) plt.xlabel('Recall') plt.ylabel('Precision') def score_model(y, prediction): y = y.flatten() y_pred = prediction.flatten() precision, recall, thresholds = precision_recall_curve(y, y_pred) print(classification_report(y, y_pred.round())) print("Area under PR curve", auc(recall, precision)) print("roc auc score", roc_auc_score(y, y_pred)) print("F1 Score", fbeta_score(y, y_pred.round(), 1)) print("F0.5 Score", fbeta_score(y, y_pred.round(), 0.5)) def plot_roc_curve(mask, prediction, label=""): y = mask.flatten() y_pred = prediction.flatten() fpr, tpr, thresholds = roc_curve(y, y_pred) roc_auc = auc(fpr, tpr) plt.xscale('log') plt.plot(fpr, tpr, label=f"{label} - {roc_auc:5.4}") plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') def calc_roc_mean(y, y_preds): appr = [] aucs = [] y = y.flatten() for idx, y_pred in enumerate(y_preds): y_pred = y_pred.flatten() fpr, tpr, thresholds = roc_curve(y, y_pred) appr.append(interpolate.interp1d(fpr, tpr)) aucs.append(auc(fpr, tpr)) x = np.linspace(0, 1, 10000) ys = np.vstack([f(x) for f in appr]) ys_mean = ys.mean(axis=0) ys_std = ys.std(axis=0) auc_mean = np.mean(aucs) return ys_mean, ys_std, auc_mean def plot_roc_mean(y, y_preds, label=""): x = np.linspace(0, 1, 10000) ys_mean, ys_std, score = calc_roc_mean(y, y_preds) plt.xscale('log') plot_mean_curve(x, ys_mean, ys_std, score, label) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') def plot_confusion_matrix(y_true, y_pred, path, normalize=False, classes=("benign", "malicious"), title='Confusion matrix', cmap="Blues", dpi=600): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ plt.clf() cm = confusion_matrix(y_true, y_pred) if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') print(cm) plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) thresh = cm.max() / 2. for i, j in ((i, j) for i in range(cm.shape[0]) for j in range(cm.shape[1])): plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') plt.savefig(path, dpi=dpi) plt.close() def plot_training_curve(logs, key, path, dpi=600): plt.clf() plt.plot(logs[f"{key}acc"], label="accuracy") plt.plot(logs[f"{key}f1_score"], label="f1_score") plt.plot(logs[f"val_{key}acc"], label="val_accuracy") # plt.plot(logs[f"val_{key}f1_score"], label="val_f1_score") plt.xlabel('epoch') plt.ylabel('percentage') plt.legend() plt.savefig(path, dpi=dpi) plt.close() def plot_error_bars(results): rates = [] for m, r in results.items(): if m == "all": continue rates.append((r / r.sum(axis=0, keepdims=True)).flatten()) rates = pd.DataFrame(np.vstack(rates), columns=("TN", "FP", "FN", "TP")) ax = rates.mean().plot.bar(yerr=rates.std()) for p in ax.patches: ax.annotate(str(np.round(p.get_height(), 4)), (p.get_x(), 0.5)) def plot_embedding(domain_embedding, labels, path, dpi=600, method="svd"): if method == "svd": red = TruncatedSVD(n_components=2) elif method == "tsne": red = TSNE(n_components=2, verbose=2) domain_reduced = red.fit_transform(domain_embedding) print(red.explained_variance_ratio_) # use if draw subset of predictions # idx = np.random.choice(np.arange(len(domain_reduced)), 10000) plt.scatter(domain_reduced[:, 0], domain_reduced[:, 1], c=(labels * (1, 2)).sum(1).astype(int), cmap=plt.cm.plasma, s=3, alpha=0.2) plt.colorbar() plt.savefig(path, dpi=dpi) def plot_model_as(model, path): from keras.utils.vis_utils import plot_model plot_model(model, to_file=path, show_shapes=True, show_layer_names=True)