ma_cisco_malware/visualize.py

import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import interpolate
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.metrics import (
    auc, classification_report, confusion_matrix, fbeta_score, precision_recall_curve,
    roc_auc_score, roc_curve
)


def scores(y_true):
    for (path, dirnames, fnames) in os.walk("results/"):
        for f in fnames:
            if path[-1] == "1" and f.endswith("npy"):
                y_pred = np.load(os.path.join(path, f)).flatten()
                print(path)
                tp = np.sum(np.logical_and(y_pred >= 0.5, y_true == 1))
                tn = np.sum(np.logical_and(y_pred < 0.5, y_true == 0))
                fp = np.sum(np.logical_and(y_pred >= 0.5, y_true == 0))
                fn = np.sum(np.logical_and(y_pred < 0.5, y_true == 1))
                precision = tp / (tp + fp)
                recall = tp / (tp + fn)
                accuracy = (tp + tn) / len(y_true)
                f1_score = 2 * (precision * recall) / (precision + recall)
                f05_score = (1 + 0.5 ** 2) * (precision * recall) / (0.5 ** 2 * precision + recall)
                print("  precision:", precision)
                print("  recall:", recall)
                print("  accuracy:", accuracy)
                print("  f1 score:", f1_score)
                print("  f0.5 score:", f05_score)


def plot_clf():
    plt.clf()
    sns.set_context("paper")
    sns.set_style("white")


def plot_save(path, dpi=600, set_size=True):
    # plt.title(path)
    fig = plt.gcf()
    # fig.suptitle(path)
    if set_size:
        fig.set_size_inches(8, 4.5)
    fig.savefig(path, dpi=dpi, bbox_inches='tight')
    plt.close()


def plot_legend():
    plt.legend()


def mathews_correlation_curve(y, y_pred):
    pass


def plot_precision_recall(y, y_pred, label=""):
    y = y.flatten()
    y_pred = y_pred.flatten()
    precision, recall, thresholds = precision_recall_curve(y, y_pred)
    # decreasing_max_precision = np.maximum.accumulate(precision)[::-1]

    # fig, ax = plt.subplots(1, 1)
    # ax.hold(True)
    score = fbeta_score(y, y_pred.round(), 1)
    # prc_ap = average_precision_score(y, y_pred)
    plt.plot(recall, precision, '--', label=f"{label} - {score:5.4}")
    # ax.step(recall[::-1], decreasing_max_precision, '-r')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])


def calc_pr_mean(y, y_preds):
    return calc_metrics_mean(y, y_preds, "prc")


def plot_mean_curve(x, ys, std, score, label):
    plt.plot(x, ys, label=f"{label} - {score:5.4}")
    plt.fill_between(x, ys - std, ys + std, alpha=0.1)
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])


def plot_pr_mean(y, y_preds, label=""):
    x = np.linspace(0, 1, 10000)
    ys_mean, ys_std, score = calc_pr_mean(y, y_preds)
    plot_mean_curve(x, ys_mean, ys_std, score, label)
    plt.xlabel('Recall')
    plt.ylabel('Precision')


def score_model(y, prediction):
    y = y.flatten()
    y_pred = prediction.flatten()

    precision, recall, thresholds = precision_recall_curve(y, y_pred)

    print(classification_report(y, y_pred.round()))
    print("Area under PR curve", auc(recall, precision))
    print("roc auc score", roc_auc_score(y, y_pred))
    print("F1 Score", fbeta_score(y, y_pred.round(), 1))
    print("F0.5 Score", fbeta_score(y, y_pred.round(), 0.5))


def plot_roc_curve(mask, prediction, label=""):
    y = mask.flatten()
    y_pred = prediction.flatten()
    fpr, tpr, thresholds = roc_curve(y, y_pred)
    roc_auc = auc(fpr, tpr)
    plt.xscale('log')
    plt.plot(fpr, tpr, label=f"{label} - {roc_auc:5.4}")
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')


def calc_metrics_mean(y, y_preds, metric):
    appr = []
    y = y.flatten()
    for idx, y_pred in enumerate(y_preds):
        y_pred = y_pred.flatten()
        if metric == "prc":
            precision, recall, thresholds = precision_recall_curve(y, y_pred)
            appr.append(interpolate.interp1d(recall, precision))
        elif metric == "roc":
            fpr, tpr, thresholds = roc_curve(y, y_pred)
            appr.append(interpolate.interp1d(fpr, tpr))
    x = np.linspace(0, 1, 10000)
    ys = np.vstack([f(x) for f in appr])
    ys_mean = ys.mean(axis=0)
    ys_std = ys.std(axis=0)
    return ys_mean, ys_std, ys


def calc_roc_mean(y, y_preds):
    return calc_metrics_mean(y, y_preds, "roc")


def plot_roc_mean(y, y_preds, label=""):
    x = np.linspace(0, 1, 10000)
    ys_mean, ys_std, score = calc_roc_mean(y, y_preds)
    plt.xscale('log')
    plot_mean_curve(x, ys_mean, ys_std, score, label)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')


def plot_confusion_matrix(y_true, y_pred, path,
                          normalize=False,
                          classes=("benign", "malicious"),
                          title='Confusion matrix',
                          cmap="Blues", dpi=600):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.clf()
    cm = confusion_matrix(y_true, y_pred)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in ((i, j) for i in range(cm.shape[0]) for j in range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(path, dpi=dpi)
    plt.close()


def plot_training_curve(logs, key, path, dpi=600):
    plt.clf()
    plt.plot(logs[f"{key}acc"], label="accuracy")
    plt.plot(logs[f"{key}f1_score"], label="f1_score")

    plt.plot(logs[f"val_{key}acc"], label="val_accuracy")
    # plt.plot(logs[f"val_{key}f1_score"], label="val_f1_score")

    plt.xlabel('epoch')
    plt.ylabel('percentage')
    plt.legend()
    plt.savefig(path, dpi=dpi)
    plt.close()


def plot_error_bars(results):
    rates = []
    for m, r in results.items():
        if m == "all": continue
        rates.append((r / r.sum(axis=0, keepdims=True)).flatten())
    rates = pd.DataFrame(np.vstack(rates), columns=("TN", "FP", "FN", "TP"))

    ax = rates.mean().plot.bar(yerr=rates.std())
    for p in ax.patches:
        ax.annotate(str(np.round(p.get_height(), 4)), (p.get_x(), 0.5))


def plot_embedding(domain_embedding, labels, path, dpi=600, method="svd"):
    if method == "svd":
        red = TruncatedSVD(n_components=2)
    elif method == "tsne":
        red = TSNE(n_components=2, verbose=2)
    domain_reduced = red.fit_transform(domain_embedding)
    print(red.explained_variance_ratio_)
    # use if draw subset of predictions
    # idx = np.random.choice(np.arange(len(domain_reduced)), 10000)
    plt.scatter(domain_reduced[:, 0],
                domain_reduced[:, 1],
                c=(labels * (1, 2)).sum(1).astype(int),
                cmap=plt.cm.plasma,
                s=3,
                alpha=0.2)
    plt.colorbar()
    plt.savefig(path, dpi=dpi)


def plot_model_as(model, path, shapes=True, layer_names=True):
    from keras.utils.vis_utils import plot_model
    plot_model(model, to_file=path, show_shapes=shapes, show_layer_names=layer_names)