ma_cisco_malware/visualize.py

import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import interpolate
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.metrics import (
    auc, classification_report, confusion_matrix, fbeta_score, precision_recall_curve,
    roc_auc_score, roc_curve
)


def scores(y_true):
    for (path, dirnames, fnames) in os.walk("results/"):
        for f in fnames:
            if path[-1] == "1" and f.endswith("npy"):
                y_pred = np.load(os.path.join(path, f)).flatten()
                print(path)
                tp = np.sum(np.logical_and(y_pred >= 0.5, y_true == 1))
                tn = np.sum(np.logical_and(y_pred < 0.5, y_true == 0))
                fp = np.sum(np.logical_and(y_pred >= 0.5, y_true == 0))
                fn = np.sum(np.logical_and(y_pred < 0.5, y_true == 1))
                precision = tp / (tp + fp)
                recall = tp / (tp + fn)
                accuracy = (tp + tn) / len(y_true)
                f1_score = 2 * (precision * recall) / (precision + recall)
                f05_score = (1 + 0.5 ** 2) * (precision * recall) / (0.5 ** 2 * precision + recall)
                print("  precision:", precision)
                print("  recall:", recall)
                print("  accuracy:", accuracy)
                print("  f1 score:", f1_score)
                print("  f0.5 score:", f05_score)


def plot_clf():
    plt.clf()


def plot_save(path, dpi=300):
    plt.title(path)
    fig = plt.gcf()
    fig.set_size_inches(18.5, 10.5)
    fig.savefig(path, dpi=dpi)
    plt.close()


def plot_legend():
    plt.legend()


def mathews_correlation_curve(y, y_pred):
    pass


def plot_precision_recall(y, y_pred, label=""):
    y = y.flatten()
    y_pred = y_pred.flatten()
    precision, recall, thresholds = precision_recall_curve(y, y_pred)
    # decreasing_max_precision = np.maximum.accumulate(precision)[::-1]

    # fig, ax = plt.subplots(1, 1)
    # ax.hold(True)
    score = fbeta_score(y, y_pred.round(), 1)
    # prc_ap = average_precision_score(y, y_pred)
    plt.plot(recall, precision, '--', label=f"{label} - {score:5.4}")
    # ax.step(recall[::-1], decreasing_max_precision, '-r')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])


def calc_pr_mean(y, y_preds):
    appr = []
    scores = []
    y = y.flatten()

    for idx, y_pred in enumerate(y_preds):
        y_pred = y_pred.flatten()
        precision, recall, thresholds = precision_recall_curve(y, y_pred)
        appr.append(interpolate.interp1d(recall, precision))
        scores.append(fbeta_score(y, y_pred.round(), 1))
    x = np.linspace(0, 1, 10000)
    ys = np.vstack([f(x) for f in appr])
    ys_mean = ys.mean(axis=0)
    ys_std = ys.std(axis=0)
    scores_mean = np.mean(scores)
    return ys_mean, ys_std, scores_mean


def plot_mean_curve(x, ys, std, score, label):
    plt.plot(x, ys, label=f"{label} - {score:5.4}")
    plt.fill_between(x, ys - std, ys + std, alpha=0.1)
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])


def plot_pr_mean(y, y_preds, label=""):
    x = np.linspace(0, 1, 10000)
    ys_mean, ys_std, score = calc_pr_mean(y, y_preds)
    plot_mean_curve(x, ys_mean, ys_std, score, label)
    plt.xlabel('Recall')
    plt.ylabel('Precision')


def score_model(y, prediction):
    y = y.flatten()
    y_pred = prediction.flatten()

    precision, recall, thresholds = precision_recall_curve(y, y_pred)

    print(classification_report(y, y_pred.round()))
    print("Area under PR curve", auc(recall, precision))
    print("roc auc score", roc_auc_score(y, y_pred))
    print("F1 Score", fbeta_score(y, y_pred.round(), 1))
    print("F0.5 Score", fbeta_score(y, y_pred.round(), 0.5))


def plot_roc_curve(mask, prediction, label=""):
    y = mask.flatten()
    y_pred = prediction.flatten()
    fpr, tpr, thresholds = roc_curve(y, y_pred)
    roc_auc = auc(fpr, tpr)
    plt.xscale('log')
    plt.plot(fpr, tpr, label=f"{label} - {roc_auc:5.4}")
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')


def calc_roc_mean(y, y_preds):
    appr = []
    aucs = []
    y = y.flatten()

    for idx, y_pred in enumerate(y_preds):
        y_pred = y_pred.flatten()
        fpr, tpr, thresholds = roc_curve(y, y_pred)
        appr.append(interpolate.interp1d(fpr, tpr))
        aucs.append(auc(fpr, tpr))
    x = np.linspace(0, 1, 10000)
    ys = np.vstack([f(x) for f in appr])
    ys_mean = ys.mean(axis=0)
    ys_std = ys.std(axis=0)
    auc_mean = np.mean(aucs)
    return ys_mean, ys_std, auc_mean


def plot_roc_mean(y, y_preds, label=""):
    x = np.linspace(0, 1, 10000)
    ys_mean, ys_std, score = calc_roc_mean(y, y_preds)
    plt.xscale('log')
    plot_mean_curve(x, ys_mean, ys_std, score, label)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')


def plot_confusion_matrix(y_true, y_pred, path,
                          normalize=False,
                          classes=("benign", "malicious"),
                          title='Confusion matrix',
                          cmap="Blues", dpi=600):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.clf()
    cm = confusion_matrix(y_true, y_pred)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in ((i, j) for i in range(cm.shape[0]) for j in range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(path, dpi=dpi)
    plt.close()


def plot_training_curve(logs, key, path, dpi=600):
    plt.clf()
    plt.plot(logs[f"{key}acc"], label="accuracy")
    plt.plot(logs[f"{key}f1_score"], label="f1_score")

    plt.plot(logs[f"val_{key}acc"], label="val_accuracy")
    # plt.plot(logs[f"val_{key}f1_score"], label="val_f1_score")

    plt.xlabel('epoch')
    plt.ylabel('percentage')
    plt.legend()
    plt.savefig(path, dpi=dpi)
    plt.close()


def plot_error_bars(results):
    rates = []
    for m, r in results.items():
        if m == "all": continue
        rates.append((r / r.sum(axis=0, keepdims=True)).flatten())
    rates = pd.DataFrame(np.vstack(rates), columns=("TN", "FP", "FN", "TP"))

    ax = rates.mean().plot.bar(yerr=rates.std())
    for p in ax.patches:
        ax.annotate(str(np.round(p.get_height(), 4)), (p.get_x(), 0.5))


def plot_embedding(domain_embedding, labels, path, dpi=600, method="svd"):
    if method == "svd":
        red = TruncatedSVD(n_components=2)
    elif method == "tsne":
        red = TSNE(n_components=2, verbose=2)
    domain_reduced = red.fit_transform(domain_embedding)
    print(red.explained_variance_ratio_)
    # use if draw subset of predictions
    # idx = np.random.choice(np.arange(len(domain_reduced)), 10000)
    plt.scatter(domain_reduced[:, 0],
                domain_reduced[:, 1],
                c=(labels * (1, 2)).sum(1).astype(int),
                cmap=plt.cm.plasma,
                s=3,
                alpha=0.2)
    plt.colorbar()
    plt.savefig(path, dpi=dpi)


def plot_model_as(model, path):
    from keras.utils.vis_utils import plot_model
    plot_model(model, to_file=path, show_shapes=True, show_layer_names=True)