From 2593131e9e0b6e6f287c834a737c3581189af98c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Knaebel?= Date: Sat, 29 Jul 2017 10:43:59 +0200 Subject: [PATCH] add embedding visualization and domain encoding generator --- dataset.py | 18 ++++++++++++++++++ visualize.py | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/dataset.py b/dataset.py index 3c66df3..eb14ef7 100644 --- a/dataset.py +++ b/dataset.py @@ -227,3 +227,21 @@ def load_or_generate_h5data(h5data, train_data, domain_length, window_size): store_h5dataset(h5data, domain_tr, flow_tr, client_tr, server_tr) logger.info("load h5 dataset") return load_h5dataset(h5data) + + +# TODO: implement csv loading if already generated +def load_or_generate_domains(train_data, domain_length): + char_dict = get_character_dict() + user_flow_df = get_user_flow_data(train_data) + + domain_encs = user_flow_df.domain.apply(lambda d: get_domain_features(d, char_dict, domain_length)) + domain_encs = np.stack(domain_encs) + + user_flow_df = user_flow_df[["domain", "serverLabel", "trustedHits", "virusTotalHits"]].dropna(axis=0, how="any") + user_flow_df.reset_index(inplace=True) + user_flow_df["clientLabel"] = np.where( + np.logical_or(user_flow_df.trustedHits > 0, user_flow_df.virusTotalHits >= 3), 1.0, 0.0) + user_flow_df = user_flow_df[["domain", "serverLabel", "clientLabel"]] + user_flow_df.groupby(user_flow_df.domain).mean() + + return domain_encs, user_flow_df[["serverLabel", "clientLabel"]].as_matrix() diff --git a/visualize.py b/visualize.py index fbe69e2..90db73e 100644 --- a/visualize.py +++ b/visualize.py @@ -3,6 +3,7 @@ import os import matplotlib.pyplot as plt import numpy as np from keras.utils import plot_model +from sklearn.decomposition import PCA from sklearn.metrics import ( auc, classification_report, confusion_matrix, fbeta_score, precision_recall_curve, roc_auc_score, roc_curve @@ -144,5 +145,22 @@ def plot_training_curve(logs, key, path, dpi=600): plt.close() +def plot_embedding(domain_embedding, labels, path, dpi=600): + pca = PCA(n_components=2) + domain_reduced = pca.fit_transform(domain_embedding) + print(pca.explained_variance_ratio_) + + # use if draw subset of predictions + # idx = np.random.choice(np.arange(len(domain_reduced)), 10000) + + plt.scatter(domain_reduced[:, 0], + domain_reduced[:, 1], + c=(labels * (1, 2)).sum(1).astype(int), + cmap=plt.cm.plasma, + s=3) + plt.colorbar() + plt.savefig(path, dpi=dpi) + + def plot_model_as(model, path): plot_model(model, to_file=path, show_shapes=True, show_layer_names=True)