diff --git a/dataset.py b/dataset.py index d8965a0..bfbc38e 100644 --- a/dataset.py +++ b/dataset.py @@ -9,25 +9,28 @@ import numpy as np import pandas as pd from tqdm import tqdm -logger = logging.getLogger('logger') +logger = logging.getLogger('cisco_logger') -chars = dict((char, idx + 1) for (idx, char) in - enumerate(string.ascii_lowercase + string.punctuation + string.digits)) +char2idx = dict((char, idx + 1) for (idx, char) in + enumerate(string.ascii_lowercase + string.punctuation + string.digits)) + +idx2char = {v: k for k, v in char2idx.items()} def get_character_dict(): - return chars + return char2idx def get_vocab_size(): - return len(chars) + 1 + return len(char2idx) + 1 def encode_char(c): - if c in chars: - return chars[c] - else: - return 0 + return char2idx.get(c, 0) + + +def decode_char(i): + return idx2char.get(i, "") encode_char = np.vectorize(encode_char) @@ -84,11 +87,12 @@ def get_user_chunks(user_flow, window=10): return result -def get_domain_features(domain, vocab: dict, max_length=40): +# TODO: DATA CORRUPTION; reverse, 0! to n +def get_domain_features(domain, max_length=40): encoding = np.zeros((max_length,)) for j in range(min(len(domain), max_length)): char = domain[-j] # TODO: why -j -> order reversed for domain url? - encoding[j] = vocab.get(char, 0) + encoding[j] = encode_char(char) return encoding @@ -99,13 +103,6 @@ def get_all_flow_features(features): return np.log1p(flows) -def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10): - domain, flow, name, hits, trusted_hits, server = create_raw_dataset_from_flows(user_flow_df, char_dict, - max_len, window_size) - domain, flow, name, client, server = filter_window_dataset_by_hits(domain, flow, name, hits, trusted_hits, server) - return domain, flow, name, client, server - - def filter_window_dataset_by_hits(domain, flow, name, hits, trusted_hits, server): # select only 1.0 and 0.0 from training data pos_idx = np.where(np.logical_or(hits == 1.0, trusted_hits >= 1.0))[0] @@ -122,7 +119,7 @@ def filter_window_dataset_by_hits(domain, flow, name, hits, trusted_hits, server return domain, flow, name, client, server -def create_raw_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10): +def create_raw_dataset_from_flows(user_flow_df, max_len, window_size=10): logger.info("get chunks from user data frames") with Pool() as pool: results = [] @@ -131,7 +128,6 @@ def create_raw_dataset_from_flows(user_flow_df, char_dict, max_len, window_size= windows = [window for res in results for window in res.get()] logger.info("create training dataset") domain, flow, hits, name, server, trusted_hits = create_dataset_from_windows(chunks=windows, - vocab=char_dict, max_len=max_len) # make client labels discrete with 4 different values hits = np.apply_along_axis(lambda x: make_label_discrete(x, 3), 0, np.atleast_2d(hits)) @@ -158,7 +154,7 @@ def load_h5dataset(path): return data -def create_dataset_from_windows(chunks, vocab, max_len): +def create_dataset_from_windows(chunks, max_len): """ combines domain and feature windows to sequential training data :param chunks: list of flow feature windows @@ -168,7 +164,7 @@ def create_dataset_from_windows(chunks, vocab, max_len): """ def get_domain_features_reduced(d): - return get_domain_features(d[0], vocab, max_len) + return get_domain_features(d[0], max_len) logger.info(" compute domain features") domain_features = [] @@ -257,7 +253,6 @@ def load_or_generate_h5data(h5data, train_data, domain_length, window_size): def load_or_generate_raw_h5data(h5data, train_data, domain_length, window_size): h5data = h5data + "_raw" - char_dict = get_character_dict() logger.info(f"check for h5data {h5data}") try: check_h5dataset(h5data) @@ -265,8 +260,8 @@ def load_or_generate_raw_h5data(h5data, train_data, domain_length, window_size): logger.info("h5 data not found - load csv file") user_flow_df = get_user_flow_data(train_data) logger.info("create raw training dataset") - domain, flow, name, hits, trusted_hits, server = create_raw_dataset_from_flows(user_flow_df, char_dict, - domain_length, window_size) + domain, flow, name, hits, trusted_hits, server = create_raw_dataset_from_flows(user_flow_df, domain_length, + window_size) logger.info("store raw training dataset as h5 file") data = { "domain": domain.astype(np.int8), @@ -298,7 +293,6 @@ def generate_names(train_data, window_size): def load_or_generate_domains(train_data, domain_length): fn = f"{train_data}_domains.gz" - char_dict = get_character_dict() try: user_flow_df = pd.read_csv(fn) @@ -317,10 +311,10 @@ def load_or_generate_domains(train_data, domain_length): user_flow_df.to_csv(fn, compression="gzip") - domain_encs = user_flow_df.domain.apply(lambda d: get_domain_features(d, char_dict, domain_length)) + domain_encs = user_flow_df.domain.apply(lambda d: get_domain_features(d, domain_length)) domain_encs = np.stack(domain_encs) - return domain_encs, user_flow_df[["serverLabel", "clientLabel"]].as_matrix().astype(bool) + return domain_encs, user_flow_df[["clientLabel", "serverLabel"]].as_matrix().astype(bool) def save_predictions(path, results): diff --git a/fancy.sh b/fancy.sh index 30cc4ff..966a25c 100644 --- a/fancy.sh +++ b/fancy.sh @@ -5,25 +5,26 @@ N2=$2 RESDIR=$3 DATADIR=$4 -for ((i = ${N1}; i <= ${N2}; i++)) -do - python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_final_${i} --data ${DATADIR} --model_output client - python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_final_${i} --data ${DATADIR} --model_output both - python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_inter_${i} --data ${DATADIR} --model_output both - python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_staggered_${i} --data ${DATADIR} --model_output both -done +#for ((i = ${N1}; i <= ${N2}; i++)) +#do +# python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_final_${i} --data ${DATADIR} --model_output client +# python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_final_${i} --data ${DATADIR} --model_output both +# python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_inter_${i} --data ${DATADIR} --model_output both +# python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_staggered_${i} --data ${DATADIR} --model_output both +#done +# +#python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/client_final_{1..20}/ --data ${DATADIR} --model_output client --out-prefix ${RESDIR}/client_final +#python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_final_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_final +#python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_inter_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_inter +#python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_staggered_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_staggered -python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/client_final_*/ --data ${DATADIR} --model_output client --out-prefix ${RESDIR}/client_final -python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_final_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_final -python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_inter_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_inter -python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_staggered_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_staggered +#python3 main.py --mode beta --batch 1024 --models ${RESDIR}/client_final_{1..20}/ --data ${DATADIR} --model_output client --out-prefix ${RESDIR}/client_final +#python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_final_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_final +#python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_inter_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_inter +#python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_staggered_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_staggered +#python3 main.py --mode all_beta --out-prefix ${RESDIR}/both_staggered -python3 main.py --mode beta --batch 1024 --models ${RESDIR}/client_final_*/ --data ${DATADIR} --model_output client --out-prefix ${RESDIR}/client_final -python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_final_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_final -python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_inter_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_inter -python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_staggered_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_staggered - -python3 main.py --mode embedding --batch 1024 --model ${RESDIR}/client_final_*/ --data ${DATADIR} --model_output client --out-prefix --model ${RESDIR}/client_final -python3 main.py --mode embedding --batch 1024 --model ${RESDIR}/both_final_*/ --data ${DATADIR} --model_output both --out-prefix --model ${RESDIR}/both_final -python3 main.py --mode embedding --batch 1024 --model ${RESDIR}/both_inter_*/ --data ${DATADIR} --model_output both --out-prefix --model ${RESDIR}/both_inter -python3 main.py --mode embedding --batch 1024 --model ${RESDIR}/both_staggered_*/ --data ${DATADIR} --model_output both --out-prefix --model ${RESDIR}/both_staggered \ No newline at end of file +python3 main.py --mode embedding --batch 1024 --models ${RESDIR}/client_final_{1..20}/ ${RESDIR}/both_final_{1..20}/ \ + ${RESDIR}/both_inter_{1..20}/ ${RESDIR}/both_staggered_{1..20}/ \ + --data ${DATADIR} \ + --out-prefix ${RESDIR}/figs/tsne/tsne diff --git a/main.py b/main.py index 6a2efe2..cbe74d2 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,7 @@ import logging import os +import joblib import numpy as np import pandas as pd import tensorflow as tf @@ -15,35 +16,37 @@ import models # create logger import visualize from arguments import get_model_args +from server import test_server_only, train_server_only from utils import exists_or_make_path, get_custom_class_weights, get_custom_sample_weights, load_model -logger = logging.getLogger('logger') +logger = logging.getLogger('cisco_logger') logger.setLevel(logging.DEBUG) +logger.propagate = False # create console handler and set level to debug ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # create formatter -formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') +formatter1 = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') # add formatter to ch -ch.setFormatter(formatter) +ch.setFormatter(formatter1) # add ch to logger logger.addHandler(ch) -ch = logging.FileHandler("info.log") -ch.setLevel(logging.DEBUG) - -# create formatter -formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - -# add formatter to ch -ch.setFormatter(formatter) - -# add ch to logger -logger.addHandler(ch) +# ch = logging.FileHandler("info.log") +# ch.setLevel(logging.DEBUG) +# +# # create formatter +# formatter2 = logging.Formatter('!! %(asctime)s - %(name)s - %(levelname)s - %(message)s') +# +# # add formatter to ch +# ch.setFormatter(formatter2) +# +# # add ch to logger +# logger.addHandler(ch) args = arguments.parse() @@ -100,7 +103,7 @@ def main_hyperband(): "flow_features": [3], "domain_length": [args.domain_length], # model params - "embedding_size": [2 ** x for x in range(3, 7)], + "embedding": [2 ** x for x in range(3, 7)], "filter_embedding": [2 ** x for x in range(1, 10)], "kernel_embedding": [1, 3, 5, 7, 9], "dense_embedding": [2 ** x for x in range(4, 10)], @@ -119,6 +122,12 @@ def main_hyperband(): if args.model_type in ("inter", "staggered"): server_tr = np.expand_dims(server_windows_tr, 2) + + idx = np.random.permutation(len(domain_tr)) + domain_tr = domain_tr[idx] + flow_tr = flow_tr[idx] + client_tr = client_tr[idx] + server_tr = server_tr[idx] hp = hyperband.Hyperband(param_dist, [domain_tr, flow_tr], @@ -337,8 +346,8 @@ def main_test(): def main_visualization(): def plot_model(clf_model, path): embd, model = load_model(clf_model, custom_objects=models.get_custom_objects()) - visualize.plot_model_as(embd, os.path.join(path, "model_embd.pdf")) - visualize.plot_model_as(model, os.path.join(path, "model_clf.pdf")) + visualize.plot_model_as(embd, os.path.join(path, "model_embd.pdf"), shapes=False) + visualize.plot_model_as(model, os.path.join(path, "model_clf.pdf"), shapes=False) def vis(model_name, model_path, df, df_paul, aggregation, curve): visualize.plot_clf() @@ -411,13 +420,15 @@ def main_visualization(): visualize.plot_confusion_matrix(df_user.client_val.as_matrix(), df_user.client_pred.as_matrix().round(), "{}/user_cov_norm.pdf".format(args.model_path), normalize=True, title="User Confusion Matrix") - plot_embedding(args.model_path, results["domain_embds"], args.data, args.domain_length) -def plot_embedding(model_path, domain_embedding, data, domain_length): - logger.info("visualize embedding") - domain_encs, labels = dataset.load_or_generate_domains(data, domain_length) - visualize.plot_embedding(domain_embedding, labels, path="{}/embd_svd.png".format(model_path), method="svd") +# plot_embedding(args.model_path, results["domain_embds"], args.data, args.domain_length) + + +# def plot_embedding(model_path, domain_embedding, data, domain_length): +# logger.info("visualize embedding") +# domain_encs, labels = dataset.load_or_generate_domains(data, domain_length) +# visualize.plot_embedding(domain_embedding, labels, path="{}/embd_svd.png".format(model_path), method="svd") def main_visualize_all(): @@ -477,11 +488,7 @@ def main_visualize_all(): def main_visualize_all_embds(): - import matplotlib.pyplot as plt - _, _, name_val, hits_vt, hits_trusted, server_val = dataset.load_or_generate_raw_h5data(args.data, - args.data, - args.domain_length, - args.window) + import seaborn as sns def load_df(path): res = dataset.load_predictions(path) @@ -489,42 +496,50 @@ def main_visualize_all_embds(): dfs = [(model_args["model_name"], load_df(model_args["model_path"])) for model_args in get_model_args(args)] - plt.clf() + from sklearn.manifold import TSNE - from sklearn.decomposition import TruncatedSVD - - def vis(ax, domain_embedding, labels): - red = TruncatedSVD(n_components=2) - # use if draw subset of predictions - idx = np.random.choice(np.arange(len(domain_embedding)), 5000) - domain_embedding = domain_embedding[idx] - labels = labels[idx] - domain_reduced = red.fit_transform(domain_embedding) - ax.scatter(domain_reduced[:, 0], - domain_reduced[:, 1], - c=(labels * (1, 2)).sum(1).astype(int), - cmap=plt.cm.plasma, - s=3, - alpha=0.1) + def vis2(domain_embedding, labels): + n_levels = 7 + logger.info(f"reduction for {sub_sample} of {len(domain_embedding)} points") + red = TSNE(n_components=2) + domains = red.fit_transform(domain_embedding) + logger.info("plot kde") + sns.kdeplot(domains[labels.sum(axis=1) == 0, 0], domains[labels.sum(axis=1) == 0, 1], + cmap="Blues", label="benign", n_levels=9, alpha=0.45, shade=True, shade_lowest=False) + sns.kdeplot(domains[labels[:, 1], 0], domains[labels[:, 1], 1], + cmap="Greens", label="server", n_levels=5, alpha=0.45, shade=True, shade_lowest=False) + sns.kdeplot(domains[labels[:, 0], 0], domains[labels[:, 0], 1], + cmap="Reds", label="client", n_levels=5, alpha=0.45, shade=True, shade_lowest=False) domain_encs, labels = dataset.load_or_generate_domains(args.data, args.domain_length) - fig, axes = plt.subplots(nrows=5, ncols=4) + idx = np.arange(len(labels)) + client = labels[:, 0] + server = labels[:, 1] + benign = np.logical_not(np.logical_and(client, server)) + print(client.sum(), server.sum(), benign.sum()) - for (model_name, embd), ax in zip(dfs, axes.flat): + idx = np.concatenate(( + np.random.choice(idx[client], 1000), + np.random.choice(idx[server], 1000), + np.random.choice(idx[benign], 6000)), axis=0) + + print(idx.shape) + lls = labels[idx] + + for model_name, embd in dfs: logger.info(f"plot embedding for {model_name}") - vis(ax, embd, labels) - - visualize.plot_save("{}_svd.png".format(args.output_prefix, 600)) - -import joblib + visualize.plot_clf() + embd = embd[idx] + vis2(embd, lls) + visualize.plot_save("{}_{}.pdf".format(args.output_prefix, model_name)) def main_beta(): - _, _, name_val, hits_vt, hits_trusted, server_val = dataset.load_or_generate_raw_h5data(args.data, - args.data, - args.domain_length, - args.window) + domain_val, _, name_val, hits_vt, hits_trusted, server_val = dataset.load_or_generate_raw_h5data(args.data, + args.data, + args.domain_length, + args.window) path, model_prefix = os.path.split(os.path.normpath(args.output_prefix)) try: results = joblib.load(f"{path}/curves.joblib") @@ -532,82 +547,101 @@ def main_beta(): results = {} results[model_prefix] = {"all": {}} + domains = domain_val.value.reshape(-1, 40) + domains = np.apply_along_axis(lambda d: "".join(map(dataset.decode_char, d)), 1, domains) + def load_df(path): + df_server = None res = dataset.load_predictions(path) - res = pd.DataFrame(data={ + data = { "names": name_val, "client_pred": res["client_pred"].flatten(), - "hits_vt": hits_vt, "hits_trusted": hits_trusted - }) + "hits_vt": hits_vt, "hits_trusted": hits_trusted, + } + if "server_pred" in res: + print(res["server_pred"].shape, server_val.value.shape) + server = res["server_pred"] if len(res["server_pred"].shape) == 2 else res["server_pred"].max(axis=1) + val = server_val.value.max(axis=1) + data["server_pred"] = server.flatten() + data["server_val"] = val.flatten() + + if res["server_pred"].flatten().shape == server_val.value.flatten().shape: + df_server = pd.DataFrame(data={ + "server_pred": res["server_pred"].flatten(), + "domain": domains, + "server_val": server_val.value.flatten() + }) + + res = pd.DataFrame(data=data) res["client_val"] = np.logical_or(res.hits_vt == 1.0, res.hits_trusted >= 3) - return res - paul = dataset.load_predictions("results/paul/") - df_paul = pd.DataFrame(data={ - "names": paul["testNames"].flatten(), "client_pred": paul["testScores"].flatten(), - "hits_vt": paul["testLabel"].flatten(), "hits_trusted": paul["testHits"].flatten() - }) - df_paul["client_val"] = np.logical_or(df_paul.hits_vt == 1.0, df_paul.hits_trusted >= 3) - df_paul_user = df_paul.groupby(df_paul.names).max() + return res, df_server - logger.info("plot pr curves") - visualize.plot_clf() - predictions = [] + client_preds = [] + server_preds = [] + server_flow_preds = [] + client_user_preds = [] + server_user_preds = [] + server_domain_preds = [] + server_domain_avg_preds = [] for model_args in get_model_args(args): - df = load_df(model_args["model_path"]) - predictions.append(df.client_pred.as_matrix()) + df, df_server = load_df(model_args["model_path"]) + client_preds.append(df.client_pred.as_matrix()) + if "server_val" in df.columns: + server_preds.append(df.server_pred.as_matrix()) + if df_server is not None: + server_flow_preds.append(df_server.server_pred.as_matrix()) + df_domain = df_server.groupby(df_server.domain).max() + server_domain_preds.append(df_domain.server_pred.as_matrix()) + df_domain_avg = df_server.groupby(df_server.domain).rolling(10).mean() + server_domain_avg_preds.append(df_domain_avg.server_pred.as_matrix()) + results[model_prefix][model_args["model_name"]] = confusion_matrix(df.client_val.as_matrix(), df.client_pred.as_matrix().round()) - results[model_prefix]["all"]["window_prc"] = visualize.calc_pr_mean(df.client_val.as_matrix(), predictions) - visualize.plot_pr_mean(df.client_val.as_matrix(), predictions, "mean") - visualize.plot_pr_mean(df_paul.client_val.as_matrix(), [df_paul.client_pred.as_matrix()], "paul") - visualize.plot_legend() - visualize.plot_save(f"{args.output_prefix}_window_client_prc_all.pdf") + df_user = df.groupby(df.names).max() + client_user_preds.append(df_user.client_pred.as_matrix()) + if "server_val" in df.columns: + server_user_preds.append(df_user.server_pred.as_matrix()) - logger.info("plot roc curves") - visualize.plot_clf() - predictions = [] - for model_args in get_model_args(args): - df = load_df(model_args["model_path"]) - predictions.append(df.client_pred.as_matrix()) - results[model_prefix][model_args["model_name"]] = confusion_matrix(df.client_val.as_matrix(), - df.client_pred.as_matrix().round()) - results[model_prefix]["all"]["window_roc"] = visualize.calc_roc_mean(df.client_val.as_matrix(), predictions) - visualize.plot_roc_mean(df.client_val.as_matrix(), predictions, "mean") - visualize.plot_roc_mean(df_paul.client_val.as_matrix(), [df_paul.client_pred.as_matrix()], "paul") - visualize.plot_legend() - visualize.plot_save(f"{args.output_prefix}_window_client_roc_all.pdf") + logger.info("plot client curves") + results[model_prefix]["all"]["client_window_prc"] = visualize.calc_pr_mean(df.client_val.as_matrix(), client_preds) + results[model_prefix]["all"]["client_window_roc"] = visualize.calc_roc_mean(df.client_val.as_matrix(), client_preds) + results[model_prefix]["all"]["client_user_prc"] = visualize.calc_pr_mean(df_user.client_val.as_matrix(), + client_user_preds) + results[model_prefix]["all"]["client_user_roc"] = visualize.calc_roc_mean(df_user.client_val.as_matrix(), + client_user_preds) - logger.info("plot user pr curves") - visualize.plot_clf() - predictions = [] - for model_args in get_model_args(args): - df = load_df(model_args["model_path"]) - df = df.groupby(df.names).max() - predictions.append(df.client_pred.as_matrix()) - results[model_prefix][model_args["model_name"]] = confusion_matrix(df.client_val.as_matrix(), - df.client_pred.as_matrix().round()) - results[model_prefix]["all"]["user_prc"] = visualize.calc_pr_mean(df.client_val.as_matrix(), predictions) - visualize.plot_pr_mean(df.client_val.as_matrix(), predictions, "mean") - visualize.plot_pr_mean(df_paul_user.client_val.as_matrix(), [df_paul_user.client_pred.as_matrix()], "paul") - visualize.plot_legend() - visualize.plot_save(f"{args.output_prefix}_user_client_prc_all.pdf") + if "server_val" in df.columns: + logger.info("plot server curves") + results[model_prefix]["all"]["server_window_prc"] = visualize.calc_pr_mean(df.server_val.as_matrix(), + server_preds) + results[model_prefix]["all"]["server_window_roc"] = visualize.calc_roc_mean(df.server_val.as_matrix(), + server_preds) + results[model_prefix]["all"]["server_user_prc"] = visualize.calc_pr_mean(df_user.server_val.as_matrix(), + server_user_preds) + + results[model_prefix]["all"]["server_user_roc"] = visualize.calc_roc_mean(df_user.server_val.as_matrix(), + server_user_preds) - logger.info("plot user roc curves") - visualize.plot_clf() - predictions = [] - for model_args in get_model_args(args): - df = load_df(model_args["model_path"]) - df = df.groupby(df.names).max() - predictions.append(df.client_pred.as_matrix()) - results[model_prefix]["all"]["user_roc"] = visualize.calc_roc_mean(df.client_val.as_matrix(), predictions) - visualize.plot_roc_mean(df.client_val.as_matrix(), predictions, "mean") - visualize.plot_roc_mean(df_paul_user.client_val.as_matrix(), [df_paul_user.client_pred.as_matrix()], "paul") - visualize.plot_legend() - visualize.plot_save(f"{args.output_prefix}_user_client_roc_all.pdf") + if df_server is not None: + logger.info("plot server flow curves") + results[model_prefix]["all"]["server_flow_prc"] = visualize.calc_pr_mean(df_server.server_val.as_matrix(), + server_flow_preds) + results[model_prefix]["all"]["server_flow_roc"] = visualize.calc_roc_mean(df_server.server_val.as_matrix(), + server_flow_preds) + results[model_prefix]["all"]["server_domain_prc"] = visualize.calc_pr_mean(df_domain.server_val.as_matrix(), + server_domain_preds) + results[model_prefix]["all"]["server_domain_roc"] = visualize.calc_roc_mean(df_domain.server_val.as_matrix(), + server_domain_preds) + results[model_prefix]["all"]["server_domain_avg_prc"] = visualize.calc_pr_mean( + df_domain_avg.server_val.as_matrix(), + server_domain_avg_preds) + results[model_prefix]["all"]["server_domain_avg_roc"] = visualize.calc_roc_mean( + df_domain_avg.server_val.as_matrix(), + server_domain_avg_preds) joblib.dump(results, f"{path}/curves.joblib") - plot_overall_result() + # plot_overall_result() def plot_overall_result(): @@ -619,12 +653,19 @@ def plot_overall_result(): import matplotlib.pyplot as plt x = np.linspace(0, 1, 10000) - for vis in ["window_prc", "window_roc", "user_prc", "user_roc"]: + for vis in ["client_window_prc", "client_window_roc", "client_user_prc", "client_user_roc", + "server_window_prc", "server_window_roc", "server_user_prc", "server_user_roc", + "server_flow_prc", "server_flow_roc", "server_domain_prc", "server_domain_roc", + "server_domain_avg_prc", "server_domain_avg_roc"]: logger.info(f"plot {vis}") visualize.plot_clf() for model_key in results.keys(): - ys_mean, ys_std, score = results[model_key]["all"][vis] - plt.plot(x, ys_mean, label=f"{model_key} - {score:5.4}") + if vis not in results[model_key]["all"]: + continue + if "final" in model_key and vis.startswith("server_flow"): + continue + ys_mean, ys_std, ys = results[model_key]["all"][vis] + plt.plot(x, ys_mean, label=f"{model_key} - {np.mean(ys_mean):5.4} ({np.mean(ys_std):4.3})") plt.fill_between(x, ys_mean - ys_std, ys_mean + ys_std, alpha=0.2) if vis.endswith("prc"): plt.xlabel('Recall') @@ -632,124 +673,37 @@ def plot_overall_result(): else: plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') + plt.xscale('log') plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0]) visualize.plot_legend() - visualize.plot_save(f"{path}/{vis}_all.pdf") + visualize.plot_save(f"{path}/figs/curves/{vis}_all.pdf") - for cat, models in results.items(): + for vis in ["client_window_prc", "client_window_roc", "client_user_prc", "client_user_roc", + "server_window_prc", "server_window_roc", "server_user_prc", "server_user_roc", + "server_flow_prc", "server_flow_roc", "server_domain_prc", "server_domain_roc", + "server_domain_avg_prc", "server_domain_avg_roc"]: + logger.info(f"plot {vis}") visualize.plot_clf() - visualize.plot_error_bars(models) - visualize.plot_legend() - visualize.plot_save(f"{path}/error_bars_{cat}.pdf") - - -def train_server_only(): - logger.info(f"Create model path {args.model_path}") - exists_or_make_path(args.model_path) - logger.info(f"Use command line arguments: {args}") - - domain_tr, flow_tr, name_tr, client_tr, server_windows_tr = dataset.load_or_generate_h5data(args.data, - args.data, - args.domain_length, - args.window) - domain_tr = domain_tr.value.reshape(-1, 40) - flow_tr = flow_tr.value.reshape(-1, 3) - server_tr = server_windows_tr.value.reshape(-1) - - logger.info("define callbacks") - callbacks = [] - callbacks.append(ModelCheckpoint(filepath=args.clf_model, - monitor='loss', - verbose=False, - save_best_only=True)) - callbacks.append(CSVLogger(args.train_log)) - logger.info(f"Use early stopping: {args.stop_early}") - if args.stop_early: - callbacks.append(EarlyStopping(monitor='val_loss', - patience=5, - verbose=False)) - custom_metrics = models.get_metric_functions() - - model = models.get_server_model_by_params(params=PARAMS) - - features = {"ipt_domains": domain_tr, "ipt_flows": flow_tr} - if args.model_output == "both": - labels = {"client": client_tr, "server": server_tr} - elif args.model_output == "client": - labels = {"client": client_tr} - elif args.model_output == "server": - labels = {"server": server_tr} - else: - raise ValueError("unknown model output") - - logger.info("compile and train model") - logger.info(model.get_config()) - model.compile(optimizer='adam', - loss='binary_crossentropy', - metrics=['accuracy'] + custom_metrics) - - model.summary() - model.fit(features, labels, - batch_size=args.batch_size, - epochs=args.epochs, - callbacks=callbacks) - - -def test_server_only(): - logger.info("start test: load data") - domain_val, flow_val, _, _, _, _ = dataset.load_or_generate_raw_h5data(args.data, - args.data, - args.domain_length, - args.window) - domain_val = domain_val.value.reshape(-1, 40) - flow_val = flow_val.value.reshape(-1, 3) - domain_encs, _ = dataset.load_or_generate_domains(args.data, args.domain_length) - - for model_args in get_model_args(args): - results = {} - logger.info(f"process model {model_args['model_path']}") - embd_model, clf_model = load_model(model_args["clf_model"], custom_objects=models.get_custom_objects()) - - pred = clf_model.predict([domain_val, flow_val], - batch_size=args.batch_size, - verbose=1) - - results["server_pred"] = pred - - domain_embeddings = embd_model.predict(domain_encs, batch_size=args.batch_size, verbose=1) - results["domain_embds"] = domain_embeddings - - dataset.save_predictions(model_args["model_path"], results) - - -def vis_server(): - def load_model(m, c): - from keras.models import load_model - clf = load_model(m, custom_objects=c) - emdb = clf.layers[1] - return emdb, clf - - domain_raw, flow_raw, name_raw, hits_vt_raw, hits_trusted_raw, server_raw = dataset.load_or_generate_raw_h5data( - args.data, - args.data, - args.domain_length, - args.window) - - results = dataset.load_predictions(args.clf_model) - - visualize.plot_clf() - visualize.plot_precision_recall(server_raw.flatten(), results["server_pred"].flatten(), "server") - visualize.plot_legend() - visualize.plot_save("results/server_model/windows_prc.pdf") - visualize.plot_clf() - visualize.plot_precision_recall(server_raw.flatten(), results["server_pred"].flatten(), "server") - visualize.plot_legend() - visualize.plot_save("results/server_model/windows_prc.pdf") - visualize.plot_clf() - visualize.plot_roc_curve(server_raw.flatten(), results["server_pred"].flatten(), "server") - visualize.plot_legend() - visualize.plot_save("results/server_model/windows_roc.pdf") + for model_key in results.keys(): + if vis not in results[model_key]["all"]: + continue + if "final" in model_key and vis.startswith("server_flow"): + continue + _, _, ys = results[model_key]["all"][vis] + for y in ys: + plt.plot(x, y, label=f"{model_key} - {np.mean(y):5.4}") + if vis.endswith("prc"): + plt.xlabel('Recall') + plt.ylabel('Precision') + else: + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.xscale('log') + plt.ylim([0.0, 1.0]) + plt.xlim([0.0, 1.0]) + visualize.plot_legend() + visualize.plot_save(f"{path}/figs/appendix/{model_key}_{vis}.pdf") def main(): diff --git a/visualize.py b/visualize.py index 3e35c05..bc96167 100644 --- a/visualize.py +++ b/visualize.py @@ -3,6 +3,7 @@ import os import matplotlib.pyplot as plt import numpy as np import pandas as pd +import seaborn as sns from scipy import interpolate from sklearn.decomposition import TruncatedSVD from sklearn.manifold import TSNE @@ -36,13 +37,17 @@ def scores(y_true): def plot_clf(): plt.clf() + sns.set_context("paper") + sns.set_style("white") -def plot_save(path, dpi=300): - plt.title(path) +def plot_save(path, dpi=600, set_size=True): + # plt.title(path) fig = plt.gcf() - fig.set_size_inches(18.5, 10.5) - fig.savefig(path, dpi=dpi) + # fig.suptitle(path) + if set_size: + fig.set_size_inches(8, 4.5) + fig.savefig(path, dpi=dpi, bbox_inches='tight') plt.close() @@ -73,21 +78,7 @@ def plot_precision_recall(y, y_pred, label=""): def calc_pr_mean(y, y_preds): - appr = [] - scores = [] - y = y.flatten() - - for idx, y_pred in enumerate(y_preds): - y_pred = y_pred.flatten() - precision, recall, thresholds = precision_recall_curve(y, y_pred) - appr.append(interpolate.interp1d(recall, precision)) - scores.append(fbeta_score(y, y_pred.round(), 1)) - x = np.linspace(0, 1, 10000) - ys = np.vstack([f(x) for f in appr]) - ys_mean = ys.mean(axis=0) - ys_std = ys.std(axis=0) - scores_mean = np.mean(scores) - return ys_mean, ys_std, scores_mean + return calc_metrics_mean(y, y_preds, "prc") def plot_mean_curve(x, ys, std, score, label): @@ -131,22 +122,26 @@ def plot_roc_curve(mask, prediction, label=""): plt.ylabel('True Positive Rate') -def calc_roc_mean(y, y_preds): +def calc_metrics_mean(y, y_preds, metric): appr = [] - aucs = [] y = y.flatten() - for idx, y_pred in enumerate(y_preds): y_pred = y_pred.flatten() - fpr, tpr, thresholds = roc_curve(y, y_pred) - appr.append(interpolate.interp1d(fpr, tpr)) - aucs.append(auc(fpr, tpr)) + if metric == "prc": + precision, recall, thresholds = precision_recall_curve(y, y_pred) + appr.append(interpolate.interp1d(recall, precision)) + elif metric == "roc": + fpr, tpr, thresholds = roc_curve(y, y_pred) + appr.append(interpolate.interp1d(fpr, tpr)) x = np.linspace(0, 1, 10000) ys = np.vstack([f(x) for f in appr]) ys_mean = ys.mean(axis=0) ys_std = ys.std(axis=0) - auc_mean = np.mean(aucs) - return ys_mean, ys_std, auc_mean + return ys_mean, ys_std, ys + + +def calc_roc_mean(y, y_preds): + return calc_metrics_mean(y, y_preds, "roc") def plot_roc_mean(y, y_preds, label=""): @@ -243,6 +238,6 @@ def plot_embedding(domain_embedding, labels, path, dpi=600, method="svd"): plt.savefig(path, dpi=dpi) -def plot_model_as(model, path): +def plot_model_as(model, path, shapes=True, layer_names=True): from keras.utils.vis_utils import plot_model - plot_model(model, to_file=path, show_shapes=True, show_layer_names=True) + plot_model(model, to_file=path, show_shapes=shapes, show_layer_names=layer_names)