Compare commits

..

No commits in common. "dev" and "master" have entirely different histories.
dev ... master

20 changed files with 300 additions and 2470 deletions

6
.gitignore vendored
View File

@ -99,8 +99,4 @@ ENV/
*.tif
*.joblib
*.csv
*.csv.gz
*.csv.tar.*
*.h5
*.npy
*.png
*.csv.gz

View File

@ -1,60 +1,3 @@
run:
python3 main.py --mode train --data data/rk_mini.csv.gz --model results/test/test_client --epochs 2 \
--filter_embd 8 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 16 \
--dense_embd 8 --domain_embd 8 --batch 64 --type final --model_output client --runs 1
python3 main.py --mode train --data data/rk_mini.csv.gz --model results/test/test_final --epochs 2 \
--filter_embd 8 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 16 \
--dense_embd 8 --domain_embd 8 --batch 64 --type final --model_output both --runs 1
python3 main.py --mode train --data data/rk_mini.csv.gz --model results/test/test_inter --epochs 2 \
--filter_embd 8 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 16 \
--dense_embd 8 --domain_embd 8 --batch 64 --type inter --model_output both --runs 1
python3 main.py --mode train --data data/rk_mini.csv.gz --model results/test/test_soft --epochs 2 \
--filter_embd 8 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 16 \
--dense_embd 8 --domain_embd 8 --batch 64 --type soft --model_output both --runs 1
python3 main.py --mode train --data data/rk_mini.csv.gz --model results/test/test_long --epochs 2 \
--filter_embd 8 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 16 \
--dense_embd 8 --domain_embd 8 --batch 64 --type long --model_output both --runs 1
python3 main.py --mode train --data data/rk_mini.csv.gz --model results/test/test_staggered --epochs 2 \
--filter_embd 8 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 16 \
--dense_embd 8 --domain_embd 8 --batch 64 --type staggered --model_output both --runs 1
test:
python3 main.py --mode test --batch 128 --models results/test/test_both_* --data data/rk_mini.csv.gz --model_output both
python3 main.py --mode test --batch 128 --models results/test/test_client_* --data data/rk_mini.csv.gz --model_output client
python3 main.py --epochs 1 --batch 64
fancy:
python3 main.py --mode fancy --batch 128 --model results/test/test_both_1 --data data/rk_mini.csv.gz
python3 main.py --mode fancy --batch 128 --model results/test/test_both_2 --data data/rk_mini.csv.gz
python3 main.py --mode fancy --batch 128 --model results/test/test_both_3 --data data/rk_mini.csv.gz
python3 main.py --mode fancy --batch 128 --model results/test/test_both_4 --data data/rk_mini.csv.gz
python3 main.py --mode fancy --batch 128 --model results/test/test_both_5 --data data/rk_mini.csv.gz
python3 main.py --mode fancy --batch 128 --model results/test/test_client_1 --data data/rk_mini.csv.gz
python3 main.py --mode fancy --batch 128 --model results/test/test_client_2 --data data/rk_mini.csv.gz
python3 main.py --mode fancy --batch 128 --model results/test/test_client_3 --data data/rk_mini.csv.gz
python3 main.py --mode fancy --batch 128 --model results/test/test_client_4 --data data/rk_mini.csv.gz
all-fancy:
python3 main.py --mode all_fancy --batch 128 --models results/test/test* --data data/rk_mini.csv.gz \
--out-prefix results/test/
hyper:
python3 main.py --mode hyperband --batch 64 --train data/rk_data.csv.gz
clean:
rm -r results/test/
rm data/rk_mini.csv.gz_raw.h5
rm data/rk_mini.csv.gz.h5

View File

@ -1,146 +0,0 @@
import argparse
import os
parser = argparse.ArgumentParser()
parser.add_argument("--mode", action="store", dest="mode",
default="")
# parser.add_argument("--train", action="store", dest="train_data",
# default="data/full_dataset.csv.tar.gz")
parser.add_argument("--data", action="store", dest="data",
default="data/full_dataset.csv.tar.gz")
# parser.add_argument("--test", action="store", dest="test_data",
# default="data/full_future_dataset.csv.tar.gz")
parser.add_argument("--hyper_result", action="store", dest="hyperband_results",
default="")
parser.add_argument("--model", action="store", dest="model_path",
default="results/model_x")
parser.add_argument("--model_src", action="store", dest="model_source",
default="results/model_x")
parser.add_argument("--model_dest", action="store", dest="model_destination",
default="results/model_x")
parser.add_argument("--models", action="store", dest="model_paths", nargs="+",
default=[])
parser.add_argument("--type", action="store", dest="model_type",
default="final")
parser.add_argument("--embd_type", action="store", dest="embedding_type",
default="small")
# parser.add_argument("--depth", action="store", dest="model_depth",
# default="flat1")
parser.add_argument("--model_output", action="store", dest="model_output",
default="both")
parser.add_argument("--batch", action="store", dest="batch_size",
default=64, type=int)
parser.add_argument("--epochs", action="store", dest="epochs",
default=10, type=int)
parser.add_argument("--init_epoch", action="store", dest="initial_epoch",
default=0, type=int)
parser.add_argument("--runs", action="store", dest="runs",
default=20, type=int)
parser.add_argument("--hyper_max_iter", action="store", dest="hyper_max_iter",
default=81, type=int)
# parser.add_argument("--samples", action="store", dest="samples",
# default=100000, type=int)
#
# parser.add_argument("--samples_val", action="store", dest="samples_val",
# default=10000, type=int)
#
parser.add_argument("--embd", action="store", dest="embedding",
default=128, type=int)
parser.add_argument("--filter_embd", action="store", dest="filter_embedding",
default=128, type=int)
parser.add_argument("--dense_embd", action="store", dest="dense_embedding",
default=128, type=int)
parser.add_argument("--kernel_embd", action="store", dest="kernel_embedding",
default=3, type=int)
parser.add_argument("--filter_main", action="store", dest="filter_main",
default=128, type=int)
parser.add_argument("--dense_main", action="store", dest="dense_main",
default=128, type=int)
parser.add_argument("--kernel_main", action="store", dest="kernel_main",
default=3, type=int)
parser.add_argument("--window", action="store", dest="window",
default=10, type=int)
parser.add_argument("--domain_length", action="store", dest="domain_length",
default=40, type=int)
parser.add_argument("--domain_embd", action="store", dest="domain_embedding",
default=512, type=int)
parser.add_argument("--out-prefix", action="store", dest="output_prefix",
default="", type=str)
# parser.add_argument("--queue", action="store", dest="queue_size",
# default=50, type=int)
#
# parser.add_argument("--p", action="store", dest="p_train",
# default=0.5, type=float)
#
# parser.add_argument("--p_val", action="store", dest="p_val",
# default=0.01, type=float)
#
# parser.add_argument("--gpu", action="store", dest="gpu",
# default=0, type=int)
#
# parser.add_argument("--tmp", action="store_true", dest="tmp")
#
parser.add_argument("--stop_early", action="store_true", dest="stop_early")
parser.add_argument("--balanced_weights", action="store_true", dest="class_weights")
parser.add_argument("--sample_weights", action="store_true", dest="sample_weights")
parser.add_argument("--gpu", action="store_true", dest="gpu")
parser.add_argument("--new_model", action="store_true", dest="new_model")
def get_model_args(args):
return [{
"model_path": model_path,
"model_name": os.path.split(os.path.normpath(model_path))[1],
"embedding_model": os.path.join(model_path, "embd.h5"),
"clf_model": os.path.join(model_path, "clf.h5"),
"train_log": os.path.join(model_path, "train.log.csv"),
# "train_h5data": args.train_data,
# "test_h5data": args.test_data,
"future_prediction": os.path.join(model_path, f"{os.path.basename(args.data)}_pred")
} for model_path in args.model_paths]
def parse():
args = parser.parse_args()
args.result_path = os.path.split(os.path.normpath(args.output_prefix))[1]
args.model_name = os.path.split(os.path.normpath(args.model_path))[1]
args.embedding_model = os.path.join(args.model_path, "embd.h5")
args.clf_model = os.path.join(args.model_path, "clf.h5")
args.train_log = os.path.join(args.model_path, "train.log.csv")
# args.train_h5data = args.train_data
# args.test_h5data = args.test_data
args.future_prediction = os.path.join(args.model_path, f"{os.path.basename(args.data)}_pred")
return args

View File

@ -1,297 +1,198 @@
# -*- coding: utf-8 -*-
import logging
import string
from multiprocessing import Pool
import h5py
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
logger = logging.getLogger('cisco_logger')
char2idx = dict((char, idx + 1) for (idx, char) in
enumerate(string.ascii_lowercase + string.punctuation + string.digits + " "))
idx2char = {v: k for k, v in char2idx.items()}
chars = dict((char, idx + 1) for (idx, char) in
enumerate(string.ascii_lowercase + string.punctuation + string.digits))
def get_character_dict():
return char2idx
def get_vocab_size():
return len(char2idx) + 1
return chars
def encode_char(c):
return char2idx.get(c, 0)
def decode_char(i):
return idx2char.get(i, "")
if c in chars:
return chars[c]
else:
return 0
encode_char = np.vectorize(encode_char)
decode_char = np.vectorize(decode_char)
def encode_domain(domain: string):
return encode_char(list(domain))
def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
maxLengthInSeconds=300):
maxMilliSeconds = maxLengthInSeconds * 1000
outDomainLists = []
outDFFrames = []
if overlapping == False:
numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize)))
userIDs = np.arange(len(dataFrame))
for blockID in np.arange(numBlocks):
curIDs = userIDs[(blockID * windowSize):((blockID + 1) * windowSize)]
# print(curIDs)
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
if maxLengthInSeconds != -1:
curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
if len(underTimeOutIDs) != len(curIDs):
curIDs = curIDs[underTimeOutIDs]
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
outDomainLists.append(list(curDomains))
outDFFrames.append(useData)
else:
numBlocks = len(dataFrame) + 1 - windowSize
userIDs = np.arange(len(dataFrame))
for blockID in np.arange(numBlocks):
curIDs = userIDs[blockID:blockID + windowSize]
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
if maxLengthInSeconds != -1:
curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
if len(underTimeOutIDs) != len(curIDs):
curIDs = curIDs[underTimeOutIDs]
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
outDomainLists.append(list(curDomains))
outDFFrames.append(useData)
if len(outDomainLists[-1]) != windowSize:
outDomainLists.pop(-1)
outDFFrames.pop(-1)
return (outDomainLists, outDFFrames)
def decode_domain(domain):
return "".join(decode_char(domain))
def get_user_chunks(user_flow, window=10):
result = []
chunk_size = (len(user_flow) // window)
for i in range(chunk_size):
result.append(user_flow.iloc[i * window:(i + 1) * window])
if result and len(result[-1]) != window:
result.pop()
return result
def get_domain_features(domain: string, max_length=40):
def get_domain_features(domain, vocab, max_length=40):
encoding = np.zeros((max_length,))
for j in range(min(len(domain), max_length)):
c = domain[len(domain) - 1 - j]
encoding[max_length - 1 - j] = encode_char(c)
for j in range(np.min([len(domain), max_length])):
curCharacter = domain[-j]
if curCharacter in vocab:
encoding[j] = vocab[curCharacter]
return encoding
def get_all_flow_features(features):
flows = np.stack(
map(lambda f: f[["duration", "bytes_up", "bytes_down"]], features)
)
return np.log1p(flows)
def get_flow_features(flow):
keys = ['duration', 'bytes_down', 'bytes_up']
features = np.zeros([len(keys), ])
for i, key in enumerate(keys):
# TODO: does it still works after exceptions occur -- default: zero!
# i wonder whether something brokes
# if there are exceptions regarding to inconsistent feature length
try:
features[i] = np.log1p(flow[key]).astype(float)
except:
pass
return features
def filter_window_dataset_by_hits(domain, flow, name, hits, trusted_hits, server):
# select only 1.0 and 0.0 from training data
pos_idx = np.where(np.logical_or(hits == 1.0, trusted_hits >= 1.0))[0]
neg_idx = np.where(hits == 0.0)[0]
idx = np.concatenate((pos_idx, neg_idx))
# choose selected sample to train on
domain = domain[idx]
flow = flow[idx]
client = np.zeros_like(idx, float)
client[:pos_idx.shape[-1]] = 1.0
server = server[idx]
name = name[idx]
return domain, flow, name, client, server
def get_cisco_features(curDataLine, urlSIPDict):
numCiscoFeatures = 30
try:
ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
# log transform
ciscoFeatures = np.log1p(ciscoFeatures).astype(float)
return ciscoFeatures.ravel()
except:
return np.zeros([numCiscoFeatures, ]).ravel()
def create_raw_dataset_from_flows(user_flow_df, max_len, window_size=10):
logger.info("get chunks from user data frames")
with Pool() as pool:
results = []
for user_flow in tqdm(get_flow_per_user(user_flow_df), total=len(user_flow_df['user_hash'].unique().tolist())):
results.append(pool.apply_async(get_user_chunks, (user_flow, window_size)))
windows = [window for res in results for window in res.get()]
logger.info("create training dataset")
domain, flow, hits, name, server, trusted_hits = create_dataset_from_windows(chunks=windows,
max_len=max_len)
# make client labels discrete with 4 different values
hits = np.apply_along_axis(lambda x: make_label_discrete(x, 3), 0, np.atleast_2d(hits))
def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10, use_cisco_features=False):
domains = []
features = []
print("get chunks from user data frames")
for i, user_flow in enumerate(get_flow_per_user(user_flow_df)):
(domain_windows, feature_windows) = get_user_chunks(user_flow,
windowSize=window_size,
overlapping=True,
maxLengthInSeconds=-1)
domains += domain_windows
features += feature_windows
# TODO: remove later
if i >= 10:
break
return domain, flow, name, hits, trusted_hits, server
print("create training dataset")
return create_dataset_from_lists(
domains=domains, features=features, vocab=char_dict,
max_len=max_len,
use_cisco_features=use_cisco_features, urlSIPDIct=dict(),
window_size=window_size)
def store_h5dataset(path, data: dict):
f = h5py.File(path + ".h5", "w")
for key, val in data.items():
f.create_dataset(key, data=val)
f.close()
def check_h5dataset(path):
return open(path + ".h5", "r")
def load_h5dataset(path):
f = h5py.File(path + ".h5", "r")
data = {}
for k in f.keys():
data[k] = f[k]
return data
def create_dataset_from_windows(chunks, max_len):
def create_dataset_from_lists(domains, features, vocab, max_len,
use_cisco_features=False, urlSIPDIct=dict(),
window_size=10):
"""
combines domain and feature windows to sequential training data
:param chunks: list of flow feature windows
:param domains: list of domain windows
:param features: list of feature windows
:param vocab:
:param max_len:
:param use_cisco_features: idk
:param urlSIPDIct: idk
:param window_size: size of the flow window
:return:
"""
# TODO: check for hits vs vth consistency
# if 'hits' in dfs[0].keys():
# hits_col = 'hits'
# elif 'virusTotalHits' in dfs[0].keys():
# hits_col = 'virusTotalHits'
hits_col = "virusTotalHits"
def get_domain_features_reduced(d):
return get_domain_features(d[0], max_len)
numFlowFeatures = 3
numCiscoFeatures = 30
numFeatures = numFlowFeatures
if use_cisco_features:
numFeatures += numCiscoFeatures
sample_size = len(domains)
hits = []
names = []
servers = []
trusted_hits = []
logger.info(" compute domain features")
domain_features = []
for ds in tqdm(map(lambda f: f.domain, chunks)):
domain_features.append(np.apply_along_axis(get_domain_features_reduced, 2, np.atleast_3d(ds)))
domain_features = np.concatenate(domain_features, 0)
logger.info(" compute flow features")
flow_features = get_all_flow_features(chunks)
logger.info(" select hits")
hits = np.max(np.stack(map(lambda f: f.virusTotalHits, chunks)), axis=1)
logger.info(" select names")
names = np.stack(map(lambda f: f.user_hash, chunks))
assert (names[:, :1].repeat(10, axis=1) == names).all()
names = names[:, 0]
logger.info(" select servers")
servers = np.stack(map(lambda f: f.serverLabel, chunks))
logger.info(" select trusted hits")
trusted_hits = np.max(np.stack(map(lambda f: f.trustedHits, chunks)), axis=1)
domain_features = np.zeros((sample_size, window_size, max_len))
flow_features = np.zeros((sample_size, window_size, numFeatures))
return (domain_features, flow_features,
hits, names, servers, trusted_hits)
for i in tqdm(np.arange(sample_size), miniters=10):
for j in range(window_size):
domain_features[i, j] = get_domain_features(domains[i][j], vocab, max_len)
flow_features[i, j] = get_flow_features(features[i].iloc[j])
# TODO: cisco features?
hits.append(np.max(features[i][hits_col]))
names.append(np.unique(features[i]['user_hash']))
servers.append(np.max(features[i]['serverLabel']))
trusted_hits.append(np.max(features[i]['trustedHits']))
X = [domain_features, flow_features]
return X, np.array(hits), np.array(names), np.array(servers), np.array(trusted_hits)
def make_label_discrete(values, threshold):
max_val = np.max(values)
if max_val >= threshold:
def discretize_label(values, threshold):
maxVal = np.max(values)
if maxVal >= threshold:
return 1.0
elif max_val == -1:
elif maxVal == -1:
return -1.0
elif 0 < max_val < threshold:
elif 0 < maxVal < threshold:
return -2.0
else:
return 0.0
def get_user_flow_data(csv_file):
types = {
"duration": int,
"bytes_down": int,
"bytes_up": int,
"domain": object,
"timeStamp": float,
"http_method": object,
"server_ip": object,
"user_hash": float,
"virusTotalHits": int,
"serverLabel": int,
"trustedHits": int
}
df = pd.read_csv(csv_file, index_col=False)
df = df[list(types.keys())]
# df.set_index(keys=['user_hash'], drop=False, inplace=True)
def get_user_flow_data():
df = pd.read_csv("data/rk_data.csv.gz")
df.drop("Unnamed: 0", 1, inplace=True)
df.set_index(keys=['user_hash'], drop=False, inplace=True)
return df
def get_flow_per_user(df):
users = df['user_hash'].unique().tolist()
for user in users:
yield df.loc[df.user_hash == user].dropna(axis=0, how="any")
def load_or_generate_h5data(train_data, domain_length, window_size):
logger.info(f"check for h5data {train_data}")
try:
check_h5dataset(train_data)
except FileNotFoundError:
logger.info("load raw training dataset")
domain, flow, name, hits, trusted_hits, server = load_or_generate_raw_h5data(train_data, domain_length,
window_size)
logger.info("filter training dataset")
domain, flow, name, client, server = filter_window_dataset_by_hits(domain.value, flow.value,
name.value, hits.value,
trusted_hits.value, server.value)
logger.info("store training dataset as h5 file")
data = {
"domain": domain.astype(np.int8),
"flow": flow,
"name": name,
"client": client.astype(np.bool),
"server": server.astype(np.bool)
}
store_h5dataset(train_data, data)
logger.info("load h5 dataset")
data = load_h5dataset(train_data)
return data["domain"], data["flow"], data["name"], data["client"], data["server"]
def load_or_generate_raw_h5data(train_data, domain_length, window_size):
h5data = train_data + "_raw"
logger.info(f"check for h5data {h5data}")
try:
check_h5dataset(h5data)
except FileNotFoundError:
logger.info("h5 data not found - load csv file")
user_flow_df = get_user_flow_data(train_data)
logger.info("create raw training dataset")
domain, flow, name, hits, trusted_hits, server = create_raw_dataset_from_flows(user_flow_df, domain_length,
window_size)
logger.info("store raw training dataset as h5 file")
data = {
"domain": domain.astype(np.int8),
"flow": flow,
"name": name,
"hits_vt": hits.astype(np.int8),
"hits_trusted": hits.astype(np.int8),
"server": server.astype(np.bool)
}
store_h5dataset(h5data, data)
logger.info("load h5 dataset")
data = load_h5dataset(h5data)
return data["domain"], data["flow"], data["name"], data["hits_vt"], data["hits_trusted"], data["server"]
def generate_names(train_data, window_size):
user_flow_df = get_user_flow_data(train_data)
with Pool() as pool:
results = []
for user_flow in tqdm(get_flow_per_user(user_flow_df),
total=len(user_flow_df['user_hash'].unique().tolist())):
results.append(pool.apply_async(get_user_chunks, (user_flow, window_size)))
windows = [window for res in results for window in res.get()]
names = np.stack(map(lambda f: f.user_hash, windows))
names = names[:, 0]
return names
def load_or_generate_domains(train_data, domain_length):
fn = f"{train_data}_domains.gz"
try:
logger.info(f"Load file {fn}.")
user_flow_df = pd.read_csv(fn)
logger.info(f"File successfully loaded.")
except FileNotFoundError:
logger.info(f"File {fn} not found, recreate.")
user_flow_df = get_user_flow_data(train_data)
# user_flow_df.reset_index(inplace=True)
user_flow_df = user_flow_df[["domain", "serverLabel", "trustedHits", "virusTotalHits"]].dropna(axis=0,
how="any")
user_flow_df = user_flow_df.groupby(user_flow_df.domain).mean()
user_flow_df.reset_index(inplace=True)
user_flow_df["clientLabel"] = np.where(
np.logical_or(user_flow_df.trustedHits > 0, user_flow_df.virusTotalHits >= 3), True, False)
user_flow_df[["serverLabel", "clientLabel"]] = user_flow_df[["serverLabel", "clientLabel"]].astype(bool)
user_flow_df = user_flow_df[["domain", "serverLabel", "clientLabel"]]
user_flow_df.to_csv(fn, compression="gzip")
logger.info(f"Extract features from domains")
domain_encs = user_flow_df.domain.apply(lambda d: get_domain_features(d, domain_length))
domain_encs = np.stack(domain_encs)
return domain_encs, user_flow_df.domain, user_flow_df[["clientLabel", "serverLabel"]].as_matrix().astype(bool)
def save_predictions(path, results):
joblib.dump(results, path + "/results.joblib", compress=3)
def load_predictions(path):
return joblib.load(path + "/results.joblib")
yield df.loc[df.user_hash == user]

View File

@ -1,30 +0,0 @@
#!/usr/bin/env bash
N1=$1
N2=$2
RESDIR=$3
DATADIR=$4
#for ((i = ${N1}; i <= ${N2}; i++))
#do
# python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_final_${i} --data ${DATADIR} --model_output client
# python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_final_${i} --data ${DATADIR} --model_output both
# python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_inter_${i} --data ${DATADIR} --model_output both
# python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_staggered_${i} --data ${DATADIR} --model_output both
#done
#
#python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/client_final_{1..20}/ --data ${DATADIR} --model_output client --out-prefix ${RESDIR}/client_final
#python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_final_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_final
#python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_inter_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_inter
#python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_staggered_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_staggered
#python3 main.py --mode beta --batch 1024 --models ${RESDIR}/client_final_{1..20}/ --data ${DATADIR} --model_output client --out-prefix ${RESDIR}/client_final
#python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_final_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_final
#python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_inter_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_inter
#python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_staggered_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_staggered
#python3 main.py --mode all_beta --out-prefix ${RESDIR}/both_staggered
python3 main.py --mode embedding --batch 1024 --models ${RESDIR}/client_final_{1..20}/ ${RESDIR}/both_final_{1..20}/ \
${RESDIR}/both_inter_{1..20}/ ${RESDIR}/both_staggered_{1..20}/ \
--data ${DATADIR} \
--out-prefix ${RESDIR}/figs/svd/svd

View File

@ -1,146 +0,0 @@
# -*- coding: utf-8 -*-
# implementation of hyperband:
# https://arxiv.org/pdf/1603.06560.pdf
import logging
import random
from math import ceil, log
from random import random as rng
from time import ctime, time
import joblib
import numpy as np
from keras.callbacks import EarlyStopping
import models
logger = logging.getLogger('cisco_logger')
def sample_params(param_distribution: dict):
p = {}
for key, val in param_distribution.items():
p[key] = random.choice(val)
return p
class Hyperband:
def __init__(self, param_distribution, X, y, max_iter=81, savefile=None):
self.get_params = lambda: sample_params(param_distribution)
self.max_iter = max_iter # maximum iterations per configuration
self.eta = 3 # defines configuration downsampling rate (default = 3)
self.logeta = lambda x: log(x) / log(self.eta)
self.s_max = int(self.logeta(self.max_iter))
self.B = (self.s_max + 1) * self.max_iter
self.results = [] # list of dicts
self.counter = 0
self.best_loss = np.inf
self.best_counter = -1
self.savefile = savefile
self.X = X
self.y = y
def try_params(self, n_iterations, params):
n_iterations = int(round(n_iterations))
model = models.get_models_by_params(params)
callbacks = [EarlyStopping(monitor='val_loss',
patience=5,
verbose=False)]
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
history = model.fit(self.X,
self.y[0] if params["model_output"] == "client" else self.y,
batch_size=params["batch_size"],
epochs=n_iterations,
callbacks=callbacks,
shuffle=True,
validation_split=0.4)
return {"loss": np.min(history.history['val_loss']),
"early_stop": len(history.history["loss"]) < n_iterations,
"stop_after": len(history.history["val_loss"])}
# can be called multiple times
def run(self, skip_last=0, dry_run=False):
for s in reversed(range(self.s_max + 1)):
# initial number of configurations
n = int(ceil(self.B / self.max_iter / (s + 1) * self.eta ** s))
# initial number of iterations per config
r = self.max_iter * self.eta ** (-s)
# n random configurations
random_configs = [self.get_params() for _ in range(n)]
for i in range((s + 1) - int(skip_last)): # changed from s + 1
# Run each of the n configs for <iterations>
# and keep best (n_configs / eta) configurations
n_configs = n * self.eta ** (-i)
n_iterations = r * self.eta ** (i)
logger.info("*** {} configurations x {:.1f} iterations each".format(
n_configs, n_iterations))
val_losses = []
early_stops = []
for t in random_configs:
self.counter += 1
logger.info("Config {} | {} | lowest loss so far: {:.4f} (run {})".format(
self.counter, ctime(), self.best_loss, self.best_counter))
start_time = time()
if dry_run:
result = {'loss': rng(), 'log_loss': rng(), 'auc': rng()}
else:
result = self.try_params(n_iterations, t) # <---
assert (type(result) == dict)
assert ('loss' in result)
seconds = int(round(time() - start_time))
logger.info("{} seconds.".format(seconds))
loss = result['loss']
val_losses.append(loss)
early_stop = result.get('early_stop', False)
early_stops.append(early_stop)
# keeping track of the best result so far (for display only)
# could do it be checking results each time, but hey
if loss < self.best_loss:
self.best_loss = loss
self.best_counter = self.counter
result['counter'] = self.counter
result['seconds'] = seconds
result['params'] = t
result['iterations'] = n_iterations
self.results.append(result)
# select a number of best configurations for the next loop
# filter out early stops, if any
indices = np.argsort(val_losses)
random_configs = [random_configs[i] for i in indices if not early_stops[i]]
random_configs = random_configs[0:int(n_configs / self.eta)]
if self.savefile:
joblib.dump(self.results, self.savefile)
return self.results

953
main.py
View File

@ -1,860 +1,127 @@
import logging
import operator
import os
import argparse
import joblib
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint
from sklearn.metrics import confusion_matrix
from keras.utils import np_utils
import arguments
import dataset
import hyperband
import models
# create logger
import visualize
from arguments import get_model_args
from utils import exists_or_make_path, get_custom_class_weights, get_custom_sample_weights, load_model
logger = logging.getLogger('cisco_logger')
logger.setLevel(logging.DEBUG)
logger.propagate = False
parser = argparse.ArgumentParser()
# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
parser.add_argument("--modes", action="store", dest="modes", nargs="+")
# create formatter
formatter1 = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# add formatter to ch
ch.setFormatter(formatter1)
# add ch to logger
logger.addHandler(ch)
# ch = logging.FileHandler("info.log")
# ch.setLevel(logging.DEBUG)
# parser.add_argument("--data", action="store", dest="data",
# default="data/")
#
# # create formatter
# formatter2 = logging.Formatter('!! %(asctime)s - %(name)s - %(levelname)s - %(message)s')
# parser.add_argument("--h5data", action="store", dest="h5data",
# default="")
#
# # add formatter to ch
# ch.setFormatter(formatter2)
# parser.add_argument("--model", action="store", dest="model",
# default="model_x")
#
# # add ch to logger
# logger.addHandler(ch)
args = arguments.parse()
if args.gpu:
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.per_process_gpu_memory_fraction = 0.5
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
# default parameter
PARAMS = {
"type": args.model_type,
"embedding_type": args.embedding_type,
# "depth": args.model_depth,
"batch_size": args.batch_size,
"window_size": args.window,
"domain_length": args.domain_length,
"flow_features": 3,
#
'dropout': 0.5, # currently fix
'embedding': args.embedding,
'flow_features': 3,
'filter_embedding': args.filter_embedding,
'dense_embedding': args.dense_embedding,
'kernel_embedding': args.kernel_embedding,
'filter_main': args.filter_main,
'dense_main': args.dense_main,
'kernel_main': args.kernel_main,
'model_output': args.model_output
}
# TODO: remove inner global params
def get_param_dist(dist_size="small"):
if dist_size == "small":
return {
# static params
"type": [args.model_type],
"embedding_type": [args.embedding_type],
# "depth": [args.model_depth],
"model_output": [args.model_output],
"batch_size": [args.batch_size],
"window_size": [args.window],
"flow_features": [3],
"domain_length": [args.domain_length],
# model params
"embedding": [2 ** x for x in range(3, 6)],
"filter_embedding": [2 ** x for x in range(1, 8)],
"kernel_embedding": [1, 3, 5],
"dense_embedding": [2 ** x for x in range(4, 8)],
"dropout": [0.5],
"filter_main": [2 ** x for x in range(1, 8)],
"kernel_main": [1, 3, 5],
"dense_main": [2 ** x for x in range(1, 8)],
}
else:
return {
# static params
"type": [args.model_type],
"embedding_type": [args.embedding_type],
# "depth": [args.model_depth],
"model_output": [args.model_output],
"batch_size": [args.batch_size],
"window_size": [args.window],
"flow_features": [3],
"domain_length": [args.domain_length],
# model params
"embedding": [2 ** x for x in range(3, 7)],
"filter_embedding": [2 ** x for x in range(1, 10)],
"kernel_embedding": [1, 3, 5, 7, 9],
"dense_embedding": [2 ** x for x in range(4, 10)],
"dropout": [0.5],
"filter_main": [2 ** x for x in range(1, 10)],
"kernel_main": [1, 3, 5, 7, 9],
"dense_main": [2 ** x for x in range(1, 12)],
}
def shuffle_training_data(domain, flow, client, server):
idx = np.random.permutation(len(domain))
domain = domain[idx]
flow = flow[idx]
client = client[idx]
server = server[idx]
return domain, flow, client, server
def main_paul_best():
pauls_best_params = {
"type": "paul",
"batch_size": 64,
"window_size": 10,
"domain_length": 40,
"flow_features": 3,
#
'dropout': 0.5,
'domain_features': 32,
'drop_out': 0.5,
'embedding_size': 64,
'filter_main': 512,
'flow_features': 3,
'dense_main': 32,
'filter_embedding': 32,
'hidden_embedding': 32,
'kernel_embedding': 8,
'kernels_main': 8,
'input_length': 40
}
main_train(pauls_best_params)
def main_hyperband(data, domain_length, window_size, model_type, result_file, max_iter, dist_size="small"):
logger.info("create training dataset")
domain_tr, flow_tr, client_tr, server_tr = load_data(data, domain_length, window_size, model_type, shuffled=True)
return run_hyperband(dist_size, domain_tr, flow_tr, client_tr, server_tr, max_iter, result_file)
def run_hyperband(dist_size, features, labels, max_iter, savefile):
param_dist = get_param_dist(dist_size)
hp = hyperband.Hyperband(param_dist, features, labels,
max_iter=max_iter,
savefile=savefile)
results = hp.run()
return results
def train(parameters, features, labels):
pass
def load_data(data, domain_length, window_size, model_type, shuffled=False):
# data preparation
domain_tr, flow_tr, name_tr, client_tr, server_windows_tr = dataset.load_or_generate_h5data(data, domain_length,
window_size)
server_tr = np.max(server_windows_tr, axis=1)
if model_type in ("inter", "staggered"):
server_tr = np.expand_dims(server_windows_tr, 2)
if shuffled:
domain_tr, flow_tr, client_tr, server_tr = shuffle_training_data(domain_tr, flow_tr, client_tr, server_tr)
return domain_tr, flow_tr, client_tr, server_tr
def load_training_data(data, model_output, domain_length, window_size, model_type, shuffled=False):
domain_tr, flow_tr, client_tr, server_tr = load_data(data, domain_length,
window_size, model_type, shuffled)
features = {"ipt_domains": domain_tr.value, "ipt_flows": flow_tr.value}
if model_output == "both":
labels = {"client": client_tr.value, "server": server_tr}
loss_weights = {"client": 1.0, "server": 1.0}
elif model_output == "client":
labels = {"client": client_tr.value}
loss_weights = {"client": 1.0}
elif model_output == "server":
labels = {"server": server_tr}
loss_weights = {"server": 1.0}
else:
raise ValueError("unknown model output")
return features, labels, loss_weights
def get_weighting(class_weights, sample_weights, labels):
return None, None
client, server = labels["client"], labels["server"]
if class_weights:
logger.info("class weights: compute custom weights")
custom_class_weights = get_custom_class_weights(client, server)
logger.info(custom_class_weights)
else:
logger.info("class weights: set default")
custom_class_weights = None
if sample_weights:
logger.info("class weights: compute custom weights")
custom_sample_weights = get_custom_sample_weights(client, server)
logger.info(custom_sample_weights)
else:
logger.info("class weights: set default")
custom_sample_weights = None
return custom_class_weights, custom_sample_weights
def main_train(param=None):
logger.info(f"Create model path {args.model_path}")
exists_or_make_path(args.model_path)
logger.info(f"Use command line arguments: {args}")
# data preparation
features, labels, loss_weights = load_training_data(args.data, args.model_output, args.domain_length,
args.window, args.model_type)
# call hyperband if results are not accessible
if args.hyperband_results:
try:
hyper_results = joblib.load(args.hyperband_results)
except Exception:
logger.info("start hyperband parameter search")
hyper_results = run_hyperband("small", features, labels, args.hyper_max_iter,
args.hyperband_results)
param = sorted(hyper_results, key=operator.itemgetter("loss"))[0]["params"]
param["type"] = args.model_type
logger.info(f"select params from result: {param}")
if not param:
param = PARAMS
# custom class or sample weights
# TODO: should throw an error when using weights with only the client labels
custom_class_weights, custom_sample_weights = get_weighting(args.class_weights, args.sample_weights, labels)
for i in range(args.runs):
model_path = os.path.join(args.model_path, f"clf_{i}.h5")
train_log_path = os.path.join(args.model_path, f"train_{i}.log.csv")
# define training call backs
logger.info("define callbacks")
callbacks = []
callbacks.append(ModelCheckpoint(filepath=model_path,
monitor='loss',
verbose=False,
save_best_only=True))
callbacks.append(CSVLogger(train_log_path))
logger.info(f"Use early stopping: {args.stop_early}")
if args.stop_early:
callbacks.append(EarlyStopping(monitor='val_loss',
patience=5,
verbose=False))
custom_metrics = models.get_metric_functions()
logger.info(f"Generator model with params: {param}")
model = models.get_models_by_params(param)
logger.info(f"select model: {args.model_type}")
if args.model_type == "staggered":
logger.info("compile and pre-train server model")
logger.info(model.get_config())
model.compile(optimizer='adam',
loss='binary_crossentropy',
loss_weights={"client": 0.0, "server": 1.0},
metrics=['accuracy'] + custom_metrics)
model.summary()
model.fit(features, labels,
batch_size=args.batch_size,
epochs=args.epochs,
class_weight=custom_class_weights,
sample_weight=custom_sample_weights)
logger.info("fix server model")
model.get_layer("domain_cnn").trainable = False
model.get_layer("domain_cnn").layer.trainable = False
model.get_layer("dense_server").trainable = False
model.get_layer("server").trainable = False
loss_weights = {"client": 1.0, "server": 0.0}
logger.info("compile and train model")
logger.info(model.get_config())
model.compile(optimizer='adam',
loss='binary_crossentropy',
loss_weights=loss_weights,
metrics=['accuracy'] + custom_metrics)
model.summary()
model.fit(features, labels,
batch_size=args.batch_size,
epochs=args.epochs,
callbacks=callbacks,
class_weight=custom_class_weights,
sample_weight=custom_sample_weights)
def main_retrain():
source = os.path.join(args.model_source, "clf.h5")
destination = os.path.join(args.model_destination, "clf.h5")
logger.info(f"Use command line arguments: {args}")
exists_or_make_path(args.model_destination)
domain_tr, flow_tr, name_tr, client_tr, server_windows_tr = dataset.load_or_generate_h5data(args.data,
args.domain_length,
args.window)
logger.info("define callbacks")
callbacks = []
callbacks.append(ModelCheckpoint(filepath=destination,
monitor='loss',
verbose=False,
save_best_only=True))
callbacks.append(CSVLogger(args.train_log))
logger.info(f"Use early stopping: {args.stop_early}")
if args.stop_early:
callbacks.append(EarlyStopping(monitor='val_loss',
patience=5,
verbose=False))
server_tr = np.max(server_windows_tr, axis=1)
if args.class_weights:
logger.info("class weights: compute custom weights")
custom_class_weights = get_custom_class_weights(client_tr.value, server_tr)
logger.info(custom_class_weights)
else:
logger.info("class weights: set default")
custom_class_weights = None
logger.info(f"Load pretrained model")
embedding, model = load_model(source, custom_objects=models.get_custom_objects())
if args.model_type in ("inter", "staggered"):
server_tr = np.expand_dims(server_windows_tr, 2)
features = {"ipt_domains": domain_tr.value, "ipt_flows": flow_tr.value}
if args.model_output == "both":
labels = {"client": client_tr.value, "server": server_tr}
elif args.model_output == "client":
labels = {"client": client_tr.value}
elif args.model_output == "server":
labels = {"server": server_tr}
else:
raise ValueError("unknown model output")
logger.info("re-train model")
embedding.summary()
model.summary()
model.fit(features, labels,
batch_size=args.batch_size,
epochs=args.epochs,
callbacks=callbacks,
class_weight=custom_class_weights,
initial_epoch=args.initial_epoch)
def main_test():
logger.info("load test data")
domain_val, flow_val, _, _, _, _ = dataset.load_or_generate_raw_h5data(args.data, args.domain_length, args.window)
logger.info("load test domains")
domain_encs, _, _ = dataset.load_or_generate_domains(args.data, args.domain_length)
def get_dir(path):
return os.path.split(os.path.normpath(path))
results = {}
for model_path in args.model_paths:
file = get_dir(model_path)[1]
results[file] = {}
logger.info(f"process model {model_path}")
embd_model, clf_model = load_model(model_path, custom_objects=models.get_custom_objects())
pred = clf_model.predict([domain_val, flow_val],
batch_size=args.batch_size,
verbose=1)
if args.model_output == "both":
c_pred, s_pred = pred
results[file]["client_pred"] = c_pred
results[file]["server_pred"] = s_pred
elif args.model_output == "client":
results[file]["client_pred"] = pred
else:
results[file]["server_pred"] = pred
domain_embeddings = embd_model.predict(domain_encs, batch_size=args.batch_size, verbose=1)
results["domain_embds"] = domain_embeddings
# store results every round - safety first!
dataset.save_predictions(get_dir(model_path)[0], results)
def main_visualization():
def plot_model(clf_model, path):
embd, model = load_model(clf_model, custom_objects=models.get_custom_objects())
visualize.plot_model_as(embd, os.path.join(path, "model_embd.pdf"), shapes=False)
visualize.plot_model_as(model, os.path.join(path, "model_clf.pdf"), shapes=False)
def vis(model_name, model_path, df, df_paul, aggregation, curve):
visualize.plot_clf()
if aggregation == "user":
df = df.groupby(df.names).max()
df_paul = df_paul.groupby(df_paul.names).max()
if curve == "prc":
visualize.plot_precision_recall(df.client_val.as_matrix(), df.client_pred.as_matrix(), model_name)
visualize.plot_precision_recall(df_paul.client_val.as_matrix(), df_paul.client_pred.as_matrix(), "paul")
elif curve == "roc":
visualize.plot_roc_curve(df.client_val.as_matrix(), df.client_pred.as_matrix(), model_name)
visualize.plot_roc_curve(df_paul.client_val.as_matrix(), df_paul.client_pred.as_matrix(), "paul")
visualize.plot_legend()
visualize.plot_save("{}/{}_{}.pdf".format(model_path, aggregation, curve))
_, _, name_val, hits_vt, hits_trusted, server_val = dataset.load_or_generate_raw_h5data(args.data,
args.domain_length,
args.window)
results = dataset.load_predictions(args.model_path)
df = pd.DataFrame(data={
"names": name_val, "client_pred": results["client_pred"].flatten(),
"hits_vt": hits_vt, "hits_trusted": hits_trusted
})
df["client_val"] = np.logical_or(df.hits_vt == 1.0, df.hits_trusted >= 3)
df_user = df.groupby(df.names).max()
paul = dataset.load_predictions("results/paul/")
df_paul = pd.DataFrame(data={
"names": paul["testNames"].flatten(), "client_pred": paul["testScores"].flatten(),
"hits_vt": paul["testLabel"].flatten(), "hits_trusted": paul["testHits"].flatten()
})
df_paul["client_val"] = np.logical_or(df_paul.hits_vt == 1.0, df_paul.hits_trusted >= 3)
logger.info("plot model")
plot_model(args.clf_model, args.model_path)
# logger.info("plot training curve")
# logs = pd.read_csv(args.train_log)
# if "acc" in logs.keys():
# visualize.plot_training_curve(logs, "", "{}/client_train.png".format(args.model_path))
# elif "client_acc" in logs.keys() and "server_acc" in logs.keys():
# visualize.plot_training_curve(logs, "client_", "{}/client_train.png".format(args.model_path))
# visualize.plot_training_curve(logs, "server_", "{}/server_train.png".format(args.model_path))
# else:
# logger.warning("Error while plotting training curves")
logger.info("plot window prc")
vis(args.model_name, args.model_path, df, df_paul, "window", "prc")
logger.info("plot window roc")
vis(args.model_name, args.model_path, df, df_paul, "window", "roc")
logger.info("plot user prc")
vis(args.model_name, args.model_path, df, df_paul, "user", "prc")
logger.info("plot user roc")
vis(args.model_name, args.model_path, df, df_paul, "user", "roc")
# absolute values
visualize.plot_confusion_matrix(df.client_val.as_matrix(), df.client_pred.as_matrix().round(),
"{}/client_cov.pdf".format(args.model_path),
normalize=False, title="Client Confusion Matrix")
visualize.plot_confusion_matrix(df_user.client_val.as_matrix(), df_user.client_pred.as_matrix().round(),
"{}/user_cov.pdf".format(args.model_path),
normalize=False, title="User Confusion Matrix")
# normalized
visualize.plot_confusion_matrix(df.client_val.as_matrix(), df.client_pred.as_matrix().round(),
"{}/client_cov_norm.pdf".format(args.model_path),
normalize=True, title="Client Confusion Matrix")
visualize.plot_confusion_matrix(df_user.client_val.as_matrix(), df_user.client_pred.as_matrix().round(),
"{}/user_cov_norm.pdf".format(args.model_path),
normalize=True, title="User Confusion Matrix")
def main_visualize_all():
_, _, name_val, hits_vt, hits_trusted, server_val = dataset.load_or_generate_raw_h5data(args.data,
args.domain_length,
args.window)
def load_df(path):
res = dataset.load_predictions(path)
res = pd.DataFrame(data={
"names": name_val, "client_pred": res["client_pred"].flatten(),
"hits_vt": hits_vt, "hits_trusted": hits_trusted
})
res["client_val"] = np.logical_or(res.hits_vt == 1.0, res.hits_trusted >= 3)
return res
dfs = [(model_args["model_name"], load_df(model_args["model_path"])) for model_args in get_model_args(args)]
paul = dataset.load_predictions("results/paul/")
df_paul = pd.DataFrame(data={
"names": paul["testNames"].flatten(), "client_pred": paul["testScores"].flatten(),
"hits_vt": paul["testLabel"].flatten(), "hits_trusted": paul["testHits"].flatten()
})
df_paul["client_val"] = np.logical_or(df_paul.hits_vt == 1.0, df_paul.hits_trusted >= 3)
def vis(output_prefix, dfs, df_paul, aggregation, curve):
visualize.plot_clf()
if curve == "prc":
for model_name, df in dfs:
if aggregation == "user":
df = df.groupby(df.names).max()
visualize.plot_precision_recall(df.client_val.as_matrix(), df.client_pred.as_matrix(), model_name)
if aggregation == "user":
df_paul = df_paul.groupby(df_paul.names).max()
visualize.plot_precision_recall(df_paul.client_val.as_matrix(), df_paul.client_pred.as_matrix(), "paul")
elif curve == "roc":
for model_name, df in dfs:
if aggregation == "user":
df = df.groupby(df.names).max()
visualize.plot_roc_curve(df.client_val.as_matrix(), df.client_pred.as_matrix(), model_name)
if aggregation == "user":
df_paul = df_paul.groupby(df_paul.names).max()
visualize.plot_roc_curve(df_paul.client_val.as_matrix(), df_paul.client_pred.as_matrix(), "paul")
visualize.plot_legend()
visualize.plot_save("{}_{}_{}.pdf".format(output_prefix, aggregation, curve))
logger.info("plot pr curves")
vis(args.output_prefix, dfs, df_paul, "window", "prc")
logger.info("plot roc curves")
vis(args.output_prefix, dfs, df_paul, "window", "roc")
logger.info("plot user pr curves")
vis(args.output_prefix, dfs, df_paul, "user", "prc")
logger.info("plot user roc curves")
vis(args.output_prefix, dfs, df_paul, "user", "roc")
def main_visualize_all_embds():
def load_df(path):
res = dataset.load_predictions(path)
return res["domain_embds"]
dfs = [(model_args["model_name"], load_df(model_args["model_path"])) for model_args in get_model_args(args)]
from sklearn.decomposition import TruncatedSVD
def vis2(domain_embedding, labels):
n_levels = 7
logger.info(f"reduction for {len(domain_embedding)} points")
red = TruncatedSVD(n_components=2, algorithm="arpack")
domains = red.fit_transform(domain_embedding)
logger.info("plot kde")
benign = domains[labels.sum(axis=1) == 0]
# print(domains.shape)
# print(benign.shape)
# benign_idx
# sns.kdeplot(domains[labels.sum(axis=1) == 0, 0], domains[labels.sum(axis=1) == 0, 1],
# cmap="Blues", label="benign", n_levels=9, alpha=0.35, shade=True, shade_lowest=False)
# sns.kdeplot(domains[labels[:, 1], 0], domains[labels[:, 1], 1],
# cmap="Greens", label="server", n_levels=5, alpha=0.35, shade=True, shade_lowest=False)
# sns.kdeplot(domains[labels[:, 0], 0], domains[labels[:, 0], 1],
# cmap="Reds", label="client", n_levels=5, alpha=0.35, shade=True, shade_lowest=False)
plt.scatter(benign[benign_idx, 0], benign[benign_idx, 1],
cmap="Blues", label="benign", alpha=0.35, s=10)
plt.scatter(domains[labels[:, 1], 0], domains[labels[:, 1], 1],
cmap="Greens", label="server", alpha=0.35, s=10)
plt.scatter(domains[labels[:, 0], 0], domains[labels[:, 0], 1],
cmap="Reds", label="client", alpha=0.35, s=10)
return np.concatenate((domains[:1000], domains[1000:2000], domains[2000:3000]), axis=0)
domain_encs, _, labels = dataset.load_or_generate_domains(args.data, args.domain_length)
idx = np.arange(len(labels))
client = labels[:, 0]
server = labels[:, 1]
benign = np.logical_not(np.logical_or(client, server))
print(client.sum(), server.sum(), benign.sum())
idx = np.concatenate((
np.random.choice(idx[client], 1000),
np.random.choice(idx[server], 1000),
np.random.choice(idx[benign], 6000)), axis=0)
benign_idx = np.random.choice(np.arange(6000), 1000)
print(idx.shape)
lls = labels[idx]
for model_name, embd in dfs:
logger.info(f"plot embedding for {model_name}")
visualize.plot_clf()
embd = embd[idx]
points = vis2(embd, lls)
# np.savetxt("{}_{}.csv".format(args.output_prefix, model_name), points, delimiter=",")
visualize.plot_save("{}_{}.pdf".format(args.output_prefix, model_name))
def main_beta():
domain_val, _, name_val, hits_vt, hits_trusted, server_val = dataset.load_or_generate_raw_h5data(args.data,
args.domain_length,
args.window)
path, model_prefix = os.path.split(os.path.normpath(args.model_path))
curves = {
model_prefix: {"all": {}}
}
# domains = domain_val.value.reshape(-1, 40)
# domains = np.apply_along_axis(lambda d: dataset.decode_domain(d), 1, domains)
def load_df(res):
df_server = None
data = {
"names": name_val, "client_pred": res["client_pred"].flatten(),
"hits_vt": hits_vt, "hits_trusted": hits_trusted,
}
if "server_pred" in res:
server = res["server_pred"] if len(res["server_pred"].shape) == 2 else res["server_pred"].max(axis=1)
val = server_val.value.max(axis=1)
data["server_pred"] = server.flatten()
data["server_val"] = val.flatten()
# if res["server_pred"].flatten().shape == server_val.value.flatten().shape:
# df_server = pd.DataFrame(data={
# "server_pred": res["server_pred"].flatten(),
# "domain": domains,
# "server_val": server_val.value.flatten()
# })
res = pd.DataFrame(data=data)
res["client_val"] = np.logical_or(res.hits_vt == 1.0, res.hits_trusted >= 3)
return res, df_server
logger.info(f"load results from {args.model_path}")
res = dataset.load_predictions(args.model_path)
model_keys = sorted(filter(lambda x: x.startswith("clf"), res.keys()), key=lambda x: int(x[4:-3]))
client_preds = []
server_preds = []
server_flow_preds = []
client_user_preds = []
server_user_preds = []
server_domain_preds = []
server_domain_avg_preds = []
for model_name in model_keys:
logger.info(f"load model {model_name}")
df, df_server = load_df(res[model_name])
client_preds.append(df.client_pred.as_matrix())
if "server_val" in df.columns:
server_preds.append(df.server_pred.as_matrix())
if df_server is not None:
logger.info(f" group servers")
server_flow_preds.append(df_server.server_pred.as_matrix())
df_domain = df_server.groupby(df_server.domain).max()
server_domain_preds.append(df_domain.server_pred.as_matrix())
df_domain_avg = df_server.groupby(df_server.domain).rolling(10).mean()
server_domain_avg_preds.append(df_domain_avg.server_pred.as_matrix())
curves[model_prefix][model_name] = confusion_matrix(df.client_val.as_matrix(),
df.client_pred.as_matrix().round())
logger.info(f" group users")
df_user = df.groupby(df.names).max()
client_user_preds.append(df_user.client_pred.as_matrix())
if "server_val" in df.columns:
server_user_preds.append(df_user.server_pred.as_matrix())
logger.info("compute client curves")
curves[model_prefix]["all"]["client_window_prc"] = visualize.calc_pr_mean(df.client_val.as_matrix(), client_preds)
curves[model_prefix]["all"]["client_window_roc"] = visualize.calc_roc_mean(df.client_val.as_matrix(), client_preds)
curves[model_prefix]["all"]["client_user_prc"] = visualize.calc_pr_mean(df_user.client_val.as_matrix(),
client_user_preds)
curves[model_prefix]["all"]["client_user_roc"] = visualize.calc_roc_mean(df_user.client_val.as_matrix(),
client_user_preds)
if "server_val" in df.columns:
logger.info("compute server curves")
curves[model_prefix]["all"]["server_window_prc"] = visualize.calc_pr_mean(df.server_val.as_matrix(),
server_preds)
curves[model_prefix]["all"]["server_window_roc"] = visualize.calc_roc_mean(df.server_val.as_matrix(),
server_preds)
curves[model_prefix]["all"]["server_user_prc"] = visualize.calc_pr_mean(df_user.server_val.as_matrix(),
server_user_preds)
curves[model_prefix]["all"]["server_user_roc"] = visualize.calc_roc_mean(df_user.server_val.as_matrix(),
server_user_preds)
if df_server is not None:
logger.info("compute server flow curves")
curves[model_prefix]["all"]["server_flow_prc"] = visualize.calc_pr_mean(df_server.server_val.as_matrix(),
server_flow_preds)
curves[model_prefix]["all"]["server_flow_roc"] = visualize.calc_roc_mean(df_server.server_val.as_matrix(),
server_flow_preds)
curves[model_prefix]["all"]["server_domain_prc"] = visualize.calc_pr_mean(df_domain.server_val.as_matrix(),
server_domain_preds)
curves[model_prefix]["all"]["server_domain_roc"] = visualize.calc_roc_mean(df_domain.server_val.as_matrix(),
server_domain_preds)
curves[model_prefix]["all"]["server_domain_avg_prc"] = visualize.calc_pr_mean(
df_domain_avg.server_val.as_matrix(),
server_domain_avg_preds)
curves[model_prefix]["all"]["server_domain_avg_roc"] = visualize.calc_roc_mean(
df_domain_avg.server_val.as_matrix(),
server_domain_avg_preds)
joblib.dump(curves, f"{args.model_path}_curves.joblib")
try:
curves_all: dict = joblib.load(f"{path}/curves.joblib")
logger.info(f"load file {path}/curves.joblib successfully")
curves_all[model_prefix] = curves[model_prefix]
except Exception:
curves_all = curves
logger.info(f"currently {len(curves_all)} models in file: {curves_all.keys()}")
joblib.dump(curves_all, f"{path}/curves.joblib")
import matplotlib
matplotlib.use("agg")
import matplotlib.pyplot as plt
def plot_overall_result():
path, model_prefix = os.path.split(os.path.normpath(args.model_path))
exists_or_make_path(f"{path}/figs/curves/")
try:
results = joblib.load(f"{path}/curves.joblib")
logger.info("curves successfully loaded")
except Exception:
results = {}
x = np.linspace(0, 1, 10000)
for vis in ["client_window_prc", "client_window_roc", "client_user_prc", "client_user_roc",
"server_window_prc", "server_window_roc", "server_user_prc", "server_user_roc",
"server_flow_prc", "server_flow_roc", "server_domain_prc", "server_domain_roc"]:
logger.info(f"plot {vis}")
visualize.plot_clf()
for model_key in results.keys():
if vis not in results[model_key]["all"]:
continue
if "final" in model_key and vis.startswith("server_flow"):
continue
ys_mean, ys_std, ys = results[model_key]["all"][vis]
plt.plot(x, ys_mean, label=f"{model_key} - {np.mean(ys_mean):5.4} ({np.mean(ys_std):4.3})")
plt.fill_between(x, ys_mean - ys_std, ys_mean + ys_std, alpha=0.2)
if vis.endswith("prc"):
plt.xlabel('Recall')
plt.ylabel('Precision')
else:
plt.plot(x, x, label="random classifier", ls="--", c=".3", alpha=0.4)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.xscale('log')
plt.ylim([0.0, 1.0])
plt.xlim([0.0, 1.0])
visualize.plot_legend()
visualize.plot_save(f"{path}/figs/curves/{vis}_all.pdf")
return
for vis in ["client_window_prc", "client_window_roc", "client_user_prc", "client_user_roc",
"server_window_prc", "server_window_roc", "server_user_prc", "server_user_roc",
"server_flow_prc", "server_flow_roc", "server_domain_prc", "server_domain_roc"]:
logger.info(f"plot {vis}")
visualize.plot_clf()
for model_key in results.keys():
if vis not in results[model_key]["all"]:
continue
if "final" in model_key and vis.startswith("server_flow"):
continue
_, _, ys = results[model_key]["all"][vis]
for y in ys:
plt.plot(x, y, label=f"{model_key} - {np.mean(y):5.4}")
if vis.endswith("prc"):
plt.xlabel('Recall')
plt.ylabel('Precision')
else:
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.xscale('log')
plt.ylim([0.0, 1.0])
plt.xlim([0.0, 1.0])
visualize.plot_legend()
visualize.plot_save(f"{path}/figs/Appendices/{model_key}_{vis}.pdf")
def main_stats():
path, model_prefix = os.path.split(os.path.normpath(args.output_prefix))
for time in ("current", "future"):
df = dataset.get_user_flow_data(f"data/{time}Data.csv.gz")
df["clientlabel"] = np.logical_or(df.virusTotalHits > 3, df.trustedHits > 0)
# df_user = df.groupby(df.user_hash).max()
# df_server = df.groupby(df.domain).max()
# len(df)
# df.clientlabel.sum()
# df.serverLabel.sum()
for col in ["duration", "bytes_down", "bytes_up"]:
# visualize.plot_clf()
plt.clf()
plt.hist(df[col])
visualize.plot_save(f"{path}/figs/hist_{time}_{col}.pdf")
print(".")
# visualize.plot_clf()
plt.clf()
plt.hist(np.log1p(df[col]))
visualize.plot_save(f"{path}/figs/hist_{time}_norm_{col}.pdf")
print("-")
def main_stats2():
import joblib
res = joblib.load("results/variance_test_hyper/curves.joblib")
for vis in ["client_window_prc", "client_window_roc", "client_user_prc", "client_user_roc",
"server_window_prc", "server_window_roc", "server_user_prc", "server_user_roc",
"server_flow_prc", "server_flow_roc", "server_domain_prc", "server_domain_roc",
"server_domain_avg_prc", "server_domain_avg_roc"]:
tab = []
for m, r in res.items():
if vis not in r: continue
tab.append(r["all"][vis][2].mean(axis=1))
if not tab: continue
df = pd.DataFrame(data=np.vstack(tab).T, columns=list(res.keys()),
index=range(1, 21))
df.to_csv(f"{vis}.csv")
print(f"% {vis}")
print(df.round(4).to_latex())
print()
# parser.add_argument("--pred", action="store", dest="pred",
# default="")
#
# parser.add_argument("--type", action="store", dest="model_type",
# default="simple_conv")
#
parser.add_argument("--batch", action="store", dest="batch_size",
default=64, type=int)
parser.add_argument("--epochs", action="store", dest="epochs",
default=10, type=int)
# parser.add_argument("--samples", action="store", dest="samples",
# default=100000, type=int)
#
# parser.add_argument("--samples_val", action="store", dest="samples_val",
# default=10000, type=int)
#
# parser.add_argument("--area", action="store", dest="area_size",
# default=25, type=int)
#
# parser.add_argument("--queue", action="store", dest="queue_size",
# default=50, type=int)
#
# parser.add_argument("--p", action="store", dest="p_train",
# default=0.5, type=float)
#
# parser.add_argument("--p_val", action="store", dest="p_val",
# default=0.01, type=float)
#
# parser.add_argument("--gpu", action="store", dest="gpu",
# default=0, type=int)
#
# parser.add_argument("--tmp", action="store_true", dest="tmp")
#
# parser.add_argument("--test", action="store", dest="test_image",
# default=6, choices=range(7), type=int)
args = parser.parse_args()
# config = tf.ConfigProto(log_device_placement=True)
# config.gpu_options.per_process_gpu_memory_fraction = 0.5
# config.gpu_options.allow_growth = True
# session = tf.Session(config=config)
def main():
if "train" == args.mode:
main_train()
if "retrain" == args.mode:
main_retrain()
if "hyperband" == args.mode:
main_hyperband(args.data, args.domain_length, args.window, args.model_type, args.hyperband_results,
args.hyper_max_iter)
if "test" == args.mode:
main_test()
if "beta" == args.mode:
main_beta()
if "all_beta" == args.mode:
plot_overall_result()
if "embedding" == args.mode:
main_visualize_all_embds()
# parameter
innerCNNFilters = 512
innerCNNKernelSize = 2
cnnDropout = 0.5
cnnHiddenDims = 1024
domainFeatures = 512
flowFeatures = 3
numCiscoFeatures = 30
windowSize = 10
maxLen = 40
embeddingSize = 100
kernel_size = 2
drop_out = 0.5
filters = 2
hidden_dims = 100
vocabSize = 40
threshold = 3
minFlowsPerUser = 10
numEpochs = 100
char_dict = dataset.get_character_dict()
user_flow_df = dataset.get_user_flow_data()
print("create training dataset")
(X_tr, hits_tr, names_tr, server_tr, trusted_hits_tr) = dataset.create_dataset_from_flows(
user_flow_df, char_dict,
max_len=maxLen, window_size=windowSize)
# make client labels discrete with 4 different values
# TODO: use trusted_hits_tr for client classification too
client_labels = np.apply_along_axis(lambda x: dataset.discretize_label(x, 3), 0, np.atleast_2d(hits_tr))
# select only 1.0 and 0.0 from training data
pos_idx = np.where(client_labels == 1.0)[0]
neg_idx = np.where(client_labels == 0.0)[0]
idx = np.concatenate((pos_idx, neg_idx))
# select labels for prediction
client_labels = client_labels[idx]
server_labels = server_tr[idx]
shared_cnn = models.get_shared_cnn(len(char_dict) + 1, embeddingSize, maxLen,
domainFeatures, kernel_size, domainFeatures, 0.5)
model = models.get_top_cnn(shared_cnn, flowFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size,
cnnHiddenDims, cnnDropout)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
client_labels = np_utils.to_categorical(client_labels, 2)
server_labels = np_utils.to_categorical(server_labels, 2)
model.fit(X_tr,
[client_labels, server_labels],
batch_size=args.batch_size,
epochs=args.epochs,
shuffle=True)
# TODO: for validation we use future data -> validation_data=(testData,testLabel))
if __name__ == "__main__":

41
models.py Normal file
View File

@ -0,0 +1,41 @@
import keras
from keras.engine import Input, Model
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, TimeDistributed
def get_shared_cnn(vocab_size, embedding_size, input_length, filters, kernel_size,
hidden_dims, drop_out):
x = y = Input(shape=(input_length,))
y = Embedding(input_dim=vocab_size, output_dim=embedding_size)(y)
y = Conv1D(filters, kernel_size, activation='relu')(y)
y = GlobalMaxPooling1D()(y)
y = Dense(hidden_dims)(y)
y = Dropout(drop_out)(y)
y = Activation('relu')(y)
return Model(x, y)
def get_full_model(vocabSize, embeddingSize, maxLen, domainFeatures, flowFeatures,
filters, h1, h2, dropout, dense):
pass
def get_top_cnn(cnn, numFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size, cnnHiddenDims, cnnDropout):
ipt_domains = Input(shape=(windowSize, maxLen), name="ipt_domains")
encoded = TimeDistributed(cnn)(ipt_domains)
ipt_flows = Input(shape=(windowSize, numFeatures), name="ipt_flows")
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
# add second cnn
y = Conv1D(filters,
kernel_size,
activation='relu',
input_shape=(windowSize, domainFeatures + numFeatures))(merged)
# TODO: why global pooling? -> 3D to 2D
# we use max pooling:
y = GlobalMaxPooling1D()(y)
y = Dropout(cnnDropout)(y)
y = Dense(cnnHiddenDims, activation='relu')(y)
y1 = Dense(2, activation='softmax', name="client")(y)
y2 = Dense(2, activation='softmax', name="server")(y)
return Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2))

View File

@ -1,117 +0,0 @@
from collections import namedtuple
from keras.models import Model
from . import networks
from .metrics import *
NetworkParameters = namedtuple("NetworkParameters", [
"type", "flow_features", "window_size", "domain_length", "output",
"embedding_size",
"domain_filter", "domain_kernel", "domain_dense", "domain_dropout",
"main_filter", "main_kernel", "main_dense", "main_dropout",
])
def create_model(model, output_type):
if output_type == "both":
return Model(inputs=[model.in_domains, model.in_flows], outputs=(model.out_client, model.out_server))
elif output_type == "client":
return Model(inputs=[model.in_domains, model.in_flows], outputs=(model.out_client,))
else:
raise Exception("unknown model output")
def get_models_by_params(params: dict):
K.clear_session()
# decomposing param section
# mainly embedding model
embedding_type = params.get("embedding_type", "small")
network_type = params.get("type")
# network_depth = params.get("depth")
embedding_size = params.get("embedding")
filter_embedding = params.get("filter_embedding")
kernel_embedding = params.get("kernel_embedding")
hidden_embedding = params.get("dense_embedding")
# dropout = params.get("dropout")
# mainly prediction model
flow_features = params.get("flow_features")
window_size = params.get("window_size")
domain_length = params.get("domain_length")
filter_main = params.get("filter_main")
kernel_main = params.get("kernel_main")
dense_dim = params.get("dense_main")
model_output = params.get("model_output", "both")
if embedding_type == "small":
domain_cnn = networks.get_domain_embedding_model(embedding_size, domain_length, filter_embedding,
kernel_embedding, hidden_embedding, 0.5)
elif embedding_type == "deep":
domain_cnn = networks.get_domain_embedding_model2(embedding_size, domain_length, filter_embedding,
kernel_embedding, hidden_embedding, 0.5)
else:
raise ValueError("embedding type not found")
if network_type == "final":
model = networks.get_final_model(0.25, flow_features, window_size, domain_length,
filter_main, kernel_main, dense_dim, domain_cnn)
model = create_model(model, model_output)
elif network_type in ("inter", "staggered"):
model = networks.get_inter_model(0.25, flow_features, window_size, domain_length,
filter_main, kernel_main, dense_dim, domain_cnn)
model = create_model(model, model_output)
elif network_type == "long":
model = networks.get_long_model(0.25, flow_features, window_size, domain_length,
filter_main, kernel_main, dense_dim, domain_cnn)
model = create_model(model, model_output)
elif network_type == "soft":
model = networks.get_long_model(0.25, flow_features, window_size, domain_length,
filter_main, kernel_main, dense_dim, domain_cnn)
model = create_model(model, model_output)
conv_server = model.get_layer("conv_server").trainable_weights
conv_client = model.get_layer("conv_client").trainable_weights
l1 = [0.001 * K.sum(K.abs(x - y)) for (x, y) in zip(conv_server, conv_client)]
model.add_loss(l1)
dense_server = model.get_layer("dense_server").trainable_weights
dense_client = model.get_layer("dense_client").trainable_weights
l2 = [0.001 * K.sum(K.abs(x - y)) for (x, y) in zip(dense_server, dense_client)]
model.add_loss(l2)
elif network_type == "sluice":
model = networks.get_sluice_model(0.25, flow_features, window_size, domain_length,
filter_main, kernel_main, dense_dim, domain_cnn)
model = create_model(model, model_output)
conv_server = model.get_layer("conv_server").trainable_weights
conv_client = model.get_layer("conv_client").trainable_weights
l1 = [0.001 * K.sum(K.abs(x - y)) for (x, y) in zip(conv_server, conv_client)]
model.add_loss(l1)
dense_server = model.get_layer("dense_server").trainable_weights
dense_client = model.get_layer("dense_client").trainable_weights
l2 = [0.001 * K.sum(K.abs(x - y)) for (x, y) in zip(dense_server, dense_client)]
model.add_loss(l2)
else:
raise ValueError("network type not found")
return model
def get_server_model_by_params(params: dict):
# decomposing param section
# mainly embedding model
network_depth = params.get("depth")
embedding_size = params.get("embedding")
input_length = params.get("input_length")
filter_embedding = params.get("filter_embedding")
kernel_embedding = params.get("kernel_embedding")
hidden_embedding = params.get("dense_embedding")
# mainly prediction model
flow_features = params.get("flow_features")
domain_length = params.get("domain_length")
dense_dim = params.get("dense_main")
embedding_model = networks.get_domain_embedding_model(embedding_size, input_length, filter_embedding,
kernel_embedding,
hidden_embedding, 0.5)
return networks.get_server_model(flow_features, domain_length, dense_dim, embedding_model)

View File

@ -1,64 +0,0 @@
import keras.backend as K
from keras.activations import elu
def get_custom_objects():
return dict([
("precision", precision),
("recall", recall),
("f1_score", f1_score),
("selu", selu)
])
def selu(x):
"""Scaled Exponential Linear Unit. (Klambauer et al., 2017)
# Arguments
x: A tensor or variable to compute the activation function for.
# References
- [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
# copied from keras.io
"""
alpha = 1.6732632423543772848170429916717
scale = 1.0507009873554804934193349852946
return scale * elu(x, alpha)
def get_metric_functions():
return [precision, recall, f1_score]
def precision(y_true, y_pred):
# Count positive samples.
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
return true_positives / (predicted_positives + K.epsilon())
def recall(y_true, y_pred):
# Count positive samples.
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
return true_positives / (possible_positives + K.epsilon())
def f1_score(y_true, y_pred):
return f_score(1)(y_true, y_pred)
def f05_score(y_true, y_pred):
return f_score(0.5)(y_true, y_pred)
def f_score(beta):
def _f(y_true, y_pred):
p = precision(y_true, y_pred)
r = recall(y_true, y_pred)
bb = beta ** 2
fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
return fbeta_score
return _f

View File

@ -1,249 +0,0 @@
from collections import namedtuple
import keras
import keras.backend as K
import numpy as np
from keras.engine import Input, Model as KerasModel
from keras.engine.topology import Layer
from keras.layers import Conv1D, Dense, Dropout, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, TimeDistributed
from keras.regularizers import Regularizer
import dataset
Model = namedtuple("Model", ["in_domains", "in_flows", "out_client", "out_server"])
def get_domain_embedding_model(embedding_size, input_length, filter_size, kernel_size, hidden_dims,
drop_out=0.5) -> KerasModel:
x = y = Input(shape=(input_length,))
y = Embedding(input_dim=dataset.get_vocab_size(), output_dim=embedding_size)(y)
y = Conv1D(filter_size,
kernel_size,
activation='relu')(y)
y = GlobalMaxPooling1D()(y)
y = Dropout(drop_out)(y)
y = Dense(hidden_dims, activation="relu")(y)
return KerasModel(x, y)
def get_domain_embedding_model2(embedding_size, input_length, filter_size, kernel_size, hidden_dims,
drop_out=0.5) -> KerasModel:
x = y = Input(shape=(input_length,))
y = Embedding(input_dim=dataset.get_vocab_size(), output_dim=embedding_size)(y)
y = Conv1D(filter_size,
kernel_size,
activation='relu')(y)
y = Conv1D(filter_size,
kernel_size,
activation='relu')(y)
y = Conv1D(filter_size,
kernel_size,
activation='relu')(y)
y = GlobalAveragePooling1D()(y)
y = Dense(hidden_dims, activation="relu")(y)
return KerasModel(x, y)
def get_final_model(cnnDropout, flow_features, window_size, domain_length, cnn_dims, kernel_size,
dense_dim, cnn) -> Model:
ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
encoded = TimeDistributed(cnn, name="domain_cnn")(ipt_domains)
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
# CNN processing a small slides of flow windows
y = Conv1D(cnn_dims,
kernel_size,
activation='relu')(merged)
# remove temporal dimension by global max pooling
y = GlobalMaxPooling1D()(y)
y = Dropout(cnnDropout)(y)
y = Dense(dense_dim, activation='relu')(y)
out_client = Dense(1, activation='sigmoid', name="client")(y)
out_server = Dense(1, activation='sigmoid', name="server")(y)
return Model(ipt_domains, ipt_flows, out_client, out_server)
def get_inter_model(dropout, flow_features, window_size, domain_length, cnn_dims, kernel_size,
dense_dim, cnn) -> Model:
ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
encoded = TimeDistributed(cnn, name="domain_cnn")(ipt_domains)
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
y = Dense(dense_dim,
activation="relu",
name="dense_server")(merged)
out_server = Dense(1, activation="sigmoid", name="server")(y)
merged = keras.layers.concatenate([merged,
y], -1)
# CNN processing a small slides of flow windows
y = Conv1D(cnn_dims,
kernel_size,
activation='relu')(merged)
# remove temporal dimension by global max pooling
y = GlobalMaxPooling1D()(y)
y = Dropout(dropout)(y)
y = Dense(dense_dim,
activation='relu',
name="dense_client")(y)
out_client = Dense(1, activation='sigmoid', name="client")(y)
return Model(ipt_domains, ipt_flows, out_client, out_server)
def get_server_model(flow_features, domain_length, dense_dim, cnn):
ipt_domains = Input(shape=(domain_length,), name="ipt_domains")
ipt_flows = Input(shape=(flow_features,), name="ipt_flows")
encoded = cnn(ipt_domains)
cnn.name = "domain_cnn"
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
y = Dense(dense_dim,
activation="relu",
name="dense_server")(merged)
out_server = Dense(1, activation="sigmoid", name="server")(y)
return KerasModel(inputs=[ipt_domains, ipt_flows], outputs=out_server)
def get_long_model(dropout, flow_features, window_size, domain_length, cnn_dims, kernel_size,
dense_dim, cnn) -> Model:
ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
encoded = TimeDistributed(cnn, name="domain_cnn")(ipt_domains)
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
y = Conv1D(cnn_dims,
kernel_size,
activation='relu', name="conv_server")(merged)
# remove temporal dimension by global max pooling
y = GlobalMaxPooling1D()(y)
y = Dropout(dropout)(y)
y = Dense(dense_dim,
activation="relu",
name="dense_server")(y)
out_server = Dense(1, activation="sigmoid", name="server")(y)
# CNN processing a small slides of flow windows
y = Conv1D(cnn_dims,
kernel_size,
activation='relu', name="conv_client")(merged)
# remove temporal dimension by global max pooling
y = GlobalMaxPooling1D()(y)
y = Dropout(dropout)(y)
y = Dense(dense_dim,
activation='relu',
name="dense_client")(y)
out_client = Dense(1, activation='sigmoid', name="client")(y)
return Model(ipt_domains, ipt_flows, out_client, out_server)
class CrossStitch2(Layer):
def __init__(self, **kwargs):
super(CrossStitch2, self).__init__(**kwargs)
def build(self, input_shape):
# Create a trainable weight variable for this layer.
self.s = self.add_weight(name='cross-stitch-s',
shape=(1,),
initializer='uniform',
trainable=True)
self.d = self.add_weight(name='cross-stitch-d',
shape=(1,),
initializer='uniform',
trainable=True)
super(CrossStitch2, self).build(input_shape)
def call(self, xs):
x1, x2 = xs
out = x1 * self.s + x2 * self.d
return out
def compute_output_shape(self, input_shape):
return input_shape[0]
class CrossStitchMix2(Layer):
def __init__(self, **kwargs):
super(CrossStitchMix2, self).__init__(**kwargs)
def build(self, input_shape):
# Create a trainable weight variable for this layer.
self.s = self.add_weight(name='cross-stitch-s',
shape=(1,),
initializer='uniform',
trainable=True)
self.d = self.add_weight(name='cross-stitch-d',
shape=(1,),
initializer='uniform',
trainable=True)
super(CrossStitchMix2, self).build(input_shape)
def call(self, xs):
x1, x2 = xs
out = K.concatenate((x1 * self.s, x2 * self.d), axis=-1)
return out
def compute_output_shape(self, input_shape):
return (input_shape[0][0], input_shape[0][1] + input_shape[1][1])
class L21(Regularizer):
"""Regularizer for L21 regularization.
Found at: https://bitbucket.org/ispamm/group-lasso-for-neural-networks-tensorflow-keras
# Arguments
C: Float; L21 regularization factor.
"""
def __init__(self, C=0.):
self.C = K.cast_to_floatx(C)
def __call__(self, x):
const_coeff = np.sqrt(K.int_shape(x)[1])
return self.C * const_coeff * K.sum(K.sqrt(K.sum(K.square(x), axis=1)))
def get_config(self):
return {'C': float(self.C)}
def get_sluice_model(dropout, flow_features, window_size, domain_length, cnn_dims, kernel_size,
dense_dim, cnn) -> Model:
ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
encoded = TimeDistributed(cnn, name="domain_cnn")(ipt_domains)
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
y1 = Conv1D(cnn_dims,
kernel_size,
activation='relu', name="conv_server")(merged)
y1 = GlobalMaxPooling1D()(y1)
y2 = Conv1D(cnn_dims,
kernel_size,
activation='relu', name="conv_client")(merged)
y2 = GlobalMaxPooling1D()(y2)
c11 = CrossStitch2()([y1, y2])
c12 = CrossStitch2()([y1, y2])
y1 = Dropout(dropout)(c11)
y1 = Dense(dense_dim,
activation="relu",
name="dense_server")(y1)
y2 = Dropout(dropout)(c12)
y2 = Dense(dense_dim,
activation='relu',
name="dense_client")(y2)
c21 = CrossStitch2()([y1, y2])
c22 = CrossStitch2()([y1, y2])
beta1 = CrossStitchMix2()([c11, c21])
beta2 = CrossStitchMix2()([c12, c22])
out_server = Dense(1, activation="sigmoid", name="server")(beta1)
out_client = Dense(1, activation='sigmoid', name="client")(beta2)
return Model(ipt_domains, ipt_flows, out_client, out_server)

View File

@ -1,16 +0,0 @@
#!/usr/bin/env bash
SRC=$1
DEST=$2
DATADIR=$3
INIT=$4
EPOCHS=$5
BS=128
for i in `ls -d $SRC*/`
do
echo "retrain model in ${i}"
name=$(basename $i)
python3 main.py --mode retrain --model_src ${i} --model_dest ${DEST}/${name} --init_epoch $INIT --epochs $EPOCHS --batch $BS --train ${DATADIR}
done

53
run.sh
View File

@ -1,53 +0,0 @@
#!/usr/bin/env bash
RESDIR=$1
mkdir -p /tmp/rk/${RESDIR}
DATADIR=$2
EPOCHS=10
for output in client both
do
for depth in small
do
for mtype in inter final
do
python main.py --mode train \
--train ${DATADIR}/currentData.csv \
--model ${RESDIR}/${output}_${depth}_${mtype} \
--epochs $EPOCHS \
--embd 128 \
--filter_embd 256 --kernel_embd 8 --dense_embd 128 \
--domain_embd 32 \
--filter_main 32 --kernel_main 8 --dense_main 1024 \
--batch 256 \
--balanced_weights \
--model_output ${output} \
--type ${mtype} \
--depth ${depth}
done
done
done
for depth in small
do
python main.py --mode train \
--train ${DATADIR}/currentData.csv \
--model ${RESDIR}/both_${depth}_staggered \
--epochs $EPOCHS \
--embd 128 \
--filter_embd 256 --kernel_embd 8 --dense_embd 128 \
--domain_embd 32 \
--filter_main 32 --kernel_main 8 --dense_main 1024 \
--batch 256 \
--balanced_weights \
--model_output both \
--type staggered \
--depth ${depth}
done
# python main.py --mode train --epochs 100 --embd 64 --filter_embd 128 --kernel_embd 5 --dense_embd 128 --domain_embd 32 --filter_main 32 --kernel_main 5 --dense_main 512 --batch 256 --balanced_weights --model_output ${output} --type ${mtype} --depth ${depth} --train ${DATADIR}/currentData.csv --model ${RESDIR}/${output}_${depth}_${mtype}
# python main.py --mode train --epochs 100 --embd 64 --filter_embd 128 --kernel_embd 5 --dense_embd 128 --domain_embd 32 --filter_main 32 --kernel_main 5 --dense_main 512 --batch 256 --balanced_weights --model_output client --type final --depth small --train /tmp/rk/data/currentData.csv --model /tmp/rk/results/paul3/client_final

View File

@ -1,30 +0,0 @@
#!/usr/bin/env bash
N1=$1
N2=$2
OUTPUT=$3
DEPTH=$4
TYPE=$5
RESDIR=$6
mkdir -p /tmp/rk/${RESDIR}
DATADIR=$7
EPOCHS=10
for ((i = ${N1}; i <= ${N2}; i++))
do
python main.py --mode train \
--data ${DATADIR} \
--model ${RESDIR}/${OUTPUT}_${TYPE}_${i} \
--epochs ${EPOCHS} \
--embd 128 \
--filter_embd 256 --kernel_embd 8 --dense_embd 128 \
--domain_embd 32 \
--filter_main 32 --kernel_main 8 --dense_main 1024 \
--batch 128 \
--model_output ${OUTPUT} \
--type ${TYPE} \
--depth ${DEPTH} \
--gpu
done

View File

@ -1,30 +0,0 @@
#!/usr/bin/env bash
N1=$1
N2=$2
OUTPUT=$3
DEPTH=$4
TYPE=$5
RESDIR=$6
mkdir -p /tmp/rk/${RESDIR}
DATADIR=$7
EPOCHS=10
for ((i = ${N1}; i <= ${N2}; i++))
do
python main.py --mode train \
--train ${DATADIR} \
--model ${RESDIR}/${OUTPUT}_${TYPE}_${i} \
--epochs ${EPOCHS} \
--embd 64 \
--filter_embd 128 --kernel_embd 5 --dense_embd 64 \
--domain_embd 16 \
--filter_main 32 --kernel_main 5 --dense_main 256 \
--batch 128 \
--model_output ${OUTPUT} \
--type ${TYPE} \
--depth ${DEPTH} \
--gpu
done

View File

@ -1,26 +1,10 @@
#!/usr/bin/python2
import sys
import joblib
import numpy as np
import pandas as pd
fn = sys.argv[1]
df = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/{}.joblib".format(fn))
df = pd.concat(df["data"])
df = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/currentData.joblib")
df = df["data"]
df = pd.concat(df)
df.reset_index(inplace=True)
df.dropna(axis=0, how="any", inplace=True)
df.serverLabel = pd.to_numeric(df.serverLabel, errors='coerce')
df.duration = pd.to_numeric(df.duration, errors='coerce')
df.bytes_down = pd.to_numeric(df.bytes_down, errors='coerce')
df.bytes_up = pd.to_numeric(df.bytes_up, errors='coerce')
df.http_method = df.http_method.astype("category")
df.serverLabel = df.serverLabel.astype(np.bool)
df.virusTotalHits = df.virusTotalHits.astype(np.int8)
df.trustedHits = df.trustedHits.astype(np.int8)
df.to_csv("/tmp/rk/data/{}.csv".format(fn), encoding="utf-8")
df.to_csv("/tmp/rk/full_dataset.csv.gz", compression="gzip")

116
server.py
View File

@ -1,116 +0,0 @@
import logging
from keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint
import arguments
import dataset
import models
# create logger
import visualize
from arguments import get_model_args
from utils import exists_or_make_path, load_model
logger = logging.getLogger('cisco_logger')
args = arguments.parse()
def train_server_only(params):
logger.info(f"Create model path {args.model_path}")
exists_or_make_path(args.model_path)
logger.info(f"Use command line arguments: {args}")
domain_tr, flow_tr, name_tr, client_tr, server_windows_tr = dataset.load_or_generate_h5data(args.data,
args.domain_length,
args.window)
domain_tr = domain_tr.value.reshape(-1, 40)
flow_tr = flow_tr.value.reshape(-1, 3)
server_tr = server_windows_tr.value.reshape(-1)
logger.info("define callbacks")
callbacks = []
callbacks.append(ModelCheckpoint(filepath=args.clf_model,
monitor='loss',
verbose=False,
save_best_only=True))
callbacks.append(CSVLogger(args.train_log))
logger.info(f"Use early stopping: {args.stop_early}")
if args.stop_early:
callbacks.append(EarlyStopping(monitor='val_loss',
patience=5,
verbose=False))
custom_metrics = models.get_metric_functions()
model = models.get_server_model_by_params(params=params)
features = {"ipt_domains": domain_tr, "ipt_flows": flow_tr}
if args.model_output == "both":
labels = {"client": client_tr, "server": server_tr}
elif args.model_output == "client":
labels = {"client": client_tr}
elif args.model_output == "server":
labels = {"server": server_tr}
else:
raise ValueError("unknown model output")
logger.info("compile and train model")
logger.info(model.get_config())
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'] + custom_metrics)
model.summary()
model.fit(features, labels,
batch_size=args.batch_size,
epochs=args.epochs,
callbacks=callbacks)
def test_server_only():
logger.info("start test: load data")
domain_val, flow_val, _, _, _, _ = dataset.load_or_generate_raw_h5data(args.data, args.domain_length, args.window)
domain_val = domain_val.value.reshape(-1, 40)
flow_val = flow_val.value.reshape(-1, 3)
domain_encs, _ = dataset.load_or_generate_domains(args.data, args.domain_length)
for model_args in get_model_args(args):
results = {}
logger.info(f"process model {model_args['model_path']}")
embd_model, clf_model = load_model(model_args["clf_model"], custom_objects=models.get_custom_objects())
pred = clf_model.predict([domain_val, flow_val],
batch_size=args.batch_size,
verbose=1)
results["server_pred"] = pred
domain_embeddings = embd_model.predict(domain_encs, batch_size=args.batch_size, verbose=1)
results["domain_embds"] = domain_embeddings
dataset.save_predictions(model_args["model_path"], results)
def vis_server():
def load_model(m, c):
from keras.models import load_model
clf = load_model(m, custom_objects=c)
emdb = clf.layers[1]
return emdb, clf
domain_raw, flow_raw, name_raw, hits_vt_raw, hits_trusted_raw, server_raw = dataset.load_or_generate_raw_h5data(
args.data, args.domain_length, args.window)
results = dataset.load_predictions(args.clf_model)
visualize.plot_clf()
visualize.plot_precision_recall(server_raw.flatten(), results["server_pred"].flatten(), "server")
visualize.plot_legend()
visualize.plot_save("results/server_model/windows_prc.pdf")
visualize.plot_clf()
visualize.plot_precision_recall(server_raw.flatten(), results["server_pred"].flatten(), "server")
visualize.plot_legend()
visualize.plot_save("results/server_model/windows_prc.pdf")
visualize.plot_clf()
visualize.plot_roc_curve(server_raw.flatten(), results["server_pred"].flatten(), "server")
visualize.plot_legend()
visualize.plot_save("results/server_model/windows_roc.pdf")

12
test.sh
View File

@ -1,12 +0,0 @@
#!/usr/bin/env bash
RESDIR=$1
DATADIR=$2
for output in client both
do
python3 main.py --mode test --batch 1024 \
--models ${RESDIR}/${output}_*/ \
--test ${DATADIR}/futureData.csv \
--model_output ${output}
done

View File

@ -1,46 +0,0 @@
import os
from operator import itemgetter
import joblib
import numpy as np
from keras.models import load_model as load_keras_model
from sklearn.utils import class_weight
def exists_or_make_path(p):
if not os.path.exists(p):
os.makedirs(p)
def get_custom_class_weights(client, server):
return {
"client": class_weight.compute_class_weight('balanced', np.unique(client), client),
"server": class_weight.compute_class_weight('balanced', np.unique(server), server)
}
def get_custom_sample_weights(client, server):
return {
"client": class_weight.compute_sample_weight("balanced", client),
"server": class_weight.compute_sample_weight("balanced", server)
}
def load_ordered_hyperband_results(path):
results = joblib.load(path)
return sorted(results, itemgetter("loss"))
def load_model(path, custom_objects=None):
clf = load_keras_model(path, custom_objects)
try:
embd = clf.get_layer("domain_cnn").layer
except Exception:
# in some version i forgot to specify domain_cnn
# this bug fix is for certain compatibility
try:
embd = clf.layers[1].layer
except Exception:
embd = clf.get_layer("domain_cnn")
return embd, clf

View File

@ -1,247 +0,0 @@
import os
import matplotlib
matplotlib.use("agg")
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import interpolate
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.metrics import (
auc, classification_report, confusion_matrix, fbeta_score, precision_recall_curve,
roc_auc_score, roc_curve
)
def scores(y_true):
for (path, dirnames, fnames) in os.walk("results/"):
for f in fnames:
if path[-1] == "1" and f.endswith("npy"):
y_pred = np.load(os.path.join(path, f)).flatten()
print(path)
tp = np.sum(np.logical_and(y_pred >= 0.5, y_true == 1))
tn = np.sum(np.logical_and(y_pred < 0.5, y_true == 0))
fp = np.sum(np.logical_and(y_pred >= 0.5, y_true == 0))
fn = np.sum(np.logical_and(y_pred < 0.5, y_true == 1))
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / len(y_true)
f1_score = 2 * (precision * recall) / (precision + recall)
f05_score = (1 + 0.5 ** 2) * (precision * recall) / (0.5 ** 2 * precision + recall)
print(" precision:", precision)
print(" recall:", recall)
print(" accuracy:", accuracy)
print(" f1 score:", f1_score)
print(" f0.5 score:", f05_score)
def plot_clf():
plt.clf()
sns.set_context("paper")
sns.set_style("white")
def plot_save(path, dpi=600, set_size=True):
# plt.title(path)
fig = plt.gcf()
# fig.suptitle(path)
if set_size:
fig.set_size_inches(8, 4.5)
fig.savefig(path, dpi=dpi, bbox_inches='tight')
plt.close()
def plot_legend():
plt.legend()
def mathews_correlation_curve(y, y_pred):
pass
def plot_precision_recall(y, y_pred, label=""):
y = y.flatten()
y_pred = y_pred.flatten()
precision, recall, thresholds = precision_recall_curve(y, y_pred)
# decreasing_max_precision = np.maximum.accumulate(precision)[::-1]
# fig, ax = plt.subplots(1, 1)
# ax.hold(True)
score = fbeta_score(y, y_pred.round(), 1)
# prc_ap = average_precision_score(y, y_pred)
plt.plot(recall, precision, '--', label=f"{label} - {score:5.4}")
# ax.step(recall[::-1], decreasing_max_precision, '-r')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.0])
plt.xlim([0.0, 1.0])
def calc_pr_mean(y, y_preds):
return calc_metrics_mean(y, y_preds, "prc")
def plot_mean_curve(x, ys, std, score, label):
plt.plot(x, ys, label=f"{label} - {score:5.4}")
plt.fill_between(x, ys - std, ys + std, alpha=0.1)
plt.ylim([0.0, 1.0])
plt.xlim([0.0, 1.0])
def plot_pr_mean(y, y_preds, label=""):
x = np.linspace(0, 1, 10000)
ys_mean, ys_std, score = calc_pr_mean(y, y_preds)
plot_mean_curve(x, ys_mean, ys_std, score, label)
plt.xlabel('Recall')
plt.ylabel('Precision')
def score_model(y, prediction):
y = y.flatten()
y_pred = prediction.flatten()
precision, recall, thresholds = precision_recall_curve(y, y_pred)
print(classification_report(y, y_pred.round()))
print("Area under PR curve", auc(recall, precision))
print("roc auc score", roc_auc_score(y, y_pred))
print("F1 Score", fbeta_score(y, y_pred.round(), 1))
print("F0.5 Score", fbeta_score(y, y_pred.round(), 0.5))
def plot_roc_curve(mask, prediction, label=""):
y = mask.flatten()
y_pred = prediction.flatten()
fpr, tpr, thresholds = roc_curve(y, y_pred)
roc_auc = auc(fpr, tpr)
plt.xscale('log')
plt.plot(fpr, tpr, label=f"{label} - {roc_auc:5.4}")
plt.ylim([0.0, 1.0])
plt.xlim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
def calc_metrics_mean(y, y_preds, metric):
appr = []
y = y.flatten()
for idx, y_pred in enumerate(y_preds):
y_pred = y_pred.flatten()
if metric == "prc":
precision, recall, thresholds = precision_recall_curve(y, y_pred)
appr.append(interpolate.interp1d(recall, precision))
elif metric == "roc":
fpr, tpr, thresholds = roc_curve(y, y_pred)
appr.append(interpolate.interp1d(fpr, tpr))
x = np.linspace(0, 1, 10000)
ys = np.vstack([f(x) for f in appr])
ys_mean = ys.mean(axis=0)
ys_std = ys.std(axis=0)
return ys_mean, ys_std, ys
def calc_roc_mean(y, y_preds):
return calc_metrics_mean(y, y_preds, "roc")
def plot_roc_mean(y, y_preds, label=""):
x = np.linspace(0, 1, 10000)
ys_mean, ys_std, score = calc_roc_mean(y, y_preds)
plt.xscale('log')
plot_mean_curve(x, ys_mean, ys_std, score, label)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
def plot_confusion_matrix(y_true, y_pred, path,
normalize=False,
classes=("benign", "malicious"),
title='Confusion matrix',
cmap="Blues", dpi=600):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.clf()
cm = confusion_matrix(y_true, y_pred)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.
for i, j in ((i, j) for i in range(cm.shape[0]) for j in range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.savefig(path, dpi=dpi)
plt.close()
def plot_training_curve(logs, key, path, dpi=600):
plt.clf()
plt.plot(logs[f"{key}acc"], label="accuracy")
plt.plot(logs[f"{key}f1_score"], label="f1_score")
plt.plot(logs[f"val_{key}acc"], label="val_accuracy")
# plt.plot(logs[f"val_{key}f1_score"], label="val_f1_score")
plt.xlabel('epoch')
plt.ylabel('percentage')
plt.legend()
plt.savefig(path, dpi=dpi)
plt.close()
def plot_error_bars(results):
rates = []
for m, r in results.items():
if m == "all": continue
rates.append((r / r.sum(axis=0, keepdims=True)).flatten())
rates = pd.DataFrame(np.vstack(rates), columns=("TN", "FP", "FN", "TP"))
ax = rates.mean().plot.bar(yerr=rates.std())
for p in ax.patches:
ax.annotate(str(np.round(p.get_height(), 4)), (p.get_x(), 0.5))
def plot_embedding(domain_embedding, labels, path, dpi=600, method="svd"):
if method == "svd":
red = TruncatedSVD(n_components=2)
elif method == "tsne":
red = TSNE(n_components=2, verbose=2)
domain_reduced = red.fit_transform(domain_embedding)
print(red.explained_variance_ratio_)
# use if draw subset of predictions
# idx = np.random.choice(np.arange(len(domain_reduced)), 10000)
plt.scatter(domain_reduced[:, 0],
domain_reduced[:, 1],
c=(labels * (1, 2)).sum(1).astype(int),
cmap=plt.cm.plasma,
s=3,
alpha=0.2)
plt.colorbar()
plt.savefig(path, dpi=dpi)
def plot_model_as(model, path, shapes=True, layer_names=True):
from keras.utils.vis_utils import plot_model
plot_model(model, to_file=path, show_shapes=shapes, show_layer_names=layer_names)