Compare commits
110 Commits
Author | SHA1 | Date |
---|---|---|
René Knaebel | b3d646c9e7 | |
René Knaebel | f382d06eb5 | |
René Knaebel | 349bc92a61 | |
René Knaebel | d58dbcb101 | |
René Knaebel | 461d4cab8f | |
René Knaebel | 3ce385eca6 | |
René Knaebel | c19d649bc4 | |
René Knaebel | 27f4d086eb | |
René Knaebel | 4fc2f0c925 | |
René Knaebel | 9ce11e4db4 | |
René Knaebel | 9b8ca8abab | |
René Knaebel | 903e81c931 | |
René Knaebel | 826357a41f | |
René Knaebel | e12bbda8c5 | |
René Knaebel | b1f48c1895 | |
René Knaebel | 7b8dfcebbe | |
René Knaebel | 14fef66a55 | |
René Knaebel | 88e3eda595 | |
René Knaebel | 8b17bd0701 | |
René Knaebel | a860f0da34 | |
René Knaebel | d1da3d6ca3 | |
René Knaebel | a686f147f0 | |
René Knaebel | 33063f3081 | |
René Knaebel | e8473048cb | |
René Knaebel | 0b26c6125c | |
René Knaebel | 5741f8ee0e | |
René Knaebel | 508667d1d0 | |
René Knaebel | 345afbaef5 | |
René Knaebel | b24fa770f9 | |
René Knaebel | 371a1dad05 | |
René Knaebel | 68254d6629 | |
René Knaebel | f02e0b7f99 | |
René Knaebel | a1e553f8f1 | |
René Knaebel | 605447440f | |
René Knaebel | 090c89a127 | |
René Knaebel | b157ca6a19 | |
René Knaebel | 49ad506a96 | |
René Knaebel | 607d74998c | |
René Knaebel | e2bf2dc90f | |
René Knaebel | fbe6d6a584 | |
René Knaebel | 6a47b5f245 | |
René Knaebel | f2845e635e | |
René Knaebel | ec5a1101be | |
René Knaebel | b0e0cd904e | |
René Knaebel | 7f49021a63 | |
René Knaebel | 6ce8fb464f | |
René Knaebel | 3a44efa775 | |
René Knaebel | 6d8d7b19f3 | |
René Knaebel | 6121eac448 | |
René Knaebel | 1cf62423e1 | |
René Knaebel | 6fef2b8b84 | |
René Knaebel | 9a51b6ea34 | |
René Knaebel | edc75f4f44 | |
René Knaebel | 528829bb33 | |
René Knaebel | 70d00efb01 | |
René Knaebel | 1ab0108c78 | |
René Knaebel | 595c2ea894 | |
René Knaebel | 71f218888d | |
René Knaebel | 5bd8e41711 | |
René Knaebel | 2080444fb7 | |
René Knaebel | ed4f478bad | |
René Knaebel | 1da31cc97c | |
René Knaebel | 3f6779fa3d | |
René Knaebel | 0db8427457 | |
René Knaebel | dc9180da10 | |
René Knaebel | 933eaae04a | |
René Knaebel | dceaf47211 | |
René Knaebel | 954dfcf9f9 | |
René Knaebel | 5a02f582cd | |
René Knaebel | 6e7dc1297c | |
René Knaebel | 7f1d13658f | |
René Knaebel | 452f9e0456 | |
René Knaebel | 787f43b328 | |
René Knaebel | 8ac195ba6f | |
René Knaebel | 1e781d5491 | |
René Knaebel | f4da147688 | |
René Knaebel | e24f596f40 | |
René Knaebel | ebaeb6b96e | |
René Knaebel | d97785f646 | |
René Knaebel | b0da2de0ea | |
René Knaebel | 820a5d1a4d | |
René Knaebel | 8cd1023165 | |
René Knaebel | 2593131e9e | |
René Knaebel | c1535b941b | |
René Knaebel | 18b60e1754 | |
René Knaebel | 79fc441fe1 | |
René Knaebel | d33c9f44ec | |
René Knaebel | 844494eca9 | |
René Knaebel | 336be37032 | |
René Knaebel | 6b787792db | |
René Knaebel | b35f23e518 | |
René Knaebel | d0418b9efa | |
René Knaebel | 2afaccc84b | |
René Knaebel | 9f0bae33d5 | |
René Knaebel | a196daa895 | |
René Knaebel | 522854ee0d | |
René Knaebel | 41b38de1ab | |
René Knaebel | fdc03c9922 | |
René Knaebel | 4a9f94a029 | |
René Knaebel | 21b9d7be73 | |
René Knaebel | 36cdba3fdf | |
René Knaebel | a3324b5e04 | |
René Knaebel | be56112b33 | |
René Knaebel | 3c4be52bb6 | |
René Knaebel | 933f6bf1d7 | |
René Knaebel | b2f5c56019 | |
René Knaebel | 772b07847f | |
René Knaebel | a70d1cb03a | |
René Knaebel | 7c05ef6a12 | |
René Knaebel | 3862dce975 |
|
@ -99,4 +99,8 @@ ENV/
|
|||
*.tif
|
||||
*.joblib
|
||||
*.csv
|
||||
*.csv.gz
|
||||
*.csv.gz
|
||||
*.csv.tar.*
|
||||
*.h5
|
||||
*.npy
|
||||
*.png
|
||||
|
|
61
Makefile
61
Makefile
|
@ -1,3 +1,60 @@
|
|||
test:
|
||||
python3 main.py --epochs 1 --batch 64
|
||||
run:
|
||||
python3 main.py --mode train --data data/rk_mini.csv.gz --model results/test/test_client --epochs 2 \
|
||||
--filter_embd 8 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 16 \
|
||||
--dense_embd 8 --domain_embd 8 --batch 64 --type final --model_output client --runs 1
|
||||
|
||||
python3 main.py --mode train --data data/rk_mini.csv.gz --model results/test/test_final --epochs 2 \
|
||||
--filter_embd 8 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 16 \
|
||||
--dense_embd 8 --domain_embd 8 --batch 64 --type final --model_output both --runs 1
|
||||
|
||||
python3 main.py --mode train --data data/rk_mini.csv.gz --model results/test/test_inter --epochs 2 \
|
||||
--filter_embd 8 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 16 \
|
||||
--dense_embd 8 --domain_embd 8 --batch 64 --type inter --model_output both --runs 1
|
||||
|
||||
python3 main.py --mode train --data data/rk_mini.csv.gz --model results/test/test_soft --epochs 2 \
|
||||
--filter_embd 8 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 16 \
|
||||
--dense_embd 8 --domain_embd 8 --batch 64 --type soft --model_output both --runs 1
|
||||
|
||||
python3 main.py --mode train --data data/rk_mini.csv.gz --model results/test/test_long --epochs 2 \
|
||||
--filter_embd 8 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 16 \
|
||||
--dense_embd 8 --domain_embd 8 --batch 64 --type long --model_output both --runs 1
|
||||
|
||||
python3 main.py --mode train --data data/rk_mini.csv.gz --model results/test/test_staggered --epochs 2 \
|
||||
--filter_embd 8 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 16 \
|
||||
--dense_embd 8 --domain_embd 8 --batch 64 --type staggered --model_output both --runs 1
|
||||
|
||||
|
||||
test:
|
||||
python3 main.py --mode test --batch 128 --models results/test/test_both_* --data data/rk_mini.csv.gz --model_output both
|
||||
python3 main.py --mode test --batch 128 --models results/test/test_client_* --data data/rk_mini.csv.gz --model_output client
|
||||
|
||||
fancy:
|
||||
python3 main.py --mode fancy --batch 128 --model results/test/test_both_1 --data data/rk_mini.csv.gz
|
||||
|
||||
python3 main.py --mode fancy --batch 128 --model results/test/test_both_2 --data data/rk_mini.csv.gz
|
||||
|
||||
python3 main.py --mode fancy --batch 128 --model results/test/test_both_3 --data data/rk_mini.csv.gz
|
||||
|
||||
python3 main.py --mode fancy --batch 128 --model results/test/test_both_4 --data data/rk_mini.csv.gz
|
||||
|
||||
python3 main.py --mode fancy --batch 128 --model results/test/test_both_5 --data data/rk_mini.csv.gz
|
||||
|
||||
python3 main.py --mode fancy --batch 128 --model results/test/test_client_1 --data data/rk_mini.csv.gz
|
||||
|
||||
python3 main.py --mode fancy --batch 128 --model results/test/test_client_2 --data data/rk_mini.csv.gz
|
||||
|
||||
python3 main.py --mode fancy --batch 128 --model results/test/test_client_3 --data data/rk_mini.csv.gz
|
||||
|
||||
python3 main.py --mode fancy --batch 128 --model results/test/test_client_4 --data data/rk_mini.csv.gz
|
||||
|
||||
all-fancy:
|
||||
python3 main.py --mode all_fancy --batch 128 --models results/test/test* --data data/rk_mini.csv.gz \
|
||||
--out-prefix results/test/
|
||||
|
||||
hyper:
|
||||
python3 main.py --mode hyperband --batch 64 --train data/rk_data.csv.gz
|
||||
|
||||
clean:
|
||||
rm -r results/test/
|
||||
rm data/rk_mini.csv.gz_raw.h5
|
||||
rm data/rk_mini.csv.gz.h5
|
||||
|
|
|
@ -0,0 +1,146 @@
|
|||
import argparse
|
||||
import os
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("--mode", action="store", dest="mode",
|
||||
default="")
|
||||
|
||||
# parser.add_argument("--train", action="store", dest="train_data",
|
||||
# default="data/full_dataset.csv.tar.gz")
|
||||
|
||||
parser.add_argument("--data", action="store", dest="data",
|
||||
default="data/full_dataset.csv.tar.gz")
|
||||
|
||||
# parser.add_argument("--test", action="store", dest="test_data",
|
||||
# default="data/full_future_dataset.csv.tar.gz")
|
||||
|
||||
parser.add_argument("--hyper_result", action="store", dest="hyperband_results",
|
||||
default="")
|
||||
|
||||
|
||||
parser.add_argument("--model", action="store", dest="model_path",
|
||||
default="results/model_x")
|
||||
|
||||
parser.add_argument("--model_src", action="store", dest="model_source",
|
||||
default="results/model_x")
|
||||
|
||||
parser.add_argument("--model_dest", action="store", dest="model_destination",
|
||||
default="results/model_x")
|
||||
|
||||
parser.add_argument("--models", action="store", dest="model_paths", nargs="+",
|
||||
default=[])
|
||||
|
||||
parser.add_argument("--type", action="store", dest="model_type",
|
||||
default="final")
|
||||
|
||||
parser.add_argument("--embd_type", action="store", dest="embedding_type",
|
||||
default="small")
|
||||
|
||||
# parser.add_argument("--depth", action="store", dest="model_depth",
|
||||
# default="flat1")
|
||||
|
||||
parser.add_argument("--model_output", action="store", dest="model_output",
|
||||
default="both")
|
||||
|
||||
parser.add_argument("--batch", action="store", dest="batch_size",
|
||||
default=64, type=int)
|
||||
|
||||
parser.add_argument("--epochs", action="store", dest="epochs",
|
||||
default=10, type=int)
|
||||
|
||||
parser.add_argument("--init_epoch", action="store", dest="initial_epoch",
|
||||
default=0, type=int)
|
||||
|
||||
parser.add_argument("--runs", action="store", dest="runs",
|
||||
default=20, type=int)
|
||||
|
||||
parser.add_argument("--hyper_max_iter", action="store", dest="hyper_max_iter",
|
||||
default=81, type=int)
|
||||
|
||||
# parser.add_argument("--samples", action="store", dest="samples",
|
||||
# default=100000, type=int)
|
||||
#
|
||||
# parser.add_argument("--samples_val", action="store", dest="samples_val",
|
||||
# default=10000, type=int)
|
||||
#
|
||||
parser.add_argument("--embd", action="store", dest="embedding",
|
||||
default=128, type=int)
|
||||
|
||||
parser.add_argument("--filter_embd", action="store", dest="filter_embedding",
|
||||
default=128, type=int)
|
||||
|
||||
parser.add_argument("--dense_embd", action="store", dest="dense_embedding",
|
||||
default=128, type=int)
|
||||
|
||||
parser.add_argument("--kernel_embd", action="store", dest="kernel_embedding",
|
||||
default=3, type=int)
|
||||
|
||||
parser.add_argument("--filter_main", action="store", dest="filter_main",
|
||||
default=128, type=int)
|
||||
|
||||
parser.add_argument("--dense_main", action="store", dest="dense_main",
|
||||
default=128, type=int)
|
||||
|
||||
parser.add_argument("--kernel_main", action="store", dest="kernel_main",
|
||||
default=3, type=int)
|
||||
|
||||
|
||||
parser.add_argument("--window", action="store", dest="window",
|
||||
default=10, type=int)
|
||||
|
||||
parser.add_argument("--domain_length", action="store", dest="domain_length",
|
||||
default=40, type=int)
|
||||
|
||||
parser.add_argument("--domain_embd", action="store", dest="domain_embedding",
|
||||
default=512, type=int)
|
||||
|
||||
parser.add_argument("--out-prefix", action="store", dest="output_prefix",
|
||||
default="", type=str)
|
||||
|
||||
|
||||
# parser.add_argument("--queue", action="store", dest="queue_size",
|
||||
# default=50, type=int)
|
||||
#
|
||||
# parser.add_argument("--p", action="store", dest="p_train",
|
||||
# default=0.5, type=float)
|
||||
#
|
||||
# parser.add_argument("--p_val", action="store", dest="p_val",
|
||||
# default=0.01, type=float)
|
||||
#
|
||||
# parser.add_argument("--gpu", action="store", dest="gpu",
|
||||
# default=0, type=int)
|
||||
#
|
||||
# parser.add_argument("--tmp", action="store_true", dest="tmp")
|
||||
#
|
||||
parser.add_argument("--stop_early", action="store_true", dest="stop_early")
|
||||
parser.add_argument("--balanced_weights", action="store_true", dest="class_weights")
|
||||
parser.add_argument("--sample_weights", action="store_true", dest="sample_weights")
|
||||
parser.add_argument("--gpu", action="store_true", dest="gpu")
|
||||
parser.add_argument("--new_model", action="store_true", dest="new_model")
|
||||
|
||||
|
||||
def get_model_args(args):
|
||||
return [{
|
||||
"model_path": model_path,
|
||||
"model_name": os.path.split(os.path.normpath(model_path))[1],
|
||||
"embedding_model": os.path.join(model_path, "embd.h5"),
|
||||
"clf_model": os.path.join(model_path, "clf.h5"),
|
||||
"train_log": os.path.join(model_path, "train.log.csv"),
|
||||
# "train_h5data": args.train_data,
|
||||
# "test_h5data": args.test_data,
|
||||
"future_prediction": os.path.join(model_path, f"{os.path.basename(args.data)}_pred")
|
||||
} for model_path in args.model_paths]
|
||||
|
||||
|
||||
def parse():
|
||||
args = parser.parse_args()
|
||||
args.result_path = os.path.split(os.path.normpath(args.output_prefix))[1]
|
||||
args.model_name = os.path.split(os.path.normpath(args.model_path))[1]
|
||||
args.embedding_model = os.path.join(args.model_path, "embd.h5")
|
||||
args.clf_model = os.path.join(args.model_path, "clf.h5")
|
||||
args.train_log = os.path.join(args.model_path, "train.log.csv")
|
||||
# args.train_h5data = args.train_data
|
||||
# args.test_h5data = args.test_data
|
||||
args.future_prediction = os.path.join(args.model_path, f"{os.path.basename(args.data)}_pred")
|
||||
return args
|
385
dataset.py
385
dataset.py
|
@ -1,198 +1,297 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import logging
|
||||
import string
|
||||
from multiprocessing import Pool
|
||||
|
||||
import h5py
|
||||
import joblib
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
chars = dict((char, idx + 1) for (idx, char) in
|
||||
enumerate(string.ascii_lowercase + string.punctuation + string.digits))
|
||||
logger = logging.getLogger('cisco_logger')
|
||||
|
||||
char2idx = dict((char, idx + 1) for (idx, char) in
|
||||
enumerate(string.ascii_lowercase + string.punctuation + string.digits + " "))
|
||||
|
||||
idx2char = {v: k for k, v in char2idx.items()}
|
||||
|
||||
|
||||
def get_character_dict():
|
||||
return chars
|
||||
return char2idx
|
||||
|
||||
|
||||
def get_vocab_size():
|
||||
return len(char2idx) + 1
|
||||
|
||||
|
||||
def encode_char(c):
|
||||
if c in chars:
|
||||
return chars[c]
|
||||
else:
|
||||
return 0
|
||||
return char2idx.get(c, 0)
|
||||
|
||||
|
||||
def decode_char(i):
|
||||
return idx2char.get(i, "")
|
||||
|
||||
|
||||
encode_char = np.vectorize(encode_char)
|
||||
decode_char = np.vectorize(decode_char)
|
||||
|
||||
|
||||
def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
|
||||
maxLengthInSeconds=300):
|
||||
maxMilliSeconds = maxLengthInSeconds * 1000
|
||||
outDomainLists = []
|
||||
outDFFrames = []
|
||||
if overlapping == False:
|
||||
numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize)))
|
||||
userIDs = np.arange(len(dataFrame))
|
||||
for blockID in np.arange(numBlocks):
|
||||
curIDs = userIDs[(blockID * windowSize):((blockID + 1) * windowSize)]
|
||||
# print(curIDs)
|
||||
useData = dataFrame.iloc[curIDs]
|
||||
curDomains = useData['domain']
|
||||
if maxLengthInSeconds != -1:
|
||||
curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
|
||||
underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
|
||||
if len(underTimeOutIDs) != len(curIDs):
|
||||
curIDs = curIDs[underTimeOutIDs]
|
||||
useData = dataFrame.iloc[curIDs]
|
||||
curDomains = useData['domain']
|
||||
outDomainLists.append(list(curDomains))
|
||||
outDFFrames.append(useData)
|
||||
else:
|
||||
numBlocks = len(dataFrame) + 1 - windowSize
|
||||
userIDs = np.arange(len(dataFrame))
|
||||
for blockID in np.arange(numBlocks):
|
||||
curIDs = userIDs[blockID:blockID + windowSize]
|
||||
useData = dataFrame.iloc[curIDs]
|
||||
curDomains = useData['domain']
|
||||
if maxLengthInSeconds != -1:
|
||||
curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
|
||||
underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
|
||||
if len(underTimeOutIDs) != len(curIDs):
|
||||
curIDs = curIDs[underTimeOutIDs]
|
||||
useData = dataFrame.iloc[curIDs]
|
||||
curDomains = useData['domain']
|
||||
outDomainLists.append(list(curDomains))
|
||||
outDFFrames.append(useData)
|
||||
if len(outDomainLists[-1]) != windowSize:
|
||||
outDomainLists.pop(-1)
|
||||
outDFFrames.pop(-1)
|
||||
return (outDomainLists, outDFFrames)
|
||||
def encode_domain(domain: string):
|
||||
return encode_char(list(domain))
|
||||
|
||||
|
||||
def get_domain_features(domain, vocab, max_length=40):
|
||||
def decode_domain(domain):
|
||||
return "".join(decode_char(domain))
|
||||
|
||||
|
||||
def get_user_chunks(user_flow, window=10):
|
||||
result = []
|
||||
chunk_size = (len(user_flow) // window)
|
||||
for i in range(chunk_size):
|
||||
result.append(user_flow.iloc[i * window:(i + 1) * window])
|
||||
if result and len(result[-1]) != window:
|
||||
result.pop()
|
||||
return result
|
||||
|
||||
|
||||
def get_domain_features(domain: string, max_length=40):
|
||||
encoding = np.zeros((max_length,))
|
||||
for j in range(np.min([len(domain), max_length])):
|
||||
curCharacter = domain[-j]
|
||||
if curCharacter in vocab:
|
||||
encoding[j] = vocab[curCharacter]
|
||||
for j in range(min(len(domain), max_length)):
|
||||
c = domain[len(domain) - 1 - j]
|
||||
encoding[max_length - 1 - j] = encode_char(c)
|
||||
return encoding
|
||||
|
||||
|
||||
def get_flow_features(flow):
|
||||
keys = ['duration', 'bytes_down', 'bytes_up']
|
||||
features = np.zeros([len(keys), ])
|
||||
for i, key in enumerate(keys):
|
||||
# TODO: does it still works after exceptions occur -- default: zero!
|
||||
# i wonder whether something brokes
|
||||
# if there are exceptions regarding to inconsistent feature length
|
||||
try:
|
||||
features[i] = np.log1p(flow[key]).astype(float)
|
||||
except:
|
||||
pass
|
||||
return features
|
||||
def get_all_flow_features(features):
|
||||
flows = np.stack(
|
||||
map(lambda f: f[["duration", "bytes_up", "bytes_down"]], features)
|
||||
)
|
||||
return np.log1p(flows)
|
||||
|
||||
|
||||
def get_cisco_features(curDataLine, urlSIPDict):
|
||||
numCiscoFeatures = 30
|
||||
try:
|
||||
ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
|
||||
# log transform
|
||||
ciscoFeatures = np.log1p(ciscoFeatures).astype(float)
|
||||
return ciscoFeatures.ravel()
|
||||
except:
|
||||
return np.zeros([numCiscoFeatures, ]).ravel()
|
||||
def filter_window_dataset_by_hits(domain, flow, name, hits, trusted_hits, server):
|
||||
# select only 1.0 and 0.0 from training data
|
||||
pos_idx = np.where(np.logical_or(hits == 1.0, trusted_hits >= 1.0))[0]
|
||||
neg_idx = np.where(hits == 0.0)[0]
|
||||
idx = np.concatenate((pos_idx, neg_idx))
|
||||
# choose selected sample to train on
|
||||
domain = domain[idx]
|
||||
flow = flow[idx]
|
||||
client = np.zeros_like(idx, float)
|
||||
client[:pos_idx.shape[-1]] = 1.0
|
||||
server = server[idx]
|
||||
name = name[idx]
|
||||
|
||||
return domain, flow, name, client, server
|
||||
|
||||
|
||||
def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10, use_cisco_features=False):
|
||||
domains = []
|
||||
features = []
|
||||
print("get chunks from user data frames")
|
||||
for i, user_flow in enumerate(get_flow_per_user(user_flow_df)):
|
||||
(domain_windows, feature_windows) = get_user_chunks(user_flow,
|
||||
windowSize=window_size,
|
||||
overlapping=True,
|
||||
maxLengthInSeconds=-1)
|
||||
domains += domain_windows
|
||||
features += feature_windows
|
||||
# TODO: remove later
|
||||
if i >= 10:
|
||||
break
|
||||
def create_raw_dataset_from_flows(user_flow_df, max_len, window_size=10):
|
||||
logger.info("get chunks from user data frames")
|
||||
with Pool() as pool:
|
||||
results = []
|
||||
for user_flow in tqdm(get_flow_per_user(user_flow_df), total=len(user_flow_df['user_hash'].unique().tolist())):
|
||||
results.append(pool.apply_async(get_user_chunks, (user_flow, window_size)))
|
||||
windows = [window for res in results for window in res.get()]
|
||||
logger.info("create training dataset")
|
||||
domain, flow, hits, name, server, trusted_hits = create_dataset_from_windows(chunks=windows,
|
||||
max_len=max_len)
|
||||
# make client labels discrete with 4 different values
|
||||
hits = np.apply_along_axis(lambda x: make_label_discrete(x, 3), 0, np.atleast_2d(hits))
|
||||
|
||||
print("create training dataset")
|
||||
return create_dataset_from_lists(
|
||||
domains=domains, features=features, vocab=char_dict,
|
||||
max_len=max_len,
|
||||
use_cisco_features=use_cisco_features, urlSIPDIct=dict(),
|
||||
window_size=window_size)
|
||||
return domain, flow, name, hits, trusted_hits, server
|
||||
|
||||
|
||||
def create_dataset_from_lists(domains, features, vocab, max_len,
|
||||
use_cisco_features=False, urlSIPDIct=dict(),
|
||||
window_size=10):
|
||||
def store_h5dataset(path, data: dict):
|
||||
f = h5py.File(path + ".h5", "w")
|
||||
for key, val in data.items():
|
||||
f.create_dataset(key, data=val)
|
||||
f.close()
|
||||
|
||||
|
||||
def check_h5dataset(path):
|
||||
return open(path + ".h5", "r")
|
||||
|
||||
|
||||
def load_h5dataset(path):
|
||||
f = h5py.File(path + ".h5", "r")
|
||||
data = {}
|
||||
for k in f.keys():
|
||||
data[k] = f[k]
|
||||
return data
|
||||
|
||||
|
||||
def create_dataset_from_windows(chunks, max_len):
|
||||
"""
|
||||
combines domain and feature windows to sequential training data
|
||||
:param domains: list of domain windows
|
||||
:param features: list of feature windows
|
||||
:param chunks: list of flow feature windows
|
||||
:param vocab:
|
||||
:param max_len:
|
||||
:param use_cisco_features: idk
|
||||
:param urlSIPDIct: idk
|
||||
:param window_size: size of the flow window
|
||||
:return:
|
||||
"""
|
||||
# TODO: check for hits vs vth consistency
|
||||
# if 'hits' in dfs[0].keys():
|
||||
# hits_col = 'hits'
|
||||
# elif 'virusTotalHits' in dfs[0].keys():
|
||||
# hits_col = 'virusTotalHits'
|
||||
hits_col = "virusTotalHits"
|
||||
|
||||
numFlowFeatures = 3
|
||||
numCiscoFeatures = 30
|
||||
numFeatures = numFlowFeatures
|
||||
if use_cisco_features:
|
||||
numFeatures += numCiscoFeatures
|
||||
sample_size = len(domains)
|
||||
hits = []
|
||||
names = []
|
||||
servers = []
|
||||
trusted_hits = []
|
||||
def get_domain_features_reduced(d):
|
||||
return get_domain_features(d[0], max_len)
|
||||
|
||||
domain_features = np.zeros((sample_size, window_size, max_len))
|
||||
flow_features = np.zeros((sample_size, window_size, numFeatures))
|
||||
logger.info(" compute domain features")
|
||||
domain_features = []
|
||||
for ds in tqdm(map(lambda f: f.domain, chunks)):
|
||||
domain_features.append(np.apply_along_axis(get_domain_features_reduced, 2, np.atleast_3d(ds)))
|
||||
domain_features = np.concatenate(domain_features, 0)
|
||||
logger.info(" compute flow features")
|
||||
flow_features = get_all_flow_features(chunks)
|
||||
logger.info(" select hits")
|
||||
hits = np.max(np.stack(map(lambda f: f.virusTotalHits, chunks)), axis=1)
|
||||
logger.info(" select names")
|
||||
names = np.stack(map(lambda f: f.user_hash, chunks))
|
||||
assert (names[:, :1].repeat(10, axis=1) == names).all()
|
||||
names = names[:, 0]
|
||||
logger.info(" select servers")
|
||||
servers = np.stack(map(lambda f: f.serverLabel, chunks))
|
||||
logger.info(" select trusted hits")
|
||||
trusted_hits = np.max(np.stack(map(lambda f: f.trustedHits, chunks)), axis=1)
|
||||
|
||||
for i in tqdm(np.arange(sample_size), miniters=10):
|
||||
for j in range(window_size):
|
||||
domain_features[i, j] = get_domain_features(domains[i][j], vocab, max_len)
|
||||
flow_features[i, j] = get_flow_features(features[i].iloc[j])
|
||||
# TODO: cisco features?
|
||||
|
||||
hits.append(np.max(features[i][hits_col]))
|
||||
names.append(np.unique(features[i]['user_hash']))
|
||||
servers.append(np.max(features[i]['serverLabel']))
|
||||
trusted_hits.append(np.max(features[i]['trustedHits']))
|
||||
X = [domain_features, flow_features]
|
||||
return X, np.array(hits), np.array(names), np.array(servers), np.array(trusted_hits)
|
||||
return (domain_features, flow_features,
|
||||
hits, names, servers, trusted_hits)
|
||||
|
||||
|
||||
def discretize_label(values, threshold):
|
||||
maxVal = np.max(values)
|
||||
if maxVal >= threshold:
|
||||
def make_label_discrete(values, threshold):
|
||||
max_val = np.max(values)
|
||||
if max_val >= threshold:
|
||||
return 1.0
|
||||
elif maxVal == -1:
|
||||
elif max_val == -1:
|
||||
return -1.0
|
||||
elif 0 < maxVal < threshold:
|
||||
elif 0 < max_val < threshold:
|
||||
return -2.0
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
|
||||
def get_user_flow_data():
|
||||
df = pd.read_csv("data/rk_data.csv.gz")
|
||||
df.drop("Unnamed: 0", 1, inplace=True)
|
||||
df.set_index(keys=['user_hash'], drop=False, inplace=True)
|
||||
def get_user_flow_data(csv_file):
|
||||
types = {
|
||||
"duration": int,
|
||||
"bytes_down": int,
|
||||
"bytes_up": int,
|
||||
"domain": object,
|
||||
"timeStamp": float,
|
||||
"http_method": object,
|
||||
"server_ip": object,
|
||||
"user_hash": float,
|
||||
"virusTotalHits": int,
|
||||
"serverLabel": int,
|
||||
"trustedHits": int
|
||||
}
|
||||
df = pd.read_csv(csv_file, index_col=False)
|
||||
df = df[list(types.keys())]
|
||||
# df.set_index(keys=['user_hash'], drop=False, inplace=True)
|
||||
return df
|
||||
|
||||
|
||||
def get_flow_per_user(df):
|
||||
users = df['user_hash'].unique().tolist()
|
||||
for user in users:
|
||||
yield df.loc[df.user_hash == user]
|
||||
yield df.loc[df.user_hash == user].dropna(axis=0, how="any")
|
||||
|
||||
|
||||
def load_or_generate_h5data(train_data, domain_length, window_size):
|
||||
logger.info(f"check for h5data {train_data}")
|
||||
try:
|
||||
check_h5dataset(train_data)
|
||||
except FileNotFoundError:
|
||||
logger.info("load raw training dataset")
|
||||
domain, flow, name, hits, trusted_hits, server = load_or_generate_raw_h5data(train_data, domain_length,
|
||||
window_size)
|
||||
logger.info("filter training dataset")
|
||||
domain, flow, name, client, server = filter_window_dataset_by_hits(domain.value, flow.value,
|
||||
name.value, hits.value,
|
||||
trusted_hits.value, server.value)
|
||||
logger.info("store training dataset as h5 file")
|
||||
data = {
|
||||
"domain": domain.astype(np.int8),
|
||||
"flow": flow,
|
||||
"name": name,
|
||||
"client": client.astype(np.bool),
|
||||
"server": server.astype(np.bool)
|
||||
}
|
||||
store_h5dataset(train_data, data)
|
||||
logger.info("load h5 dataset")
|
||||
data = load_h5dataset(train_data)
|
||||
return data["domain"], data["flow"], data["name"], data["client"], data["server"]
|
||||
|
||||
|
||||
def load_or_generate_raw_h5data(train_data, domain_length, window_size):
|
||||
h5data = train_data + "_raw"
|
||||
logger.info(f"check for h5data {h5data}")
|
||||
try:
|
||||
check_h5dataset(h5data)
|
||||
except FileNotFoundError:
|
||||
logger.info("h5 data not found - load csv file")
|
||||
user_flow_df = get_user_flow_data(train_data)
|
||||
logger.info("create raw training dataset")
|
||||
domain, flow, name, hits, trusted_hits, server = create_raw_dataset_from_flows(user_flow_df, domain_length,
|
||||
window_size)
|
||||
logger.info("store raw training dataset as h5 file")
|
||||
data = {
|
||||
"domain": domain.astype(np.int8),
|
||||
"flow": flow,
|
||||
"name": name,
|
||||
"hits_vt": hits.astype(np.int8),
|
||||
"hits_trusted": hits.astype(np.int8),
|
||||
"server": server.astype(np.bool)
|
||||
}
|
||||
store_h5dataset(h5data, data)
|
||||
logger.info("load h5 dataset")
|
||||
data = load_h5dataset(h5data)
|
||||
return data["domain"], data["flow"], data["name"], data["hits_vt"], data["hits_trusted"], data["server"]
|
||||
|
||||
|
||||
def generate_names(train_data, window_size):
|
||||
user_flow_df = get_user_flow_data(train_data)
|
||||
with Pool() as pool:
|
||||
results = []
|
||||
for user_flow in tqdm(get_flow_per_user(user_flow_df),
|
||||
total=len(user_flow_df['user_hash'].unique().tolist())):
|
||||
results.append(pool.apply_async(get_user_chunks, (user_flow, window_size)))
|
||||
windows = [window for res in results for window in res.get()]
|
||||
names = np.stack(map(lambda f: f.user_hash, windows))
|
||||
names = names[:, 0]
|
||||
|
||||
return names
|
||||
|
||||
|
||||
def load_or_generate_domains(train_data, domain_length):
|
||||
fn = f"{train_data}_domains.gz"
|
||||
|
||||
try:
|
||||
logger.info(f"Load file {fn}.")
|
||||
user_flow_df = pd.read_csv(fn)
|
||||
logger.info(f"File successfully loaded.")
|
||||
except FileNotFoundError:
|
||||
logger.info(f"File {fn} not found, recreate.")
|
||||
user_flow_df = get_user_flow_data(train_data)
|
||||
# user_flow_df.reset_index(inplace=True)
|
||||
user_flow_df = user_flow_df[["domain", "serverLabel", "trustedHits", "virusTotalHits"]].dropna(axis=0,
|
||||
how="any")
|
||||
user_flow_df = user_flow_df.groupby(user_flow_df.domain).mean()
|
||||
user_flow_df.reset_index(inplace=True)
|
||||
|
||||
user_flow_df["clientLabel"] = np.where(
|
||||
np.logical_or(user_flow_df.trustedHits > 0, user_flow_df.virusTotalHits >= 3), True, False)
|
||||
user_flow_df[["serverLabel", "clientLabel"]] = user_flow_df[["serverLabel", "clientLabel"]].astype(bool)
|
||||
user_flow_df = user_flow_df[["domain", "serverLabel", "clientLabel"]]
|
||||
|
||||
user_flow_df.to_csv(fn, compression="gzip")
|
||||
|
||||
logger.info(f"Extract features from domains")
|
||||
domain_encs = user_flow_df.domain.apply(lambda d: get_domain_features(d, domain_length))
|
||||
domain_encs = np.stack(domain_encs)
|
||||
|
||||
return domain_encs, user_flow_df.domain, user_flow_df[["clientLabel", "serverLabel"]].as_matrix().astype(bool)
|
||||
|
||||
|
||||
def save_predictions(path, results):
|
||||
joblib.dump(results, path + "/results.joblib", compress=3)
|
||||
|
||||
|
||||
def load_predictions(path):
|
||||
return joblib.load(path + "/results.joblib")
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
N1=$1
|
||||
N2=$2
|
||||
RESDIR=$3
|
||||
DATADIR=$4
|
||||
|
||||
#for ((i = ${N1}; i <= ${N2}; i++))
|
||||
#do
|
||||
# python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_final_${i} --data ${DATADIR} --model_output client
|
||||
# python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_final_${i} --data ${DATADIR} --model_output both
|
||||
# python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_inter_${i} --data ${DATADIR} --model_output both
|
||||
# python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_staggered_${i} --data ${DATADIR} --model_output both
|
||||
#done
|
||||
#
|
||||
#python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/client_final_{1..20}/ --data ${DATADIR} --model_output client --out-prefix ${RESDIR}/client_final
|
||||
#python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_final_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_final
|
||||
#python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_inter_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_inter
|
||||
#python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_staggered_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_staggered
|
||||
|
||||
#python3 main.py --mode beta --batch 1024 --models ${RESDIR}/client_final_{1..20}/ --data ${DATADIR} --model_output client --out-prefix ${RESDIR}/client_final
|
||||
#python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_final_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_final
|
||||
#python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_inter_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_inter
|
||||
#python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_staggered_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_staggered
|
||||
#python3 main.py --mode all_beta --out-prefix ${RESDIR}/both_staggered
|
||||
|
||||
python3 main.py --mode embedding --batch 1024 --models ${RESDIR}/client_final_{1..20}/ ${RESDIR}/both_final_{1..20}/ \
|
||||
${RESDIR}/both_inter_{1..20}/ ${RESDIR}/both_staggered_{1..20}/ \
|
||||
--data ${DATADIR} \
|
||||
--out-prefix ${RESDIR}/figs/svd/svd
|
|
@ -0,0 +1,146 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# implementation of hyperband:
|
||||
# https://arxiv.org/pdf/1603.06560.pdf
|
||||
import logging
|
||||
import random
|
||||
from math import ceil, log
|
||||
from random import random as rng
|
||||
from time import ctime, time
|
||||
|
||||
import joblib
|
||||
import numpy as np
|
||||
from keras.callbacks import EarlyStopping
|
||||
|
||||
import models
|
||||
|
||||
logger = logging.getLogger('cisco_logger')
|
||||
|
||||
|
||||
def sample_params(param_distribution: dict):
|
||||
p = {}
|
||||
for key, val in param_distribution.items():
|
||||
p[key] = random.choice(val)
|
||||
return p
|
||||
|
||||
|
||||
class Hyperband:
|
||||
def __init__(self, param_distribution, X, y, max_iter=81, savefile=None):
|
||||
self.get_params = lambda: sample_params(param_distribution)
|
||||
|
||||
self.max_iter = max_iter # maximum iterations per configuration
|
||||
self.eta = 3 # defines configuration downsampling rate (default = 3)
|
||||
|
||||
self.logeta = lambda x: log(x) / log(self.eta)
|
||||
self.s_max = int(self.logeta(self.max_iter))
|
||||
self.B = (self.s_max + 1) * self.max_iter
|
||||
|
||||
self.results = [] # list of dicts
|
||||
self.counter = 0
|
||||
self.best_loss = np.inf
|
||||
self.best_counter = -1
|
||||
|
||||
self.savefile = savefile
|
||||
|
||||
self.X = X
|
||||
self.y = y
|
||||
|
||||
def try_params(self, n_iterations, params):
|
||||
n_iterations = int(round(n_iterations))
|
||||
model = models.get_models_by_params(params)
|
||||
|
||||
callbacks = [EarlyStopping(monitor='val_loss',
|
||||
patience=5,
|
||||
verbose=False)]
|
||||
|
||||
model.compile(optimizer='adam',
|
||||
loss='binary_crossentropy',
|
||||
metrics=['accuracy'])
|
||||
|
||||
history = model.fit(self.X,
|
||||
self.y[0] if params["model_output"] == "client" else self.y,
|
||||
batch_size=params["batch_size"],
|
||||
epochs=n_iterations,
|
||||
callbacks=callbacks,
|
||||
shuffle=True,
|
||||
validation_split=0.4)
|
||||
|
||||
return {"loss": np.min(history.history['val_loss']),
|
||||
"early_stop": len(history.history["loss"]) < n_iterations,
|
||||
"stop_after": len(history.history["val_loss"])}
|
||||
|
||||
# can be called multiple times
|
||||
def run(self, skip_last=0, dry_run=False):
|
||||
|
||||
for s in reversed(range(self.s_max + 1)):
|
||||
|
||||
# initial number of configurations
|
||||
n = int(ceil(self.B / self.max_iter / (s + 1) * self.eta ** s))
|
||||
|
||||
# initial number of iterations per config
|
||||
r = self.max_iter * self.eta ** (-s)
|
||||
|
||||
# n random configurations
|
||||
random_configs = [self.get_params() for _ in range(n)]
|
||||
|
||||
for i in range((s + 1) - int(skip_last)): # changed from s + 1
|
||||
|
||||
# Run each of the n configs for <iterations>
|
||||
# and keep best (n_configs / eta) configurations
|
||||
|
||||
n_configs = n * self.eta ** (-i)
|
||||
n_iterations = r * self.eta ** (i)
|
||||
|
||||
logger.info("*** {} configurations x {:.1f} iterations each".format(
|
||||
n_configs, n_iterations))
|
||||
|
||||
val_losses = []
|
||||
early_stops = []
|
||||
|
||||
for t in random_configs:
|
||||
|
||||
self.counter += 1
|
||||
logger.info("Config {} | {} | lowest loss so far: {:.4f} (run {})".format(
|
||||
self.counter, ctime(), self.best_loss, self.best_counter))
|
||||
|
||||
start_time = time()
|
||||
|
||||
if dry_run:
|
||||
result = {'loss': rng(), 'log_loss': rng(), 'auc': rng()}
|
||||
else:
|
||||
result = self.try_params(n_iterations, t) # <---
|
||||
|
||||
assert (type(result) == dict)
|
||||
assert ('loss' in result)
|
||||
|
||||
seconds = int(round(time() - start_time))
|
||||
logger.info("{} seconds.".format(seconds))
|
||||
|
||||
loss = result['loss']
|
||||
val_losses.append(loss)
|
||||
|
||||
early_stop = result.get('early_stop', False)
|
||||
early_stops.append(early_stop)
|
||||
|
||||
# keeping track of the best result so far (for display only)
|
||||
# could do it be checking results each time, but hey
|
||||
if loss < self.best_loss:
|
||||
self.best_loss = loss
|
||||
self.best_counter = self.counter
|
||||
|
||||
result['counter'] = self.counter
|
||||
result['seconds'] = seconds
|
||||
result['params'] = t
|
||||
result['iterations'] = n_iterations
|
||||
|
||||
self.results.append(result)
|
||||
|
||||
# select a number of best configurations for the next loop
|
||||
# filter out early stops, if any
|
||||
indices = np.argsort(val_losses)
|
||||
random_configs = [random_configs[i] for i in indices if not early_stops[i]]
|
||||
random_configs = random_configs[0:int(n_configs / self.eta)]
|
||||
|
||||
if self.savefile:
|
||||
joblib.dump(self.results, self.savefile)
|
||||
|
||||
return self.results
|
951
main.py
951
main.py
|
@ -1,127 +1,860 @@
|
|||
import argparse
|
||||
import logging
|
||||
import operator
|
||||
import os
|
||||
|
||||
import joblib
|
||||
import numpy as np
|
||||
from keras.utils import np_utils
|
||||
import pandas as pd
|
||||
import tensorflow as tf
|
||||
from keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint
|
||||
from sklearn.metrics import confusion_matrix
|
||||
|
||||
import arguments
|
||||
import dataset
|
||||
import hyperband
|
||||
import models
|
||||
# create logger
|
||||
import visualize
|
||||
from arguments import get_model_args
|
||||
from utils import exists_or_make_path, get_custom_class_weights, get_custom_sample_weights, load_model
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
logger = logging.getLogger('cisco_logger')
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger.propagate = False
|
||||
|
||||
parser.add_argument("--modes", action="store", dest="modes", nargs="+")
|
||||
# create console handler and set level to debug
|
||||
ch = logging.StreamHandler()
|
||||
ch.setLevel(logging.DEBUG)
|
||||
|
||||
# parser.add_argument("--data", action="store", dest="data",
|
||||
# default="data/")
|
||||
#
|
||||
# parser.add_argument("--h5data", action="store", dest="h5data",
|
||||
# default="")
|
||||
#
|
||||
# parser.add_argument("--model", action="store", dest="model",
|
||||
# default="model_x")
|
||||
#
|
||||
# parser.add_argument("--pred", action="store", dest="pred",
|
||||
# default="")
|
||||
#
|
||||
# parser.add_argument("--type", action="store", dest="model_type",
|
||||
# default="simple_conv")
|
||||
#
|
||||
parser.add_argument("--batch", action="store", dest="batch_size",
|
||||
default=64, type=int)
|
||||
# create formatter
|
||||
formatter1 = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
|
||||
parser.add_argument("--epochs", action="store", dest="epochs",
|
||||
default=10, type=int)
|
||||
# add formatter to ch
|
||||
ch.setFormatter(formatter1)
|
||||
|
||||
# parser.add_argument("--samples", action="store", dest="samples",
|
||||
# default=100000, type=int)
|
||||
#
|
||||
# parser.add_argument("--samples_val", action="store", dest="samples_val",
|
||||
# default=10000, type=int)
|
||||
#
|
||||
# parser.add_argument("--area", action="store", dest="area_size",
|
||||
# default=25, type=int)
|
||||
#
|
||||
# parser.add_argument("--queue", action="store", dest="queue_size",
|
||||
# default=50, type=int)
|
||||
#
|
||||
# parser.add_argument("--p", action="store", dest="p_train",
|
||||
# default=0.5, type=float)
|
||||
#
|
||||
# parser.add_argument("--p_val", action="store", dest="p_val",
|
||||
# default=0.01, type=float)
|
||||
#
|
||||
# parser.add_argument("--gpu", action="store", dest="gpu",
|
||||
# default=0, type=int)
|
||||
#
|
||||
# parser.add_argument("--tmp", action="store_true", dest="tmp")
|
||||
#
|
||||
# parser.add_argument("--test", action="store", dest="test_image",
|
||||
# default=6, choices=range(7), type=int)
|
||||
# add ch to logger
|
||||
logger.addHandler(ch)
|
||||
|
||||
args = parser.parse_args()
|
||||
# ch = logging.FileHandler("info.log")
|
||||
# ch.setLevel(logging.DEBUG)
|
||||
#
|
||||
# # create formatter
|
||||
# formatter2 = logging.Formatter('!! %(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
#
|
||||
# # add formatter to ch
|
||||
# ch.setFormatter(formatter2)
|
||||
#
|
||||
# # add ch to logger
|
||||
# logger.addHandler(ch)
|
||||
|
||||
# config = tf.ConfigProto(log_device_placement=True)
|
||||
# config.gpu_options.per_process_gpu_memory_fraction = 0.5
|
||||
# config.gpu_options.allow_growth = True
|
||||
# session = tf.Session(config=config)
|
||||
args = arguments.parse()
|
||||
|
||||
if args.gpu:
|
||||
config = tf.ConfigProto(log_device_placement=True)
|
||||
config.gpu_options.per_process_gpu_memory_fraction = 0.5
|
||||
config.gpu_options.allow_growth = True
|
||||
session = tf.Session(config=config)
|
||||
|
||||
# default parameter
|
||||
PARAMS = {
|
||||
"type": args.model_type,
|
||||
"embedding_type": args.embedding_type,
|
||||
# "depth": args.model_depth,
|
||||
"batch_size": args.batch_size,
|
||||
"window_size": args.window,
|
||||
"domain_length": args.domain_length,
|
||||
"flow_features": 3,
|
||||
#
|
||||
'dropout': 0.5, # currently fix
|
||||
'embedding': args.embedding,
|
||||
'flow_features': 3,
|
||||
'filter_embedding': args.filter_embedding,
|
||||
'dense_embedding': args.dense_embedding,
|
||||
'kernel_embedding': args.kernel_embedding,
|
||||
'filter_main': args.filter_main,
|
||||
'dense_main': args.dense_main,
|
||||
'kernel_main': args.kernel_main,
|
||||
'model_output': args.model_output
|
||||
}
|
||||
|
||||
|
||||
# TODO: remove inner global params
|
||||
def get_param_dist(dist_size="small"):
|
||||
if dist_size == "small":
|
||||
return {
|
||||
# static params
|
||||
"type": [args.model_type],
|
||||
"embedding_type": [args.embedding_type],
|
||||
# "depth": [args.model_depth],
|
||||
"model_output": [args.model_output],
|
||||
"batch_size": [args.batch_size],
|
||||
"window_size": [args.window],
|
||||
"flow_features": [3],
|
||||
"domain_length": [args.domain_length],
|
||||
# model params
|
||||
"embedding": [2 ** x for x in range(3, 6)],
|
||||
"filter_embedding": [2 ** x for x in range(1, 8)],
|
||||
"kernel_embedding": [1, 3, 5],
|
||||
"dense_embedding": [2 ** x for x in range(4, 8)],
|
||||
"dropout": [0.5],
|
||||
"filter_main": [2 ** x for x in range(1, 8)],
|
||||
"kernel_main": [1, 3, 5],
|
||||
"dense_main": [2 ** x for x in range(1, 8)],
|
||||
}
|
||||
else:
|
||||
return {
|
||||
# static params
|
||||
"type": [args.model_type],
|
||||
"embedding_type": [args.embedding_type],
|
||||
# "depth": [args.model_depth],
|
||||
"model_output": [args.model_output],
|
||||
"batch_size": [args.batch_size],
|
||||
"window_size": [args.window],
|
||||
"flow_features": [3],
|
||||
"domain_length": [args.domain_length],
|
||||
# model params
|
||||
"embedding": [2 ** x for x in range(3, 7)],
|
||||
"filter_embedding": [2 ** x for x in range(1, 10)],
|
||||
"kernel_embedding": [1, 3, 5, 7, 9],
|
||||
"dense_embedding": [2 ** x for x in range(4, 10)],
|
||||
"dropout": [0.5],
|
||||
"filter_main": [2 ** x for x in range(1, 10)],
|
||||
"kernel_main": [1, 3, 5, 7, 9],
|
||||
"dense_main": [2 ** x for x in range(1, 12)],
|
||||
}
|
||||
|
||||
|
||||
def shuffle_training_data(domain, flow, client, server):
|
||||
idx = np.random.permutation(len(domain))
|
||||
domain = domain[idx]
|
||||
flow = flow[idx]
|
||||
client = client[idx]
|
||||
server = server[idx]
|
||||
return domain, flow, client, server
|
||||
|
||||
|
||||
def main_paul_best():
|
||||
pauls_best_params = {
|
||||
"type": "paul",
|
||||
"batch_size": 64,
|
||||
"window_size": 10,
|
||||
"domain_length": 40,
|
||||
"flow_features": 3,
|
||||
#
|
||||
'dropout': 0.5,
|
||||
'domain_features': 32,
|
||||
'drop_out': 0.5,
|
||||
'embedding_size': 64,
|
||||
'filter_main': 512,
|
||||
'flow_features': 3,
|
||||
'dense_main': 32,
|
||||
'filter_embedding': 32,
|
||||
'hidden_embedding': 32,
|
||||
'kernel_embedding': 8,
|
||||
'kernels_main': 8,
|
||||
'input_length': 40
|
||||
}
|
||||
main_train(pauls_best_params)
|
||||
|
||||
|
||||
def main_hyperband(data, domain_length, window_size, model_type, result_file, max_iter, dist_size="small"):
|
||||
logger.info("create training dataset")
|
||||
domain_tr, flow_tr, client_tr, server_tr = load_data(data, domain_length, window_size, model_type, shuffled=True)
|
||||
return run_hyperband(dist_size, domain_tr, flow_tr, client_tr, server_tr, max_iter, result_file)
|
||||
|
||||
|
||||
def run_hyperband(dist_size, features, labels, max_iter, savefile):
|
||||
param_dist = get_param_dist(dist_size)
|
||||
hp = hyperband.Hyperband(param_dist, features, labels,
|
||||
max_iter=max_iter,
|
||||
savefile=savefile)
|
||||
results = hp.run()
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def train(parameters, features, labels):
|
||||
pass
|
||||
|
||||
|
||||
def load_data(data, domain_length, window_size, model_type, shuffled=False):
|
||||
# data preparation
|
||||
domain_tr, flow_tr, name_tr, client_tr, server_windows_tr = dataset.load_or_generate_h5data(data, domain_length,
|
||||
window_size)
|
||||
server_tr = np.max(server_windows_tr, axis=1)
|
||||
if model_type in ("inter", "staggered"):
|
||||
server_tr = np.expand_dims(server_windows_tr, 2)
|
||||
if shuffled:
|
||||
domain_tr, flow_tr, client_tr, server_tr = shuffle_training_data(domain_tr, flow_tr, client_tr, server_tr)
|
||||
return domain_tr, flow_tr, client_tr, server_tr
|
||||
|
||||
|
||||
def load_training_data(data, model_output, domain_length, window_size, model_type, shuffled=False):
|
||||
domain_tr, flow_tr, client_tr, server_tr = load_data(data, domain_length,
|
||||
window_size, model_type, shuffled)
|
||||
features = {"ipt_domains": domain_tr.value, "ipt_flows": flow_tr.value}
|
||||
if model_output == "both":
|
||||
labels = {"client": client_tr.value, "server": server_tr}
|
||||
loss_weights = {"client": 1.0, "server": 1.0}
|
||||
elif model_output == "client":
|
||||
labels = {"client": client_tr.value}
|
||||
loss_weights = {"client": 1.0}
|
||||
elif model_output == "server":
|
||||
labels = {"server": server_tr}
|
||||
loss_weights = {"server": 1.0}
|
||||
else:
|
||||
raise ValueError("unknown model output")
|
||||
return features, labels, loss_weights
|
||||
|
||||
|
||||
def get_weighting(class_weights, sample_weights, labels):
|
||||
return None, None
|
||||
client, server = labels["client"], labels["server"]
|
||||
if class_weights:
|
||||
logger.info("class weights: compute custom weights")
|
||||
custom_class_weights = get_custom_class_weights(client, server)
|
||||
logger.info(custom_class_weights)
|
||||
else:
|
||||
logger.info("class weights: set default")
|
||||
custom_class_weights = None
|
||||
|
||||
if sample_weights:
|
||||
logger.info("class weights: compute custom weights")
|
||||
custom_sample_weights = get_custom_sample_weights(client, server)
|
||||
logger.info(custom_sample_weights)
|
||||
else:
|
||||
logger.info("class weights: set default")
|
||||
custom_sample_weights = None
|
||||
|
||||
return custom_class_weights, custom_sample_weights
|
||||
|
||||
|
||||
def main_train(param=None):
|
||||
logger.info(f"Create model path {args.model_path}")
|
||||
exists_or_make_path(args.model_path)
|
||||
logger.info(f"Use command line arguments: {args}")
|
||||
|
||||
# data preparation
|
||||
features, labels, loss_weights = load_training_data(args.data, args.model_output, args.domain_length,
|
||||
args.window, args.model_type)
|
||||
|
||||
# call hyperband if results are not accessible
|
||||
if args.hyperband_results:
|
||||
try:
|
||||
hyper_results = joblib.load(args.hyperband_results)
|
||||
except Exception:
|
||||
logger.info("start hyperband parameter search")
|
||||
hyper_results = run_hyperband("small", features, labels, args.hyper_max_iter,
|
||||
args.hyperband_results)
|
||||
param = sorted(hyper_results, key=operator.itemgetter("loss"))[0]["params"]
|
||||
param["type"] = args.model_type
|
||||
logger.info(f"select params from result: {param}")
|
||||
if not param:
|
||||
param = PARAMS
|
||||
|
||||
# custom class or sample weights
|
||||
# TODO: should throw an error when using weights with only the client labels
|
||||
custom_class_weights, custom_sample_weights = get_weighting(args.class_weights, args.sample_weights, labels)
|
||||
|
||||
for i in range(args.runs):
|
||||
model_path = os.path.join(args.model_path, f"clf_{i}.h5")
|
||||
train_log_path = os.path.join(args.model_path, f"train_{i}.log.csv")
|
||||
# define training call backs
|
||||
logger.info("define callbacks")
|
||||
callbacks = []
|
||||
callbacks.append(ModelCheckpoint(filepath=model_path,
|
||||
monitor='loss',
|
||||
verbose=False,
|
||||
save_best_only=True))
|
||||
callbacks.append(CSVLogger(train_log_path))
|
||||
logger.info(f"Use early stopping: {args.stop_early}")
|
||||
if args.stop_early:
|
||||
callbacks.append(EarlyStopping(monitor='val_loss',
|
||||
patience=5,
|
||||
verbose=False))
|
||||
custom_metrics = models.get_metric_functions()
|
||||
|
||||
logger.info(f"Generator model with params: {param}")
|
||||
model = models.get_models_by_params(param)
|
||||
|
||||
logger.info(f"select model: {args.model_type}")
|
||||
if args.model_type == "staggered":
|
||||
logger.info("compile and pre-train server model")
|
||||
logger.info(model.get_config())
|
||||
|
||||
model.compile(optimizer='adam',
|
||||
loss='binary_crossentropy',
|
||||
loss_weights={"client": 0.0, "server": 1.0},
|
||||
metrics=['accuracy'] + custom_metrics)
|
||||
|
||||
model.summary()
|
||||
model.fit(features, labels,
|
||||
batch_size=args.batch_size,
|
||||
epochs=args.epochs,
|
||||
class_weight=custom_class_weights,
|
||||
sample_weight=custom_sample_weights)
|
||||
|
||||
logger.info("fix server model")
|
||||
model.get_layer("domain_cnn").trainable = False
|
||||
model.get_layer("domain_cnn").layer.trainable = False
|
||||
model.get_layer("dense_server").trainable = False
|
||||
model.get_layer("server").trainable = False
|
||||
loss_weights = {"client": 1.0, "server": 0.0}
|
||||
|
||||
logger.info("compile and train model")
|
||||
logger.info(model.get_config())
|
||||
model.compile(optimizer='adam',
|
||||
loss='binary_crossentropy',
|
||||
loss_weights=loss_weights,
|
||||
metrics=['accuracy'] + custom_metrics)
|
||||
|
||||
model.summary()
|
||||
model.fit(features, labels,
|
||||
batch_size=args.batch_size,
|
||||
epochs=args.epochs,
|
||||
callbacks=callbacks,
|
||||
class_weight=custom_class_weights,
|
||||
sample_weight=custom_sample_weights)
|
||||
|
||||
|
||||
def main_retrain():
|
||||
source = os.path.join(args.model_source, "clf.h5")
|
||||
destination = os.path.join(args.model_destination, "clf.h5")
|
||||
|
||||
logger.info(f"Use command line arguments: {args}")
|
||||
exists_or_make_path(args.model_destination)
|
||||
|
||||
domain_tr, flow_tr, name_tr, client_tr, server_windows_tr = dataset.load_or_generate_h5data(args.data,
|
||||
args.domain_length,
|
||||
args.window)
|
||||
logger.info("define callbacks")
|
||||
callbacks = []
|
||||
callbacks.append(ModelCheckpoint(filepath=destination,
|
||||
monitor='loss',
|
||||
verbose=False,
|
||||
save_best_only=True))
|
||||
callbacks.append(CSVLogger(args.train_log))
|
||||
logger.info(f"Use early stopping: {args.stop_early}")
|
||||
if args.stop_early:
|
||||
callbacks.append(EarlyStopping(monitor='val_loss',
|
||||
patience=5,
|
||||
verbose=False))
|
||||
|
||||
server_tr = np.max(server_windows_tr, axis=1)
|
||||
|
||||
if args.class_weights:
|
||||
logger.info("class weights: compute custom weights")
|
||||
custom_class_weights = get_custom_class_weights(client_tr.value, server_tr)
|
||||
logger.info(custom_class_weights)
|
||||
else:
|
||||
logger.info("class weights: set default")
|
||||
custom_class_weights = None
|
||||
|
||||
logger.info(f"Load pretrained model")
|
||||
embedding, model = load_model(source, custom_objects=models.get_custom_objects())
|
||||
|
||||
if args.model_type in ("inter", "staggered"):
|
||||
server_tr = np.expand_dims(server_windows_tr, 2)
|
||||
|
||||
features = {"ipt_domains": domain_tr.value, "ipt_flows": flow_tr.value}
|
||||
if args.model_output == "both":
|
||||
labels = {"client": client_tr.value, "server": server_tr}
|
||||
elif args.model_output == "client":
|
||||
labels = {"client": client_tr.value}
|
||||
elif args.model_output == "server":
|
||||
labels = {"server": server_tr}
|
||||
else:
|
||||
raise ValueError("unknown model output")
|
||||
|
||||
logger.info("re-train model")
|
||||
embedding.summary()
|
||||
model.summary()
|
||||
model.fit(features, labels,
|
||||
batch_size=args.batch_size,
|
||||
epochs=args.epochs,
|
||||
callbacks=callbacks,
|
||||
class_weight=custom_class_weights,
|
||||
initial_epoch=args.initial_epoch)
|
||||
|
||||
|
||||
def main_test():
|
||||
logger.info("load test data")
|
||||
domain_val, flow_val, _, _, _, _ = dataset.load_or_generate_raw_h5data(args.data, args.domain_length, args.window)
|
||||
logger.info("load test domains")
|
||||
domain_encs, _, _ = dataset.load_or_generate_domains(args.data, args.domain_length)
|
||||
|
||||
def get_dir(path):
|
||||
return os.path.split(os.path.normpath(path))
|
||||
|
||||
results = {}
|
||||
for model_path in args.model_paths:
|
||||
file = get_dir(model_path)[1]
|
||||
results[file] = {}
|
||||
logger.info(f"process model {model_path}")
|
||||
embd_model, clf_model = load_model(model_path, custom_objects=models.get_custom_objects())
|
||||
|
||||
pred = clf_model.predict([domain_val, flow_val],
|
||||
batch_size=args.batch_size,
|
||||
verbose=1)
|
||||
|
||||
if args.model_output == "both":
|
||||
c_pred, s_pred = pred
|
||||
results[file]["client_pred"] = c_pred
|
||||
results[file]["server_pred"] = s_pred
|
||||
elif args.model_output == "client":
|
||||
results[file]["client_pred"] = pred
|
||||
else:
|
||||
results[file]["server_pred"] = pred
|
||||
|
||||
domain_embeddings = embd_model.predict(domain_encs, batch_size=args.batch_size, verbose=1)
|
||||
results["domain_embds"] = domain_embeddings
|
||||
# store results every round - safety first!
|
||||
dataset.save_predictions(get_dir(model_path)[0], results)
|
||||
|
||||
|
||||
def main_visualization():
|
||||
def plot_model(clf_model, path):
|
||||
embd, model = load_model(clf_model, custom_objects=models.get_custom_objects())
|
||||
visualize.plot_model_as(embd, os.path.join(path, "model_embd.pdf"), shapes=False)
|
||||
visualize.plot_model_as(model, os.path.join(path, "model_clf.pdf"), shapes=False)
|
||||
|
||||
def vis(model_name, model_path, df, df_paul, aggregation, curve):
|
||||
visualize.plot_clf()
|
||||
if aggregation == "user":
|
||||
df = df.groupby(df.names).max()
|
||||
df_paul = df_paul.groupby(df_paul.names).max()
|
||||
if curve == "prc":
|
||||
visualize.plot_precision_recall(df.client_val.as_matrix(), df.client_pred.as_matrix(), model_name)
|
||||
visualize.plot_precision_recall(df_paul.client_val.as_matrix(), df_paul.client_pred.as_matrix(), "paul")
|
||||
elif curve == "roc":
|
||||
visualize.plot_roc_curve(df.client_val.as_matrix(), df.client_pred.as_matrix(), model_name)
|
||||
visualize.plot_roc_curve(df_paul.client_val.as_matrix(), df_paul.client_pred.as_matrix(), "paul")
|
||||
|
||||
visualize.plot_legend()
|
||||
visualize.plot_save("{}/{}_{}.pdf".format(model_path, aggregation, curve))
|
||||
|
||||
_, _, name_val, hits_vt, hits_trusted, server_val = dataset.load_or_generate_raw_h5data(args.data,
|
||||
args.domain_length,
|
||||
args.window)
|
||||
|
||||
results = dataset.load_predictions(args.model_path)
|
||||
df = pd.DataFrame(data={
|
||||
"names": name_val, "client_pred": results["client_pred"].flatten(),
|
||||
"hits_vt": hits_vt, "hits_trusted": hits_trusted
|
||||
})
|
||||
df["client_val"] = np.logical_or(df.hits_vt == 1.0, df.hits_trusted >= 3)
|
||||
df_user = df.groupby(df.names).max()
|
||||
|
||||
paul = dataset.load_predictions("results/paul/")
|
||||
df_paul = pd.DataFrame(data={
|
||||
"names": paul["testNames"].flatten(), "client_pred": paul["testScores"].flatten(),
|
||||
"hits_vt": paul["testLabel"].flatten(), "hits_trusted": paul["testHits"].flatten()
|
||||
})
|
||||
df_paul["client_val"] = np.logical_or(df_paul.hits_vt == 1.0, df_paul.hits_trusted >= 3)
|
||||
|
||||
logger.info("plot model")
|
||||
plot_model(args.clf_model, args.model_path)
|
||||
|
||||
# logger.info("plot training curve")
|
||||
# logs = pd.read_csv(args.train_log)
|
||||
# if "acc" in logs.keys():
|
||||
# visualize.plot_training_curve(logs, "", "{}/client_train.png".format(args.model_path))
|
||||
# elif "client_acc" in logs.keys() and "server_acc" in logs.keys():
|
||||
# visualize.plot_training_curve(logs, "client_", "{}/client_train.png".format(args.model_path))
|
||||
# visualize.plot_training_curve(logs, "server_", "{}/server_train.png".format(args.model_path))
|
||||
# else:
|
||||
# logger.warning("Error while plotting training curves")
|
||||
|
||||
logger.info("plot window prc")
|
||||
vis(args.model_name, args.model_path, df, df_paul, "window", "prc")
|
||||
logger.info("plot window roc")
|
||||
vis(args.model_name, args.model_path, df, df_paul, "window", "roc")
|
||||
logger.info("plot user prc")
|
||||
vis(args.model_name, args.model_path, df, df_paul, "user", "prc")
|
||||
logger.info("plot user roc")
|
||||
vis(args.model_name, args.model_path, df, df_paul, "user", "roc")
|
||||
|
||||
# absolute values
|
||||
visualize.plot_confusion_matrix(df.client_val.as_matrix(), df.client_pred.as_matrix().round(),
|
||||
"{}/client_cov.pdf".format(args.model_path),
|
||||
normalize=False, title="Client Confusion Matrix")
|
||||
visualize.plot_confusion_matrix(df_user.client_val.as_matrix(), df_user.client_pred.as_matrix().round(),
|
||||
"{}/user_cov.pdf".format(args.model_path),
|
||||
normalize=False, title="User Confusion Matrix")
|
||||
# normalized
|
||||
visualize.plot_confusion_matrix(df.client_val.as_matrix(), df.client_pred.as_matrix().round(),
|
||||
"{}/client_cov_norm.pdf".format(args.model_path),
|
||||
normalize=True, title="Client Confusion Matrix")
|
||||
visualize.plot_confusion_matrix(df_user.client_val.as_matrix(), df_user.client_pred.as_matrix().round(),
|
||||
"{}/user_cov_norm.pdf".format(args.model_path),
|
||||
normalize=True, title="User Confusion Matrix")
|
||||
|
||||
|
||||
def main_visualize_all():
|
||||
_, _, name_val, hits_vt, hits_trusted, server_val = dataset.load_or_generate_raw_h5data(args.data,
|
||||
args.domain_length,
|
||||
args.window)
|
||||
|
||||
def load_df(path):
|
||||
res = dataset.load_predictions(path)
|
||||
res = pd.DataFrame(data={
|
||||
"names": name_val, "client_pred": res["client_pred"].flatten(),
|
||||
"hits_vt": hits_vt, "hits_trusted": hits_trusted
|
||||
})
|
||||
res["client_val"] = np.logical_or(res.hits_vt == 1.0, res.hits_trusted >= 3)
|
||||
return res
|
||||
|
||||
dfs = [(model_args["model_name"], load_df(model_args["model_path"])) for model_args in get_model_args(args)]
|
||||
|
||||
paul = dataset.load_predictions("results/paul/")
|
||||
df_paul = pd.DataFrame(data={
|
||||
"names": paul["testNames"].flatten(), "client_pred": paul["testScores"].flatten(),
|
||||
"hits_vt": paul["testLabel"].flatten(), "hits_trusted": paul["testHits"].flatten()
|
||||
})
|
||||
df_paul["client_val"] = np.logical_or(df_paul.hits_vt == 1.0, df_paul.hits_trusted >= 3)
|
||||
|
||||
def vis(output_prefix, dfs, df_paul, aggregation, curve):
|
||||
visualize.plot_clf()
|
||||
if curve == "prc":
|
||||
for model_name, df in dfs:
|
||||
if aggregation == "user":
|
||||
df = df.groupby(df.names).max()
|
||||
visualize.plot_precision_recall(df.client_val.as_matrix(), df.client_pred.as_matrix(), model_name)
|
||||
if aggregation == "user":
|
||||
df_paul = df_paul.groupby(df_paul.names).max()
|
||||
visualize.plot_precision_recall(df_paul.client_val.as_matrix(), df_paul.client_pred.as_matrix(), "paul")
|
||||
elif curve == "roc":
|
||||
for model_name, df in dfs:
|
||||
if aggregation == "user":
|
||||
df = df.groupby(df.names).max()
|
||||
visualize.plot_roc_curve(df.client_val.as_matrix(), df.client_pred.as_matrix(), model_name)
|
||||
if aggregation == "user":
|
||||
df_paul = df_paul.groupby(df_paul.names).max()
|
||||
visualize.plot_roc_curve(df_paul.client_val.as_matrix(), df_paul.client_pred.as_matrix(), "paul")
|
||||
visualize.plot_legend()
|
||||
visualize.plot_save("{}_{}_{}.pdf".format(output_prefix, aggregation, curve))
|
||||
|
||||
logger.info("plot pr curves")
|
||||
vis(args.output_prefix, dfs, df_paul, "window", "prc")
|
||||
logger.info("plot roc curves")
|
||||
vis(args.output_prefix, dfs, df_paul, "window", "roc")
|
||||
|
||||
logger.info("plot user pr curves")
|
||||
vis(args.output_prefix, dfs, df_paul, "user", "prc")
|
||||
logger.info("plot user roc curves")
|
||||
vis(args.output_prefix, dfs, df_paul, "user", "roc")
|
||||
|
||||
|
||||
def main_visualize_all_embds():
|
||||
def load_df(path):
|
||||
res = dataset.load_predictions(path)
|
||||
return res["domain_embds"]
|
||||
|
||||
dfs = [(model_args["model_name"], load_df(model_args["model_path"])) for model_args in get_model_args(args)]
|
||||
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
|
||||
def vis2(domain_embedding, labels):
|
||||
n_levels = 7
|
||||
logger.info(f"reduction for {len(domain_embedding)} points")
|
||||
red = TruncatedSVD(n_components=2, algorithm="arpack")
|
||||
domains = red.fit_transform(domain_embedding)
|
||||
logger.info("plot kde")
|
||||
benign = domains[labels.sum(axis=1) == 0]
|
||||
# print(domains.shape)
|
||||
# print(benign.shape)
|
||||
# benign_idx
|
||||
# sns.kdeplot(domains[labels.sum(axis=1) == 0, 0], domains[labels.sum(axis=1) == 0, 1],
|
||||
# cmap="Blues", label="benign", n_levels=9, alpha=0.35, shade=True, shade_lowest=False)
|
||||
# sns.kdeplot(domains[labels[:, 1], 0], domains[labels[:, 1], 1],
|
||||
# cmap="Greens", label="server", n_levels=5, alpha=0.35, shade=True, shade_lowest=False)
|
||||
# sns.kdeplot(domains[labels[:, 0], 0], domains[labels[:, 0], 1],
|
||||
# cmap="Reds", label="client", n_levels=5, alpha=0.35, shade=True, shade_lowest=False)
|
||||
plt.scatter(benign[benign_idx, 0], benign[benign_idx, 1],
|
||||
cmap="Blues", label="benign", alpha=0.35, s=10)
|
||||
plt.scatter(domains[labels[:, 1], 0], domains[labels[:, 1], 1],
|
||||
cmap="Greens", label="server", alpha=0.35, s=10)
|
||||
plt.scatter(domains[labels[:, 0], 0], domains[labels[:, 0], 1],
|
||||
cmap="Reds", label="client", alpha=0.35, s=10)
|
||||
|
||||
return np.concatenate((domains[:1000], domains[1000:2000], domains[2000:3000]), axis=0)
|
||||
|
||||
domain_encs, _, labels = dataset.load_or_generate_domains(args.data, args.domain_length)
|
||||
|
||||
idx = np.arange(len(labels))
|
||||
client = labels[:, 0]
|
||||
server = labels[:, 1]
|
||||
benign = np.logical_not(np.logical_or(client, server))
|
||||
print(client.sum(), server.sum(), benign.sum())
|
||||
|
||||
idx = np.concatenate((
|
||||
np.random.choice(idx[client], 1000),
|
||||
np.random.choice(idx[server], 1000),
|
||||
np.random.choice(idx[benign], 6000)), axis=0)
|
||||
benign_idx = np.random.choice(np.arange(6000), 1000)
|
||||
|
||||
print(idx.shape)
|
||||
lls = labels[idx]
|
||||
|
||||
for model_name, embd in dfs:
|
||||
logger.info(f"plot embedding for {model_name}")
|
||||
visualize.plot_clf()
|
||||
embd = embd[idx]
|
||||
points = vis2(embd, lls)
|
||||
# np.savetxt("{}_{}.csv".format(args.output_prefix, model_name), points, delimiter=",")
|
||||
visualize.plot_save("{}_{}.pdf".format(args.output_prefix, model_name))
|
||||
|
||||
|
||||
def main_beta():
|
||||
domain_val, _, name_val, hits_vt, hits_trusted, server_val = dataset.load_or_generate_raw_h5data(args.data,
|
||||
args.domain_length,
|
||||
args.window)
|
||||
path, model_prefix = os.path.split(os.path.normpath(args.model_path))
|
||||
curves = {
|
||||
model_prefix: {"all": {}}
|
||||
}
|
||||
|
||||
# domains = domain_val.value.reshape(-1, 40)
|
||||
# domains = np.apply_along_axis(lambda d: dataset.decode_domain(d), 1, domains)
|
||||
|
||||
def load_df(res):
|
||||
df_server = None
|
||||
data = {
|
||||
"names": name_val, "client_pred": res["client_pred"].flatten(),
|
||||
"hits_vt": hits_vt, "hits_trusted": hits_trusted,
|
||||
}
|
||||
if "server_pred" in res:
|
||||
server = res["server_pred"] if len(res["server_pred"].shape) == 2 else res["server_pred"].max(axis=1)
|
||||
val = server_val.value.max(axis=1)
|
||||
data["server_pred"] = server.flatten()
|
||||
data["server_val"] = val.flatten()
|
||||
|
||||
# if res["server_pred"].flatten().shape == server_val.value.flatten().shape:
|
||||
# df_server = pd.DataFrame(data={
|
||||
# "server_pred": res["server_pred"].flatten(),
|
||||
# "domain": domains,
|
||||
# "server_val": server_val.value.flatten()
|
||||
# })
|
||||
|
||||
res = pd.DataFrame(data=data)
|
||||
res["client_val"] = np.logical_or(res.hits_vt == 1.0, res.hits_trusted >= 3)
|
||||
|
||||
return res, df_server
|
||||
|
||||
logger.info(f"load results from {args.model_path}")
|
||||
res = dataset.load_predictions(args.model_path)
|
||||
model_keys = sorted(filter(lambda x: x.startswith("clf"), res.keys()), key=lambda x: int(x[4:-3]))
|
||||
|
||||
client_preds = []
|
||||
server_preds = []
|
||||
server_flow_preds = []
|
||||
client_user_preds = []
|
||||
server_user_preds = []
|
||||
server_domain_preds = []
|
||||
server_domain_avg_preds = []
|
||||
for model_name in model_keys:
|
||||
logger.info(f"load model {model_name}")
|
||||
df, df_server = load_df(res[model_name])
|
||||
client_preds.append(df.client_pred.as_matrix())
|
||||
if "server_val" in df.columns:
|
||||
server_preds.append(df.server_pred.as_matrix())
|
||||
if df_server is not None:
|
||||
logger.info(f" group servers")
|
||||
server_flow_preds.append(df_server.server_pred.as_matrix())
|
||||
df_domain = df_server.groupby(df_server.domain).max()
|
||||
server_domain_preds.append(df_domain.server_pred.as_matrix())
|
||||
df_domain_avg = df_server.groupby(df_server.domain).rolling(10).mean()
|
||||
server_domain_avg_preds.append(df_domain_avg.server_pred.as_matrix())
|
||||
|
||||
curves[model_prefix][model_name] = confusion_matrix(df.client_val.as_matrix(),
|
||||
df.client_pred.as_matrix().round())
|
||||
logger.info(f" group users")
|
||||
df_user = df.groupby(df.names).max()
|
||||
client_user_preds.append(df_user.client_pred.as_matrix())
|
||||
if "server_val" in df.columns:
|
||||
server_user_preds.append(df_user.server_pred.as_matrix())
|
||||
|
||||
logger.info("compute client curves")
|
||||
curves[model_prefix]["all"]["client_window_prc"] = visualize.calc_pr_mean(df.client_val.as_matrix(), client_preds)
|
||||
curves[model_prefix]["all"]["client_window_roc"] = visualize.calc_roc_mean(df.client_val.as_matrix(), client_preds)
|
||||
curves[model_prefix]["all"]["client_user_prc"] = visualize.calc_pr_mean(df_user.client_val.as_matrix(),
|
||||
client_user_preds)
|
||||
curves[model_prefix]["all"]["client_user_roc"] = visualize.calc_roc_mean(df_user.client_val.as_matrix(),
|
||||
client_user_preds)
|
||||
|
||||
if "server_val" in df.columns:
|
||||
logger.info("compute server curves")
|
||||
curves[model_prefix]["all"]["server_window_prc"] = visualize.calc_pr_mean(df.server_val.as_matrix(),
|
||||
server_preds)
|
||||
curves[model_prefix]["all"]["server_window_roc"] = visualize.calc_roc_mean(df.server_val.as_matrix(),
|
||||
server_preds)
|
||||
curves[model_prefix]["all"]["server_user_prc"] = visualize.calc_pr_mean(df_user.server_val.as_matrix(),
|
||||
server_user_preds)
|
||||
|
||||
curves[model_prefix]["all"]["server_user_roc"] = visualize.calc_roc_mean(df_user.server_val.as_matrix(),
|
||||
server_user_preds)
|
||||
|
||||
if df_server is not None:
|
||||
logger.info("compute server flow curves")
|
||||
curves[model_prefix]["all"]["server_flow_prc"] = visualize.calc_pr_mean(df_server.server_val.as_matrix(),
|
||||
server_flow_preds)
|
||||
curves[model_prefix]["all"]["server_flow_roc"] = visualize.calc_roc_mean(df_server.server_val.as_matrix(),
|
||||
server_flow_preds)
|
||||
curves[model_prefix]["all"]["server_domain_prc"] = visualize.calc_pr_mean(df_domain.server_val.as_matrix(),
|
||||
server_domain_preds)
|
||||
curves[model_prefix]["all"]["server_domain_roc"] = visualize.calc_roc_mean(df_domain.server_val.as_matrix(),
|
||||
server_domain_preds)
|
||||
curves[model_prefix]["all"]["server_domain_avg_prc"] = visualize.calc_pr_mean(
|
||||
df_domain_avg.server_val.as_matrix(),
|
||||
server_domain_avg_preds)
|
||||
curves[model_prefix]["all"]["server_domain_avg_roc"] = visualize.calc_roc_mean(
|
||||
df_domain_avg.server_val.as_matrix(),
|
||||
server_domain_avg_preds)
|
||||
|
||||
joblib.dump(curves, f"{args.model_path}_curves.joblib")
|
||||
try:
|
||||
curves_all: dict = joblib.load(f"{path}/curves.joblib")
|
||||
logger.info(f"load file {path}/curves.joblib successfully")
|
||||
curves_all[model_prefix] = curves[model_prefix]
|
||||
except Exception:
|
||||
curves_all = curves
|
||||
logger.info(f"currently {len(curves_all)} models in file: {curves_all.keys()}")
|
||||
joblib.dump(curves_all, f"{path}/curves.joblib")
|
||||
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("agg")
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def plot_overall_result():
|
||||
path, model_prefix = os.path.split(os.path.normpath(args.model_path))
|
||||
exists_or_make_path(f"{path}/figs/curves/")
|
||||
try:
|
||||
results = joblib.load(f"{path}/curves.joblib")
|
||||
logger.info("curves successfully loaded")
|
||||
except Exception:
|
||||
results = {}
|
||||
|
||||
x = np.linspace(0, 1, 10000)
|
||||
for vis in ["client_window_prc", "client_window_roc", "client_user_prc", "client_user_roc",
|
||||
"server_window_prc", "server_window_roc", "server_user_prc", "server_user_roc",
|
||||
"server_flow_prc", "server_flow_roc", "server_domain_prc", "server_domain_roc"]:
|
||||
logger.info(f"plot {vis}")
|
||||
visualize.plot_clf()
|
||||
for model_key in results.keys():
|
||||
if vis not in results[model_key]["all"]:
|
||||
continue
|
||||
if "final" in model_key and vis.startswith("server_flow"):
|
||||
continue
|
||||
ys_mean, ys_std, ys = results[model_key]["all"][vis]
|
||||
plt.plot(x, ys_mean, label=f"{model_key} - {np.mean(ys_mean):5.4} ({np.mean(ys_std):4.3})")
|
||||
plt.fill_between(x, ys_mean - ys_std, ys_mean + ys_std, alpha=0.2)
|
||||
if vis.endswith("prc"):
|
||||
plt.xlabel('Recall')
|
||||
plt.ylabel('Precision')
|
||||
else:
|
||||
plt.plot(x, x, label="random classifier", ls="--", c=".3", alpha=0.4)
|
||||
plt.xlabel('False Positive Rate')
|
||||
plt.ylabel('True Positive Rate')
|
||||
plt.xscale('log')
|
||||
plt.ylim([0.0, 1.0])
|
||||
plt.xlim([0.0, 1.0])
|
||||
visualize.plot_legend()
|
||||
visualize.plot_save(f"{path}/figs/curves/{vis}_all.pdf")
|
||||
return
|
||||
|
||||
for vis in ["client_window_prc", "client_window_roc", "client_user_prc", "client_user_roc",
|
||||
"server_window_prc", "server_window_roc", "server_user_prc", "server_user_roc",
|
||||
"server_flow_prc", "server_flow_roc", "server_domain_prc", "server_domain_roc"]:
|
||||
logger.info(f"plot {vis}")
|
||||
visualize.plot_clf()
|
||||
for model_key in results.keys():
|
||||
if vis not in results[model_key]["all"]:
|
||||
continue
|
||||
if "final" in model_key and vis.startswith("server_flow"):
|
||||
continue
|
||||
_, _, ys = results[model_key]["all"][vis]
|
||||
for y in ys:
|
||||
plt.plot(x, y, label=f"{model_key} - {np.mean(y):5.4}")
|
||||
if vis.endswith("prc"):
|
||||
plt.xlabel('Recall')
|
||||
plt.ylabel('Precision')
|
||||
else:
|
||||
plt.xlabel('False Positive Rate')
|
||||
plt.ylabel('True Positive Rate')
|
||||
plt.xscale('log')
|
||||
plt.ylim([0.0, 1.0])
|
||||
plt.xlim([0.0, 1.0])
|
||||
visualize.plot_legend()
|
||||
visualize.plot_save(f"{path}/figs/Appendices/{model_key}_{vis}.pdf")
|
||||
|
||||
|
||||
def main_stats():
|
||||
path, model_prefix = os.path.split(os.path.normpath(args.output_prefix))
|
||||
|
||||
for time in ("current", "future"):
|
||||
df = dataset.get_user_flow_data(f"data/{time}Data.csv.gz")
|
||||
df["clientlabel"] = np.logical_or(df.virusTotalHits > 3, df.trustedHits > 0)
|
||||
# df_user = df.groupby(df.user_hash).max()
|
||||
# df_server = df.groupby(df.domain).max()
|
||||
|
||||
# len(df)
|
||||
# df.clientlabel.sum()
|
||||
# df.serverLabel.sum()
|
||||
|
||||
for col in ["duration", "bytes_down", "bytes_up"]:
|
||||
# visualize.plot_clf()
|
||||
plt.clf()
|
||||
plt.hist(df[col])
|
||||
visualize.plot_save(f"{path}/figs/hist_{time}_{col}.pdf")
|
||||
print(".")
|
||||
# visualize.plot_clf()
|
||||
plt.clf()
|
||||
plt.hist(np.log1p(df[col]))
|
||||
visualize.plot_save(f"{path}/figs/hist_{time}_norm_{col}.pdf")
|
||||
print("-")
|
||||
|
||||
|
||||
def main_stats2():
|
||||
import joblib
|
||||
res = joblib.load("results/variance_test_hyper/curves.joblib")
|
||||
|
||||
for vis in ["client_window_prc", "client_window_roc", "client_user_prc", "client_user_roc",
|
||||
"server_window_prc", "server_window_roc", "server_user_prc", "server_user_roc",
|
||||
"server_flow_prc", "server_flow_roc", "server_domain_prc", "server_domain_roc",
|
||||
"server_domain_avg_prc", "server_domain_avg_roc"]:
|
||||
tab = []
|
||||
for m, r in res.items():
|
||||
if vis not in r: continue
|
||||
tab.append(r["all"][vis][2].mean(axis=1))
|
||||
if not tab: continue
|
||||
|
||||
df = pd.DataFrame(data=np.vstack(tab).T, columns=list(res.keys()),
|
||||
index=range(1, 21))
|
||||
df.to_csv(f"{vis}.csv")
|
||||
|
||||
print(f"% {vis}")
|
||||
print(df.round(4).to_latex())
|
||||
print()
|
||||
|
||||
|
||||
def main():
|
||||
# parameter
|
||||
innerCNNFilters = 512
|
||||
innerCNNKernelSize = 2
|
||||
cnnDropout = 0.5
|
||||
cnnHiddenDims = 1024
|
||||
domainFeatures = 512
|
||||
flowFeatures = 3
|
||||
numCiscoFeatures = 30
|
||||
windowSize = 10
|
||||
maxLen = 40
|
||||
embeddingSize = 100
|
||||
kernel_size = 2
|
||||
drop_out = 0.5
|
||||
filters = 2
|
||||
hidden_dims = 100
|
||||
vocabSize = 40
|
||||
threshold = 3
|
||||
minFlowsPerUser = 10
|
||||
numEpochs = 100
|
||||
|
||||
char_dict = dataset.get_character_dict()
|
||||
user_flow_df = dataset.get_user_flow_data()
|
||||
|
||||
print("create training dataset")
|
||||
(X_tr, hits_tr, names_tr, server_tr, trusted_hits_tr) = dataset.create_dataset_from_flows(
|
||||
user_flow_df, char_dict,
|
||||
max_len=maxLen, window_size=windowSize)
|
||||
# make client labels discrete with 4 different values
|
||||
# TODO: use trusted_hits_tr for client classification too
|
||||
client_labels = np.apply_along_axis(lambda x: dataset.discretize_label(x, 3), 0, np.atleast_2d(hits_tr))
|
||||
# select only 1.0 and 0.0 from training data
|
||||
pos_idx = np.where(client_labels == 1.0)[0]
|
||||
neg_idx = np.where(client_labels == 0.0)[0]
|
||||
idx = np.concatenate((pos_idx, neg_idx))
|
||||
# select labels for prediction
|
||||
client_labels = client_labels[idx]
|
||||
server_labels = server_tr[idx]
|
||||
|
||||
shared_cnn = models.get_shared_cnn(len(char_dict) + 1, embeddingSize, maxLen,
|
||||
domainFeatures, kernel_size, domainFeatures, 0.5)
|
||||
|
||||
model = models.get_top_cnn(shared_cnn, flowFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size,
|
||||
cnnHiddenDims, cnnDropout)
|
||||
|
||||
model.compile(optimizer='adam',
|
||||
loss='binary_crossentropy',
|
||||
metrics=['accuracy'])
|
||||
|
||||
client_labels = np_utils.to_categorical(client_labels, 2)
|
||||
server_labels = np_utils.to_categorical(server_labels, 2)
|
||||
model.fit(X_tr,
|
||||
[client_labels, server_labels],
|
||||
batch_size=args.batch_size,
|
||||
epochs=args.epochs,
|
||||
shuffle=True)
|
||||
# TODO: for validation we use future data -> validation_data=(testData,testLabel))
|
||||
if "train" == args.mode:
|
||||
main_train()
|
||||
if "retrain" == args.mode:
|
||||
main_retrain()
|
||||
if "hyperband" == args.mode:
|
||||
main_hyperband(args.data, args.domain_length, args.window, args.model_type, args.hyperband_results,
|
||||
args.hyper_max_iter)
|
||||
if "test" == args.mode:
|
||||
main_test()
|
||||
if "beta" == args.mode:
|
||||
main_beta()
|
||||
if "all_beta" == args.mode:
|
||||
plot_overall_result()
|
||||
if "embedding" == args.mode:
|
||||
main_visualize_all_embds()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
41
models.py
41
models.py
|
@ -1,41 +0,0 @@
|
|||
import keras
|
||||
from keras.engine import Input, Model
|
||||
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, TimeDistributed
|
||||
|
||||
|
||||
def get_shared_cnn(vocab_size, embedding_size, input_length, filters, kernel_size,
|
||||
hidden_dims, drop_out):
|
||||
x = y = Input(shape=(input_length,))
|
||||
y = Embedding(input_dim=vocab_size, output_dim=embedding_size)(y)
|
||||
y = Conv1D(filters, kernel_size, activation='relu')(y)
|
||||
y = GlobalMaxPooling1D()(y)
|
||||
y = Dense(hidden_dims)(y)
|
||||
y = Dropout(drop_out)(y)
|
||||
y = Activation('relu')(y)
|
||||
return Model(x, y)
|
||||
|
||||
|
||||
def get_full_model(vocabSize, embeddingSize, maxLen, domainFeatures, flowFeatures,
|
||||
filters, h1, h2, dropout, dense):
|
||||
pass
|
||||
|
||||
|
||||
def get_top_cnn(cnn, numFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size, cnnHiddenDims, cnnDropout):
|
||||
ipt_domains = Input(shape=(windowSize, maxLen), name="ipt_domains")
|
||||
encoded = TimeDistributed(cnn)(ipt_domains)
|
||||
ipt_flows = Input(shape=(windowSize, numFeatures), name="ipt_flows")
|
||||
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
|
||||
# add second cnn
|
||||
y = Conv1D(filters,
|
||||
kernel_size,
|
||||
activation='relu',
|
||||
input_shape=(windowSize, domainFeatures + numFeatures))(merged)
|
||||
# TODO: why global pooling? -> 3D to 2D
|
||||
# we use max pooling:
|
||||
y = GlobalMaxPooling1D()(y)
|
||||
y = Dropout(cnnDropout)(y)
|
||||
y = Dense(cnnHiddenDims, activation='relu')(y)
|
||||
y1 = Dense(2, activation='softmax', name="client")(y)
|
||||
y2 = Dense(2, activation='softmax', name="server")(y)
|
||||
|
||||
return Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2))
|
|
@ -0,0 +1,117 @@
|
|||
from collections import namedtuple
|
||||
|
||||
from keras.models import Model
|
||||
|
||||
from . import networks
|
||||
from .metrics import *
|
||||
|
||||
NetworkParameters = namedtuple("NetworkParameters", [
|
||||
"type", "flow_features", "window_size", "domain_length", "output",
|
||||
"embedding_size",
|
||||
"domain_filter", "domain_kernel", "domain_dense", "domain_dropout",
|
||||
"main_filter", "main_kernel", "main_dense", "main_dropout",
|
||||
])
|
||||
|
||||
|
||||
def create_model(model, output_type):
|
||||
if output_type == "both":
|
||||
return Model(inputs=[model.in_domains, model.in_flows], outputs=(model.out_client, model.out_server))
|
||||
elif output_type == "client":
|
||||
return Model(inputs=[model.in_domains, model.in_flows], outputs=(model.out_client,))
|
||||
else:
|
||||
raise Exception("unknown model output")
|
||||
|
||||
|
||||
def get_models_by_params(params: dict):
|
||||
K.clear_session()
|
||||
# decomposing param section
|
||||
# mainly embedding model
|
||||
embedding_type = params.get("embedding_type", "small")
|
||||
network_type = params.get("type")
|
||||
# network_depth = params.get("depth")
|
||||
embedding_size = params.get("embedding")
|
||||
filter_embedding = params.get("filter_embedding")
|
||||
kernel_embedding = params.get("kernel_embedding")
|
||||
hidden_embedding = params.get("dense_embedding")
|
||||
# dropout = params.get("dropout")
|
||||
# mainly prediction model
|
||||
flow_features = params.get("flow_features")
|
||||
window_size = params.get("window_size")
|
||||
domain_length = params.get("domain_length")
|
||||
filter_main = params.get("filter_main")
|
||||
kernel_main = params.get("kernel_main")
|
||||
dense_dim = params.get("dense_main")
|
||||
model_output = params.get("model_output", "both")
|
||||
|
||||
if embedding_type == "small":
|
||||
domain_cnn = networks.get_domain_embedding_model(embedding_size, domain_length, filter_embedding,
|
||||
kernel_embedding, hidden_embedding, 0.5)
|
||||
elif embedding_type == "deep":
|
||||
domain_cnn = networks.get_domain_embedding_model2(embedding_size, domain_length, filter_embedding,
|
||||
kernel_embedding, hidden_embedding, 0.5)
|
||||
else:
|
||||
raise ValueError("embedding type not found")
|
||||
|
||||
if network_type == "final":
|
||||
model = networks.get_final_model(0.25, flow_features, window_size, domain_length,
|
||||
filter_main, kernel_main, dense_dim, domain_cnn)
|
||||
model = create_model(model, model_output)
|
||||
elif network_type in ("inter", "staggered"):
|
||||
model = networks.get_inter_model(0.25, flow_features, window_size, domain_length,
|
||||
filter_main, kernel_main, dense_dim, domain_cnn)
|
||||
model = create_model(model, model_output)
|
||||
elif network_type == "long":
|
||||
model = networks.get_long_model(0.25, flow_features, window_size, domain_length,
|
||||
filter_main, kernel_main, dense_dim, domain_cnn)
|
||||
model = create_model(model, model_output)
|
||||
elif network_type == "soft":
|
||||
model = networks.get_long_model(0.25, flow_features, window_size, domain_length,
|
||||
filter_main, kernel_main, dense_dim, domain_cnn)
|
||||
model = create_model(model, model_output)
|
||||
conv_server = model.get_layer("conv_server").trainable_weights
|
||||
conv_client = model.get_layer("conv_client").trainable_weights
|
||||
l1 = [0.001 * K.sum(K.abs(x - y)) for (x, y) in zip(conv_server, conv_client)]
|
||||
model.add_loss(l1)
|
||||
|
||||
dense_server = model.get_layer("dense_server").trainable_weights
|
||||
dense_client = model.get_layer("dense_client").trainable_weights
|
||||
l2 = [0.001 * K.sum(K.abs(x - y)) for (x, y) in zip(dense_server, dense_client)]
|
||||
model.add_loss(l2)
|
||||
elif network_type == "sluice":
|
||||
model = networks.get_sluice_model(0.25, flow_features, window_size, domain_length,
|
||||
filter_main, kernel_main, dense_dim, domain_cnn)
|
||||
model = create_model(model, model_output)
|
||||
conv_server = model.get_layer("conv_server").trainable_weights
|
||||
conv_client = model.get_layer("conv_client").trainable_weights
|
||||
l1 = [0.001 * K.sum(K.abs(x - y)) for (x, y) in zip(conv_server, conv_client)]
|
||||
model.add_loss(l1)
|
||||
|
||||
dense_server = model.get_layer("dense_server").trainable_weights
|
||||
dense_client = model.get_layer("dense_client").trainable_weights
|
||||
l2 = [0.001 * K.sum(K.abs(x - y)) for (x, y) in zip(dense_server, dense_client)]
|
||||
model.add_loss(l2)
|
||||
else:
|
||||
raise ValueError("network type not found")
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def get_server_model_by_params(params: dict):
|
||||
# decomposing param section
|
||||
# mainly embedding model
|
||||
network_depth = params.get("depth")
|
||||
embedding_size = params.get("embedding")
|
||||
input_length = params.get("input_length")
|
||||
filter_embedding = params.get("filter_embedding")
|
||||
kernel_embedding = params.get("kernel_embedding")
|
||||
hidden_embedding = params.get("dense_embedding")
|
||||
# mainly prediction model
|
||||
flow_features = params.get("flow_features")
|
||||
domain_length = params.get("domain_length")
|
||||
dense_dim = params.get("dense_main")
|
||||
|
||||
embedding_model = networks.get_domain_embedding_model(embedding_size, input_length, filter_embedding,
|
||||
kernel_embedding,
|
||||
hidden_embedding, 0.5)
|
||||
|
||||
return networks.get_server_model(flow_features, domain_length, dense_dim, embedding_model)
|
|
@ -0,0 +1,64 @@
|
|||
import keras.backend as K
|
||||
from keras.activations import elu
|
||||
|
||||
|
||||
def get_custom_objects():
|
||||
return dict([
|
||||
("precision", precision),
|
||||
("recall", recall),
|
||||
("f1_score", f1_score),
|
||||
("selu", selu)
|
||||
])
|
||||
|
||||
|
||||
def selu(x):
|
||||
"""Scaled Exponential Linear Unit. (Klambauer et al., 2017)
|
||||
# Arguments
|
||||
x: A tensor or variable to compute the activation function for.
|
||||
# References
|
||||
- [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
|
||||
# copied from keras.io
|
||||
"""
|
||||
alpha = 1.6732632423543772848170429916717
|
||||
scale = 1.0507009873554804934193349852946
|
||||
return scale * elu(x, alpha)
|
||||
|
||||
|
||||
def get_metric_functions():
|
||||
return [precision, recall, f1_score]
|
||||
|
||||
|
||||
def precision(y_true, y_pred):
|
||||
# Count positive samples.
|
||||
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
|
||||
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
|
||||
return true_positives / (predicted_positives + K.epsilon())
|
||||
|
||||
|
||||
def recall(y_true, y_pred):
|
||||
# Count positive samples.
|
||||
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
|
||||
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
|
||||
return true_positives / (possible_positives + K.epsilon())
|
||||
|
||||
|
||||
def f1_score(y_true, y_pred):
|
||||
return f_score(1)(y_true, y_pred)
|
||||
|
||||
|
||||
def f05_score(y_true, y_pred):
|
||||
return f_score(0.5)(y_true, y_pred)
|
||||
|
||||
|
||||
def f_score(beta):
|
||||
def _f(y_true, y_pred):
|
||||
p = precision(y_true, y_pred)
|
||||
r = recall(y_true, y_pred)
|
||||
|
||||
bb = beta ** 2
|
||||
|
||||
fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
|
||||
|
||||
return fbeta_score
|
||||
|
||||
return _f
|
|
@ -0,0 +1,249 @@
|
|||
from collections import namedtuple
|
||||
|
||||
import keras
|
||||
import keras.backend as K
|
||||
import numpy as np
|
||||
from keras.engine import Input, Model as KerasModel
|
||||
from keras.engine.topology import Layer
|
||||
from keras.layers import Conv1D, Dense, Dropout, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, TimeDistributed
|
||||
from keras.regularizers import Regularizer
|
||||
|
||||
import dataset
|
||||
|
||||
Model = namedtuple("Model", ["in_domains", "in_flows", "out_client", "out_server"])
|
||||
|
||||
|
||||
def get_domain_embedding_model(embedding_size, input_length, filter_size, kernel_size, hidden_dims,
|
||||
drop_out=0.5) -> KerasModel:
|
||||
x = y = Input(shape=(input_length,))
|
||||
y = Embedding(input_dim=dataset.get_vocab_size(), output_dim=embedding_size)(y)
|
||||
y = Conv1D(filter_size,
|
||||
kernel_size,
|
||||
activation='relu')(y)
|
||||
y = GlobalMaxPooling1D()(y)
|
||||
y = Dropout(drop_out)(y)
|
||||
y = Dense(hidden_dims, activation="relu")(y)
|
||||
return KerasModel(x, y)
|
||||
|
||||
|
||||
def get_domain_embedding_model2(embedding_size, input_length, filter_size, kernel_size, hidden_dims,
|
||||
drop_out=0.5) -> KerasModel:
|
||||
x = y = Input(shape=(input_length,))
|
||||
y = Embedding(input_dim=dataset.get_vocab_size(), output_dim=embedding_size)(y)
|
||||
y = Conv1D(filter_size,
|
||||
kernel_size,
|
||||
activation='relu')(y)
|
||||
y = Conv1D(filter_size,
|
||||
kernel_size,
|
||||
activation='relu')(y)
|
||||
y = Conv1D(filter_size,
|
||||
kernel_size,
|
||||
activation='relu')(y)
|
||||
y = GlobalAveragePooling1D()(y)
|
||||
y = Dense(hidden_dims, activation="relu")(y)
|
||||
return KerasModel(x, y)
|
||||
|
||||
|
||||
def get_final_model(cnnDropout, flow_features, window_size, domain_length, cnn_dims, kernel_size,
|
||||
dense_dim, cnn) -> Model:
|
||||
ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
|
||||
encoded = TimeDistributed(cnn, name="domain_cnn")(ipt_domains)
|
||||
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
|
||||
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
|
||||
# CNN processing a small slides of flow windows
|
||||
y = Conv1D(cnn_dims,
|
||||
kernel_size,
|
||||
activation='relu')(merged)
|
||||
# remove temporal dimension by global max pooling
|
||||
y = GlobalMaxPooling1D()(y)
|
||||
y = Dropout(cnnDropout)(y)
|
||||
y = Dense(dense_dim, activation='relu')(y)
|
||||
out_client = Dense(1, activation='sigmoid', name="client")(y)
|
||||
out_server = Dense(1, activation='sigmoid', name="server")(y)
|
||||
|
||||
return Model(ipt_domains, ipt_flows, out_client, out_server)
|
||||
|
||||
|
||||
def get_inter_model(dropout, flow_features, window_size, domain_length, cnn_dims, kernel_size,
|
||||
dense_dim, cnn) -> Model:
|
||||
ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
|
||||
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
|
||||
encoded = TimeDistributed(cnn, name="domain_cnn")(ipt_domains)
|
||||
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
|
||||
y = Dense(dense_dim,
|
||||
activation="relu",
|
||||
name="dense_server")(merged)
|
||||
out_server = Dense(1, activation="sigmoid", name="server")(y)
|
||||
merged = keras.layers.concatenate([merged,
|
||||
y], -1)
|
||||
# CNN processing a small slides of flow windows
|
||||
y = Conv1D(cnn_dims,
|
||||
kernel_size,
|
||||
activation='relu')(merged)
|
||||
# remove temporal dimension by global max pooling
|
||||
y = GlobalMaxPooling1D()(y)
|
||||
y = Dropout(dropout)(y)
|
||||
y = Dense(dense_dim,
|
||||
activation='relu',
|
||||
name="dense_client")(y)
|
||||
|
||||
out_client = Dense(1, activation='sigmoid', name="client")(y)
|
||||
|
||||
return Model(ipt_domains, ipt_flows, out_client, out_server)
|
||||
|
||||
|
||||
def get_server_model(flow_features, domain_length, dense_dim, cnn):
|
||||
ipt_domains = Input(shape=(domain_length,), name="ipt_domains")
|
||||
ipt_flows = Input(shape=(flow_features,), name="ipt_flows")
|
||||
encoded = cnn(ipt_domains)
|
||||
cnn.name = "domain_cnn"
|
||||
|
||||
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
|
||||
y = Dense(dense_dim,
|
||||
activation="relu",
|
||||
name="dense_server")(merged)
|
||||
out_server = Dense(1, activation="sigmoid", name="server")(y)
|
||||
|
||||
return KerasModel(inputs=[ipt_domains, ipt_flows], outputs=out_server)
|
||||
|
||||
|
||||
def get_long_model(dropout, flow_features, window_size, domain_length, cnn_dims, kernel_size,
|
||||
dense_dim, cnn) -> Model:
|
||||
ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
|
||||
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
|
||||
encoded = TimeDistributed(cnn, name="domain_cnn")(ipt_domains)
|
||||
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
|
||||
y = Conv1D(cnn_dims,
|
||||
kernel_size,
|
||||
activation='relu', name="conv_server")(merged)
|
||||
# remove temporal dimension by global max pooling
|
||||
y = GlobalMaxPooling1D()(y)
|
||||
y = Dropout(dropout)(y)
|
||||
y = Dense(dense_dim,
|
||||
activation="relu",
|
||||
name="dense_server")(y)
|
||||
out_server = Dense(1, activation="sigmoid", name="server")(y)
|
||||
# CNN processing a small slides of flow windows
|
||||
y = Conv1D(cnn_dims,
|
||||
kernel_size,
|
||||
activation='relu', name="conv_client")(merged)
|
||||
# remove temporal dimension by global max pooling
|
||||
y = GlobalMaxPooling1D()(y)
|
||||
y = Dropout(dropout)(y)
|
||||
y = Dense(dense_dim,
|
||||
activation='relu',
|
||||
name="dense_client")(y)
|
||||
|
||||
out_client = Dense(1, activation='sigmoid', name="client")(y)
|
||||
|
||||
return Model(ipt_domains, ipt_flows, out_client, out_server)
|
||||
|
||||
|
||||
class CrossStitch2(Layer):
|
||||
def __init__(self, **kwargs):
|
||||
super(CrossStitch2, self).__init__(**kwargs)
|
||||
|
||||
def build(self, input_shape):
|
||||
# Create a trainable weight variable for this layer.
|
||||
self.s = self.add_weight(name='cross-stitch-s',
|
||||
shape=(1,),
|
||||
initializer='uniform',
|
||||
trainable=True)
|
||||
self.d = self.add_weight(name='cross-stitch-d',
|
||||
shape=(1,),
|
||||
initializer='uniform',
|
||||
trainable=True)
|
||||
super(CrossStitch2, self).build(input_shape)
|
||||
|
||||
def call(self, xs):
|
||||
x1, x2 = xs
|
||||
out = x1 * self.s + x2 * self.d
|
||||
return out
|
||||
|
||||
def compute_output_shape(self, input_shape):
|
||||
return input_shape[0]
|
||||
|
||||
|
||||
class CrossStitchMix2(Layer):
|
||||
def __init__(self, **kwargs):
|
||||
super(CrossStitchMix2, self).__init__(**kwargs)
|
||||
|
||||
def build(self, input_shape):
|
||||
# Create a trainable weight variable for this layer.
|
||||
self.s = self.add_weight(name='cross-stitch-s',
|
||||
shape=(1,),
|
||||
initializer='uniform',
|
||||
trainable=True)
|
||||
self.d = self.add_weight(name='cross-stitch-d',
|
||||
shape=(1,),
|
||||
initializer='uniform',
|
||||
trainable=True)
|
||||
super(CrossStitchMix2, self).build(input_shape)
|
||||
|
||||
def call(self, xs):
|
||||
x1, x2 = xs
|
||||
out = K.concatenate((x1 * self.s, x2 * self.d), axis=-1)
|
||||
return out
|
||||
|
||||
def compute_output_shape(self, input_shape):
|
||||
return (input_shape[0][0], input_shape[0][1] + input_shape[1][1])
|
||||
|
||||
|
||||
class L21(Regularizer):
|
||||
"""Regularizer for L21 regularization.
|
||||
Found at: https://bitbucket.org/ispamm/group-lasso-for-neural-networks-tensorflow-keras
|
||||
# Arguments
|
||||
C: Float; L21 regularization factor.
|
||||
"""
|
||||
|
||||
def __init__(self, C=0.):
|
||||
self.C = K.cast_to_floatx(C)
|
||||
|
||||
def __call__(self, x):
|
||||
const_coeff = np.sqrt(K.int_shape(x)[1])
|
||||
return self.C * const_coeff * K.sum(K.sqrt(K.sum(K.square(x), axis=1)))
|
||||
|
||||
def get_config(self):
|
||||
return {'C': float(self.C)}
|
||||
|
||||
|
||||
def get_sluice_model(dropout, flow_features, window_size, domain_length, cnn_dims, kernel_size,
|
||||
dense_dim, cnn) -> Model:
|
||||
ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
|
||||
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
|
||||
encoded = TimeDistributed(cnn, name="domain_cnn")(ipt_domains)
|
||||
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
|
||||
y1 = Conv1D(cnn_dims,
|
||||
kernel_size,
|
||||
activation='relu', name="conv_server")(merged)
|
||||
y1 = GlobalMaxPooling1D()(y1)
|
||||
|
||||
y2 = Conv1D(cnn_dims,
|
||||
kernel_size,
|
||||
activation='relu', name="conv_client")(merged)
|
||||
y2 = GlobalMaxPooling1D()(y2)
|
||||
|
||||
c11 = CrossStitch2()([y1, y2])
|
||||
c12 = CrossStitch2()([y1, y2])
|
||||
|
||||
y1 = Dropout(dropout)(c11)
|
||||
y1 = Dense(dense_dim,
|
||||
activation="relu",
|
||||
name="dense_server")(y1)
|
||||
|
||||
y2 = Dropout(dropout)(c12)
|
||||
y2 = Dense(dense_dim,
|
||||
activation='relu',
|
||||
name="dense_client")(y2)
|
||||
|
||||
c21 = CrossStitch2()([y1, y2])
|
||||
c22 = CrossStitch2()([y1, y2])
|
||||
|
||||
beta1 = CrossStitchMix2()([c11, c21])
|
||||
beta2 = CrossStitchMix2()([c12, c22])
|
||||
|
||||
out_server = Dense(1, activation="sigmoid", name="server")(beta1)
|
||||
|
||||
out_client = Dense(1, activation='sigmoid', name="client")(beta2)
|
||||
|
||||
return Model(ipt_domains, ipt_flows, out_client, out_server)
|
|
@ -0,0 +1,16 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
SRC=$1
|
||||
DEST=$2
|
||||
DATADIR=$3
|
||||
INIT=$4
|
||||
EPOCHS=$5
|
||||
BS=128
|
||||
|
||||
for i in `ls -d $SRC*/`
|
||||
do
|
||||
echo "retrain model in ${i}"
|
||||
name=$(basename $i)
|
||||
python3 main.py --mode retrain --model_src ${i} --model_dest ${DEST}/${name} --init_epoch $INIT --epochs $EPOCHS --batch $BS --train ${DATADIR}
|
||||
|
||||
done
|
|
@ -0,0 +1,53 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
|
||||
RESDIR=$1
|
||||
mkdir -p /tmp/rk/${RESDIR}
|
||||
DATADIR=$2
|
||||
|
||||
EPOCHS=10
|
||||
|
||||
for output in client both
|
||||
do
|
||||
for depth in small
|
||||
do
|
||||
for mtype in inter final
|
||||
do
|
||||
|
||||
python main.py --mode train \
|
||||
--train ${DATADIR}/currentData.csv \
|
||||
--model ${RESDIR}/${output}_${depth}_${mtype} \
|
||||
--epochs $EPOCHS \
|
||||
--embd 128 \
|
||||
--filter_embd 256 --kernel_embd 8 --dense_embd 128 \
|
||||
--domain_embd 32 \
|
||||
--filter_main 32 --kernel_main 8 --dense_main 1024 \
|
||||
--batch 256 \
|
||||
--balanced_weights \
|
||||
--model_output ${output} \
|
||||
--type ${mtype} \
|
||||
--depth ${depth}
|
||||
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
for depth in small
|
||||
do
|
||||
python main.py --mode train \
|
||||
--train ${DATADIR}/currentData.csv \
|
||||
--model ${RESDIR}/both_${depth}_staggered \
|
||||
--epochs $EPOCHS \
|
||||
--embd 128 \
|
||||
--filter_embd 256 --kernel_embd 8 --dense_embd 128 \
|
||||
--domain_embd 32 \
|
||||
--filter_main 32 --kernel_main 8 --dense_main 1024 \
|
||||
--batch 256 \
|
||||
--balanced_weights \
|
||||
--model_output both \
|
||||
--type staggered \
|
||||
--depth ${depth}
|
||||
done
|
||||
|
||||
# python main.py --mode train --epochs 100 --embd 64 --filter_embd 128 --kernel_embd 5 --dense_embd 128 --domain_embd 32 --filter_main 32 --kernel_main 5 --dense_main 512 --batch 256 --balanced_weights --model_output ${output} --type ${mtype} --depth ${depth} --train ${DATADIR}/currentData.csv --model ${RESDIR}/${output}_${depth}_${mtype}
|
||||
# python main.py --mode train --epochs 100 --embd 64 --filter_embd 128 --kernel_embd 5 --dense_embd 128 --domain_embd 32 --filter_main 32 --kernel_main 5 --dense_main 512 --batch 256 --balanced_weights --model_output client --type final --depth small --train /tmp/rk/data/currentData.csv --model /tmp/rk/results/paul3/client_final
|
|
@ -0,0 +1,30 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
|
||||
N1=$1
|
||||
N2=$2
|
||||
OUTPUT=$3
|
||||
DEPTH=$4
|
||||
TYPE=$5
|
||||
RESDIR=$6
|
||||
mkdir -p /tmp/rk/${RESDIR}
|
||||
DATADIR=$7
|
||||
|
||||
EPOCHS=10
|
||||
|
||||
for ((i = ${N1}; i <= ${N2}; i++))
|
||||
do
|
||||
python main.py --mode train \
|
||||
--data ${DATADIR} \
|
||||
--model ${RESDIR}/${OUTPUT}_${TYPE}_${i} \
|
||||
--epochs ${EPOCHS} \
|
||||
--embd 128 \
|
||||
--filter_embd 256 --kernel_embd 8 --dense_embd 128 \
|
||||
--domain_embd 32 \
|
||||
--filter_main 32 --kernel_main 8 --dense_main 1024 \
|
||||
--batch 128 \
|
||||
--model_output ${OUTPUT} \
|
||||
--type ${TYPE} \
|
||||
--depth ${DEPTH} \
|
||||
--gpu
|
||||
done
|
|
@ -0,0 +1,30 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
|
||||
N1=$1
|
||||
N2=$2
|
||||
OUTPUT=$3
|
||||
DEPTH=$4
|
||||
TYPE=$5
|
||||
RESDIR=$6
|
||||
mkdir -p /tmp/rk/${RESDIR}
|
||||
DATADIR=$7
|
||||
|
||||
EPOCHS=10
|
||||
|
||||
for ((i = ${N1}; i <= ${N2}; i++))
|
||||
do
|
||||
python main.py --mode train \
|
||||
--train ${DATADIR} \
|
||||
--model ${RESDIR}/${OUTPUT}_${TYPE}_${i} \
|
||||
--epochs ${EPOCHS} \
|
||||
--embd 64 \
|
||||
--filter_embd 128 --kernel_embd 5 --dense_embd 64 \
|
||||
--domain_embd 16 \
|
||||
--filter_main 32 --kernel_main 5 --dense_main 256 \
|
||||
--batch 128 \
|
||||
--model_output ${OUTPUT} \
|
||||
--type ${TYPE} \
|
||||
--depth ${DEPTH} \
|
||||
--gpu
|
||||
done
|
|
@ -1,10 +1,26 @@
|
|||
#!/usr/bin/python2
|
||||
|
||||
import sys
|
||||
|
||||
import joblib
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
df = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/currentData.joblib")
|
||||
df = df["data"]
|
||||
df = pd.concat(df)
|
||||
fn = sys.argv[1]
|
||||
|
||||
df = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/{}.joblib".format(fn))
|
||||
df = pd.concat(df["data"])
|
||||
df.reset_index(inplace=True)
|
||||
df.to_csv("/tmp/rk/full_dataset.csv.gz", compression="gzip")
|
||||
df.dropna(axis=0, how="any", inplace=True)
|
||||
|
||||
df.serverLabel = pd.to_numeric(df.serverLabel, errors='coerce')
|
||||
df.duration = pd.to_numeric(df.duration, errors='coerce')
|
||||
df.bytes_down = pd.to_numeric(df.bytes_down, errors='coerce')
|
||||
df.bytes_up = pd.to_numeric(df.bytes_up, errors='coerce')
|
||||
|
||||
df.http_method = df.http_method.astype("category")
|
||||
df.serverLabel = df.serverLabel.astype(np.bool)
|
||||
df.virusTotalHits = df.virusTotalHits.astype(np.int8)
|
||||
df.trustedHits = df.trustedHits.astype(np.int8)
|
||||
|
||||
df.to_csv("/tmp/rk/data/{}.csv".format(fn), encoding="utf-8")
|
||||
|
|
|
@ -0,0 +1,116 @@
|
|||
import logging
|
||||
|
||||
from keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint
|
||||
|
||||
import arguments
|
||||
import dataset
|
||||
import models
|
||||
# create logger
|
||||
import visualize
|
||||
from arguments import get_model_args
|
||||
from utils import exists_or_make_path, load_model
|
||||
|
||||
logger = logging.getLogger('cisco_logger')
|
||||
|
||||
args = arguments.parse()
|
||||
|
||||
|
||||
def train_server_only(params):
|
||||
logger.info(f"Create model path {args.model_path}")
|
||||
exists_or_make_path(args.model_path)
|
||||
logger.info(f"Use command line arguments: {args}")
|
||||
|
||||
domain_tr, flow_tr, name_tr, client_tr, server_windows_tr = dataset.load_or_generate_h5data(args.data,
|
||||
args.domain_length,
|
||||
args.window)
|
||||
domain_tr = domain_tr.value.reshape(-1, 40)
|
||||
flow_tr = flow_tr.value.reshape(-1, 3)
|
||||
server_tr = server_windows_tr.value.reshape(-1)
|
||||
|
||||
logger.info("define callbacks")
|
||||
callbacks = []
|
||||
callbacks.append(ModelCheckpoint(filepath=args.clf_model,
|
||||
monitor='loss',
|
||||
verbose=False,
|
||||
save_best_only=True))
|
||||
callbacks.append(CSVLogger(args.train_log))
|
||||
logger.info(f"Use early stopping: {args.stop_early}")
|
||||
if args.stop_early:
|
||||
callbacks.append(EarlyStopping(monitor='val_loss',
|
||||
patience=5,
|
||||
verbose=False))
|
||||
custom_metrics = models.get_metric_functions()
|
||||
|
||||
model = models.get_server_model_by_params(params=params)
|
||||
|
||||
features = {"ipt_domains": domain_tr, "ipt_flows": flow_tr}
|
||||
if args.model_output == "both":
|
||||
labels = {"client": client_tr, "server": server_tr}
|
||||
elif args.model_output == "client":
|
||||
labels = {"client": client_tr}
|
||||
elif args.model_output == "server":
|
||||
labels = {"server": server_tr}
|
||||
else:
|
||||
raise ValueError("unknown model output")
|
||||
|
||||
logger.info("compile and train model")
|
||||
logger.info(model.get_config())
|
||||
model.compile(optimizer='adam',
|
||||
loss='binary_crossentropy',
|
||||
metrics=['accuracy'] + custom_metrics)
|
||||
|
||||
model.summary()
|
||||
model.fit(features, labels,
|
||||
batch_size=args.batch_size,
|
||||
epochs=args.epochs,
|
||||
callbacks=callbacks)
|
||||
|
||||
|
||||
def test_server_only():
|
||||
logger.info("start test: load data")
|
||||
domain_val, flow_val, _, _, _, _ = dataset.load_or_generate_raw_h5data(args.data, args.domain_length, args.window)
|
||||
domain_val = domain_val.value.reshape(-1, 40)
|
||||
flow_val = flow_val.value.reshape(-1, 3)
|
||||
domain_encs, _ = dataset.load_or_generate_domains(args.data, args.domain_length)
|
||||
|
||||
for model_args in get_model_args(args):
|
||||
results = {}
|
||||
logger.info(f"process model {model_args['model_path']}")
|
||||
embd_model, clf_model = load_model(model_args["clf_model"], custom_objects=models.get_custom_objects())
|
||||
|
||||
pred = clf_model.predict([domain_val, flow_val],
|
||||
batch_size=args.batch_size,
|
||||
verbose=1)
|
||||
|
||||
results["server_pred"] = pred
|
||||
|
||||
domain_embeddings = embd_model.predict(domain_encs, batch_size=args.batch_size, verbose=1)
|
||||
results["domain_embds"] = domain_embeddings
|
||||
|
||||
dataset.save_predictions(model_args["model_path"], results)
|
||||
|
||||
|
||||
def vis_server():
|
||||
def load_model(m, c):
|
||||
from keras.models import load_model
|
||||
clf = load_model(m, custom_objects=c)
|
||||
emdb = clf.layers[1]
|
||||
return emdb, clf
|
||||
|
||||
domain_raw, flow_raw, name_raw, hits_vt_raw, hits_trusted_raw, server_raw = dataset.load_or_generate_raw_h5data(
|
||||
args.data, args.domain_length, args.window)
|
||||
|
||||
results = dataset.load_predictions(args.clf_model)
|
||||
|
||||
visualize.plot_clf()
|
||||
visualize.plot_precision_recall(server_raw.flatten(), results["server_pred"].flatten(), "server")
|
||||
visualize.plot_legend()
|
||||
visualize.plot_save("results/server_model/windows_prc.pdf")
|
||||
visualize.plot_clf()
|
||||
visualize.plot_precision_recall(server_raw.flatten(), results["server_pred"].flatten(), "server")
|
||||
visualize.plot_legend()
|
||||
visualize.plot_save("results/server_model/windows_prc.pdf")
|
||||
visualize.plot_clf()
|
||||
visualize.plot_roc_curve(server_raw.flatten(), results["server_pred"].flatten(), "server")
|
||||
visualize.plot_legend()
|
||||
visualize.plot_save("results/server_model/windows_roc.pdf")
|
|
@ -0,0 +1,12 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
RESDIR=$1
|
||||
DATADIR=$2
|
||||
|
||||
for output in client both
|
||||
do
|
||||
python3 main.py --mode test --batch 1024 \
|
||||
--models ${RESDIR}/${output}_*/ \
|
||||
--test ${DATADIR}/futureData.csv \
|
||||
--model_output ${output}
|
||||
done
|
|
@ -0,0 +1,46 @@
|
|||
import os
|
||||
from operator import itemgetter
|
||||
|
||||
import joblib
|
||||
import numpy as np
|
||||
from keras.models import load_model as load_keras_model
|
||||
from sklearn.utils import class_weight
|
||||
|
||||
|
||||
def exists_or_make_path(p):
|
||||
if not os.path.exists(p):
|
||||
os.makedirs(p)
|
||||
|
||||
|
||||
def get_custom_class_weights(client, server):
|
||||
return {
|
||||
"client": class_weight.compute_class_weight('balanced', np.unique(client), client),
|
||||
"server": class_weight.compute_class_weight('balanced', np.unique(server), server)
|
||||
}
|
||||
|
||||
|
||||
def get_custom_sample_weights(client, server):
|
||||
return {
|
||||
"client": class_weight.compute_sample_weight("balanced", client),
|
||||
"server": class_weight.compute_sample_weight("balanced", server)
|
||||
}
|
||||
|
||||
|
||||
def load_ordered_hyperband_results(path):
|
||||
results = joblib.load(path)
|
||||
return sorted(results, itemgetter("loss"))
|
||||
|
||||
|
||||
def load_model(path, custom_objects=None):
|
||||
clf = load_keras_model(path, custom_objects)
|
||||
try:
|
||||
embd = clf.get_layer("domain_cnn").layer
|
||||
except Exception:
|
||||
# in some version i forgot to specify domain_cnn
|
||||
# this bug fix is for certain compatibility
|
||||
try:
|
||||
embd = clf.layers[1].layer
|
||||
except Exception:
|
||||
embd = clf.get_layer("domain_cnn")
|
||||
|
||||
return embd, clf
|
|
@ -0,0 +1,247 @@
|
|||
import os
|
||||
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("agg")
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
from scipy import interpolate
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
from sklearn.manifold import TSNE
|
||||
from sklearn.metrics import (
|
||||
auc, classification_report, confusion_matrix, fbeta_score, precision_recall_curve,
|
||||
roc_auc_score, roc_curve
|
||||
)
|
||||
|
||||
|
||||
def scores(y_true):
|
||||
for (path, dirnames, fnames) in os.walk("results/"):
|
||||
for f in fnames:
|
||||
if path[-1] == "1" and f.endswith("npy"):
|
||||
y_pred = np.load(os.path.join(path, f)).flatten()
|
||||
print(path)
|
||||
tp = np.sum(np.logical_and(y_pred >= 0.5, y_true == 1))
|
||||
tn = np.sum(np.logical_and(y_pred < 0.5, y_true == 0))
|
||||
fp = np.sum(np.logical_and(y_pred >= 0.5, y_true == 0))
|
||||
fn = np.sum(np.logical_and(y_pred < 0.5, y_true == 1))
|
||||
precision = tp / (tp + fp)
|
||||
recall = tp / (tp + fn)
|
||||
accuracy = (tp + tn) / len(y_true)
|
||||
f1_score = 2 * (precision * recall) / (precision + recall)
|
||||
f05_score = (1 + 0.5 ** 2) * (precision * recall) / (0.5 ** 2 * precision + recall)
|
||||
print(" precision:", precision)
|
||||
print(" recall:", recall)
|
||||
print(" accuracy:", accuracy)
|
||||
print(" f1 score:", f1_score)
|
||||
print(" f0.5 score:", f05_score)
|
||||
|
||||
|
||||
def plot_clf():
|
||||
plt.clf()
|
||||
sns.set_context("paper")
|
||||
sns.set_style("white")
|
||||
|
||||
|
||||
def plot_save(path, dpi=600, set_size=True):
|
||||
# plt.title(path)
|
||||
fig = plt.gcf()
|
||||
# fig.suptitle(path)
|
||||
if set_size:
|
||||
fig.set_size_inches(8, 4.5)
|
||||
fig.savefig(path, dpi=dpi, bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
|
||||
def plot_legend():
|
||||
plt.legend()
|
||||
|
||||
|
||||
def mathews_correlation_curve(y, y_pred):
|
||||
pass
|
||||
|
||||
|
||||
def plot_precision_recall(y, y_pred, label=""):
|
||||
y = y.flatten()
|
||||
y_pred = y_pred.flatten()
|
||||
precision, recall, thresholds = precision_recall_curve(y, y_pred)
|
||||
# decreasing_max_precision = np.maximum.accumulate(precision)[::-1]
|
||||
|
||||
# fig, ax = plt.subplots(1, 1)
|
||||
# ax.hold(True)
|
||||
score = fbeta_score(y, y_pred.round(), 1)
|
||||
# prc_ap = average_precision_score(y, y_pred)
|
||||
plt.plot(recall, precision, '--', label=f"{label} - {score:5.4}")
|
||||
# ax.step(recall[::-1], decreasing_max_precision, '-r')
|
||||
plt.xlabel('Recall')
|
||||
plt.ylabel('Precision')
|
||||
plt.ylim([0.0, 1.0])
|
||||
plt.xlim([0.0, 1.0])
|
||||
|
||||
|
||||
def calc_pr_mean(y, y_preds):
|
||||
return calc_metrics_mean(y, y_preds, "prc")
|
||||
|
||||
|
||||
def plot_mean_curve(x, ys, std, score, label):
|
||||
plt.plot(x, ys, label=f"{label} - {score:5.4}")
|
||||
plt.fill_between(x, ys - std, ys + std, alpha=0.1)
|
||||
plt.ylim([0.0, 1.0])
|
||||
plt.xlim([0.0, 1.0])
|
||||
|
||||
|
||||
def plot_pr_mean(y, y_preds, label=""):
|
||||
x = np.linspace(0, 1, 10000)
|
||||
ys_mean, ys_std, score = calc_pr_mean(y, y_preds)
|
||||
plot_mean_curve(x, ys_mean, ys_std, score, label)
|
||||
plt.xlabel('Recall')
|
||||
plt.ylabel('Precision')
|
||||
|
||||
|
||||
def score_model(y, prediction):
|
||||
y = y.flatten()
|
||||
y_pred = prediction.flatten()
|
||||
|
||||
precision, recall, thresholds = precision_recall_curve(y, y_pred)
|
||||
|
||||
print(classification_report(y, y_pred.round()))
|
||||
print("Area under PR curve", auc(recall, precision))
|
||||
print("roc auc score", roc_auc_score(y, y_pred))
|
||||
print("F1 Score", fbeta_score(y, y_pred.round(), 1))
|
||||
print("F0.5 Score", fbeta_score(y, y_pred.round(), 0.5))
|
||||
|
||||
|
||||
def plot_roc_curve(mask, prediction, label=""):
|
||||
y = mask.flatten()
|
||||
y_pred = prediction.flatten()
|
||||
fpr, tpr, thresholds = roc_curve(y, y_pred)
|
||||
roc_auc = auc(fpr, tpr)
|
||||
plt.xscale('log')
|
||||
plt.plot(fpr, tpr, label=f"{label} - {roc_auc:5.4}")
|
||||
plt.ylim([0.0, 1.0])
|
||||
plt.xlim([0.0, 1.0])
|
||||
plt.xlabel('False Positive Rate')
|
||||
plt.ylabel('True Positive Rate')
|
||||
|
||||
|
||||
def calc_metrics_mean(y, y_preds, metric):
|
||||
appr = []
|
||||
y = y.flatten()
|
||||
for idx, y_pred in enumerate(y_preds):
|
||||
y_pred = y_pred.flatten()
|
||||
if metric == "prc":
|
||||
precision, recall, thresholds = precision_recall_curve(y, y_pred)
|
||||
appr.append(interpolate.interp1d(recall, precision))
|
||||
elif metric == "roc":
|
||||
fpr, tpr, thresholds = roc_curve(y, y_pred)
|
||||
appr.append(interpolate.interp1d(fpr, tpr))
|
||||
x = np.linspace(0, 1, 10000)
|
||||
ys = np.vstack([f(x) for f in appr])
|
||||
ys_mean = ys.mean(axis=0)
|
||||
ys_std = ys.std(axis=0)
|
||||
return ys_mean, ys_std, ys
|
||||
|
||||
|
||||
def calc_roc_mean(y, y_preds):
|
||||
return calc_metrics_mean(y, y_preds, "roc")
|
||||
|
||||
|
||||
def plot_roc_mean(y, y_preds, label=""):
|
||||
x = np.linspace(0, 1, 10000)
|
||||
ys_mean, ys_std, score = calc_roc_mean(y, y_preds)
|
||||
plt.xscale('log')
|
||||
plot_mean_curve(x, ys_mean, ys_std, score, label)
|
||||
plt.xlabel('False Positive Rate')
|
||||
plt.ylabel('True Positive Rate')
|
||||
|
||||
|
||||
def plot_confusion_matrix(y_true, y_pred, path,
|
||||
normalize=False,
|
||||
classes=("benign", "malicious"),
|
||||
title='Confusion matrix',
|
||||
cmap="Blues", dpi=600):
|
||||
"""
|
||||
This function prints and plots the confusion matrix.
|
||||
Normalization can be applied by setting `normalize=True`.
|
||||
"""
|
||||
plt.clf()
|
||||
cm = confusion_matrix(y_true, y_pred)
|
||||
|
||||
if normalize:
|
||||
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
|
||||
print("Normalized confusion matrix")
|
||||
else:
|
||||
print('Confusion matrix, without normalization')
|
||||
print(cm)
|
||||
|
||||
plt.imshow(cm, interpolation='nearest', cmap=cmap)
|
||||
plt.title(title)
|
||||
plt.colorbar()
|
||||
tick_marks = np.arange(len(classes))
|
||||
plt.xticks(tick_marks, classes, rotation=45)
|
||||
plt.yticks(tick_marks, classes)
|
||||
|
||||
thresh = cm.max() / 2.
|
||||
for i, j in ((i, j) for i in range(cm.shape[0]) for j in range(cm.shape[1])):
|
||||
plt.text(j, i, cm[i, j],
|
||||
horizontalalignment="center",
|
||||
color="white" if cm[i, j] > thresh else "black")
|
||||
|
||||
plt.tight_layout()
|
||||
plt.ylabel('True label')
|
||||
plt.xlabel('Predicted label')
|
||||
plt.savefig(path, dpi=dpi)
|
||||
plt.close()
|
||||
|
||||
|
||||
def plot_training_curve(logs, key, path, dpi=600):
|
||||
plt.clf()
|
||||
plt.plot(logs[f"{key}acc"], label="accuracy")
|
||||
plt.plot(logs[f"{key}f1_score"], label="f1_score")
|
||||
|
||||
plt.plot(logs[f"val_{key}acc"], label="val_accuracy")
|
||||
# plt.plot(logs[f"val_{key}f1_score"], label="val_f1_score")
|
||||
|
||||
plt.xlabel('epoch')
|
||||
plt.ylabel('percentage')
|
||||
plt.legend()
|
||||
plt.savefig(path, dpi=dpi)
|
||||
plt.close()
|
||||
|
||||
|
||||
def plot_error_bars(results):
|
||||
rates = []
|
||||
for m, r in results.items():
|
||||
if m == "all": continue
|
||||
rates.append((r / r.sum(axis=0, keepdims=True)).flatten())
|
||||
rates = pd.DataFrame(np.vstack(rates), columns=("TN", "FP", "FN", "TP"))
|
||||
|
||||
ax = rates.mean().plot.bar(yerr=rates.std())
|
||||
for p in ax.patches:
|
||||
ax.annotate(str(np.round(p.get_height(), 4)), (p.get_x(), 0.5))
|
||||
|
||||
|
||||
def plot_embedding(domain_embedding, labels, path, dpi=600, method="svd"):
|
||||
if method == "svd":
|
||||
red = TruncatedSVD(n_components=2)
|
||||
elif method == "tsne":
|
||||
red = TSNE(n_components=2, verbose=2)
|
||||
domain_reduced = red.fit_transform(domain_embedding)
|
||||
print(red.explained_variance_ratio_)
|
||||
# use if draw subset of predictions
|
||||
# idx = np.random.choice(np.arange(len(domain_reduced)), 10000)
|
||||
plt.scatter(domain_reduced[:, 0],
|
||||
domain_reduced[:, 1],
|
||||
c=(labels * (1, 2)).sum(1).astype(int),
|
||||
cmap=plt.cm.plasma,
|
||||
s=3,
|
||||
alpha=0.2)
|
||||
plt.colorbar()
|
||||
plt.savefig(path, dpi=dpi)
|
||||
|
||||
|
||||
def plot_model_as(model, path, shapes=True, layer_names=True):
|
||||
from keras.utils.vis_utils import plot_model
|
||||
plot_model(model, to_file=path, show_shapes=shapes, show_layer_names=layer_names)
|
Loading…
Reference in New Issue