Compare commits

...

110 Commits
master ... dev

Author SHA1 Message Date
René Knaebel b3d646c9e7 add group lasso regularizer impl 2017-11-30 09:34:45 +01:00
René Knaebel f382d06eb5 add initial version of sluice network with alphas, betas, and soft share 2017-11-27 16:17:19 +01:00
René Knaebel 349bc92a61 reorder curve storing 2017-11-13 14:37:41 +01:00
René Knaebel d58dbcb101 refactor class weights 2017-11-10 14:31:32 +01:00
René Knaebel 461d4cab8f add deeper domain cnn; refactor hyperband using load_data function 2017-11-10 12:52:18 +01:00
René Knaebel 3ce385eca6 update plotting function according the test and beta results 2017-11-10 12:34:21 +01:00
René Knaebel c19d649bc4 add some print lines for better following the process structure 2017-11-10 11:38:57 +01:00
René Knaebel 27f4d086eb add matplotlib agg mode; update beta vis function according to test results 2017-11-10 11:04:27 +01:00
René Knaebel 4fc2f0c925 extract weighting function 2017-11-10 10:18:13 +01:00
René Knaebel 9ce11e4db4 refactor test function according to the new training procedure 2017-11-09 13:12:50 +01:00
René Knaebel 9b8ca8abab add parameter for hyper band iteration, use hyperband results in new runs 2017-11-08 11:09:56 +01:00
René Knaebel 903e81c931 remove identical parameter from data loading function; add runs argument 2017-11-07 20:47:41 +01:00
René Knaebel 826357a41f refactor network models; remove depths 2017-11-07 20:32:08 +01:00
René Knaebel e12bbda8c5 move model creation back into models package 2017-11-07 20:09:20 +01:00
René Knaebel b1f48c1895 add soft parameter sharing network 2017-11-06 21:51:49 +01:00
René Knaebel 7b8dfcebbe add long final implementation 2017-11-05 22:52:50 +01:00
René Knaebel 14fef66a55 train multiple models at once 2017-11-04 17:58:21 +01:00
René Knaebel 88e3eda595 refactor hyperband; fix domain generation
integrate hyperband option in training procedure - start refactoring - remove the index erro in generation and add helper functions
2017-11-04 12:47:08 +01:00
René Knaebel 8b17bd0701 add TSNE embedding; server evaluation visualization 2017-10-19 17:39:37 +02:00
René Knaebel a860f0da34 refactor server training into separate file; add additional info to hyperband log 2017-10-19 17:37:29 +02:00
René Knaebel d1da3d6ca3 fix model selection 2017-10-09 15:10:15 +02:00
René Knaebel a686f147f0 add bulk embedding visualization and deep1 network 2017-10-09 14:19:01 +02:00
René Knaebel 33063f3081 add sample weight metrics to fit function 2017-10-08 22:09:09 +02:00
René Knaebel e8473048cb refactor visualization; start plotting server results 2017-10-08 11:52:10 +02:00
René Knaebel 0b26c6125c fix flat 2 model bug caused by changing input layer 2017-10-06 16:36:58 +02:00
René Knaebel 5741f8ee0e fix staggered training 2017-10-06 10:38:00 +02:00
René Knaebel 508667d1d0 add server classification model 2017-10-05 15:26:53 +02:00
René Knaebel 345afbaef5 add error bar vis, comment unused parameters from parser 2017-10-05 14:50:59 +02:00
René Knaebel b24fa770f9 change hyperband to count minimal val_loss over all losses 2017-10-05 12:55:46 +02:00
René Knaebel 371a1dad05 add hyperband savefile config, minor change of parameter name 2017-10-03 18:58:54 +02:00
René Knaebel 68254d6629 add load function for hyper band results 2017-10-02 07:34:04 +02:00
René Knaebel f02e0b7f99 fix hyperband wrong variable names 2017-09-29 23:34:39 +02:00
René Knaebel a1e553f8f1 fix rerun wrong path 2017-09-29 23:03:07 +02:00
René Knaebel 605447440f refactor hyperband implementation 2017-09-29 22:59:57 +02:00
René Knaebel 090c89a127 add retrain mode 2017-09-28 12:23:22 +02:00
René Knaebel b157ca6a19 add first version of model averaging visualization 2017-09-26 19:25:37 +02:00
René Knaebel 49ad506a96 remove file name in run script 2017-09-22 10:03:01 +02:00
René Knaebel 607d74998c add tsne (does not work with big data)
fix model loading with custom selu function
2017-09-22 10:01:12 +02:00
René Knaebel e2bf2dc90f fix missing parameters, add flat network structure, make larger graphics 2017-09-20 14:43:28 +02:00
René Knaebel fbe6d6a584 remove input shape of first conv layer in networks because unnecessary
add selu activation to deeper network designs
2017-09-17 17:26:09 +02:00
René Knaebel 6a47b5f245 add sample weights function to utils for later usage 2017-09-17 10:23:23 +02:00
René Knaebel f2845e635e fix validation split removal by loading h5data into memory 2017-09-17 09:56:18 +02:00
René Knaebel ec5a1101be remove model selection based on validation loss 2017-09-16 15:25:34 +02:00
René Knaebel b0e0cd904e add start parameter for run script 2017-09-15 11:12:42 +02:00
René Knaebel 7f49021a63 refactor training - separate staggered training; make differences as small as possible 2017-09-12 08:36:23 +02:00
René Knaebel 6ce8fb464f remove reguliarizer on dense too 2017-09-11 16:48:49 +02:00
René Knaebel 3a44efa775 minor changes for pauls test 2017-09-11 12:59:13 +02:00
René Knaebel 6d8d7b19f3 fix covariance normalization; add run_model script for multi times training 2017-09-11 12:42:44 +02:00
René Knaebel 6121eac448 remove regularizer for conv and domain 2017-09-10 23:40:14 +02:00
René Knaebel 1cf62423e1 add regularization to small networks, fix model name in args, fix visualizations 2017-09-10 18:06:40 +02:00
René Knaebel 6fef2b8b84 refactor all visualization for pauls changes - evaluate on max windows per users 2017-09-08 22:59:55 +02:00
René Knaebel 9a51b6ea34 refactor test function working on full unfiltered data 2017-09-08 19:10:23 +02:00
René Knaebel edc75f4f44 refactor dataset creation, split up functions 2017-09-08 17:11:13 +02:00
René Knaebel 528829bb33 fix file name in run script 2017-09-08 13:57:18 +02:00
René Knaebel 70d00efb01 refactor using joblib for test results, make h5py store/load more flexible 2017-09-08 13:55:13 +02:00
René Knaebel 1ab0108c78 add window to file names for visualization 2017-09-07 17:38:21 +02:00
René Knaebel 595c2ea894 change argument interface
- add more properties for network specification
 - change names for consistency
2017-09-07 15:53:58 +02:00
René Knaebel 71f218888d set server to be not trainable too; refactor visualization script 2017-09-07 15:31:04 +02:00
René Knaebel 5bd8e41711 add staggered model training for intermediate sever prediction; refactor model return values 2017-09-07 14:24:55 +02:00
René Knaebel 2080444fb7 add parser argument for naming in multi model modes, minor fixes, re-set fix vals for network - need to make them flexible 2017-09-05 17:40:57 +02:00
René Knaebel ed4f478bad add parser argument for naming in multi model modes, minor fixes, 2017-09-05 12:40:37 +02:00
René Knaebel 1da31cc97c visualize per user stats 2017-09-04 13:37:26 +02:00
René Knaebel 3f6779fa3d load names with data for per-user evaluation 2017-09-02 16:02:48 +02:00
René Knaebel 0db8427457 update test script with bash loop 2017-09-02 12:58:48 +02:00
René Knaebel dc9180da10 refactor visualization, change arguments for model type and its depth 2017-09-01 10:42:26 +02:00
René Knaebel 933eaae04a change exception type in get_flow_per_user function and replace index to new range index 2017-08-31 13:49:33 +02:00
René Knaebel dceaf47211 fix typo 2017-08-31 11:25:49 +02:00
René Knaebel 954dfcf9f9 fix run scripts 2017-08-05 09:40:40 +02:00
René Knaebel 5a02f582cd change model - add dense before server output in new model
add some new run scripts
2017-08-05 09:33:07 +02:00
René Knaebel 6e7dc1297c fix lazy domain loading and generation process 2017-08-03 12:27:17 +02:00
René Knaebel 7f1d13658f store domain embeddings while test main 2017-08-03 09:08:24 +02:00
René Knaebel 452f9e0456 fix array() to zeros(0) 2017-08-03 08:00:20 +02:00
René Knaebel 787f43b328 fix test predictions depending on model output specification 2017-08-03 07:51:58 +02:00
René Knaebel 8ac195ba6f add some static configs to run script 2017-08-02 15:13:09 +02:00
René Knaebel 1e781d5491 add argument for model outputs
BUG: need to check --new_model --model_output server
2017-08-02 12:58:09 +02:00
René Knaebel f4da147688 refactor cmd argument to have single value for mode 2017-07-30 15:49:37 +02:00
René Knaebel e24f596f40 add argument for using the new model architecture 2017-07-30 14:07:39 +02:00
René Knaebel ebaeb6b96e move vocab_size into implementation (not user dependent) 2017-07-30 13:47:11 +02:00
René Knaebel d97785f646 replace softmax by sigmoid in final layer, also adjust dataset for that 2017-07-30 12:50:26 +02:00
René Knaebel b0da2de0ea move utils functions to new file 2017-07-29 19:47:02 +02:00
René Knaebel 820a5d1a4d add new network architecture - server label moves to the middle 2017-07-29 19:42:36 +02:00
René Knaebel 8cd1023165 replace pca reduction by sklearn's truncated svd 2017-07-29 19:41:14 +02:00
René Knaebel 2593131e9e add embedding visualization and domain encoding generator 2017-07-29 10:43:59 +02:00
René Knaebel c1535b941b remove max pooling from models for better infromation flow 2017-07-28 17:25:08 +02:00
René Knaebel 18b60e1754 add extended test mode for embeddings 2017-07-17 19:30:56 +02:00
René Knaebel 79fc441fe1 wip 2017-07-17 08:44:58 +02:00
René Knaebel d33c9f44ec fix chunks per user function bug caused by numpy version of array_split 2017-07-16 18:49:14 +02:00
René Knaebel 844494eca9 add multi-threading for pre-processing 2017-07-16 09:42:52 +02:00
René Knaebel 336be37032 fix network props, add PCA to visualize main 2017-07-14 21:01:08 +02:00
René Knaebel 6b787792db add custom class weights based on sklearn balance 2017-07-14 15:57:52 +02:00
René Knaebel b35f23e518 add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00
René Knaebel d0418b9efa fix hyperband adding h5data and remove param sample 2017-07-12 15:18:45 +02:00
René Knaebel 2afaccc84b refactor argparser into separate file, add logger 2017-07-12 10:25:55 +02:00
René Knaebel 9f0bae33d5 refactor dataset generation, add callbacks 2017-07-11 21:06:58 +02:00
René Knaebel a196daa895 add simple flow feature extraction function 2017-07-11 13:46:25 +02:00
René Knaebel 522854ee0d add h5 support for pauls best config main 2017-07-11 11:12:03 +02:00
René Knaebel 41b38de1ab add feature: generate and use h5 data 2017-07-09 23:58:08 +02:00
René Knaebel fdc03c9922 add h5py example 2017-07-08 17:46:07 +02:00
René Knaebel 4a9f94a029 add output for main_test 2017-07-08 15:04:58 +02:00
René Knaebel 21b9d7be73 merged 2017-07-08 13:04:07 +02:00
René Knaebel 36cdba3fdf add pauls config test (TMP) 2017-07-08 13:01:58 +02:00
René Knaebel a3324b5e04 add pauls config test (TMP) 2017-07-08 11:53:51 +02:00
René Knaebel be56112b33 added params 2017-07-07 16:59:08 +02:00
René Knaebel 3c4be52bb6 refactor main functions - separate things into different functions 2017-07-07 08:43:16 +02:00
René Knaebel 933f6bf1d7 add feature to use both hits information from dataset 2017-07-06 16:27:47 +02:00
René Knaebel b2f5c56019 refactor dataset generation 2017-07-05 21:19:19 +02:00
René Knaebel 772b07847f WPI 2017-07-05 19:16:03 +02:00
René Knaebel a70d1cb03a fix: replace X_tr by its elements; choose selected samples for training data too 2017-07-05 18:37:29 +02:00
René Knaebel 7c05ef6a12 refactor models package: create separate modules for pauls and renes networks 2017-07-05 18:10:22 +02:00
René Knaebel 3862dce975 added new networks for domain embedding and classification task 2017-07-05 17:37:08 +02:00
20 changed files with 2470 additions and 300 deletions

6
.gitignore vendored
View File

@ -99,4 +99,8 @@ ENV/
*.tif
*.joblib
*.csv
*.csv.gz
*.csv.gz
*.csv.tar.*
*.h5
*.npy
*.png

View File

@ -1,3 +1,60 @@
test:
python3 main.py --epochs 1 --batch 64
run:
python3 main.py --mode train --data data/rk_mini.csv.gz --model results/test/test_client --epochs 2 \
--filter_embd 8 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 16 \
--dense_embd 8 --domain_embd 8 --batch 64 --type final --model_output client --runs 1
python3 main.py --mode train --data data/rk_mini.csv.gz --model results/test/test_final --epochs 2 \
--filter_embd 8 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 16 \
--dense_embd 8 --domain_embd 8 --batch 64 --type final --model_output both --runs 1
python3 main.py --mode train --data data/rk_mini.csv.gz --model results/test/test_inter --epochs 2 \
--filter_embd 8 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 16 \
--dense_embd 8 --domain_embd 8 --batch 64 --type inter --model_output both --runs 1
python3 main.py --mode train --data data/rk_mini.csv.gz --model results/test/test_soft --epochs 2 \
--filter_embd 8 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 16 \
--dense_embd 8 --domain_embd 8 --batch 64 --type soft --model_output both --runs 1
python3 main.py --mode train --data data/rk_mini.csv.gz --model results/test/test_long --epochs 2 \
--filter_embd 8 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 16 \
--dense_embd 8 --domain_embd 8 --batch 64 --type long --model_output both --runs 1
python3 main.py --mode train --data data/rk_mini.csv.gz --model results/test/test_staggered --epochs 2 \
--filter_embd 8 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 16 \
--dense_embd 8 --domain_embd 8 --batch 64 --type staggered --model_output both --runs 1
test:
python3 main.py --mode test --batch 128 --models results/test/test_both_* --data data/rk_mini.csv.gz --model_output both
python3 main.py --mode test --batch 128 --models results/test/test_client_* --data data/rk_mini.csv.gz --model_output client
fancy:
python3 main.py --mode fancy --batch 128 --model results/test/test_both_1 --data data/rk_mini.csv.gz
python3 main.py --mode fancy --batch 128 --model results/test/test_both_2 --data data/rk_mini.csv.gz
python3 main.py --mode fancy --batch 128 --model results/test/test_both_3 --data data/rk_mini.csv.gz
python3 main.py --mode fancy --batch 128 --model results/test/test_both_4 --data data/rk_mini.csv.gz
python3 main.py --mode fancy --batch 128 --model results/test/test_both_5 --data data/rk_mini.csv.gz
python3 main.py --mode fancy --batch 128 --model results/test/test_client_1 --data data/rk_mini.csv.gz
python3 main.py --mode fancy --batch 128 --model results/test/test_client_2 --data data/rk_mini.csv.gz
python3 main.py --mode fancy --batch 128 --model results/test/test_client_3 --data data/rk_mini.csv.gz
python3 main.py --mode fancy --batch 128 --model results/test/test_client_4 --data data/rk_mini.csv.gz
all-fancy:
python3 main.py --mode all_fancy --batch 128 --models results/test/test* --data data/rk_mini.csv.gz \
--out-prefix results/test/
hyper:
python3 main.py --mode hyperband --batch 64 --train data/rk_data.csv.gz
clean:
rm -r results/test/
rm data/rk_mini.csv.gz_raw.h5
rm data/rk_mini.csv.gz.h5

146
arguments.py Normal file
View File

@ -0,0 +1,146 @@
import argparse
import os
parser = argparse.ArgumentParser()
parser.add_argument("--mode", action="store", dest="mode",
default="")
# parser.add_argument("--train", action="store", dest="train_data",
# default="data/full_dataset.csv.tar.gz")
parser.add_argument("--data", action="store", dest="data",
default="data/full_dataset.csv.tar.gz")
# parser.add_argument("--test", action="store", dest="test_data",
# default="data/full_future_dataset.csv.tar.gz")
parser.add_argument("--hyper_result", action="store", dest="hyperband_results",
default="")
parser.add_argument("--model", action="store", dest="model_path",
default="results/model_x")
parser.add_argument("--model_src", action="store", dest="model_source",
default="results/model_x")
parser.add_argument("--model_dest", action="store", dest="model_destination",
default="results/model_x")
parser.add_argument("--models", action="store", dest="model_paths", nargs="+",
default=[])
parser.add_argument("--type", action="store", dest="model_type",
default="final")
parser.add_argument("--embd_type", action="store", dest="embedding_type",
default="small")
# parser.add_argument("--depth", action="store", dest="model_depth",
# default="flat1")
parser.add_argument("--model_output", action="store", dest="model_output",
default="both")
parser.add_argument("--batch", action="store", dest="batch_size",
default=64, type=int)
parser.add_argument("--epochs", action="store", dest="epochs",
default=10, type=int)
parser.add_argument("--init_epoch", action="store", dest="initial_epoch",
default=0, type=int)
parser.add_argument("--runs", action="store", dest="runs",
default=20, type=int)
parser.add_argument("--hyper_max_iter", action="store", dest="hyper_max_iter",
default=81, type=int)
# parser.add_argument("--samples", action="store", dest="samples",
# default=100000, type=int)
#
# parser.add_argument("--samples_val", action="store", dest="samples_val",
# default=10000, type=int)
#
parser.add_argument("--embd", action="store", dest="embedding",
default=128, type=int)
parser.add_argument("--filter_embd", action="store", dest="filter_embedding",
default=128, type=int)
parser.add_argument("--dense_embd", action="store", dest="dense_embedding",
default=128, type=int)
parser.add_argument("--kernel_embd", action="store", dest="kernel_embedding",
default=3, type=int)
parser.add_argument("--filter_main", action="store", dest="filter_main",
default=128, type=int)
parser.add_argument("--dense_main", action="store", dest="dense_main",
default=128, type=int)
parser.add_argument("--kernel_main", action="store", dest="kernel_main",
default=3, type=int)
parser.add_argument("--window", action="store", dest="window",
default=10, type=int)
parser.add_argument("--domain_length", action="store", dest="domain_length",
default=40, type=int)
parser.add_argument("--domain_embd", action="store", dest="domain_embedding",
default=512, type=int)
parser.add_argument("--out-prefix", action="store", dest="output_prefix",
default="", type=str)
# parser.add_argument("--queue", action="store", dest="queue_size",
# default=50, type=int)
#
# parser.add_argument("--p", action="store", dest="p_train",
# default=0.5, type=float)
#
# parser.add_argument("--p_val", action="store", dest="p_val",
# default=0.01, type=float)
#
# parser.add_argument("--gpu", action="store", dest="gpu",
# default=0, type=int)
#
# parser.add_argument("--tmp", action="store_true", dest="tmp")
#
parser.add_argument("--stop_early", action="store_true", dest="stop_early")
parser.add_argument("--balanced_weights", action="store_true", dest="class_weights")
parser.add_argument("--sample_weights", action="store_true", dest="sample_weights")
parser.add_argument("--gpu", action="store_true", dest="gpu")
parser.add_argument("--new_model", action="store_true", dest="new_model")
def get_model_args(args):
return [{
"model_path": model_path,
"model_name": os.path.split(os.path.normpath(model_path))[1],
"embedding_model": os.path.join(model_path, "embd.h5"),
"clf_model": os.path.join(model_path, "clf.h5"),
"train_log": os.path.join(model_path, "train.log.csv"),
# "train_h5data": args.train_data,
# "test_h5data": args.test_data,
"future_prediction": os.path.join(model_path, f"{os.path.basename(args.data)}_pred")
} for model_path in args.model_paths]
def parse():
args = parser.parse_args()
args.result_path = os.path.split(os.path.normpath(args.output_prefix))[1]
args.model_name = os.path.split(os.path.normpath(args.model_path))[1]
args.embedding_model = os.path.join(args.model_path, "embd.h5")
args.clf_model = os.path.join(args.model_path, "clf.h5")
args.train_log = os.path.join(args.model_path, "train.log.csv")
# args.train_h5data = args.train_data
# args.test_h5data = args.test_data
args.future_prediction = os.path.join(args.model_path, f"{os.path.basename(args.data)}_pred")
return args

View File

@ -1,198 +1,297 @@
# -*- coding: utf-8 -*-
import logging
import string
from multiprocessing import Pool
import h5py
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
chars = dict((char, idx + 1) for (idx, char) in
enumerate(string.ascii_lowercase + string.punctuation + string.digits))
logger = logging.getLogger('cisco_logger')
char2idx = dict((char, idx + 1) for (idx, char) in
enumerate(string.ascii_lowercase + string.punctuation + string.digits + " "))
idx2char = {v: k for k, v in char2idx.items()}
def get_character_dict():
return chars
return char2idx
def get_vocab_size():
return len(char2idx) + 1
def encode_char(c):
if c in chars:
return chars[c]
else:
return 0
return char2idx.get(c, 0)
def decode_char(i):
return idx2char.get(i, "")
encode_char = np.vectorize(encode_char)
decode_char = np.vectorize(decode_char)
def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
maxLengthInSeconds=300):
maxMilliSeconds = maxLengthInSeconds * 1000
outDomainLists = []
outDFFrames = []
if overlapping == False:
numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize)))
userIDs = np.arange(len(dataFrame))
for blockID in np.arange(numBlocks):
curIDs = userIDs[(blockID * windowSize):((blockID + 1) * windowSize)]
# print(curIDs)
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
if maxLengthInSeconds != -1:
curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
if len(underTimeOutIDs) != len(curIDs):
curIDs = curIDs[underTimeOutIDs]
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
outDomainLists.append(list(curDomains))
outDFFrames.append(useData)
else:
numBlocks = len(dataFrame) + 1 - windowSize
userIDs = np.arange(len(dataFrame))
for blockID in np.arange(numBlocks):
curIDs = userIDs[blockID:blockID + windowSize]
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
if maxLengthInSeconds != -1:
curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
if len(underTimeOutIDs) != len(curIDs):
curIDs = curIDs[underTimeOutIDs]
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
outDomainLists.append(list(curDomains))
outDFFrames.append(useData)
if len(outDomainLists[-1]) != windowSize:
outDomainLists.pop(-1)
outDFFrames.pop(-1)
return (outDomainLists, outDFFrames)
def encode_domain(domain: string):
return encode_char(list(domain))
def get_domain_features(domain, vocab, max_length=40):
def decode_domain(domain):
return "".join(decode_char(domain))
def get_user_chunks(user_flow, window=10):
result = []
chunk_size = (len(user_flow) // window)
for i in range(chunk_size):
result.append(user_flow.iloc[i * window:(i + 1) * window])
if result and len(result[-1]) != window:
result.pop()
return result
def get_domain_features(domain: string, max_length=40):
encoding = np.zeros((max_length,))
for j in range(np.min([len(domain), max_length])):
curCharacter = domain[-j]
if curCharacter in vocab:
encoding[j] = vocab[curCharacter]
for j in range(min(len(domain), max_length)):
c = domain[len(domain) - 1 - j]
encoding[max_length - 1 - j] = encode_char(c)
return encoding
def get_flow_features(flow):
keys = ['duration', 'bytes_down', 'bytes_up']
features = np.zeros([len(keys), ])
for i, key in enumerate(keys):
# TODO: does it still works after exceptions occur -- default: zero!
# i wonder whether something brokes
# if there are exceptions regarding to inconsistent feature length
try:
features[i] = np.log1p(flow[key]).astype(float)
except:
pass
return features
def get_all_flow_features(features):
flows = np.stack(
map(lambda f: f[["duration", "bytes_up", "bytes_down"]], features)
)
return np.log1p(flows)
def get_cisco_features(curDataLine, urlSIPDict):
numCiscoFeatures = 30
try:
ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
# log transform
ciscoFeatures = np.log1p(ciscoFeatures).astype(float)
return ciscoFeatures.ravel()
except:
return np.zeros([numCiscoFeatures, ]).ravel()
def filter_window_dataset_by_hits(domain, flow, name, hits, trusted_hits, server):
# select only 1.0 and 0.0 from training data
pos_idx = np.where(np.logical_or(hits == 1.0, trusted_hits >= 1.0))[0]
neg_idx = np.where(hits == 0.0)[0]
idx = np.concatenate((pos_idx, neg_idx))
# choose selected sample to train on
domain = domain[idx]
flow = flow[idx]
client = np.zeros_like(idx, float)
client[:pos_idx.shape[-1]] = 1.0
server = server[idx]
name = name[idx]
return domain, flow, name, client, server
def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10, use_cisco_features=False):
domains = []
features = []
print("get chunks from user data frames")
for i, user_flow in enumerate(get_flow_per_user(user_flow_df)):
(domain_windows, feature_windows) = get_user_chunks(user_flow,
windowSize=window_size,
overlapping=True,
maxLengthInSeconds=-1)
domains += domain_windows
features += feature_windows
# TODO: remove later
if i >= 10:
break
def create_raw_dataset_from_flows(user_flow_df, max_len, window_size=10):
logger.info("get chunks from user data frames")
with Pool() as pool:
results = []
for user_flow in tqdm(get_flow_per_user(user_flow_df), total=len(user_flow_df['user_hash'].unique().tolist())):
results.append(pool.apply_async(get_user_chunks, (user_flow, window_size)))
windows = [window for res in results for window in res.get()]
logger.info("create training dataset")
domain, flow, hits, name, server, trusted_hits = create_dataset_from_windows(chunks=windows,
max_len=max_len)
# make client labels discrete with 4 different values
hits = np.apply_along_axis(lambda x: make_label_discrete(x, 3), 0, np.atleast_2d(hits))
print("create training dataset")
return create_dataset_from_lists(
domains=domains, features=features, vocab=char_dict,
max_len=max_len,
use_cisco_features=use_cisco_features, urlSIPDIct=dict(),
window_size=window_size)
return domain, flow, name, hits, trusted_hits, server
def create_dataset_from_lists(domains, features, vocab, max_len,
use_cisco_features=False, urlSIPDIct=dict(),
window_size=10):
def store_h5dataset(path, data: dict):
f = h5py.File(path + ".h5", "w")
for key, val in data.items():
f.create_dataset(key, data=val)
f.close()
def check_h5dataset(path):
return open(path + ".h5", "r")
def load_h5dataset(path):
f = h5py.File(path + ".h5", "r")
data = {}
for k in f.keys():
data[k] = f[k]
return data
def create_dataset_from_windows(chunks, max_len):
"""
combines domain and feature windows to sequential training data
:param domains: list of domain windows
:param features: list of feature windows
:param chunks: list of flow feature windows
:param vocab:
:param max_len:
:param use_cisco_features: idk
:param urlSIPDIct: idk
:param window_size: size of the flow window
:return:
"""
# TODO: check for hits vs vth consistency
# if 'hits' in dfs[0].keys():
# hits_col = 'hits'
# elif 'virusTotalHits' in dfs[0].keys():
# hits_col = 'virusTotalHits'
hits_col = "virusTotalHits"
numFlowFeatures = 3
numCiscoFeatures = 30
numFeatures = numFlowFeatures
if use_cisco_features:
numFeatures += numCiscoFeatures
sample_size = len(domains)
hits = []
names = []
servers = []
trusted_hits = []
def get_domain_features_reduced(d):
return get_domain_features(d[0], max_len)
domain_features = np.zeros((sample_size, window_size, max_len))
flow_features = np.zeros((sample_size, window_size, numFeatures))
logger.info(" compute domain features")
domain_features = []
for ds in tqdm(map(lambda f: f.domain, chunks)):
domain_features.append(np.apply_along_axis(get_domain_features_reduced, 2, np.atleast_3d(ds)))
domain_features = np.concatenate(domain_features, 0)
logger.info(" compute flow features")
flow_features = get_all_flow_features(chunks)
logger.info(" select hits")
hits = np.max(np.stack(map(lambda f: f.virusTotalHits, chunks)), axis=1)
logger.info(" select names")
names = np.stack(map(lambda f: f.user_hash, chunks))
assert (names[:, :1].repeat(10, axis=1) == names).all()
names = names[:, 0]
logger.info(" select servers")
servers = np.stack(map(lambda f: f.serverLabel, chunks))
logger.info(" select trusted hits")
trusted_hits = np.max(np.stack(map(lambda f: f.trustedHits, chunks)), axis=1)
for i in tqdm(np.arange(sample_size), miniters=10):
for j in range(window_size):
domain_features[i, j] = get_domain_features(domains[i][j], vocab, max_len)
flow_features[i, j] = get_flow_features(features[i].iloc[j])
# TODO: cisco features?
hits.append(np.max(features[i][hits_col]))
names.append(np.unique(features[i]['user_hash']))
servers.append(np.max(features[i]['serverLabel']))
trusted_hits.append(np.max(features[i]['trustedHits']))
X = [domain_features, flow_features]
return X, np.array(hits), np.array(names), np.array(servers), np.array(trusted_hits)
return (domain_features, flow_features,
hits, names, servers, trusted_hits)
def discretize_label(values, threshold):
maxVal = np.max(values)
if maxVal >= threshold:
def make_label_discrete(values, threshold):
max_val = np.max(values)
if max_val >= threshold:
return 1.0
elif maxVal == -1:
elif max_val == -1:
return -1.0
elif 0 < maxVal < threshold:
elif 0 < max_val < threshold:
return -2.0
else:
return 0.0
def get_user_flow_data():
df = pd.read_csv("data/rk_data.csv.gz")
df.drop("Unnamed: 0", 1, inplace=True)
df.set_index(keys=['user_hash'], drop=False, inplace=True)
def get_user_flow_data(csv_file):
types = {
"duration": int,
"bytes_down": int,
"bytes_up": int,
"domain": object,
"timeStamp": float,
"http_method": object,
"server_ip": object,
"user_hash": float,
"virusTotalHits": int,
"serverLabel": int,
"trustedHits": int
}
df = pd.read_csv(csv_file, index_col=False)
df = df[list(types.keys())]
# df.set_index(keys=['user_hash'], drop=False, inplace=True)
return df
def get_flow_per_user(df):
users = df['user_hash'].unique().tolist()
for user in users:
yield df.loc[df.user_hash == user]
yield df.loc[df.user_hash == user].dropna(axis=0, how="any")
def load_or_generate_h5data(train_data, domain_length, window_size):
logger.info(f"check for h5data {train_data}")
try:
check_h5dataset(train_data)
except FileNotFoundError:
logger.info("load raw training dataset")
domain, flow, name, hits, trusted_hits, server = load_or_generate_raw_h5data(train_data, domain_length,
window_size)
logger.info("filter training dataset")
domain, flow, name, client, server = filter_window_dataset_by_hits(domain.value, flow.value,
name.value, hits.value,
trusted_hits.value, server.value)
logger.info("store training dataset as h5 file")
data = {
"domain": domain.astype(np.int8),
"flow": flow,
"name": name,
"client": client.astype(np.bool),
"server": server.astype(np.bool)
}
store_h5dataset(train_data, data)
logger.info("load h5 dataset")
data = load_h5dataset(train_data)
return data["domain"], data["flow"], data["name"], data["client"], data["server"]
def load_or_generate_raw_h5data(train_data, domain_length, window_size):
h5data = train_data + "_raw"
logger.info(f"check for h5data {h5data}")
try:
check_h5dataset(h5data)
except FileNotFoundError:
logger.info("h5 data not found - load csv file")
user_flow_df = get_user_flow_data(train_data)
logger.info("create raw training dataset")
domain, flow, name, hits, trusted_hits, server = create_raw_dataset_from_flows(user_flow_df, domain_length,
window_size)
logger.info("store raw training dataset as h5 file")
data = {
"domain": domain.astype(np.int8),
"flow": flow,
"name": name,
"hits_vt": hits.astype(np.int8),
"hits_trusted": hits.astype(np.int8),
"server": server.astype(np.bool)
}
store_h5dataset(h5data, data)
logger.info("load h5 dataset")
data = load_h5dataset(h5data)
return data["domain"], data["flow"], data["name"], data["hits_vt"], data["hits_trusted"], data["server"]
def generate_names(train_data, window_size):
user_flow_df = get_user_flow_data(train_data)
with Pool() as pool:
results = []
for user_flow in tqdm(get_flow_per_user(user_flow_df),
total=len(user_flow_df['user_hash'].unique().tolist())):
results.append(pool.apply_async(get_user_chunks, (user_flow, window_size)))
windows = [window for res in results for window in res.get()]
names = np.stack(map(lambda f: f.user_hash, windows))
names = names[:, 0]
return names
def load_or_generate_domains(train_data, domain_length):
fn = f"{train_data}_domains.gz"
try:
logger.info(f"Load file {fn}.")
user_flow_df = pd.read_csv(fn)
logger.info(f"File successfully loaded.")
except FileNotFoundError:
logger.info(f"File {fn} not found, recreate.")
user_flow_df = get_user_flow_data(train_data)
# user_flow_df.reset_index(inplace=True)
user_flow_df = user_flow_df[["domain", "serverLabel", "trustedHits", "virusTotalHits"]].dropna(axis=0,
how="any")
user_flow_df = user_flow_df.groupby(user_flow_df.domain).mean()
user_flow_df.reset_index(inplace=True)
user_flow_df["clientLabel"] = np.where(
np.logical_or(user_flow_df.trustedHits > 0, user_flow_df.virusTotalHits >= 3), True, False)
user_flow_df[["serverLabel", "clientLabel"]] = user_flow_df[["serverLabel", "clientLabel"]].astype(bool)
user_flow_df = user_flow_df[["domain", "serverLabel", "clientLabel"]]
user_flow_df.to_csv(fn, compression="gzip")
logger.info(f"Extract features from domains")
domain_encs = user_flow_df.domain.apply(lambda d: get_domain_features(d, domain_length))
domain_encs = np.stack(domain_encs)
return domain_encs, user_flow_df.domain, user_flow_df[["clientLabel", "serverLabel"]].as_matrix().astype(bool)
def save_predictions(path, results):
joblib.dump(results, path + "/results.joblib", compress=3)
def load_predictions(path):
return joblib.load(path + "/results.joblib")

30
fancy.sh Normal file
View File

@ -0,0 +1,30 @@
#!/usr/bin/env bash
N1=$1
N2=$2
RESDIR=$3
DATADIR=$4
#for ((i = ${N1}; i <= ${N2}; i++))
#do
# python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_final_${i} --data ${DATADIR} --model_output client
# python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_final_${i} --data ${DATADIR} --model_output both
# python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_inter_${i} --data ${DATADIR} --model_output both
# python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_staggered_${i} --data ${DATADIR} --model_output both
#done
#
#python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/client_final_{1..20}/ --data ${DATADIR} --model_output client --out-prefix ${RESDIR}/client_final
#python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_final_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_final
#python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_inter_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_inter
#python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_staggered_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_staggered
#python3 main.py --mode beta --batch 1024 --models ${RESDIR}/client_final_{1..20}/ --data ${DATADIR} --model_output client --out-prefix ${RESDIR}/client_final
#python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_final_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_final
#python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_inter_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_inter
#python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_staggered_{1..20}/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_staggered
#python3 main.py --mode all_beta --out-prefix ${RESDIR}/both_staggered
python3 main.py --mode embedding --batch 1024 --models ${RESDIR}/client_final_{1..20}/ ${RESDIR}/both_final_{1..20}/ \
${RESDIR}/both_inter_{1..20}/ ${RESDIR}/both_staggered_{1..20}/ \
--data ${DATADIR} \
--out-prefix ${RESDIR}/figs/svd/svd

146
hyperband.py Normal file
View File

@ -0,0 +1,146 @@
# -*- coding: utf-8 -*-
# implementation of hyperband:
# https://arxiv.org/pdf/1603.06560.pdf
import logging
import random
from math import ceil, log
from random import random as rng
from time import ctime, time
import joblib
import numpy as np
from keras.callbacks import EarlyStopping
import models
logger = logging.getLogger('cisco_logger')
def sample_params(param_distribution: dict):
p = {}
for key, val in param_distribution.items():
p[key] = random.choice(val)
return p
class Hyperband:
def __init__(self, param_distribution, X, y, max_iter=81, savefile=None):
self.get_params = lambda: sample_params(param_distribution)
self.max_iter = max_iter # maximum iterations per configuration
self.eta = 3 # defines configuration downsampling rate (default = 3)
self.logeta = lambda x: log(x) / log(self.eta)
self.s_max = int(self.logeta(self.max_iter))
self.B = (self.s_max + 1) * self.max_iter
self.results = [] # list of dicts
self.counter = 0
self.best_loss = np.inf
self.best_counter = -1
self.savefile = savefile
self.X = X
self.y = y
def try_params(self, n_iterations, params):
n_iterations = int(round(n_iterations))
model = models.get_models_by_params(params)
callbacks = [EarlyStopping(monitor='val_loss',
patience=5,
verbose=False)]
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
history = model.fit(self.X,
self.y[0] if params["model_output"] == "client" else self.y,
batch_size=params["batch_size"],
epochs=n_iterations,
callbacks=callbacks,
shuffle=True,
validation_split=0.4)
return {"loss": np.min(history.history['val_loss']),
"early_stop": len(history.history["loss"]) < n_iterations,
"stop_after": len(history.history["val_loss"])}
# can be called multiple times
def run(self, skip_last=0, dry_run=False):
for s in reversed(range(self.s_max + 1)):
# initial number of configurations
n = int(ceil(self.B / self.max_iter / (s + 1) * self.eta ** s))
# initial number of iterations per config
r = self.max_iter * self.eta ** (-s)
# n random configurations
random_configs = [self.get_params() for _ in range(n)]
for i in range((s + 1) - int(skip_last)): # changed from s + 1
# Run each of the n configs for <iterations>
# and keep best (n_configs / eta) configurations
n_configs = n * self.eta ** (-i)
n_iterations = r * self.eta ** (i)
logger.info("*** {} configurations x {:.1f} iterations each".format(
n_configs, n_iterations))
val_losses = []
early_stops = []
for t in random_configs:
self.counter += 1
logger.info("Config {} | {} | lowest loss so far: {:.4f} (run {})".format(
self.counter, ctime(), self.best_loss, self.best_counter))
start_time = time()
if dry_run:
result = {'loss': rng(), 'log_loss': rng(), 'auc': rng()}
else:
result = self.try_params(n_iterations, t) # <---
assert (type(result) == dict)
assert ('loss' in result)
seconds = int(round(time() - start_time))
logger.info("{} seconds.".format(seconds))
loss = result['loss']
val_losses.append(loss)
early_stop = result.get('early_stop', False)
early_stops.append(early_stop)
# keeping track of the best result so far (for display only)
# could do it be checking results each time, but hey
if loss < self.best_loss:
self.best_loss = loss
self.best_counter = self.counter
result['counter'] = self.counter
result['seconds'] = seconds
result['params'] = t
result['iterations'] = n_iterations
self.results.append(result)
# select a number of best configurations for the next loop
# filter out early stops, if any
indices = np.argsort(val_losses)
random_configs = [random_configs[i] for i in indices if not early_stops[i]]
random_configs = random_configs[0:int(n_configs / self.eta)]
if self.savefile:
joblib.dump(self.results, self.savefile)
return self.results

951
main.py
View File

@ -1,127 +1,860 @@
import argparse
import logging
import operator
import os
import joblib
import numpy as np
from keras.utils import np_utils
import pandas as pd
import tensorflow as tf
from keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint
from sklearn.metrics import confusion_matrix
import arguments
import dataset
import hyperband
import models
# create logger
import visualize
from arguments import get_model_args
from utils import exists_or_make_path, get_custom_class_weights, get_custom_sample_weights, load_model
parser = argparse.ArgumentParser()
logger = logging.getLogger('cisco_logger')
logger.setLevel(logging.DEBUG)
logger.propagate = False
parser.add_argument("--modes", action="store", dest="modes", nargs="+")
# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# parser.add_argument("--data", action="store", dest="data",
# default="data/")
#
# parser.add_argument("--h5data", action="store", dest="h5data",
# default="")
#
# parser.add_argument("--model", action="store", dest="model",
# default="model_x")
#
# parser.add_argument("--pred", action="store", dest="pred",
# default="")
#
# parser.add_argument("--type", action="store", dest="model_type",
# default="simple_conv")
#
parser.add_argument("--batch", action="store", dest="batch_size",
default=64, type=int)
# create formatter
formatter1 = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
parser.add_argument("--epochs", action="store", dest="epochs",
default=10, type=int)
# add formatter to ch
ch.setFormatter(formatter1)
# parser.add_argument("--samples", action="store", dest="samples",
# default=100000, type=int)
#
# parser.add_argument("--samples_val", action="store", dest="samples_val",
# default=10000, type=int)
#
# parser.add_argument("--area", action="store", dest="area_size",
# default=25, type=int)
#
# parser.add_argument("--queue", action="store", dest="queue_size",
# default=50, type=int)
#
# parser.add_argument("--p", action="store", dest="p_train",
# default=0.5, type=float)
#
# parser.add_argument("--p_val", action="store", dest="p_val",
# default=0.01, type=float)
#
# parser.add_argument("--gpu", action="store", dest="gpu",
# default=0, type=int)
#
# parser.add_argument("--tmp", action="store_true", dest="tmp")
#
# parser.add_argument("--test", action="store", dest="test_image",
# default=6, choices=range(7), type=int)
# add ch to logger
logger.addHandler(ch)
args = parser.parse_args()
# ch = logging.FileHandler("info.log")
# ch.setLevel(logging.DEBUG)
#
# # create formatter
# formatter2 = logging.Formatter('!! %(asctime)s - %(name)s - %(levelname)s - %(message)s')
#
# # add formatter to ch
# ch.setFormatter(formatter2)
#
# # add ch to logger
# logger.addHandler(ch)
# config = tf.ConfigProto(log_device_placement=True)
# config.gpu_options.per_process_gpu_memory_fraction = 0.5
# config.gpu_options.allow_growth = True
# session = tf.Session(config=config)
args = arguments.parse()
if args.gpu:
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.per_process_gpu_memory_fraction = 0.5
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
# default parameter
PARAMS = {
"type": args.model_type,
"embedding_type": args.embedding_type,
# "depth": args.model_depth,
"batch_size": args.batch_size,
"window_size": args.window,
"domain_length": args.domain_length,
"flow_features": 3,
#
'dropout': 0.5, # currently fix
'embedding': args.embedding,
'flow_features': 3,
'filter_embedding': args.filter_embedding,
'dense_embedding': args.dense_embedding,
'kernel_embedding': args.kernel_embedding,
'filter_main': args.filter_main,
'dense_main': args.dense_main,
'kernel_main': args.kernel_main,
'model_output': args.model_output
}
# TODO: remove inner global params
def get_param_dist(dist_size="small"):
if dist_size == "small":
return {
# static params
"type": [args.model_type],
"embedding_type": [args.embedding_type],
# "depth": [args.model_depth],
"model_output": [args.model_output],
"batch_size": [args.batch_size],
"window_size": [args.window],
"flow_features": [3],
"domain_length": [args.domain_length],
# model params
"embedding": [2 ** x for x in range(3, 6)],
"filter_embedding": [2 ** x for x in range(1, 8)],
"kernel_embedding": [1, 3, 5],
"dense_embedding": [2 ** x for x in range(4, 8)],
"dropout": [0.5],
"filter_main": [2 ** x for x in range(1, 8)],
"kernel_main": [1, 3, 5],
"dense_main": [2 ** x for x in range(1, 8)],
}
else:
return {
# static params
"type": [args.model_type],
"embedding_type": [args.embedding_type],
# "depth": [args.model_depth],
"model_output": [args.model_output],
"batch_size": [args.batch_size],
"window_size": [args.window],
"flow_features": [3],
"domain_length": [args.domain_length],
# model params
"embedding": [2 ** x for x in range(3, 7)],
"filter_embedding": [2 ** x for x in range(1, 10)],
"kernel_embedding": [1, 3, 5, 7, 9],
"dense_embedding": [2 ** x for x in range(4, 10)],
"dropout": [0.5],
"filter_main": [2 ** x for x in range(1, 10)],
"kernel_main": [1, 3, 5, 7, 9],
"dense_main": [2 ** x for x in range(1, 12)],
}
def shuffle_training_data(domain, flow, client, server):
idx = np.random.permutation(len(domain))
domain = domain[idx]
flow = flow[idx]
client = client[idx]
server = server[idx]
return domain, flow, client, server
def main_paul_best():
pauls_best_params = {
"type": "paul",
"batch_size": 64,
"window_size": 10,
"domain_length": 40,
"flow_features": 3,
#
'dropout': 0.5,
'domain_features': 32,
'drop_out': 0.5,
'embedding_size': 64,
'filter_main': 512,
'flow_features': 3,
'dense_main': 32,
'filter_embedding': 32,
'hidden_embedding': 32,
'kernel_embedding': 8,
'kernels_main': 8,
'input_length': 40
}
main_train(pauls_best_params)
def main_hyperband(data, domain_length, window_size, model_type, result_file, max_iter, dist_size="small"):
logger.info("create training dataset")
domain_tr, flow_tr, client_tr, server_tr = load_data(data, domain_length, window_size, model_type, shuffled=True)
return run_hyperband(dist_size, domain_tr, flow_tr, client_tr, server_tr, max_iter, result_file)
def run_hyperband(dist_size, features, labels, max_iter, savefile):
param_dist = get_param_dist(dist_size)
hp = hyperband.Hyperband(param_dist, features, labels,
max_iter=max_iter,
savefile=savefile)
results = hp.run()
return results
def train(parameters, features, labels):
pass
def load_data(data, domain_length, window_size, model_type, shuffled=False):
# data preparation
domain_tr, flow_tr, name_tr, client_tr, server_windows_tr = dataset.load_or_generate_h5data(data, domain_length,
window_size)
server_tr = np.max(server_windows_tr, axis=1)
if model_type in ("inter", "staggered"):
server_tr = np.expand_dims(server_windows_tr, 2)
if shuffled:
domain_tr, flow_tr, client_tr, server_tr = shuffle_training_data(domain_tr, flow_tr, client_tr, server_tr)
return domain_tr, flow_tr, client_tr, server_tr
def load_training_data(data, model_output, domain_length, window_size, model_type, shuffled=False):
domain_tr, flow_tr, client_tr, server_tr = load_data(data, domain_length,
window_size, model_type, shuffled)
features = {"ipt_domains": domain_tr.value, "ipt_flows": flow_tr.value}
if model_output == "both":
labels = {"client": client_tr.value, "server": server_tr}
loss_weights = {"client": 1.0, "server": 1.0}
elif model_output == "client":
labels = {"client": client_tr.value}
loss_weights = {"client": 1.0}
elif model_output == "server":
labels = {"server": server_tr}
loss_weights = {"server": 1.0}
else:
raise ValueError("unknown model output")
return features, labels, loss_weights
def get_weighting(class_weights, sample_weights, labels):
return None, None
client, server = labels["client"], labels["server"]
if class_weights:
logger.info("class weights: compute custom weights")
custom_class_weights = get_custom_class_weights(client, server)
logger.info(custom_class_weights)
else:
logger.info("class weights: set default")
custom_class_weights = None
if sample_weights:
logger.info("class weights: compute custom weights")
custom_sample_weights = get_custom_sample_weights(client, server)
logger.info(custom_sample_weights)
else:
logger.info("class weights: set default")
custom_sample_weights = None
return custom_class_weights, custom_sample_weights
def main_train(param=None):
logger.info(f"Create model path {args.model_path}")
exists_or_make_path(args.model_path)
logger.info(f"Use command line arguments: {args}")
# data preparation
features, labels, loss_weights = load_training_data(args.data, args.model_output, args.domain_length,
args.window, args.model_type)
# call hyperband if results are not accessible
if args.hyperband_results:
try:
hyper_results = joblib.load(args.hyperband_results)
except Exception:
logger.info("start hyperband parameter search")
hyper_results = run_hyperband("small", features, labels, args.hyper_max_iter,
args.hyperband_results)
param = sorted(hyper_results, key=operator.itemgetter("loss"))[0]["params"]
param["type"] = args.model_type
logger.info(f"select params from result: {param}")
if not param:
param = PARAMS
# custom class or sample weights
# TODO: should throw an error when using weights with only the client labels
custom_class_weights, custom_sample_weights = get_weighting(args.class_weights, args.sample_weights, labels)
for i in range(args.runs):
model_path = os.path.join(args.model_path, f"clf_{i}.h5")
train_log_path = os.path.join(args.model_path, f"train_{i}.log.csv")
# define training call backs
logger.info("define callbacks")
callbacks = []
callbacks.append(ModelCheckpoint(filepath=model_path,
monitor='loss',
verbose=False,
save_best_only=True))
callbacks.append(CSVLogger(train_log_path))
logger.info(f"Use early stopping: {args.stop_early}")
if args.stop_early:
callbacks.append(EarlyStopping(monitor='val_loss',
patience=5,
verbose=False))
custom_metrics = models.get_metric_functions()
logger.info(f"Generator model with params: {param}")
model = models.get_models_by_params(param)
logger.info(f"select model: {args.model_type}")
if args.model_type == "staggered":
logger.info("compile and pre-train server model")
logger.info(model.get_config())
model.compile(optimizer='adam',
loss='binary_crossentropy',
loss_weights={"client": 0.0, "server": 1.0},
metrics=['accuracy'] + custom_metrics)
model.summary()
model.fit(features, labels,
batch_size=args.batch_size,
epochs=args.epochs,
class_weight=custom_class_weights,
sample_weight=custom_sample_weights)
logger.info("fix server model")
model.get_layer("domain_cnn").trainable = False
model.get_layer("domain_cnn").layer.trainable = False
model.get_layer("dense_server").trainable = False
model.get_layer("server").trainable = False
loss_weights = {"client": 1.0, "server": 0.0}
logger.info("compile and train model")
logger.info(model.get_config())
model.compile(optimizer='adam',
loss='binary_crossentropy',
loss_weights=loss_weights,
metrics=['accuracy'] + custom_metrics)
model.summary()
model.fit(features, labels,
batch_size=args.batch_size,
epochs=args.epochs,
callbacks=callbacks,
class_weight=custom_class_weights,
sample_weight=custom_sample_weights)
def main_retrain():
source = os.path.join(args.model_source, "clf.h5")
destination = os.path.join(args.model_destination, "clf.h5")
logger.info(f"Use command line arguments: {args}")
exists_or_make_path(args.model_destination)
domain_tr, flow_tr, name_tr, client_tr, server_windows_tr = dataset.load_or_generate_h5data(args.data,
args.domain_length,
args.window)
logger.info("define callbacks")
callbacks = []
callbacks.append(ModelCheckpoint(filepath=destination,
monitor='loss',
verbose=False,
save_best_only=True))
callbacks.append(CSVLogger(args.train_log))
logger.info(f"Use early stopping: {args.stop_early}")
if args.stop_early:
callbacks.append(EarlyStopping(monitor='val_loss',
patience=5,
verbose=False))
server_tr = np.max(server_windows_tr, axis=1)
if args.class_weights:
logger.info("class weights: compute custom weights")
custom_class_weights = get_custom_class_weights(client_tr.value, server_tr)
logger.info(custom_class_weights)
else:
logger.info("class weights: set default")
custom_class_weights = None
logger.info(f"Load pretrained model")
embedding, model = load_model(source, custom_objects=models.get_custom_objects())
if args.model_type in ("inter", "staggered"):
server_tr = np.expand_dims(server_windows_tr, 2)
features = {"ipt_domains": domain_tr.value, "ipt_flows": flow_tr.value}
if args.model_output == "both":
labels = {"client": client_tr.value, "server": server_tr}
elif args.model_output == "client":
labels = {"client": client_tr.value}
elif args.model_output == "server":
labels = {"server": server_tr}
else:
raise ValueError("unknown model output")
logger.info("re-train model")
embedding.summary()
model.summary()
model.fit(features, labels,
batch_size=args.batch_size,
epochs=args.epochs,
callbacks=callbacks,
class_weight=custom_class_weights,
initial_epoch=args.initial_epoch)
def main_test():
logger.info("load test data")
domain_val, flow_val, _, _, _, _ = dataset.load_or_generate_raw_h5data(args.data, args.domain_length, args.window)
logger.info("load test domains")
domain_encs, _, _ = dataset.load_or_generate_domains(args.data, args.domain_length)
def get_dir(path):
return os.path.split(os.path.normpath(path))
results = {}
for model_path in args.model_paths:
file = get_dir(model_path)[1]
results[file] = {}
logger.info(f"process model {model_path}")
embd_model, clf_model = load_model(model_path, custom_objects=models.get_custom_objects())
pred = clf_model.predict([domain_val, flow_val],
batch_size=args.batch_size,
verbose=1)
if args.model_output == "both":
c_pred, s_pred = pred
results[file]["client_pred"] = c_pred
results[file]["server_pred"] = s_pred
elif args.model_output == "client":
results[file]["client_pred"] = pred
else:
results[file]["server_pred"] = pred
domain_embeddings = embd_model.predict(domain_encs, batch_size=args.batch_size, verbose=1)
results["domain_embds"] = domain_embeddings
# store results every round - safety first!
dataset.save_predictions(get_dir(model_path)[0], results)
def main_visualization():
def plot_model(clf_model, path):
embd, model = load_model(clf_model, custom_objects=models.get_custom_objects())
visualize.plot_model_as(embd, os.path.join(path, "model_embd.pdf"), shapes=False)
visualize.plot_model_as(model, os.path.join(path, "model_clf.pdf"), shapes=False)
def vis(model_name, model_path, df, df_paul, aggregation, curve):
visualize.plot_clf()
if aggregation == "user":
df = df.groupby(df.names).max()
df_paul = df_paul.groupby(df_paul.names).max()
if curve == "prc":
visualize.plot_precision_recall(df.client_val.as_matrix(), df.client_pred.as_matrix(), model_name)
visualize.plot_precision_recall(df_paul.client_val.as_matrix(), df_paul.client_pred.as_matrix(), "paul")
elif curve == "roc":
visualize.plot_roc_curve(df.client_val.as_matrix(), df.client_pred.as_matrix(), model_name)
visualize.plot_roc_curve(df_paul.client_val.as_matrix(), df_paul.client_pred.as_matrix(), "paul")
visualize.plot_legend()
visualize.plot_save("{}/{}_{}.pdf".format(model_path, aggregation, curve))
_, _, name_val, hits_vt, hits_trusted, server_val = dataset.load_or_generate_raw_h5data(args.data,
args.domain_length,
args.window)
results = dataset.load_predictions(args.model_path)
df = pd.DataFrame(data={
"names": name_val, "client_pred": results["client_pred"].flatten(),
"hits_vt": hits_vt, "hits_trusted": hits_trusted
})
df["client_val"] = np.logical_or(df.hits_vt == 1.0, df.hits_trusted >= 3)
df_user = df.groupby(df.names).max()
paul = dataset.load_predictions("results/paul/")
df_paul = pd.DataFrame(data={
"names": paul["testNames"].flatten(), "client_pred": paul["testScores"].flatten(),
"hits_vt": paul["testLabel"].flatten(), "hits_trusted": paul["testHits"].flatten()
})
df_paul["client_val"] = np.logical_or(df_paul.hits_vt == 1.0, df_paul.hits_trusted >= 3)
logger.info("plot model")
plot_model(args.clf_model, args.model_path)
# logger.info("plot training curve")
# logs = pd.read_csv(args.train_log)
# if "acc" in logs.keys():
# visualize.plot_training_curve(logs, "", "{}/client_train.png".format(args.model_path))
# elif "client_acc" in logs.keys() and "server_acc" in logs.keys():
# visualize.plot_training_curve(logs, "client_", "{}/client_train.png".format(args.model_path))
# visualize.plot_training_curve(logs, "server_", "{}/server_train.png".format(args.model_path))
# else:
# logger.warning("Error while plotting training curves")
logger.info("plot window prc")
vis(args.model_name, args.model_path, df, df_paul, "window", "prc")
logger.info("plot window roc")
vis(args.model_name, args.model_path, df, df_paul, "window", "roc")
logger.info("plot user prc")
vis(args.model_name, args.model_path, df, df_paul, "user", "prc")
logger.info("plot user roc")
vis(args.model_name, args.model_path, df, df_paul, "user", "roc")
# absolute values
visualize.plot_confusion_matrix(df.client_val.as_matrix(), df.client_pred.as_matrix().round(),
"{}/client_cov.pdf".format(args.model_path),
normalize=False, title="Client Confusion Matrix")
visualize.plot_confusion_matrix(df_user.client_val.as_matrix(), df_user.client_pred.as_matrix().round(),
"{}/user_cov.pdf".format(args.model_path),
normalize=False, title="User Confusion Matrix")
# normalized
visualize.plot_confusion_matrix(df.client_val.as_matrix(), df.client_pred.as_matrix().round(),
"{}/client_cov_norm.pdf".format(args.model_path),
normalize=True, title="Client Confusion Matrix")
visualize.plot_confusion_matrix(df_user.client_val.as_matrix(), df_user.client_pred.as_matrix().round(),
"{}/user_cov_norm.pdf".format(args.model_path),
normalize=True, title="User Confusion Matrix")
def main_visualize_all():
_, _, name_val, hits_vt, hits_trusted, server_val = dataset.load_or_generate_raw_h5data(args.data,
args.domain_length,
args.window)
def load_df(path):
res = dataset.load_predictions(path)
res = pd.DataFrame(data={
"names": name_val, "client_pred": res["client_pred"].flatten(),
"hits_vt": hits_vt, "hits_trusted": hits_trusted
})
res["client_val"] = np.logical_or(res.hits_vt == 1.0, res.hits_trusted >= 3)
return res
dfs = [(model_args["model_name"], load_df(model_args["model_path"])) for model_args in get_model_args(args)]
paul = dataset.load_predictions("results/paul/")
df_paul = pd.DataFrame(data={
"names": paul["testNames"].flatten(), "client_pred": paul["testScores"].flatten(),
"hits_vt": paul["testLabel"].flatten(), "hits_trusted": paul["testHits"].flatten()
})
df_paul["client_val"] = np.logical_or(df_paul.hits_vt == 1.0, df_paul.hits_trusted >= 3)
def vis(output_prefix, dfs, df_paul, aggregation, curve):
visualize.plot_clf()
if curve == "prc":
for model_name, df in dfs:
if aggregation == "user":
df = df.groupby(df.names).max()
visualize.plot_precision_recall(df.client_val.as_matrix(), df.client_pred.as_matrix(), model_name)
if aggregation == "user":
df_paul = df_paul.groupby(df_paul.names).max()
visualize.plot_precision_recall(df_paul.client_val.as_matrix(), df_paul.client_pred.as_matrix(), "paul")
elif curve == "roc":
for model_name, df in dfs:
if aggregation == "user":
df = df.groupby(df.names).max()
visualize.plot_roc_curve(df.client_val.as_matrix(), df.client_pred.as_matrix(), model_name)
if aggregation == "user":
df_paul = df_paul.groupby(df_paul.names).max()
visualize.plot_roc_curve(df_paul.client_val.as_matrix(), df_paul.client_pred.as_matrix(), "paul")
visualize.plot_legend()
visualize.plot_save("{}_{}_{}.pdf".format(output_prefix, aggregation, curve))
logger.info("plot pr curves")
vis(args.output_prefix, dfs, df_paul, "window", "prc")
logger.info("plot roc curves")
vis(args.output_prefix, dfs, df_paul, "window", "roc")
logger.info("plot user pr curves")
vis(args.output_prefix, dfs, df_paul, "user", "prc")
logger.info("plot user roc curves")
vis(args.output_prefix, dfs, df_paul, "user", "roc")
def main_visualize_all_embds():
def load_df(path):
res = dataset.load_predictions(path)
return res["domain_embds"]
dfs = [(model_args["model_name"], load_df(model_args["model_path"])) for model_args in get_model_args(args)]
from sklearn.decomposition import TruncatedSVD
def vis2(domain_embedding, labels):
n_levels = 7
logger.info(f"reduction for {len(domain_embedding)} points")
red = TruncatedSVD(n_components=2, algorithm="arpack")
domains = red.fit_transform(domain_embedding)
logger.info("plot kde")
benign = domains[labels.sum(axis=1) == 0]
# print(domains.shape)
# print(benign.shape)
# benign_idx
# sns.kdeplot(domains[labels.sum(axis=1) == 0, 0], domains[labels.sum(axis=1) == 0, 1],
# cmap="Blues", label="benign", n_levels=9, alpha=0.35, shade=True, shade_lowest=False)
# sns.kdeplot(domains[labels[:, 1], 0], domains[labels[:, 1], 1],
# cmap="Greens", label="server", n_levels=5, alpha=0.35, shade=True, shade_lowest=False)
# sns.kdeplot(domains[labels[:, 0], 0], domains[labels[:, 0], 1],
# cmap="Reds", label="client", n_levels=5, alpha=0.35, shade=True, shade_lowest=False)
plt.scatter(benign[benign_idx, 0], benign[benign_idx, 1],
cmap="Blues", label="benign", alpha=0.35, s=10)
plt.scatter(domains[labels[:, 1], 0], domains[labels[:, 1], 1],
cmap="Greens", label="server", alpha=0.35, s=10)
plt.scatter(domains[labels[:, 0], 0], domains[labels[:, 0], 1],
cmap="Reds", label="client", alpha=0.35, s=10)
return np.concatenate((domains[:1000], domains[1000:2000], domains[2000:3000]), axis=0)
domain_encs, _, labels = dataset.load_or_generate_domains(args.data, args.domain_length)
idx = np.arange(len(labels))
client = labels[:, 0]
server = labels[:, 1]
benign = np.logical_not(np.logical_or(client, server))
print(client.sum(), server.sum(), benign.sum())
idx = np.concatenate((
np.random.choice(idx[client], 1000),
np.random.choice(idx[server], 1000),
np.random.choice(idx[benign], 6000)), axis=0)
benign_idx = np.random.choice(np.arange(6000), 1000)
print(idx.shape)
lls = labels[idx]
for model_name, embd in dfs:
logger.info(f"plot embedding for {model_name}")
visualize.plot_clf()
embd = embd[idx]
points = vis2(embd, lls)
# np.savetxt("{}_{}.csv".format(args.output_prefix, model_name), points, delimiter=",")
visualize.plot_save("{}_{}.pdf".format(args.output_prefix, model_name))
def main_beta():
domain_val, _, name_val, hits_vt, hits_trusted, server_val = dataset.load_or_generate_raw_h5data(args.data,
args.domain_length,
args.window)
path, model_prefix = os.path.split(os.path.normpath(args.model_path))
curves = {
model_prefix: {"all": {}}
}
# domains = domain_val.value.reshape(-1, 40)
# domains = np.apply_along_axis(lambda d: dataset.decode_domain(d), 1, domains)
def load_df(res):
df_server = None
data = {
"names": name_val, "client_pred": res["client_pred"].flatten(),
"hits_vt": hits_vt, "hits_trusted": hits_trusted,
}
if "server_pred" in res:
server = res["server_pred"] if len(res["server_pred"].shape) == 2 else res["server_pred"].max(axis=1)
val = server_val.value.max(axis=1)
data["server_pred"] = server.flatten()
data["server_val"] = val.flatten()
# if res["server_pred"].flatten().shape == server_val.value.flatten().shape:
# df_server = pd.DataFrame(data={
# "server_pred": res["server_pred"].flatten(),
# "domain": domains,
# "server_val": server_val.value.flatten()
# })
res = pd.DataFrame(data=data)
res["client_val"] = np.logical_or(res.hits_vt == 1.0, res.hits_trusted >= 3)
return res, df_server
logger.info(f"load results from {args.model_path}")
res = dataset.load_predictions(args.model_path)
model_keys = sorted(filter(lambda x: x.startswith("clf"), res.keys()), key=lambda x: int(x[4:-3]))
client_preds = []
server_preds = []
server_flow_preds = []
client_user_preds = []
server_user_preds = []
server_domain_preds = []
server_domain_avg_preds = []
for model_name in model_keys:
logger.info(f"load model {model_name}")
df, df_server = load_df(res[model_name])
client_preds.append(df.client_pred.as_matrix())
if "server_val" in df.columns:
server_preds.append(df.server_pred.as_matrix())
if df_server is not None:
logger.info(f" group servers")
server_flow_preds.append(df_server.server_pred.as_matrix())
df_domain = df_server.groupby(df_server.domain).max()
server_domain_preds.append(df_domain.server_pred.as_matrix())
df_domain_avg = df_server.groupby(df_server.domain).rolling(10).mean()
server_domain_avg_preds.append(df_domain_avg.server_pred.as_matrix())
curves[model_prefix][model_name] = confusion_matrix(df.client_val.as_matrix(),
df.client_pred.as_matrix().round())
logger.info(f" group users")
df_user = df.groupby(df.names).max()
client_user_preds.append(df_user.client_pred.as_matrix())
if "server_val" in df.columns:
server_user_preds.append(df_user.server_pred.as_matrix())
logger.info("compute client curves")
curves[model_prefix]["all"]["client_window_prc"] = visualize.calc_pr_mean(df.client_val.as_matrix(), client_preds)
curves[model_prefix]["all"]["client_window_roc"] = visualize.calc_roc_mean(df.client_val.as_matrix(), client_preds)
curves[model_prefix]["all"]["client_user_prc"] = visualize.calc_pr_mean(df_user.client_val.as_matrix(),
client_user_preds)
curves[model_prefix]["all"]["client_user_roc"] = visualize.calc_roc_mean(df_user.client_val.as_matrix(),
client_user_preds)
if "server_val" in df.columns:
logger.info("compute server curves")
curves[model_prefix]["all"]["server_window_prc"] = visualize.calc_pr_mean(df.server_val.as_matrix(),
server_preds)
curves[model_prefix]["all"]["server_window_roc"] = visualize.calc_roc_mean(df.server_val.as_matrix(),
server_preds)
curves[model_prefix]["all"]["server_user_prc"] = visualize.calc_pr_mean(df_user.server_val.as_matrix(),
server_user_preds)
curves[model_prefix]["all"]["server_user_roc"] = visualize.calc_roc_mean(df_user.server_val.as_matrix(),
server_user_preds)
if df_server is not None:
logger.info("compute server flow curves")
curves[model_prefix]["all"]["server_flow_prc"] = visualize.calc_pr_mean(df_server.server_val.as_matrix(),
server_flow_preds)
curves[model_prefix]["all"]["server_flow_roc"] = visualize.calc_roc_mean(df_server.server_val.as_matrix(),
server_flow_preds)
curves[model_prefix]["all"]["server_domain_prc"] = visualize.calc_pr_mean(df_domain.server_val.as_matrix(),
server_domain_preds)
curves[model_prefix]["all"]["server_domain_roc"] = visualize.calc_roc_mean(df_domain.server_val.as_matrix(),
server_domain_preds)
curves[model_prefix]["all"]["server_domain_avg_prc"] = visualize.calc_pr_mean(
df_domain_avg.server_val.as_matrix(),
server_domain_avg_preds)
curves[model_prefix]["all"]["server_domain_avg_roc"] = visualize.calc_roc_mean(
df_domain_avg.server_val.as_matrix(),
server_domain_avg_preds)
joblib.dump(curves, f"{args.model_path}_curves.joblib")
try:
curves_all: dict = joblib.load(f"{path}/curves.joblib")
logger.info(f"load file {path}/curves.joblib successfully")
curves_all[model_prefix] = curves[model_prefix]
except Exception:
curves_all = curves
logger.info(f"currently {len(curves_all)} models in file: {curves_all.keys()}")
joblib.dump(curves_all, f"{path}/curves.joblib")
import matplotlib
matplotlib.use("agg")
import matplotlib.pyplot as plt
def plot_overall_result():
path, model_prefix = os.path.split(os.path.normpath(args.model_path))
exists_or_make_path(f"{path}/figs/curves/")
try:
results = joblib.load(f"{path}/curves.joblib")
logger.info("curves successfully loaded")
except Exception:
results = {}
x = np.linspace(0, 1, 10000)
for vis in ["client_window_prc", "client_window_roc", "client_user_prc", "client_user_roc",
"server_window_prc", "server_window_roc", "server_user_prc", "server_user_roc",
"server_flow_prc", "server_flow_roc", "server_domain_prc", "server_domain_roc"]:
logger.info(f"plot {vis}")
visualize.plot_clf()
for model_key in results.keys():
if vis not in results[model_key]["all"]:
continue
if "final" in model_key and vis.startswith("server_flow"):
continue
ys_mean, ys_std, ys = results[model_key]["all"][vis]
plt.plot(x, ys_mean, label=f"{model_key} - {np.mean(ys_mean):5.4} ({np.mean(ys_std):4.3})")
plt.fill_between(x, ys_mean - ys_std, ys_mean + ys_std, alpha=0.2)
if vis.endswith("prc"):
plt.xlabel('Recall')
plt.ylabel('Precision')
else:
plt.plot(x, x, label="random classifier", ls="--", c=".3", alpha=0.4)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.xscale('log')
plt.ylim([0.0, 1.0])
plt.xlim([0.0, 1.0])
visualize.plot_legend()
visualize.plot_save(f"{path}/figs/curves/{vis}_all.pdf")
return
for vis in ["client_window_prc", "client_window_roc", "client_user_prc", "client_user_roc",
"server_window_prc", "server_window_roc", "server_user_prc", "server_user_roc",
"server_flow_prc", "server_flow_roc", "server_domain_prc", "server_domain_roc"]:
logger.info(f"plot {vis}")
visualize.plot_clf()
for model_key in results.keys():
if vis not in results[model_key]["all"]:
continue
if "final" in model_key and vis.startswith("server_flow"):
continue
_, _, ys = results[model_key]["all"][vis]
for y in ys:
plt.plot(x, y, label=f"{model_key} - {np.mean(y):5.4}")
if vis.endswith("prc"):
plt.xlabel('Recall')
plt.ylabel('Precision')
else:
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.xscale('log')
plt.ylim([0.0, 1.0])
plt.xlim([0.0, 1.0])
visualize.plot_legend()
visualize.plot_save(f"{path}/figs/Appendices/{model_key}_{vis}.pdf")
def main_stats():
path, model_prefix = os.path.split(os.path.normpath(args.output_prefix))
for time in ("current", "future"):
df = dataset.get_user_flow_data(f"data/{time}Data.csv.gz")
df["clientlabel"] = np.logical_or(df.virusTotalHits > 3, df.trustedHits > 0)
# df_user = df.groupby(df.user_hash).max()
# df_server = df.groupby(df.domain).max()
# len(df)
# df.clientlabel.sum()
# df.serverLabel.sum()
for col in ["duration", "bytes_down", "bytes_up"]:
# visualize.plot_clf()
plt.clf()
plt.hist(df[col])
visualize.plot_save(f"{path}/figs/hist_{time}_{col}.pdf")
print(".")
# visualize.plot_clf()
plt.clf()
plt.hist(np.log1p(df[col]))
visualize.plot_save(f"{path}/figs/hist_{time}_norm_{col}.pdf")
print("-")
def main_stats2():
import joblib
res = joblib.load("results/variance_test_hyper/curves.joblib")
for vis in ["client_window_prc", "client_window_roc", "client_user_prc", "client_user_roc",
"server_window_prc", "server_window_roc", "server_user_prc", "server_user_roc",
"server_flow_prc", "server_flow_roc", "server_domain_prc", "server_domain_roc",
"server_domain_avg_prc", "server_domain_avg_roc"]:
tab = []
for m, r in res.items():
if vis not in r: continue
tab.append(r["all"][vis][2].mean(axis=1))
if not tab: continue
df = pd.DataFrame(data=np.vstack(tab).T, columns=list(res.keys()),
index=range(1, 21))
df.to_csv(f"{vis}.csv")
print(f"% {vis}")
print(df.round(4).to_latex())
print()
def main():
# parameter
innerCNNFilters = 512
innerCNNKernelSize = 2
cnnDropout = 0.5
cnnHiddenDims = 1024
domainFeatures = 512
flowFeatures = 3
numCiscoFeatures = 30
windowSize = 10
maxLen = 40
embeddingSize = 100
kernel_size = 2
drop_out = 0.5
filters = 2
hidden_dims = 100
vocabSize = 40
threshold = 3
minFlowsPerUser = 10
numEpochs = 100
char_dict = dataset.get_character_dict()
user_flow_df = dataset.get_user_flow_data()
print("create training dataset")
(X_tr, hits_tr, names_tr, server_tr, trusted_hits_tr) = dataset.create_dataset_from_flows(
user_flow_df, char_dict,
max_len=maxLen, window_size=windowSize)
# make client labels discrete with 4 different values
# TODO: use trusted_hits_tr for client classification too
client_labels = np.apply_along_axis(lambda x: dataset.discretize_label(x, 3), 0, np.atleast_2d(hits_tr))
# select only 1.0 and 0.0 from training data
pos_idx = np.where(client_labels == 1.0)[0]
neg_idx = np.where(client_labels == 0.0)[0]
idx = np.concatenate((pos_idx, neg_idx))
# select labels for prediction
client_labels = client_labels[idx]
server_labels = server_tr[idx]
shared_cnn = models.get_shared_cnn(len(char_dict) + 1, embeddingSize, maxLen,
domainFeatures, kernel_size, domainFeatures, 0.5)
model = models.get_top_cnn(shared_cnn, flowFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size,
cnnHiddenDims, cnnDropout)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
client_labels = np_utils.to_categorical(client_labels, 2)
server_labels = np_utils.to_categorical(server_labels, 2)
model.fit(X_tr,
[client_labels, server_labels],
batch_size=args.batch_size,
epochs=args.epochs,
shuffle=True)
# TODO: for validation we use future data -> validation_data=(testData,testLabel))
if "train" == args.mode:
main_train()
if "retrain" == args.mode:
main_retrain()
if "hyperband" == args.mode:
main_hyperband(args.data, args.domain_length, args.window, args.model_type, args.hyperband_results,
args.hyper_max_iter)
if "test" == args.mode:
main_test()
if "beta" == args.mode:
main_beta()
if "all_beta" == args.mode:
plot_overall_result()
if "embedding" == args.mode:
main_visualize_all_embds()
if __name__ == "__main__":

View File

@ -1,41 +0,0 @@
import keras
from keras.engine import Input, Model
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, TimeDistributed
def get_shared_cnn(vocab_size, embedding_size, input_length, filters, kernel_size,
hidden_dims, drop_out):
x = y = Input(shape=(input_length,))
y = Embedding(input_dim=vocab_size, output_dim=embedding_size)(y)
y = Conv1D(filters, kernel_size, activation='relu')(y)
y = GlobalMaxPooling1D()(y)
y = Dense(hidden_dims)(y)
y = Dropout(drop_out)(y)
y = Activation('relu')(y)
return Model(x, y)
def get_full_model(vocabSize, embeddingSize, maxLen, domainFeatures, flowFeatures,
filters, h1, h2, dropout, dense):
pass
def get_top_cnn(cnn, numFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size, cnnHiddenDims, cnnDropout):
ipt_domains = Input(shape=(windowSize, maxLen), name="ipt_domains")
encoded = TimeDistributed(cnn)(ipt_domains)
ipt_flows = Input(shape=(windowSize, numFeatures), name="ipt_flows")
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
# add second cnn
y = Conv1D(filters,
kernel_size,
activation='relu',
input_shape=(windowSize, domainFeatures + numFeatures))(merged)
# TODO: why global pooling? -> 3D to 2D
# we use max pooling:
y = GlobalMaxPooling1D()(y)
y = Dropout(cnnDropout)(y)
y = Dense(cnnHiddenDims, activation='relu')(y)
y1 = Dense(2, activation='softmax', name="client")(y)
y2 = Dense(2, activation='softmax', name="server")(y)
return Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2))

117
models/__init__.py Normal file
View File

@ -0,0 +1,117 @@
from collections import namedtuple
from keras.models import Model
from . import networks
from .metrics import *
NetworkParameters = namedtuple("NetworkParameters", [
"type", "flow_features", "window_size", "domain_length", "output",
"embedding_size",
"domain_filter", "domain_kernel", "domain_dense", "domain_dropout",
"main_filter", "main_kernel", "main_dense", "main_dropout",
])
def create_model(model, output_type):
if output_type == "both":
return Model(inputs=[model.in_domains, model.in_flows], outputs=(model.out_client, model.out_server))
elif output_type == "client":
return Model(inputs=[model.in_domains, model.in_flows], outputs=(model.out_client,))
else:
raise Exception("unknown model output")
def get_models_by_params(params: dict):
K.clear_session()
# decomposing param section
# mainly embedding model
embedding_type = params.get("embedding_type", "small")
network_type = params.get("type")
# network_depth = params.get("depth")
embedding_size = params.get("embedding")
filter_embedding = params.get("filter_embedding")
kernel_embedding = params.get("kernel_embedding")
hidden_embedding = params.get("dense_embedding")
# dropout = params.get("dropout")
# mainly prediction model
flow_features = params.get("flow_features")
window_size = params.get("window_size")
domain_length = params.get("domain_length")
filter_main = params.get("filter_main")
kernel_main = params.get("kernel_main")
dense_dim = params.get("dense_main")
model_output = params.get("model_output", "both")
if embedding_type == "small":
domain_cnn = networks.get_domain_embedding_model(embedding_size, domain_length, filter_embedding,
kernel_embedding, hidden_embedding, 0.5)
elif embedding_type == "deep":
domain_cnn = networks.get_domain_embedding_model2(embedding_size, domain_length, filter_embedding,
kernel_embedding, hidden_embedding, 0.5)
else:
raise ValueError("embedding type not found")
if network_type == "final":
model = networks.get_final_model(0.25, flow_features, window_size, domain_length,
filter_main, kernel_main, dense_dim, domain_cnn)
model = create_model(model, model_output)
elif network_type in ("inter", "staggered"):
model = networks.get_inter_model(0.25, flow_features, window_size, domain_length,
filter_main, kernel_main, dense_dim, domain_cnn)
model = create_model(model, model_output)
elif network_type == "long":
model = networks.get_long_model(0.25, flow_features, window_size, domain_length,
filter_main, kernel_main, dense_dim, domain_cnn)
model = create_model(model, model_output)
elif network_type == "soft":
model = networks.get_long_model(0.25, flow_features, window_size, domain_length,
filter_main, kernel_main, dense_dim, domain_cnn)
model = create_model(model, model_output)
conv_server = model.get_layer("conv_server").trainable_weights
conv_client = model.get_layer("conv_client").trainable_weights
l1 = [0.001 * K.sum(K.abs(x - y)) for (x, y) in zip(conv_server, conv_client)]
model.add_loss(l1)
dense_server = model.get_layer("dense_server").trainable_weights
dense_client = model.get_layer("dense_client").trainable_weights
l2 = [0.001 * K.sum(K.abs(x - y)) for (x, y) in zip(dense_server, dense_client)]
model.add_loss(l2)
elif network_type == "sluice":
model = networks.get_sluice_model(0.25, flow_features, window_size, domain_length,
filter_main, kernel_main, dense_dim, domain_cnn)
model = create_model(model, model_output)
conv_server = model.get_layer("conv_server").trainable_weights
conv_client = model.get_layer("conv_client").trainable_weights
l1 = [0.001 * K.sum(K.abs(x - y)) for (x, y) in zip(conv_server, conv_client)]
model.add_loss(l1)
dense_server = model.get_layer("dense_server").trainable_weights
dense_client = model.get_layer("dense_client").trainable_weights
l2 = [0.001 * K.sum(K.abs(x - y)) for (x, y) in zip(dense_server, dense_client)]
model.add_loss(l2)
else:
raise ValueError("network type not found")
return model
def get_server_model_by_params(params: dict):
# decomposing param section
# mainly embedding model
network_depth = params.get("depth")
embedding_size = params.get("embedding")
input_length = params.get("input_length")
filter_embedding = params.get("filter_embedding")
kernel_embedding = params.get("kernel_embedding")
hidden_embedding = params.get("dense_embedding")
# mainly prediction model
flow_features = params.get("flow_features")
domain_length = params.get("domain_length")
dense_dim = params.get("dense_main")
embedding_model = networks.get_domain_embedding_model(embedding_size, input_length, filter_embedding,
kernel_embedding,
hidden_embedding, 0.5)
return networks.get_server_model(flow_features, domain_length, dense_dim, embedding_model)

64
models/metrics.py Normal file
View File

@ -0,0 +1,64 @@
import keras.backend as K
from keras.activations import elu
def get_custom_objects():
return dict([
("precision", precision),
("recall", recall),
("f1_score", f1_score),
("selu", selu)
])
def selu(x):
"""Scaled Exponential Linear Unit. (Klambauer et al., 2017)
# Arguments
x: A tensor or variable to compute the activation function for.
# References
- [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
# copied from keras.io
"""
alpha = 1.6732632423543772848170429916717
scale = 1.0507009873554804934193349852946
return scale * elu(x, alpha)
def get_metric_functions():
return [precision, recall, f1_score]
def precision(y_true, y_pred):
# Count positive samples.
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
return true_positives / (predicted_positives + K.epsilon())
def recall(y_true, y_pred):
# Count positive samples.
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
return true_positives / (possible_positives + K.epsilon())
def f1_score(y_true, y_pred):
return f_score(1)(y_true, y_pred)
def f05_score(y_true, y_pred):
return f_score(0.5)(y_true, y_pred)
def f_score(beta):
def _f(y_true, y_pred):
p = precision(y_true, y_pred)
r = recall(y_true, y_pred)
bb = beta ** 2
fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
return fbeta_score
return _f

249
models/networks.py Normal file
View File

@ -0,0 +1,249 @@
from collections import namedtuple
import keras
import keras.backend as K
import numpy as np
from keras.engine import Input, Model as KerasModel
from keras.engine.topology import Layer
from keras.layers import Conv1D, Dense, Dropout, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, TimeDistributed
from keras.regularizers import Regularizer
import dataset
Model = namedtuple("Model", ["in_domains", "in_flows", "out_client", "out_server"])
def get_domain_embedding_model(embedding_size, input_length, filter_size, kernel_size, hidden_dims,
drop_out=0.5) -> KerasModel:
x = y = Input(shape=(input_length,))
y = Embedding(input_dim=dataset.get_vocab_size(), output_dim=embedding_size)(y)
y = Conv1D(filter_size,
kernel_size,
activation='relu')(y)
y = GlobalMaxPooling1D()(y)
y = Dropout(drop_out)(y)
y = Dense(hidden_dims, activation="relu")(y)
return KerasModel(x, y)
def get_domain_embedding_model2(embedding_size, input_length, filter_size, kernel_size, hidden_dims,
drop_out=0.5) -> KerasModel:
x = y = Input(shape=(input_length,))
y = Embedding(input_dim=dataset.get_vocab_size(), output_dim=embedding_size)(y)
y = Conv1D(filter_size,
kernel_size,
activation='relu')(y)
y = Conv1D(filter_size,
kernel_size,
activation='relu')(y)
y = Conv1D(filter_size,
kernel_size,
activation='relu')(y)
y = GlobalAveragePooling1D()(y)
y = Dense(hidden_dims, activation="relu")(y)
return KerasModel(x, y)
def get_final_model(cnnDropout, flow_features, window_size, domain_length, cnn_dims, kernel_size,
dense_dim, cnn) -> Model:
ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
encoded = TimeDistributed(cnn, name="domain_cnn")(ipt_domains)
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
# CNN processing a small slides of flow windows
y = Conv1D(cnn_dims,
kernel_size,
activation='relu')(merged)
# remove temporal dimension by global max pooling
y = GlobalMaxPooling1D()(y)
y = Dropout(cnnDropout)(y)
y = Dense(dense_dim, activation='relu')(y)
out_client = Dense(1, activation='sigmoid', name="client")(y)
out_server = Dense(1, activation='sigmoid', name="server")(y)
return Model(ipt_domains, ipt_flows, out_client, out_server)
def get_inter_model(dropout, flow_features, window_size, domain_length, cnn_dims, kernel_size,
dense_dim, cnn) -> Model:
ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
encoded = TimeDistributed(cnn, name="domain_cnn")(ipt_domains)
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
y = Dense(dense_dim,
activation="relu",
name="dense_server")(merged)
out_server = Dense(1, activation="sigmoid", name="server")(y)
merged = keras.layers.concatenate([merged,
y], -1)
# CNN processing a small slides of flow windows
y = Conv1D(cnn_dims,
kernel_size,
activation='relu')(merged)
# remove temporal dimension by global max pooling
y = GlobalMaxPooling1D()(y)
y = Dropout(dropout)(y)
y = Dense(dense_dim,
activation='relu',
name="dense_client")(y)
out_client = Dense(1, activation='sigmoid', name="client")(y)
return Model(ipt_domains, ipt_flows, out_client, out_server)
def get_server_model(flow_features, domain_length, dense_dim, cnn):
ipt_domains = Input(shape=(domain_length,), name="ipt_domains")
ipt_flows = Input(shape=(flow_features,), name="ipt_flows")
encoded = cnn(ipt_domains)
cnn.name = "domain_cnn"
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
y = Dense(dense_dim,
activation="relu",
name="dense_server")(merged)
out_server = Dense(1, activation="sigmoid", name="server")(y)
return KerasModel(inputs=[ipt_domains, ipt_flows], outputs=out_server)
def get_long_model(dropout, flow_features, window_size, domain_length, cnn_dims, kernel_size,
dense_dim, cnn) -> Model:
ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
encoded = TimeDistributed(cnn, name="domain_cnn")(ipt_domains)
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
y = Conv1D(cnn_dims,
kernel_size,
activation='relu', name="conv_server")(merged)
# remove temporal dimension by global max pooling
y = GlobalMaxPooling1D()(y)
y = Dropout(dropout)(y)
y = Dense(dense_dim,
activation="relu",
name="dense_server")(y)
out_server = Dense(1, activation="sigmoid", name="server")(y)
# CNN processing a small slides of flow windows
y = Conv1D(cnn_dims,
kernel_size,
activation='relu', name="conv_client")(merged)
# remove temporal dimension by global max pooling
y = GlobalMaxPooling1D()(y)
y = Dropout(dropout)(y)
y = Dense(dense_dim,
activation='relu',
name="dense_client")(y)
out_client = Dense(1, activation='sigmoid', name="client")(y)
return Model(ipt_domains, ipt_flows, out_client, out_server)
class CrossStitch2(Layer):
def __init__(self, **kwargs):
super(CrossStitch2, self).__init__(**kwargs)
def build(self, input_shape):
# Create a trainable weight variable for this layer.
self.s = self.add_weight(name='cross-stitch-s',
shape=(1,),
initializer='uniform',
trainable=True)
self.d = self.add_weight(name='cross-stitch-d',
shape=(1,),
initializer='uniform',
trainable=True)
super(CrossStitch2, self).build(input_shape)
def call(self, xs):
x1, x2 = xs
out = x1 * self.s + x2 * self.d
return out
def compute_output_shape(self, input_shape):
return input_shape[0]
class CrossStitchMix2(Layer):
def __init__(self, **kwargs):
super(CrossStitchMix2, self).__init__(**kwargs)
def build(self, input_shape):
# Create a trainable weight variable for this layer.
self.s = self.add_weight(name='cross-stitch-s',
shape=(1,),
initializer='uniform',
trainable=True)
self.d = self.add_weight(name='cross-stitch-d',
shape=(1,),
initializer='uniform',
trainable=True)
super(CrossStitchMix2, self).build(input_shape)
def call(self, xs):
x1, x2 = xs
out = K.concatenate((x1 * self.s, x2 * self.d), axis=-1)
return out
def compute_output_shape(self, input_shape):
return (input_shape[0][0], input_shape[0][1] + input_shape[1][1])
class L21(Regularizer):
"""Regularizer for L21 regularization.
Found at: https://bitbucket.org/ispamm/group-lasso-for-neural-networks-tensorflow-keras
# Arguments
C: Float; L21 regularization factor.
"""
def __init__(self, C=0.):
self.C = K.cast_to_floatx(C)
def __call__(self, x):
const_coeff = np.sqrt(K.int_shape(x)[1])
return self.C * const_coeff * K.sum(K.sqrt(K.sum(K.square(x), axis=1)))
def get_config(self):
return {'C': float(self.C)}
def get_sluice_model(dropout, flow_features, window_size, domain_length, cnn_dims, kernel_size,
dense_dim, cnn) -> Model:
ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
encoded = TimeDistributed(cnn, name="domain_cnn")(ipt_domains)
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
y1 = Conv1D(cnn_dims,
kernel_size,
activation='relu', name="conv_server")(merged)
y1 = GlobalMaxPooling1D()(y1)
y2 = Conv1D(cnn_dims,
kernel_size,
activation='relu', name="conv_client")(merged)
y2 = GlobalMaxPooling1D()(y2)
c11 = CrossStitch2()([y1, y2])
c12 = CrossStitch2()([y1, y2])
y1 = Dropout(dropout)(c11)
y1 = Dense(dense_dim,
activation="relu",
name="dense_server")(y1)
y2 = Dropout(dropout)(c12)
y2 = Dense(dense_dim,
activation='relu',
name="dense_client")(y2)
c21 = CrossStitch2()([y1, y2])
c22 = CrossStitch2()([y1, y2])
beta1 = CrossStitchMix2()([c11, c21])
beta2 = CrossStitchMix2()([c12, c22])
out_server = Dense(1, activation="sigmoid", name="server")(beta1)
out_client = Dense(1, activation='sigmoid', name="client")(beta2)
return Model(ipt_domains, ipt_flows, out_client, out_server)

16
rerun_models.sh Normal file
View File

@ -0,0 +1,16 @@
#!/usr/bin/env bash
SRC=$1
DEST=$2
DATADIR=$3
INIT=$4
EPOCHS=$5
BS=128
for i in `ls -d $SRC*/`
do
echo "retrain model in ${i}"
name=$(basename $i)
python3 main.py --mode retrain --model_src ${i} --model_dest ${DEST}/${name} --init_epoch $INIT --epochs $EPOCHS --batch $BS --train ${DATADIR}
done

53
run.sh Normal file
View File

@ -0,0 +1,53 @@
#!/usr/bin/env bash
RESDIR=$1
mkdir -p /tmp/rk/${RESDIR}
DATADIR=$2
EPOCHS=10
for output in client both
do
for depth in small
do
for mtype in inter final
do
python main.py --mode train \
--train ${DATADIR}/currentData.csv \
--model ${RESDIR}/${output}_${depth}_${mtype} \
--epochs $EPOCHS \
--embd 128 \
--filter_embd 256 --kernel_embd 8 --dense_embd 128 \
--domain_embd 32 \
--filter_main 32 --kernel_main 8 --dense_main 1024 \
--batch 256 \
--balanced_weights \
--model_output ${output} \
--type ${mtype} \
--depth ${depth}
done
done
done
for depth in small
do
python main.py --mode train \
--train ${DATADIR}/currentData.csv \
--model ${RESDIR}/both_${depth}_staggered \
--epochs $EPOCHS \
--embd 128 \
--filter_embd 256 --kernel_embd 8 --dense_embd 128 \
--domain_embd 32 \
--filter_main 32 --kernel_main 8 --dense_main 1024 \
--batch 256 \
--balanced_weights \
--model_output both \
--type staggered \
--depth ${depth}
done
# python main.py --mode train --epochs 100 --embd 64 --filter_embd 128 --kernel_embd 5 --dense_embd 128 --domain_embd 32 --filter_main 32 --kernel_main 5 --dense_main 512 --batch 256 --balanced_weights --model_output ${output} --type ${mtype} --depth ${depth} --train ${DATADIR}/currentData.csv --model ${RESDIR}/${output}_${depth}_${mtype}
# python main.py --mode train --epochs 100 --embd 64 --filter_embd 128 --kernel_embd 5 --dense_embd 128 --domain_embd 32 --filter_main 32 --kernel_main 5 --dense_main 512 --batch 256 --balanced_weights --model_output client --type final --depth small --train /tmp/rk/data/currentData.csv --model /tmp/rk/results/paul3/client_final

30
run_model.sh Normal file
View File

@ -0,0 +1,30 @@
#!/usr/bin/env bash
N1=$1
N2=$2
OUTPUT=$3
DEPTH=$4
TYPE=$5
RESDIR=$6
mkdir -p /tmp/rk/${RESDIR}
DATADIR=$7
EPOCHS=10
for ((i = ${N1}; i <= ${N2}; i++))
do
python main.py --mode train \
--data ${DATADIR} \
--model ${RESDIR}/${OUTPUT}_${TYPE}_${i} \
--epochs ${EPOCHS} \
--embd 128 \
--filter_embd 256 --kernel_embd 8 --dense_embd 128 \
--domain_embd 32 \
--filter_main 32 --kernel_main 8 --dense_main 1024 \
--batch 128 \
--model_output ${OUTPUT} \
--type ${TYPE} \
--depth ${DEPTH} \
--gpu
done

30
run_model_rene.sh Normal file
View File

@ -0,0 +1,30 @@
#!/usr/bin/env bash
N1=$1
N2=$2
OUTPUT=$3
DEPTH=$4
TYPE=$5
RESDIR=$6
mkdir -p /tmp/rk/${RESDIR}
DATADIR=$7
EPOCHS=10
for ((i = ${N1}; i <= ${N2}; i++))
do
python main.py --mode train \
--train ${DATADIR} \
--model ${RESDIR}/${OUTPUT}_${TYPE}_${i} \
--epochs ${EPOCHS} \
--embd 64 \
--filter_embd 128 --kernel_embd 5 --dense_embd 64 \
--domain_embd 16 \
--filter_main 32 --kernel_main 5 --dense_main 256 \
--batch 128 \
--model_output ${OUTPUT} \
--type ${TYPE} \
--depth ${DEPTH} \
--gpu
done

View File

@ -1,10 +1,26 @@
#!/usr/bin/python2
import sys
import joblib
import numpy as np
import pandas as pd
df = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/currentData.joblib")
df = df["data"]
df = pd.concat(df)
fn = sys.argv[1]
df = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/{}.joblib".format(fn))
df = pd.concat(df["data"])
df.reset_index(inplace=True)
df.to_csv("/tmp/rk/full_dataset.csv.gz", compression="gzip")
df.dropna(axis=0, how="any", inplace=True)
df.serverLabel = pd.to_numeric(df.serverLabel, errors='coerce')
df.duration = pd.to_numeric(df.duration, errors='coerce')
df.bytes_down = pd.to_numeric(df.bytes_down, errors='coerce')
df.bytes_up = pd.to_numeric(df.bytes_up, errors='coerce')
df.http_method = df.http_method.astype("category")
df.serverLabel = df.serverLabel.astype(np.bool)
df.virusTotalHits = df.virusTotalHits.astype(np.int8)
df.trustedHits = df.trustedHits.astype(np.int8)
df.to_csv("/tmp/rk/data/{}.csv".format(fn), encoding="utf-8")

116
server.py Normal file
View File

@ -0,0 +1,116 @@
import logging
from keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint
import arguments
import dataset
import models
# create logger
import visualize
from arguments import get_model_args
from utils import exists_or_make_path, load_model
logger = logging.getLogger('cisco_logger')
args = arguments.parse()
def train_server_only(params):
logger.info(f"Create model path {args.model_path}")
exists_or_make_path(args.model_path)
logger.info(f"Use command line arguments: {args}")
domain_tr, flow_tr, name_tr, client_tr, server_windows_tr = dataset.load_or_generate_h5data(args.data,
args.domain_length,
args.window)
domain_tr = domain_tr.value.reshape(-1, 40)
flow_tr = flow_tr.value.reshape(-1, 3)
server_tr = server_windows_tr.value.reshape(-1)
logger.info("define callbacks")
callbacks = []
callbacks.append(ModelCheckpoint(filepath=args.clf_model,
monitor='loss',
verbose=False,
save_best_only=True))
callbacks.append(CSVLogger(args.train_log))
logger.info(f"Use early stopping: {args.stop_early}")
if args.stop_early:
callbacks.append(EarlyStopping(monitor='val_loss',
patience=5,
verbose=False))
custom_metrics = models.get_metric_functions()
model = models.get_server_model_by_params(params=params)
features = {"ipt_domains": domain_tr, "ipt_flows": flow_tr}
if args.model_output == "both":
labels = {"client": client_tr, "server": server_tr}
elif args.model_output == "client":
labels = {"client": client_tr}
elif args.model_output == "server":
labels = {"server": server_tr}
else:
raise ValueError("unknown model output")
logger.info("compile and train model")
logger.info(model.get_config())
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'] + custom_metrics)
model.summary()
model.fit(features, labels,
batch_size=args.batch_size,
epochs=args.epochs,
callbacks=callbacks)
def test_server_only():
logger.info("start test: load data")
domain_val, flow_val, _, _, _, _ = dataset.load_or_generate_raw_h5data(args.data, args.domain_length, args.window)
domain_val = domain_val.value.reshape(-1, 40)
flow_val = flow_val.value.reshape(-1, 3)
domain_encs, _ = dataset.load_or_generate_domains(args.data, args.domain_length)
for model_args in get_model_args(args):
results = {}
logger.info(f"process model {model_args['model_path']}")
embd_model, clf_model = load_model(model_args["clf_model"], custom_objects=models.get_custom_objects())
pred = clf_model.predict([domain_val, flow_val],
batch_size=args.batch_size,
verbose=1)
results["server_pred"] = pred
domain_embeddings = embd_model.predict(domain_encs, batch_size=args.batch_size, verbose=1)
results["domain_embds"] = domain_embeddings
dataset.save_predictions(model_args["model_path"], results)
def vis_server():
def load_model(m, c):
from keras.models import load_model
clf = load_model(m, custom_objects=c)
emdb = clf.layers[1]
return emdb, clf
domain_raw, flow_raw, name_raw, hits_vt_raw, hits_trusted_raw, server_raw = dataset.load_or_generate_raw_h5data(
args.data, args.domain_length, args.window)
results = dataset.load_predictions(args.clf_model)
visualize.plot_clf()
visualize.plot_precision_recall(server_raw.flatten(), results["server_pred"].flatten(), "server")
visualize.plot_legend()
visualize.plot_save("results/server_model/windows_prc.pdf")
visualize.plot_clf()
visualize.plot_precision_recall(server_raw.flatten(), results["server_pred"].flatten(), "server")
visualize.plot_legend()
visualize.plot_save("results/server_model/windows_prc.pdf")
visualize.plot_clf()
visualize.plot_roc_curve(server_raw.flatten(), results["server_pred"].flatten(), "server")
visualize.plot_legend()
visualize.plot_save("results/server_model/windows_roc.pdf")

12
test.sh Normal file
View File

@ -0,0 +1,12 @@
#!/usr/bin/env bash
RESDIR=$1
DATADIR=$2
for output in client both
do
python3 main.py --mode test --batch 1024 \
--models ${RESDIR}/${output}_*/ \
--test ${DATADIR}/futureData.csv \
--model_output ${output}
done

46
utils.py Normal file
View File

@ -0,0 +1,46 @@
import os
from operator import itemgetter
import joblib
import numpy as np
from keras.models import load_model as load_keras_model
from sklearn.utils import class_weight
def exists_or_make_path(p):
if not os.path.exists(p):
os.makedirs(p)
def get_custom_class_weights(client, server):
return {
"client": class_weight.compute_class_weight('balanced', np.unique(client), client),
"server": class_weight.compute_class_weight('balanced', np.unique(server), server)
}
def get_custom_sample_weights(client, server):
return {
"client": class_weight.compute_sample_weight("balanced", client),
"server": class_weight.compute_sample_weight("balanced", server)
}
def load_ordered_hyperband_results(path):
results = joblib.load(path)
return sorted(results, itemgetter("loss"))
def load_model(path, custom_objects=None):
clf = load_keras_model(path, custom_objects)
try:
embd = clf.get_layer("domain_cnn").layer
except Exception:
# in some version i forgot to specify domain_cnn
# this bug fix is for certain compatibility
try:
embd = clf.layers[1].layer
except Exception:
embd = clf.get_layer("domain_cnn")
return embd, clf

247
visualize.py Normal file
View File

@ -0,0 +1,247 @@
import os
import matplotlib
matplotlib.use("agg")
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import interpolate
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.metrics import (
auc, classification_report, confusion_matrix, fbeta_score, precision_recall_curve,
roc_auc_score, roc_curve
)
def scores(y_true):
for (path, dirnames, fnames) in os.walk("results/"):
for f in fnames:
if path[-1] == "1" and f.endswith("npy"):
y_pred = np.load(os.path.join(path, f)).flatten()
print(path)
tp = np.sum(np.logical_and(y_pred >= 0.5, y_true == 1))
tn = np.sum(np.logical_and(y_pred < 0.5, y_true == 0))
fp = np.sum(np.logical_and(y_pred >= 0.5, y_true == 0))
fn = np.sum(np.logical_and(y_pred < 0.5, y_true == 1))
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / len(y_true)
f1_score = 2 * (precision * recall) / (precision + recall)
f05_score = (1 + 0.5 ** 2) * (precision * recall) / (0.5 ** 2 * precision + recall)
print(" precision:", precision)
print(" recall:", recall)
print(" accuracy:", accuracy)
print(" f1 score:", f1_score)
print(" f0.5 score:", f05_score)
def plot_clf():
plt.clf()
sns.set_context("paper")
sns.set_style("white")
def plot_save(path, dpi=600, set_size=True):
# plt.title(path)
fig = plt.gcf()
# fig.suptitle(path)
if set_size:
fig.set_size_inches(8, 4.5)
fig.savefig(path, dpi=dpi, bbox_inches='tight')
plt.close()
def plot_legend():
plt.legend()
def mathews_correlation_curve(y, y_pred):
pass
def plot_precision_recall(y, y_pred, label=""):
y = y.flatten()
y_pred = y_pred.flatten()
precision, recall, thresholds = precision_recall_curve(y, y_pred)
# decreasing_max_precision = np.maximum.accumulate(precision)[::-1]
# fig, ax = plt.subplots(1, 1)
# ax.hold(True)
score = fbeta_score(y, y_pred.round(), 1)
# prc_ap = average_precision_score(y, y_pred)
plt.plot(recall, precision, '--', label=f"{label} - {score:5.4}")
# ax.step(recall[::-1], decreasing_max_precision, '-r')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.0])
plt.xlim([0.0, 1.0])
def calc_pr_mean(y, y_preds):
return calc_metrics_mean(y, y_preds, "prc")
def plot_mean_curve(x, ys, std, score, label):
plt.plot(x, ys, label=f"{label} - {score:5.4}")
plt.fill_between(x, ys - std, ys + std, alpha=0.1)
plt.ylim([0.0, 1.0])
plt.xlim([0.0, 1.0])
def plot_pr_mean(y, y_preds, label=""):
x = np.linspace(0, 1, 10000)
ys_mean, ys_std, score = calc_pr_mean(y, y_preds)
plot_mean_curve(x, ys_mean, ys_std, score, label)
plt.xlabel('Recall')
plt.ylabel('Precision')
def score_model(y, prediction):
y = y.flatten()
y_pred = prediction.flatten()
precision, recall, thresholds = precision_recall_curve(y, y_pred)
print(classification_report(y, y_pred.round()))
print("Area under PR curve", auc(recall, precision))
print("roc auc score", roc_auc_score(y, y_pred))
print("F1 Score", fbeta_score(y, y_pred.round(), 1))
print("F0.5 Score", fbeta_score(y, y_pred.round(), 0.5))
def plot_roc_curve(mask, prediction, label=""):
y = mask.flatten()
y_pred = prediction.flatten()
fpr, tpr, thresholds = roc_curve(y, y_pred)
roc_auc = auc(fpr, tpr)
plt.xscale('log')
plt.plot(fpr, tpr, label=f"{label} - {roc_auc:5.4}")
plt.ylim([0.0, 1.0])
plt.xlim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
def calc_metrics_mean(y, y_preds, metric):
appr = []
y = y.flatten()
for idx, y_pred in enumerate(y_preds):
y_pred = y_pred.flatten()
if metric == "prc":
precision, recall, thresholds = precision_recall_curve(y, y_pred)
appr.append(interpolate.interp1d(recall, precision))
elif metric == "roc":
fpr, tpr, thresholds = roc_curve(y, y_pred)
appr.append(interpolate.interp1d(fpr, tpr))
x = np.linspace(0, 1, 10000)
ys = np.vstack([f(x) for f in appr])
ys_mean = ys.mean(axis=0)
ys_std = ys.std(axis=0)
return ys_mean, ys_std, ys
def calc_roc_mean(y, y_preds):
return calc_metrics_mean(y, y_preds, "roc")
def plot_roc_mean(y, y_preds, label=""):
x = np.linspace(0, 1, 10000)
ys_mean, ys_std, score = calc_roc_mean(y, y_preds)
plt.xscale('log')
plot_mean_curve(x, ys_mean, ys_std, score, label)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
def plot_confusion_matrix(y_true, y_pred, path,
normalize=False,
classes=("benign", "malicious"),
title='Confusion matrix',
cmap="Blues", dpi=600):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.clf()
cm = confusion_matrix(y_true, y_pred)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.
for i, j in ((i, j) for i in range(cm.shape[0]) for j in range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.savefig(path, dpi=dpi)
plt.close()
def plot_training_curve(logs, key, path, dpi=600):
plt.clf()
plt.plot(logs[f"{key}acc"], label="accuracy")
plt.plot(logs[f"{key}f1_score"], label="f1_score")
plt.plot(logs[f"val_{key}acc"], label="val_accuracy")
# plt.plot(logs[f"val_{key}f1_score"], label="val_f1_score")
plt.xlabel('epoch')
plt.ylabel('percentage')
plt.legend()
plt.savefig(path, dpi=dpi)
plt.close()
def plot_error_bars(results):
rates = []
for m, r in results.items():
if m == "all": continue
rates.append((r / r.sum(axis=0, keepdims=True)).flatten())
rates = pd.DataFrame(np.vstack(rates), columns=("TN", "FP", "FN", "TP"))
ax = rates.mean().plot.bar(yerr=rates.std())
for p in ax.patches:
ax.annotate(str(np.round(p.get_height(), 4)), (p.get_x(), 0.5))
def plot_embedding(domain_embedding, labels, path, dpi=600, method="svd"):
if method == "svd":
red = TruncatedSVD(n_components=2)
elif method == "tsne":
red = TSNE(n_components=2, verbose=2)
domain_reduced = red.fit_transform(domain_embedding)
print(red.explained_variance_ratio_)
# use if draw subset of predictions
# idx = np.random.choice(np.arange(len(domain_reduced)), 10000)
plt.scatter(domain_reduced[:, 0],
domain_reduced[:, 1],
c=(labels * (1, 2)).sum(1).astype(int),
cmap=plt.cm.plasma,
s=3,
alpha=0.2)
plt.colorbar()
plt.savefig(path, dpi=dpi)
def plot_model_as(model, path, shapes=True, layer_names=True):
from keras.utils.vis_utils import plot_model
plot_model(model, to_file=path, show_shapes=shapes, show_layer_names=layer_names)