diff --git a/arguments.py b/arguments.py index 5ef1f7d..ee3e512 100644 --- a/arguments.py +++ b/arguments.py @@ -106,6 +106,7 @@ parser.add_argument("--out-prefix", action="store", dest="output_prefix", # parser.add_argument("--stop_early", action="store_true", dest="stop_early") parser.add_argument("--balanced_weights", action="store_true", dest="class_weights") +parser.add_argument("--sample_weights", action="store_true", dest="sample_weights") parser.add_argument("--gpu", action="store_true", dest="gpu") parser.add_argument("--new_model", action="store_true", dest="new_model") diff --git a/fancy.sh b/fancy.sh index d40d7cb..cdb05ff 100644 --- a/fancy.sh +++ b/fancy.sh @@ -1,24 +1,24 @@ #!/usr/bin/env bash -RESDIR=$1 -DATADIR=$2 +N1=$1 +N2=$2 +RESDIR=$3 +DATADIR=$4 -python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_final --data ${DATADIR}/futureData.csv --model_output both -python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_inter --data ${DATADIR}/futureData.csv --model_output both -#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_staggered --data ${DATADIR}/futureData.csv --model_output both -python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_final --data ${DATADIR}/futureData.csv --model_output client -#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_inter --data ${DATADIR}/futureData.csv --model_output client +for ((i = ${N1}; i <= ${N2}; i++)) +do + python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_final_${i} --data ${DATADIR} --model_output client + python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_final_${i} --data ${DATADIR} --model_output both + python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_inter_${i} --data ${DATADIR} --model_output both + python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_staggered_${i} --data ${DATADIR} --model_output both +done -#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_medium_final --data ${DATADIR}/futureData.csv --model_output both -#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_medium_inter --data ${DATADIR}/futureData.csv --model_output both -#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_medium_final --data ${DATADIR}/futureData.csv --model_output client -#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_medium_inter --data ${DATADIR}/futureData.csv --model_output client +python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/client_final_*/ --data ${DATADIR} --model_output client --out-prefix ${RESDIR}/client_final +python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_final_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_final +python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_inter_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_inter +python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_staggered_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_staggered -#python3 main.py --mode all_fancy --batch 256 --data ${DATADIR}/futureData.csv \ -# --models ${RESDIR}/*_small_*/ --out-prefix ${RESDIR}/small - -#python3 main.py --mode all_fancy --batch 256 --data ${DATADIR}/futureData.csv \ -# --models ${RESDIR}/*_medium_*/ --out-prefix ${RESDIR}/medium - -python3 main.py --mode all_fancy --batch 256 --data ${DATADIR}/futureData.csv \ - --models ${RESDIR}/*/ --out-prefix ${RESDIR}/all +python3 main.py --mode beta --batch 1024 --models ${RESDIR}/client_final_*/ --data ${DATADIR} --model_output client --out-prefix ${RESDIR}/client_final +python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_final_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_final +python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_inter_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_inter +python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_staggered_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_staggered diff --git a/main.py b/main.py index 9e8eb09..89580c3 100644 --- a/main.py +++ b/main.py @@ -57,7 +57,7 @@ if args.gpu: PARAMS = { "type": args.model_type, "depth": args.model_depth, - # "batch_size": 64, + "batch_size": args.batch_size, "window_size": args.window, "domain_length": args.domain_length, "flow_features": 3, @@ -72,7 +72,6 @@ PARAMS = { 'filter_main': args.filter_main, 'dense_main': args.dense_main, 'kernel_main': args.kernel_main, - 'input_length': 40, 'model_output': args.model_output } @@ -101,7 +100,6 @@ def main_hyperband(): "window_size": [args.window], "flow_features": [3], "domain_length": [args.domain_length], - 'input_length': [40], # model params "embedding_size": [2 ** x for x in range(3, 7)], "filter_embedding": [2 ** x for x in range(1, 10)], @@ -133,6 +131,10 @@ def main_hyperband(): return results +def train(parameters, features, labels): + pass + + def main_train(param=None): logger.info(f"Create model path {args.model_path}") exists_or_make_path(args.model_path) @@ -473,7 +475,48 @@ def main_visualize_all(): vis(args.output_prefix, dfs, df_paul, "user", "prc") logger.info("plot user roc curves") vis(args.output_prefix, dfs, df_paul, "user", "roc") + + +def main_visualize_all_embds(): + import matplotlib.pyplot as plt + _, _, name_val, hits_vt, hits_trusted, server_val = dataset.load_or_generate_raw_h5data(args.data, + args.data, + args.domain_length, + args.window) + def load_df(path): + res = dataset.load_predictions(path) + return res["domain_embds"] + + dfs = [(model_args["model_name"], load_df(model_args["model_path"])) for model_args in get_model_args(args)] + + plt.clf() + + from sklearn.decomposition import TruncatedSVD + + def vis(ax, domain_embedding, labels): + red = TruncatedSVD(n_components=2) + # use if draw subset of predictions + idx = np.random.choice(np.arange(len(domain_embedding)), 5000) + domain_embedding = domain_embedding[idx] + labels = labels[idx] + domain_reduced = red.fit_transform(domain_embedding) + ax.scatter(domain_reduced[:, 0], + domain_reduced[:, 1], + c=(labels * (1, 2)).sum(1).astype(int), + cmap=plt.cm.plasma, + s=3, + alpha=0.1) + + domain_encs, labels = dataset.load_or_generate_domains(args.data, args.domain_length) + + fig, axes = plt.subplots(nrows=5, ncols=4) + + for (model_name, embd), ax in zip(dfs, axes.flat): + logger.info(f"plot embedding for {model_name}") + vis(ax, embd, labels) + + visualize.plot_save("{}_svd.png".format(args.output_prefix, 600)) import joblib @@ -709,6 +752,7 @@ def vis_server(): visualize.plot_legend() visualize.plot_save("results/server_model/windows_roc.pdf") + def main(): if "train" == args.mode: main_train() @@ -730,6 +774,8 @@ def main(): train_server_only() if "server_test" == args.mode: test_server_only() + if "embedding" == args.mode: + main_visualize_all_embds() if __name__ == "__main__": diff --git a/models/__init__.py b/models/__init__.py index 101e19a..3cd662b 100644 --- a/models/__init__.py +++ b/models/__init__.py @@ -1,5 +1,6 @@ import keras.backend as K +from models import deep1 from models.renes_networks import selu from . import flat_2, pauls_networks, renes_networks @@ -10,7 +11,6 @@ def get_models_by_params(params: dict): # network_type = params.get("type") network_depth = params.get("depth") embedding_size = params.get("embedding") - input_length = params.get("input_length") filter_embedding = params.get("filter_embedding") kernel_embedding = params.get("kernel_embedding") hidden_embedding = params.get("dense_embedding") @@ -32,7 +32,7 @@ def get_models_by_params(params: dict): networks = renes_networks else: raise Exception("network not found") - embedding_model = networks.get_embedding(embedding_size, input_length, filter_embedding, kernel_embedding, + embedding_model = networks.get_embedding(embedding_size, domain_length, filter_embedding, kernel_embedding, hidden_embedding, 0.5) old_model = networks.get_model(0.25, flow_features, hidden_embedding, window_size, domain_length, @@ -63,6 +63,8 @@ def get_server_model_by_params(params: dict): elif network_depth == "flat2": networks = flat_2 elif network_depth == "deep1": + networks = deep1 + elif network_depth == "deep2": networks = renes_networks else: raise Exception("network not found") diff --git a/models/deep1.py b/models/deep1.py new file mode 100644 index 0000000..87ce40c --- /dev/null +++ b/models/deep1.py @@ -0,0 +1,70 @@ +from collections import namedtuple + +import keras +from keras.engine import Input, Model as KerasModel +from keras.layers import Conv1D, Dense, Dropout, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, TimeDistributed + +import dataset + +Model = namedtuple("Model", ["in_domains", "in_flows", "out_client", "out_server"]) + + +def get_embedding(embedding_size, input_length, filter_size, kernel_size, hidden_dims, drop_out=0.5): + x = y = Input(shape=(input_length,)) + y = Embedding(input_dim=dataset.get_vocab_size(), output_dim=embedding_size)(y) + y = Conv1D(filter_size, kernel_size=kernel_size, activation="relu")(y) + y = Conv1D(filter_size, kernel_size=3, activation="relu")(y) + y = Conv1D(filter_size, kernel_size=3, activation="relu")(y) + y = GlobalAveragePooling1D()(y) + y = Dense(hidden_dims, activation="relu")(y) + return KerasModel(x, y) + + +def get_model(cnnDropout, flow_features, domain_features, window_size, domain_length, cnn_dims, kernel_size, + dense_dim, cnn, model_output="both"): + ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains") + encoded = TimeDistributed(cnn, name="domain_cnn")(ipt_domains) + ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows") + merged = keras.layers.concatenate([encoded, ipt_flows], -1) + # CNN processing a small slides of flow windows + y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation="relu", padding="same", + input_shape=(window_size, domain_features + flow_features))(merged) + # remove temporal dimension by global max pooling + y = GlobalMaxPooling1D()(y) + y = Dropout(cnnDropout)(y) + y = Dense(dense_dim, activation="relu")(y) + y = Dense(dense_dim, activation="relu")(y) + out_client = Dense(1, activation='sigmoid', name="client")(y) + out_server = Dense(1, activation='sigmoid', name="server")(y) + + return Model(ipt_domains, ipt_flows, out_client, out_server) + + +def get_new_model(dropout, flow_features, domain_features, window_size, domain_length, cnn_dims, kernel_size, + dense_dim, cnn, model_output="both"): + ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains") + ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows") + encoded = TimeDistributed(cnn, name="domain_cnn")(ipt_domains) + merged = keras.layers.concatenate([encoded, ipt_flows], -1) + y = Dense(dense_dim, activation="relu")(merged) + y = Dense(dense_dim, + activation="relu", + name="dense_server")(y) + out_server = Dense(1, activation="sigmoid", name="server")(y) + merged = keras.layers.concatenate([merged, y], -1) + # CNN processing a small slides of flow windows + y = Conv1D(filters=cnn_dims, + kernel_size=kernel_size, + activation="relu", + padding="same", + input_shape=(window_size, domain_features + flow_features))(merged) + # remove temporal dimension by global max pooling + y = GlobalMaxPooling1D()(y) + y = Dropout(dropout)(y) + y = Dense(dense_dim, activation="relu")(y) + y = Dense(dense_dim, + activation="relu", + name="dense_client")(y) + out_client = Dense(1, activation='sigmoid', name="client")(y) + + return Model(ipt_domains, ipt_flows, out_client, out_server) diff --git a/models/pauls_networks.py b/models/pauls_networks.py index a2b0e85..9ac3567 100644 --- a/models/pauls_networks.py +++ b/models/pauls_networks.py @@ -95,6 +95,8 @@ def get_server_model(flow_features, domain_length, dense_dim, cnn): ipt_domains = Input(shape=(domain_length,), name="ipt_domains") ipt_flows = Input(shape=(flow_features,), name="ipt_flows") encoded = cnn(ipt_domains) + cnn.name = "domain_cnn" + merged = keras.layers.concatenate([encoded, ipt_flows], -1) y = Dense(dense_dim, activation="relu", diff --git a/utils.py b/utils.py index 9fa6bf6..c6e83e8 100644 --- a/utils.py +++ b/utils.py @@ -13,16 +13,17 @@ def exists_or_make_path(p): def get_custom_class_weights(client, server): - client_class_weight = class_weight.compute_class_weight('balanced', np.unique(client), client) - server_class_weight = class_weight.compute_class_weight('balanced', np.unique(server), server) return { - "client": client_class_weight, - "server": server_class_weight + "client": class_weight.compute_class_weight('balanced', np.unique(client), client), + "server": class_weight.compute_class_weight('balanced', np.unique(server), server) } def get_custom_sample_weights(client, server): - return class_weight.compute_sample_weight("balanced", np.vstack((client, server)).T) + return { + "client": class_weight.compute_sample_weight("balanced", client), + "server": class_weight.compute_sample_weight("balanced", server) + } def load_ordered_hyperband_results(path): @@ -41,6 +42,5 @@ def load_model(path, custom_objects=None): embd = clf.layers[1].layer except Exception: embd = clf.get_layer("domain_cnn") - return embd, clf