add bulk embedding visualization and deep1 network

2017-10-09 14:19:01 +02:00 · 2017-10-09 14:19:01 +02:00 · a686f147f0
commit a686f147f0
parent 33063f3081
7 changed files with 151 additions and 30 deletions
--- a/arguments.py
+++ b/arguments.py
@ -106,6 +106,7 @@ parser.add_argument("--out-prefix", action="store", dest="output_prefix",
 #
 parser.add_argument("--stop_early", action="store_true", dest="stop_early")
 parser.add_argument("--balanced_weights", action="store_true", dest="class_weights")
+parser.add_argument("--sample_weights", action="store_true", dest="sample_weights")
 parser.add_argument("--gpu", action="store_true", dest="gpu")
 parser.add_argument("--new_model", action="store_true", dest="new_model")

--- a/fancy.sh
+++ b/fancy.sh
@ -1,24 +1,24 @@
 #!/usr/bin/env bash

-RESDIR=$1
-DATADIR=$2
+N1=$1
+N2=$2
+RESDIR=$3
+DATADIR=$4

-python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_final --data ${DATADIR}/futureData.csv --model_output both
-python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_inter --data ${DATADIR}/futureData.csv --model_output both
-#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_staggered --data ${DATADIR}/futureData.csv --model_output both
-python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_final --data ${DATADIR}/futureData.csv --model_output client
-#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_inter --data ${DATADIR}/futureData.csv --model_output client
+for ((i = ${N1}; i <= ${N2}; i++))
+do
+    python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_final_${i} --data ${DATADIR} --model_output client
+    python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_final_${i} --data ${DATADIR} --model_output both
+    python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_inter_${i} --data ${DATADIR} --model_output both
+    python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_staggered_${i} --data ${DATADIR} --model_output both
+done

-#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_medium_final --data ${DATADIR}/futureData.csv --model_output both
-#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_medium_inter --data ${DATADIR}/futureData.csv --model_output both
-#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_medium_final --data ${DATADIR}/futureData.csv --model_output client
-#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_medium_inter --data ${DATADIR}/futureData.csv --model_output client
+python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/client_final_*/ --data ${DATADIR} --model_output client --out-prefix ${RESDIR}/client_final
+python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_final_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_final
+python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_inter_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_inter
+python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_staggered_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_staggered

-#python3 main.py --mode all_fancy --batch 256 --data ${DATADIR}/futureData.csv \
-#                --models ${RESDIR}/*_small_*/ --out-prefix ${RESDIR}/small
-
-#python3 main.py --mode all_fancy --batch 256 --data ${DATADIR}/futureData.csv \
-#                --models ${RESDIR}/*_medium_*/ --out-prefix ${RESDIR}/medium
-
-python3 main.py --mode all_fancy --batch 256 --data ${DATADIR}/futureData.csv \
-                --models ${RESDIR}/*/ --out-prefix ${RESDIR}/all
+python3 main.py --mode beta --batch 1024 --models ${RESDIR}/client_final_*/ --data ${DATADIR} --model_output client --out-prefix ${RESDIR}/client_final
+python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_final_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_final
+python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_inter_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_inter
+python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_staggered_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_staggered
--- a/main.py
+++ b/main.py
@ -57,7 +57,7 @@ if args.gpu:
 PARAMS = {
    "type": args.model_type,
    "depth": args.model_depth,
-    # "batch_size": 64,
+    "batch_size": args.batch_size,
    "window_size": args.window,
    "domain_length": args.domain_length,
    "flow_features": 3,
@ -72,7 +72,6 @@ PARAMS = {
    'filter_main': args.filter_main,
    'dense_main': args.dense_main,
    'kernel_main': args.kernel_main,
-    'input_length': 40,
    'model_output': args.model_output
 }

@ -101,7 +100,6 @@ def main_hyperband():
        "window_size": [args.window],
        "flow_features": [3],
        "domain_length": [args.domain_length],
-        'input_length': [40],
        # model params
        "embedding_size": [2 ** x for x in range(3, 7)],
        "filter_embedding": [2 ** x for x in range(1, 10)],
@ -133,6 +131,10 @@ def main_hyperband():
    return results


+def train(parameters, features, labels):
+    pass
+
+
 def main_train(param=None):
    logger.info(f"Create model path {args.model_path}")
    exists_or_make_path(args.model_path)
@ -473,7 +475,48 @@ def main_visualize_all():
    vis(args.output_prefix, dfs, df_paul, "user", "prc")
    logger.info("plot user roc curves")
    vis(args.output_prefix, dfs, df_paul, "user", "roc")
+
+
+def main_visualize_all_embds():
+    import matplotlib.pyplot as plt
+    _, _, name_val, hits_vt, hits_trusted, server_val = dataset.load_or_generate_raw_h5data(args.data,
+                                                                                            args.data,
+                                                                                            args.domain_length,
+                                                                                            args.window)
    
+    def load_df(path):
+        res = dataset.load_predictions(path)
+        return res["domain_embds"]
+    
+    dfs = [(model_args["model_name"], load_df(model_args["model_path"])) for model_args in get_model_args(args)]
+    
+    plt.clf()
+    
+    from sklearn.decomposition import TruncatedSVD
+    
+    def vis(ax, domain_embedding, labels):
+        red = TruncatedSVD(n_components=2)
+        # use if draw subset of predictions
+        idx = np.random.choice(np.arange(len(domain_embedding)), 5000)
+        domain_embedding = domain_embedding[idx]
+        labels = labels[idx]
+        domain_reduced = red.fit_transform(domain_embedding)
+        ax.scatter(domain_reduced[:, 0],
+                   domain_reduced[:, 1],
+                   c=(labels * (1, 2)).sum(1).astype(int),
+                   cmap=plt.cm.plasma,
+                   s=3,
+                   alpha=0.1)
+    
+    domain_encs, labels = dataset.load_or_generate_domains(args.data, args.domain_length)
+    
+    fig, axes = plt.subplots(nrows=5, ncols=4)
+    
+    for (model_name, embd), ax in zip(dfs, axes.flat):
+        logger.info(f"plot embedding for {model_name}")
+        vis(ax, embd, labels)
+    
+    visualize.plot_save("{}_svd.png".format(args.output_prefix, 600))

 import joblib

@ -709,6 +752,7 @@ def vis_server():
    visualize.plot_legend()
    visualize.plot_save("results/server_model/windows_roc.pdf")

+
 def main():
    if "train" == args.mode:
        main_train()
@ -730,6 +774,8 @@ def main():
        train_server_only()
    if "server_test" == args.mode:
        test_server_only()
+    if "embedding" == args.mode:
+        main_visualize_all_embds()


 if __name__ == "__main__":
--- a/models/init.py
+++ b/models/init.py
@ -1,5 +1,6 @@
 import keras.backend as K

+from models import deep1
 from models.renes_networks import selu
 from . import flat_2, pauls_networks, renes_networks

@ -10,7 +11,6 @@ def get_models_by_params(params: dict):
    # network_type = params.get("type")
    network_depth = params.get("depth")
    embedding_size = params.get("embedding")
-    input_length = params.get("input_length")
    filter_embedding = params.get("filter_embedding")
    kernel_embedding = params.get("kernel_embedding")
    hidden_embedding = params.get("dense_embedding")
@ -32,7 +32,7 @@ def get_models_by_params(params: dict):
        networks = renes_networks
    else:
        raise Exception("network not found")
-    embedding_model = networks.get_embedding(embedding_size, input_length, filter_embedding, kernel_embedding,
+    embedding_model = networks.get_embedding(embedding_size, domain_length, filter_embedding, kernel_embedding,
                                             hidden_embedding, 0.5)

    old_model = networks.get_model(0.25, flow_features, hidden_embedding, window_size, domain_length,
@ -63,6 +63,8 @@ def get_server_model_by_params(params: dict):
    elif network_depth == "flat2":
        networks = flat_2
    elif network_depth == "deep1":
+        networks = deep1
+    elif network_depth == "deep2":
        networks = renes_networks
    else:
        raise Exception("network not found")
--- a/models/deep1.py
+++ b/models/deep1.py
@ -0,0 +1,70 @@
+from collections import namedtuple
+
+import keras
+from keras.engine import Input, Model as KerasModel
+from keras.layers import Conv1D, Dense, Dropout, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, TimeDistributed
+
+import dataset
+
+Model = namedtuple("Model", ["in_domains", "in_flows", "out_client", "out_server"])
+
+
+def get_embedding(embedding_size, input_length, filter_size, kernel_size, hidden_dims, drop_out=0.5):
+    x = y = Input(shape=(input_length,))
+    y = Embedding(input_dim=dataset.get_vocab_size(), output_dim=embedding_size)(y)
+    y = Conv1D(filter_size, kernel_size=kernel_size, activation="relu")(y)
+    y = Conv1D(filter_size, kernel_size=3, activation="relu")(y)
+    y = Conv1D(filter_size, kernel_size=3, activation="relu")(y)
+    y = GlobalAveragePooling1D()(y)
+    y = Dense(hidden_dims, activation="relu")(y)
+    return KerasModel(x, y)
+
+
+def get_model(cnnDropout, flow_features, domain_features, window_size, domain_length, cnn_dims, kernel_size,
+              dense_dim, cnn, model_output="both"):
+    ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
+    encoded = TimeDistributed(cnn, name="domain_cnn")(ipt_domains)
+    ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
+    merged = keras.layers.concatenate([encoded, ipt_flows], -1)
+    # CNN processing a small slides of flow windows
+    y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation="relu", padding="same",
+               input_shape=(window_size, domain_features + flow_features))(merged)
+    # remove temporal dimension by global max pooling
+    y = GlobalMaxPooling1D()(y)
+    y = Dropout(cnnDropout)(y)
+    y = Dense(dense_dim, activation="relu")(y)
+    y = Dense(dense_dim, activation="relu")(y)
+    out_client = Dense(1, activation='sigmoid', name="client")(y)
+    out_server = Dense(1, activation='sigmoid', name="server")(y)
+    
+    return Model(ipt_domains, ipt_flows, out_client, out_server)
+
+
+def get_new_model(dropout, flow_features, domain_features, window_size, domain_length, cnn_dims, kernel_size,
+                  dense_dim, cnn, model_output="both"):
+    ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
+    ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
+    encoded = TimeDistributed(cnn, name="domain_cnn")(ipt_domains)
+    merged = keras.layers.concatenate([encoded, ipt_flows], -1)
+    y = Dense(dense_dim, activation="relu")(merged)
+    y = Dense(dense_dim,
+              activation="relu",
+              name="dense_server")(y)
+    out_server = Dense(1, activation="sigmoid", name="server")(y)
+    merged = keras.layers.concatenate([merged, y], -1)
+    # CNN processing a small slides of flow windows
+    y = Conv1D(filters=cnn_dims,
+               kernel_size=kernel_size,
+               activation="relu",
+               padding="same",
+               input_shape=(window_size, domain_features + flow_features))(merged)
+    # remove temporal dimension by global max pooling
+    y = GlobalMaxPooling1D()(y)
+    y = Dropout(dropout)(y)
+    y = Dense(dense_dim, activation="relu")(y)
+    y = Dense(dense_dim,
+              activation="relu",
+              name="dense_client")(y)
+    out_client = Dense(1, activation='sigmoid', name="client")(y)
+    
+    return Model(ipt_domains, ipt_flows, out_client, out_server)
--- a/models/pauls_networks.py
+++ b/models/pauls_networks.py
@ -95,6 +95,8 @@ def get_server_model(flow_features, domain_length, dense_dim, cnn):
    ipt_domains = Input(shape=(domain_length,), name="ipt_domains")
    ipt_flows = Input(shape=(flow_features,), name="ipt_flows")
    encoded = cnn(ipt_domains)
+    cnn.name = "domain_cnn"
+    
    merged = keras.layers.concatenate([encoded, ipt_flows], -1)
    y = Dense(dense_dim,
              activation="relu",
--- a/utils.py
+++ b/utils.py
@ -13,16 +13,17 @@ def exists_or_make_path(p):


 def get_custom_class_weights(client, server):
-    client_class_weight = class_weight.compute_class_weight('balanced', np.unique(client), client)
-    server_class_weight = class_weight.compute_class_weight('balanced', np.unique(server), server)
    return {
-        "client": client_class_weight,
-        "server": server_class_weight
+        "client": class_weight.compute_class_weight('balanced', np.unique(client), client),
+        "server": class_weight.compute_class_weight('balanced', np.unique(server), server)
    }


 def get_custom_sample_weights(client, server):
-    return class_weight.compute_sample_weight("balanced", np.vstack((client, server)).T)
+    return {
+        "client": class_weight.compute_sample_weight("balanced", client),
+        "server": class_weight.compute_sample_weight("balanced", server)
+    }


 def load_ordered_hyperband_results(path):
@ -41,6 +42,5 @@ def load_model(path, custom_objects=None):
            embd = clf.layers[1].layer
        except Exception:
            embd = clf.get_layer("domain_cnn")
-        
    
    return embd, clf