add bulk embedding visualization and deep1 network

This commit is contained in:
René Knaebel 2017-10-09 14:19:01 +02:00
parent 33063f3081
commit a686f147f0
7 changed files with 151 additions and 30 deletions

View File

@ -106,6 +106,7 @@ parser.add_argument("--out-prefix", action="store", dest="output_prefix",
#
parser.add_argument("--stop_early", action="store_true", dest="stop_early")
parser.add_argument("--balanced_weights", action="store_true", dest="class_weights")
parser.add_argument("--sample_weights", action="store_true", dest="sample_weights")
parser.add_argument("--gpu", action="store_true", dest="gpu")
parser.add_argument("--new_model", action="store_true", dest="new_model")

View File

@ -1,24 +1,24 @@
#!/usr/bin/env bash
RESDIR=$1
DATADIR=$2
N1=$1
N2=$2
RESDIR=$3
DATADIR=$4
python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_final --data ${DATADIR}/futureData.csv --model_output both
python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_inter --data ${DATADIR}/futureData.csv --model_output both
#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_staggered --data ${DATADIR}/futureData.csv --model_output both
python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_final --data ${DATADIR}/futureData.csv --model_output client
#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_inter --data ${DATADIR}/futureData.csv --model_output client
for ((i = ${N1}; i <= ${N2}; i++))
do
python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_final_${i} --data ${DATADIR} --model_output client
python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_final_${i} --data ${DATADIR} --model_output both
python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_inter_${i} --data ${DATADIR} --model_output both
python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_staggered_${i} --data ${DATADIR} --model_output both
done
#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_medium_final --data ${DATADIR}/futureData.csv --model_output both
#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_medium_inter --data ${DATADIR}/futureData.csv --model_output both
#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_medium_final --data ${DATADIR}/futureData.csv --model_output client
#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_medium_inter --data ${DATADIR}/futureData.csv --model_output client
python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/client_final_*/ --data ${DATADIR} --model_output client --out-prefix ${RESDIR}/client_final
python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_final_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_final
python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_inter_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_inter
python3 main.py --mode all_fancy --batch 1024 --models ${RESDIR}/both_staggered_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_staggered
#python3 main.py --mode all_fancy --batch 256 --data ${DATADIR}/futureData.csv \
# --models ${RESDIR}/*_small_*/ --out-prefix ${RESDIR}/small
#python3 main.py --mode all_fancy --batch 256 --data ${DATADIR}/futureData.csv \
# --models ${RESDIR}/*_medium_*/ --out-prefix ${RESDIR}/medium
python3 main.py --mode all_fancy --batch 256 --data ${DATADIR}/futureData.csv \
--models ${RESDIR}/*/ --out-prefix ${RESDIR}/all
python3 main.py --mode beta --batch 1024 --models ${RESDIR}/client_final_*/ --data ${DATADIR} --model_output client --out-prefix ${RESDIR}/client_final
python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_final_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_final
python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_inter_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_inter
python3 main.py --mode beta --batch 1024 --models ${RESDIR}/both_staggered_*/ --data ${DATADIR} --model_output both --out-prefix ${RESDIR}/both_staggered

52
main.py
View File

@ -57,7 +57,7 @@ if args.gpu:
PARAMS = {
"type": args.model_type,
"depth": args.model_depth,
# "batch_size": 64,
"batch_size": args.batch_size,
"window_size": args.window,
"domain_length": args.domain_length,
"flow_features": 3,
@ -72,7 +72,6 @@ PARAMS = {
'filter_main': args.filter_main,
'dense_main': args.dense_main,
'kernel_main': args.kernel_main,
'input_length': 40,
'model_output': args.model_output
}
@ -101,7 +100,6 @@ def main_hyperband():
"window_size": [args.window],
"flow_features": [3],
"domain_length": [args.domain_length],
'input_length': [40],
# model params
"embedding_size": [2 ** x for x in range(3, 7)],
"filter_embedding": [2 ** x for x in range(1, 10)],
@ -133,6 +131,10 @@ def main_hyperband():
return results
def train(parameters, features, labels):
pass
def main_train(param=None):
logger.info(f"Create model path {args.model_path}")
exists_or_make_path(args.model_path)
@ -473,7 +475,48 @@ def main_visualize_all():
vis(args.output_prefix, dfs, df_paul, "user", "prc")
logger.info("plot user roc curves")
vis(args.output_prefix, dfs, df_paul, "user", "roc")
def main_visualize_all_embds():
import matplotlib.pyplot as plt
_, _, name_val, hits_vt, hits_trusted, server_val = dataset.load_or_generate_raw_h5data(args.data,
args.data,
args.domain_length,
args.window)
def load_df(path):
res = dataset.load_predictions(path)
return res["domain_embds"]
dfs = [(model_args["model_name"], load_df(model_args["model_path"])) for model_args in get_model_args(args)]
plt.clf()
from sklearn.decomposition import TruncatedSVD
def vis(ax, domain_embedding, labels):
red = TruncatedSVD(n_components=2)
# use if draw subset of predictions
idx = np.random.choice(np.arange(len(domain_embedding)), 5000)
domain_embedding = domain_embedding[idx]
labels = labels[idx]
domain_reduced = red.fit_transform(domain_embedding)
ax.scatter(domain_reduced[:, 0],
domain_reduced[:, 1],
c=(labels * (1, 2)).sum(1).astype(int),
cmap=plt.cm.plasma,
s=3,
alpha=0.1)
domain_encs, labels = dataset.load_or_generate_domains(args.data, args.domain_length)
fig, axes = plt.subplots(nrows=5, ncols=4)
for (model_name, embd), ax in zip(dfs, axes.flat):
logger.info(f"plot embedding for {model_name}")
vis(ax, embd, labels)
visualize.plot_save("{}_svd.png".format(args.output_prefix, 600))
import joblib
@ -709,6 +752,7 @@ def vis_server():
visualize.plot_legend()
visualize.plot_save("results/server_model/windows_roc.pdf")
def main():
if "train" == args.mode:
main_train()
@ -730,6 +774,8 @@ def main():
train_server_only()
if "server_test" == args.mode:
test_server_only()
if "embedding" == args.mode:
main_visualize_all_embds()
if __name__ == "__main__":

View File

@ -1,5 +1,6 @@
import keras.backend as K
from models import deep1
from models.renes_networks import selu
from . import flat_2, pauls_networks, renes_networks
@ -10,7 +11,6 @@ def get_models_by_params(params: dict):
# network_type = params.get("type")
network_depth = params.get("depth")
embedding_size = params.get("embedding")
input_length = params.get("input_length")
filter_embedding = params.get("filter_embedding")
kernel_embedding = params.get("kernel_embedding")
hidden_embedding = params.get("dense_embedding")
@ -32,7 +32,7 @@ def get_models_by_params(params: dict):
networks = renes_networks
else:
raise Exception("network not found")
embedding_model = networks.get_embedding(embedding_size, input_length, filter_embedding, kernel_embedding,
embedding_model = networks.get_embedding(embedding_size, domain_length, filter_embedding, kernel_embedding,
hidden_embedding, 0.5)
old_model = networks.get_model(0.25, flow_features, hidden_embedding, window_size, domain_length,
@ -63,6 +63,8 @@ def get_server_model_by_params(params: dict):
elif network_depth == "flat2":
networks = flat_2
elif network_depth == "deep1":
networks = deep1
elif network_depth == "deep2":
networks = renes_networks
else:
raise Exception("network not found")

70
models/deep1.py Normal file
View File

@ -0,0 +1,70 @@
from collections import namedtuple
import keras
from keras.engine import Input, Model as KerasModel
from keras.layers import Conv1D, Dense, Dropout, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, TimeDistributed
import dataset
Model = namedtuple("Model", ["in_domains", "in_flows", "out_client", "out_server"])
def get_embedding(embedding_size, input_length, filter_size, kernel_size, hidden_dims, drop_out=0.5):
x = y = Input(shape=(input_length,))
y = Embedding(input_dim=dataset.get_vocab_size(), output_dim=embedding_size)(y)
y = Conv1D(filter_size, kernel_size=kernel_size, activation="relu")(y)
y = Conv1D(filter_size, kernel_size=3, activation="relu")(y)
y = Conv1D(filter_size, kernel_size=3, activation="relu")(y)
y = GlobalAveragePooling1D()(y)
y = Dense(hidden_dims, activation="relu")(y)
return KerasModel(x, y)
def get_model(cnnDropout, flow_features, domain_features, window_size, domain_length, cnn_dims, kernel_size,
dense_dim, cnn, model_output="both"):
ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
encoded = TimeDistributed(cnn, name="domain_cnn")(ipt_domains)
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
# CNN processing a small slides of flow windows
y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation="relu", padding="same",
input_shape=(window_size, domain_features + flow_features))(merged)
# remove temporal dimension by global max pooling
y = GlobalMaxPooling1D()(y)
y = Dropout(cnnDropout)(y)
y = Dense(dense_dim, activation="relu")(y)
y = Dense(dense_dim, activation="relu")(y)
out_client = Dense(1, activation='sigmoid', name="client")(y)
out_server = Dense(1, activation='sigmoid', name="server")(y)
return Model(ipt_domains, ipt_flows, out_client, out_server)
def get_new_model(dropout, flow_features, domain_features, window_size, domain_length, cnn_dims, kernel_size,
dense_dim, cnn, model_output="both"):
ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
encoded = TimeDistributed(cnn, name="domain_cnn")(ipt_domains)
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
y = Dense(dense_dim, activation="relu")(merged)
y = Dense(dense_dim,
activation="relu",
name="dense_server")(y)
out_server = Dense(1, activation="sigmoid", name="server")(y)
merged = keras.layers.concatenate([merged, y], -1)
# CNN processing a small slides of flow windows
y = Conv1D(filters=cnn_dims,
kernel_size=kernel_size,
activation="relu",
padding="same",
input_shape=(window_size, domain_features + flow_features))(merged)
# remove temporal dimension by global max pooling
y = GlobalMaxPooling1D()(y)
y = Dropout(dropout)(y)
y = Dense(dense_dim, activation="relu")(y)
y = Dense(dense_dim,
activation="relu",
name="dense_client")(y)
out_client = Dense(1, activation='sigmoid', name="client")(y)
return Model(ipt_domains, ipt_flows, out_client, out_server)

View File

@ -95,6 +95,8 @@ def get_server_model(flow_features, domain_length, dense_dim, cnn):
ipt_domains = Input(shape=(domain_length,), name="ipt_domains")
ipt_flows = Input(shape=(flow_features,), name="ipt_flows")
encoded = cnn(ipt_domains)
cnn.name = "domain_cnn"
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
y = Dense(dense_dim,
activation="relu",

View File

@ -13,16 +13,17 @@ def exists_or_make_path(p):
def get_custom_class_weights(client, server):
client_class_weight = class_weight.compute_class_weight('balanced', np.unique(client), client)
server_class_weight = class_weight.compute_class_weight('balanced', np.unique(server), server)
return {
"client": client_class_weight,
"server": server_class_weight
"client": class_weight.compute_class_weight('balanced', np.unique(client), client),
"server": class_weight.compute_class_weight('balanced', np.unique(server), server)
}
def get_custom_sample_weights(client, server):
return class_weight.compute_sample_weight("balanced", np.vstack((client, server)).T)
return {
"client": class_weight.compute_sample_weight("balanced", client),
"server": class_weight.compute_sample_weight("balanced", server)
}
def load_ordered_hyperband_results(path):
@ -41,6 +42,5 @@ def load_model(path, custom_objects=None):
embd = clf.layers[1].layer
except Exception:
embd = clf.get_layer("domain_cnn")
return embd, clf