diff --git a/dataset.py b/dataset.py index 4369ac3..2cc2ffd 100644 --- a/dataset.py +++ b/dataset.py @@ -18,6 +18,10 @@ def get_character_dict(): return chars +def get_vocab_size(): + return len(chars) + 1 + + def encode_char(c): if c in chars: return chars[c] diff --git a/main.py b/main.py index ae14564..37568ed 100644 --- a/main.py +++ b/main.py @@ -7,7 +7,6 @@ import pandas as pd import tensorflow as tf from keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping from keras.models import load_model -from sklearn.utils import class_weight import arguments import dataset @@ -16,7 +15,7 @@ import models # create logger import visualize from dataset import load_or_generate_h5data -from utils import exists_or_make_path +from utils import exists_or_make_path, get_custom_class_weights logger = logging.getLogger('logger') logger.setLevel(logging.DEBUG) @@ -54,22 +53,39 @@ if args.gpu: config.gpu_options.allow_growth = True session = tf.Session(config=config) +# default parameter +PARAMS = { + "type": args.model_type, + "batch_size": 64, + "window_size": args.window, + "domain_length": args.domain_length, + "flow_features": 3, + # + 'dropout': 0.5, + 'domain_features': args.domain_embedding, + 'embedding_size': args.embedding, + 'filter_main': 64, + 'flow_features': 3, + # 'dense_main': 512, + 'dense_main': 64, + 'filter_embedding': args.hidden_char_dims, + 'hidden_embedding': args.domain_embedding, + 'kernel_embedding': 3, + 'kernels_main': 3, + 'input_length': 40 +} + def main_paul_best(): - char_dict = dataset.get_character_dict() pauls_best_params = models.pauls_networks.best_config - pauls_best_params["vocab_size"] = len(char_dict) + 1 main_train(pauls_best_params) def main_hyperband(): - char_dict = dataset.get_character_dict() - params = { # static params "type": ["paul"], "batch_size": [args.batch_size], - "vocab_size": [len(char_dict) + 1], "window_size": [10], "domain_length": [40], "flow_features": [3], @@ -96,50 +112,16 @@ def main_hyperband(): json.dump(results, open("hyperband.json")) -def get_custom_class_weights(client_tr, server_tr): - client = client_tr.value if type(client_tr) != np.ndarray else client_tr - server = server_tr.value if type(server_tr) != np.ndarray else server_tr - client_class_weight = class_weight.compute_class_weight('balanced', np.unique(client), client) - server_class_weight = class_weight.compute_class_weight('balanced', np.unique(server), server) - return { - "client": client_class_weight, - "server": server_class_weight - } - - -def main_train(param=None): +def main_train(param=None, train_new_model=False): exists_or_make_path(args.model_path) - char_dict = dataset.get_character_dict() domain_tr, flow_tr, client_tr, server_windows_tr = load_or_generate_h5data(args.train_h5data, args.train_data, args.domain_length, args.window) - # parameter - p = { - "type": args.model_type, - "batch_size": 64, - "window_size": args.window, - "domain_length": args.domain_length, - "flow_features": 3, - "vocab_size": len(char_dict) + 1, - # - 'dropout': 0.5, - 'domain_features': args.domain_embedding, - 'embedding_size': args.embedding, - 'filter_main': 64, - 'flow_features': 3, - # 'dense_main': 512, - 'dense_main': 64, - 'filter_embedding': args.hidden_char_dims, - 'hidden_embedding': args.domain_embedding, - 'kernel_embedding': 3, - 'kernels_main': 3, - 'input_length': 40 - } if not param: - param = p + param = PARAMS - embedding, model, _ = models.get_models_by_params(param) + embedding, model, new_model = models.get_models_by_params(param) embedding.summary() model.summary() logger.info("define callbacks") @@ -155,20 +137,26 @@ def main_train(param=None): verbose=False)) logger.info("compile model") custom_metrics = models.get_metric_functions() - model.compile(optimizer='adam', - loss='binary_crossentropy', - metrics=['accuracy'] + custom_metrics) server_tr = np.max(server_windows_tr, axis=1) if args.class_weights: logger.info("class weights: compute custom weights") - custom_class_weights = get_custom_class_weights(client_tr, server_tr) + custom_class_weights = get_custom_class_weights(client_tr.value, server_tr) logger.info(custom_class_weights) else: logger.info("class weights: set default") custom_class_weights = None logger.info("start training") + + if train_new_model: + server_tr = np.expand_dims(server_windows_tr, 2) + model = new_model + + model.compile(optimizer='adam', + loss='binary_crossentropy', + metrics=['accuracy'] + custom_metrics) + model.fit([domain_tr, flow_tr], [client_tr, server_tr], batch_size=args.batch_size, @@ -185,105 +173,29 @@ def main_test(): domain_val, flow_val, client_val, server_val = load_or_generate_h5data(args.test_h5data, args.test_data, args.domain_length, args.window) clf = load_model(args.clf_model, custom_objects=models.get_metrics()) - # stats = clf.evaluate([domain_val, flow_val], - # [client_val, server_val], - # batch_size=args.batch_size) y_pred = clf.predict([domain_val, flow_val], batch_size=args.batch_size, verbose=1) np.save(args.future_prediction, y_pred) - char_dict = dataset.get_character_dict() - user_flow_df = dataset.get_user_flow_data(args.test_data) - domains = user_flow_df.domain.unique()[:-1] - - def get_domain_features_reduced(d): - return dataset.get_domain_features(d[0], char_dict, args.domain_length) - - domain_features = [] - for ds in domains: - domain_features.append(np.apply_along_axis(get_domain_features_reduced, 2, np.atleast_3d(ds))) - - model = load_model(args.embedding_model) - domain_features = np.stack(domain_features).reshape((-1, 40)) - pred = model.predict(domain_features, batch_size=args.batch_size, verbose=1) - - np.save("/tmp/rk/domains.npy", domains) - np.save("/tmp/rk/domain_features.npy", domain_features) - np.save("/tmp/rk/domain_embd.npy", pred) - - -def main_new_model(): - exists_or_make_path(args.model_path) - - char_dict = dataset.get_character_dict() - domain_tr, flow_tr, client_tr, server_windows_tr = load_or_generate_h5data(args.train_h5data, args.train_data, - args.domain_length, args.window) - - # parameter - p = { - "type": args.model_type, - "batch_size": 64, - "window_size": args.window, - "domain_length": args.domain_length, - "flow_features": 3, - "vocab_size": len(char_dict) + 1, - # - 'dropout': 0.5, - 'domain_features': args.domain_embedding, - 'embedding_size': args.embedding, - 'filter_main': 64, - 'flow_features': 3, - # 'dense_main': 512, - 'dense_main': 64, - 'filter_embedding': args.hidden_char_dims, - 'hidden_embedding': args.domain_embedding, - 'kernel_embedding': 3, - 'kernels_main': 3, - 'input_length': 40 - } - - embedding, _, model = models.get_models_by_params(p) - embedding.summary() - model.summary() - logger.info("define callbacks") - callbacks = [] - callbacks.append(ModelCheckpoint(filepath=args.clf_model, - monitor='val_loss', - verbose=False, - save_best_only=True)) - callbacks.append(CSVLogger(args.train_log)) - if args.stop_early: - callbacks.append(EarlyStopping(monitor='val_loss', - patience=5, - verbose=False)) - logger.info("compile model") - custom_metrics = models.get_metric_functions() - model.compile(optimizer='adam', - loss='binary_crossentropy', - metrics=['accuracy'] + custom_metrics) - - server_tr = np.max(server_windows_tr, axis=1) - - if args.class_weights: - logger.info("class weights: compute custom weights") - custom_class_weights = get_custom_class_weights(client_tr, server_tr) - logger.info(custom_class_weights) - else: - logger.info("class weights: set default") - custom_class_weights = None - logger.info("start training") - server_tr = np.expand_dims(server_windows_tr, 2) - model.fit([domain_tr, flow_tr], - [client_tr, server_tr], - batch_size=args.batch_size, - epochs=args.epochs, - callbacks=callbacks, - shuffle=True, - validation_split=0.2, - class_weight=custom_class_weights) - logger.info("save embedding") - embedding.save(args.embedding_model) + # char_dict = dataset.get_character_dict() + # user_flow_df = dataset.get_user_flow_data(args.test_data) + # domains = user_flow_df.domain.unique()[:-1] + # + # def get_domain_features_reduced(d): + # return dataset.get_domain_features(d[0], char_dict, args.domain_length) + # + # domain_features = [] + # for ds in domains: + # domain_features.append(np.apply_along_axis(get_domain_features_reduced, 2, np.atleast_3d(ds))) + # + # model = load_model(args.embedding_model) + # domain_features = np.stack(domain_features).reshape((-1, 40)) + # pred = model.predict(domain_features, batch_size=args.batch_size, verbose=1) + # + # np.save("/tmp/rk/domains.npy", domains) + # np.save("/tmp/rk/domain_features.npy", domain_features) + # np.save("/tmp/rk/domain_embd.npy", pred) def main_embedding(): @@ -360,7 +272,7 @@ def main(): if "data" in args.modes: main_data() if "train_new" in args.modes: - main_new_model() + main_train(train_new_model=True) if __name__ == "__main__": diff --git a/models/__init__.py b/models/__init__.py index 49ba780..c6d883a 100644 --- a/models/__init__.py +++ b/models/__init__.py @@ -1,6 +1,5 @@ import keras.backend as K -import dataset from . import pauls_networks from . import renes_networks @@ -9,7 +8,6 @@ def get_models_by_params(params: dict): # decomposing param section # mainly embedding model network_type = params.get("type") - vocab_size = len(dataset.get_character_dict()) + 1 embedding_size = params.get("embedding_size") input_length = params.get("input_length") filter_embedding = params.get("filter_embedding") @@ -26,8 +24,8 @@ def get_models_by_params(params: dict): dense_dim = params.get("dense_main") # create models networks = renes_networks if network_type == "rene" else pauls_networks - embedding_model = networks.get_embedding(vocab_size, embedding_size, input_length, - filter_embedding, kernel_embedding, hidden_embedding, drop_out=dropout) + embedding_model = networks.get_embedding(embedding_size, input_length, filter_embedding, kernel_embedding, + hidden_embedding, drop_out=dropout) predict_model = networks.get_model(dropout, flow_features, domain_features, window_size, domain_length, filter_main, kernel_main, dense_dim, embedding_model) diff --git a/models/pauls_networks.py b/models/pauls_networks.py index fa234d1..45f6aa5 100644 --- a/models/pauls_networks.py +++ b/models/pauls_networks.py @@ -2,6 +2,8 @@ import keras from keras.engine import Input, Model from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, TimeDistributed +import dataset + best_config = { "type": "paul", "batch_size": 64, @@ -24,11 +26,10 @@ best_config = { } -def get_embedding(vocab_size, embedding_size, input_length, - filters, kernel_size, hidden_dims, drop_out=0.5): +def get_embedding(embedding_size, input_length, filter_size, kernel_size, hidden_dims, drop_out=0.5): x = y = Input(shape=(input_length,)) - y = Embedding(input_dim=vocab_size, output_dim=embedding_size)(y) - y = Conv1D(filters, kernel_size, activation='relu')(y) + y = Embedding(input_dim=dataset.get_vocab_size(), output_dim=embedding_size)(y) + y = Conv1D(filter_size, kernel_size, activation='relu')(y) y = GlobalMaxPooling1D()(y) y = Dropout(drop_out)(y) y = Dense(hidden_dims)(y) diff --git a/models/renes_networks.py b/models/renes_networks.py index 44f28bc..a9c3af9 100644 --- a/models/renes_networks.py +++ b/models/renes_networks.py @@ -3,11 +3,12 @@ from keras.engine import Input, Model from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, TimeDistributed, MaxPool1D, \ GlobalAveragePooling1D +import dataset -def get_embedding(vocab_size, embedding_size, input_length, - filter_size, kernel_size, hidden_dims, drop_out=0.5): + +def get_embedding(embedding_size, input_length, filter_size, kernel_size, hidden_dims, drop_out=0.5): x = y = Input(shape=(input_length,)) - y = Embedding(input_dim=vocab_size, output_dim=embedding_size)(y) + y = Embedding(input_dim=dataset.get_vocab_size(), output_dim=embedding_size)(y) y = Conv1D(filter_size, kernel_size=5, activation='relu')(y) y = Conv1D(filter_size, kernel_size=3, activation='relu')(y) y = Conv1D(filter_size, kernel_size=3, activation='relu')(y) diff --git a/utils.py b/utils.py index fb9b783..187e9d3 100644 --- a/utils.py +++ b/utils.py @@ -1,6 +1,18 @@ import os +import numpy as np +from sklearn.utils import class_weight + def exists_or_make_path(p): if not os.path.exists(p): os.makedirs(p) + + +def get_custom_class_weights(client, server): + client_class_weight = class_weight.compute_class_weight('balanced', np.unique(client), client) + server_class_weight = class_weight.compute_class_weight('balanced', np.unique(server), server) + return { + "client": client_class_weight, + "server": server_class_weight + }