diff --git a/.gitignore b/.gitignore index 699803c..e834a02 100644 --- a/.gitignore +++ b/.gitignore @@ -99,4 +99,5 @@ ENV/ *.tif *.joblib *.csv -*.csv.gz \ No newline at end of file +*.csv.gz +*.csv.tar.* \ No newline at end of file diff --git a/main.py b/main.py index 5b0da5a..7d95719 100644 --- a/main.py +++ b/main.py @@ -37,9 +37,21 @@ parser.add_argument("--epochs", action="store", dest="epochs", # parser.add_argument("--samples_val", action="store", dest="samples_val", # default=10000, type=int) # -# parser.add_argument("--area", action="store", dest="area_size", -# default=25, type=int) -# +parser.add_argument("--embd", action="store", dest="embedding", + default=128, type=int) + +parser.add_argument("--hidden_char_dims", action="store", dest="hidden_char_dims", + default=256, type=int) + +parser.add_argument("--window", action="store", dest="window", + default=10, type=int) + +parser.add_argument("--domain_length", action="store", dest="domain_length", + default=40, type=int) + +parser.add_argument("--domain_embd", action="store", dest="domain_embedding", + default=512, type=int) + # parser.add_argument("--queue", action="store", dest="queue_size", # default=50, type=int) # @@ -59,6 +71,7 @@ parser.add_argument("--epochs", action="store", dest="epochs", args = parser.parse_args() + # config = tf.ConfigProto(log_device_placement=True) # config.gpu_options.per_process_gpu_memory_fraction = 0.5 # config.gpu_options.allow_growth = True @@ -67,24 +80,17 @@ args = parser.parse_args() def main(): # parameter - innerCNNFilters = 512 - innerCNNKernelSize = 2 cnnDropout = 0.5 cnnHiddenDims = 1024 - domainFeatures = 512 flowFeatures = 3 numCiscoFeatures = 30 - windowSize = 10 - maxLen = 40 - embeddingSize = 100 - kernel_size = 2 + kernel_size = 3 drop_out = 0.5 - filters = 2 + filters = 128 hidden_dims = 100 vocabSize = 40 threshold = 3 minFlowsPerUser = 10 - numEpochs = 100 char_dict = dataset.get_character_dict() user_flow_df = dataset.get_user_flow_data() @@ -92,7 +98,7 @@ def main(): print("create training dataset") (X_tr, hits_tr, names_tr, server_tr, trusted_hits_tr) = dataset.create_dataset_from_flows( user_flow_df, char_dict, - max_len=maxLen, window_size=windowSize) + max_len=args.domain_length, window_size=args.window) # make client labels discrete with 4 different values # TODO: use trusted_hits_tr for client classification too client_labels = np.apply_along_axis(lambda x: dataset.discretize_label(x, 3), 0, np.atleast_2d(hits_tr)) @@ -104,11 +110,14 @@ def main(): client_labels = client_labels[idx] server_labels = server_tr[idx] - shared_cnn = models.get_shared_cnn(len(char_dict) + 1, embeddingSize, maxLen, - domainFeatures, kernel_size, domainFeatures, 0.5) + shared_cnn = models.get_embedding_network_rene(len(char_dict) + 1, args.embedding, args.domain_length, + args.hidden_char_dims, args.domain_embedding, 0.5) + shared_cnn.summary() - model = models.get_top_cnn(shared_cnn, flowFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size, - cnnHiddenDims, cnnDropout) + model = models.get_top_cnn_rene(cnnDropout, flowFeatures, args.domain_embedding, + args.window, args.domain_length, filters, kernel_size, + cnnHiddenDims, shared_cnn) + model.summary() model.compile(optimizer='adam', loss='binary_crossentropy', diff --git a/models.py b/models.py index a513ac0..f381310 100644 --- a/models.py +++ b/models.py @@ -1,10 +1,11 @@ import keras from keras.engine import Input, Model -from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, TimeDistributed +from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, TimeDistributed, MaxPool1D -def get_shared_cnn(vocab_size, embedding_size, input_length, filters, kernel_size, - hidden_dims, drop_out): +# designed by paul +def get_embedding_network_paul(vocab_size, embedding_size, input_length, filters, kernel_size, + hidden_dims, drop_out=0.5): x = y = Input(shape=(input_length,)) y = Embedding(input_dim=vocab_size, output_dim=embedding_size)(y) y = Conv1D(filters, kernel_size, activation='relu')(y) @@ -15,26 +16,65 @@ def get_shared_cnn(vocab_size, embedding_size, input_length, filters, kernel_siz return Model(x, y) +def get_embedding_network_rene(vocab_size, embedding_size, input_length, + hidden_char_dims, hidden_dims, drop_out=0.5): + x = y = Input(shape=(input_length,)) + y = Embedding(input_dim=vocab_size, output_dim=embedding_size, mask_zero=True)(y) + y = Conv1D(hidden_char_dims, kernel_size=5, activation='relu')(y) + y = MaxPool1D(pool_size=3, strides=1)(y) + y = Conv1D(hidden_char_dims, kernel_size=3, activation='relu')(y) + y = MaxPool1D(pool_size=3, strides=1)(y) + y = Conv1D(hidden_char_dims, kernel_size=3, activation='relu')(y) + y = GlobalMaxPooling1D()(y) + y = Dense(hidden_dims)(y) + y = Dropout(drop_out)(y) + y = Activation('relu')(y) + return Model(x, y) + + def get_full_model(vocabSize, embeddingSize, maxLen, domainFeatures, flowFeatures, filters, h1, h2, dropout, dense): pass -def get_top_cnn(cnn, numFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size, cnnHiddenDims, cnnDropout): - ipt_domains = Input(shape=(windowSize, maxLen), name="ipt_domains") +# designed by paul +def get_top_cnn(cnnDropout, flow_features, domain_features, window_size, domain_length, cnn_dims, kernel_size, + dense_dim, + cnn): + ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains") encoded = TimeDistributed(cnn)(ipt_domains) - ipt_flows = Input(shape=(windowSize, numFeatures), name="ipt_flows") + ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows") merged = keras.layers.concatenate([encoded, ipt_flows], -1) - # add second cnn - y = Conv1D(filters, + # CNN processing a small slides of flow windows + # TODO: add more layers? + y = Conv1D(cnn_dims, kernel_size, activation='relu', - input_shape=(windowSize, domainFeatures + numFeatures))(merged) - # TODO: why global pooling? -> 3D to 2D - # we use max pooling: + input_shape=(window_size, domain_features + flow_features))(merged) + # remove temporal dimension by global max pooling y = GlobalMaxPooling1D()(y) y = Dropout(cnnDropout)(y) - y = Dense(cnnHiddenDims, activation='relu')(y) + y = Dense(dense_dim, activation='relu')(y) + y1 = Dense(2, activation='softmax', name="client")(y) + y2 = Dense(2, activation='softmax', name="server")(y) + + return Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2)) + + +def get_top_cnn_rene(cnnDropout, flow_features, domain_features, window_size, domain_length, cnn_dims, kernel_size, + dense_dim, cnn): + ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains") + encoded = TimeDistributed(cnn)(ipt_domains) + ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows") + merged = keras.layers.concatenate([encoded, ipt_flows], -1) + # CNN processing a small slides of flow windows + # TODO: add more layers? + y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu', + input_shape=(window_size, domain_features + flow_features))(merged) + # remove temporal dimension by global max pooling + y = GlobalMaxPooling1D()(y) + y = Dropout(cnnDropout)(y) + y = Dense(dense_dim, activation='relu')(y) y1 = Dense(2, activation='softmax', name="client")(y) y2 = Dense(2, activation='softmax', name="server")(y) diff --git a/scripts/make_csv_dataset.py b/scripts/make_csv_dataset.py index 3191925..2b2a92d 100644 --- a/scripts/make_csv_dataset.py +++ b/scripts/make_csv_dataset.py @@ -4,7 +4,6 @@ import joblib import pandas as pd df = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/currentData.joblib") -df = df["data"] -df = pd.concat(df) +df = pd.concat(df["data"]) df.reset_index(inplace=True) -df.to_csv("/tmp/rk/full_dataset.csv.gz", compression="gzip") +df.to_csv("/tmp/rk/full_future_dataset.csv.gz", compression="gzip")