From bbd63fd1dad15e42daeff5f11279092727dc58c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Knaebel?= Date: Fri, 30 Jun 2017 10:12:20 +0200 Subject: [PATCH] separating logical sections into dataset, models and main. continued initial refactoring --- .gitignore | 5 +- cnnOnCnnParameterSelection.py => dataset.py | 107 +------------------- main.py | 68 +++++++++++++ models.py | 53 ++++++++++ scripts/make_csv_dataset.py | 6 ++ 5 files changed, 132 insertions(+), 107 deletions(-) rename cnnOnCnnParameterSelection.py => dataset.py (65%) create mode 100644 main.py create mode 100644 models.py create mode 100644 scripts/make_csv_dataset.py diff --git a/.gitignore b/.gitignore index e6863c9..699803c 100644 --- a/.gitignore +++ b/.gitignore @@ -96,4 +96,7 @@ ENV/ .DS_Store # data -*.tif \ No newline at end of file +*.tif +*.joblib +*.csv +*.csv.gz \ No newline at end of file diff --git a/cnnOnCnnParameterSelection.py b/dataset.py similarity index 65% rename from cnnOnCnnParameterSelection.py rename to dataset.py index a7bae38..6bcd700 100644 --- a/cnnOnCnnParameterSelection.py +++ b/dataset.py @@ -1,12 +1,8 @@ # -*- coding: utf-8 -*- import string -import keras import numpy as np import pandas as pd -from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Reshape, Embedding, Input, Activation -from keras.models import Model -from keras.utils import np_utils from tqdm import tqdm @@ -21,18 +17,6 @@ def get_character_dict(): enumerate(string.ascii_lowercase + string.punctuation)) -def get_cnn(vocabSize, embeddingSize, input_length, filters, kernel_size, - hidden_dims, drop_out): - x = y = Input(shape=(input_length,)) - y = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(y) - y = Conv1D(filters, kernel_size, activation='relu')(y) - y = GlobalMaxPooling1D()(y) - y = Dense(hidden_dims)(y) - y = Dropout(drop_out)(y) - y = Activation('relu')(y) - return Model(x, y) - - def get_user_chunks(dataFrame, windowSize=10, overlapping=False, maxLengthInSeconds=300): # print('maxLength: ' + str(maxLengthInSeconds)) @@ -102,10 +86,8 @@ def getCiscoFeatures(curDataLine, urlSIPDict): numCiscoFeatures = 30 try: ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])] - # print('cisco features: ' + str(ciscoFeatures)) # log transform ciscoFeatures = np.log1p(ciscoFeatures).astype(float) - # print('log transformed: ' + str(ciscoFeatures)) return ciscoFeatures.ravel() except: return np.zeros([numCiscoFeatures, ]).ravel() @@ -117,7 +99,7 @@ def create_dataset_from_flows(user_flow_df, char_dict, maxLen, threshold=3, wind print("get chunks from user data frames") for i, user_flow in enumerate(get_flow_per_user(user_flow_df)): (domainListsTmp, dfListsTmp) = get_user_chunks(user_flow, windowSize=windowSize, - overlapping=False, maxLengthInSeconds=maxLengthInSeconds) + overlapping=False, maxLengthInSeconds=-1) domainLists += domainListsTmp dfLists += dfListsTmp if i >= 10: @@ -193,90 +175,3 @@ def get_flow_per_user(df): users = df['user_hash'].unique().tolist() for user in users: yield df.loc[df.user_hash == user] - - -if __name__ == "__main__": - # parameter - innerCNNFilters = 512 - innerCNNKernelSize = 2 - cnnDropout = 0.5 - cnnHiddenDims = 1024 - domainFeatures = 512 - flowFeatures = 3 - numCiscoFeatures = 30 - windowSize = 10 - maxLen = 40 - embeddingSize = 100 - kernel_size = 2 - drop_out = 0.5 - filters = 2 - hidden_dims = 100 - vocabSize = 40 - threshold = 3 - minFlowsPerUser = 10 - numEpochs = 100 - maxLengthInSeconds = -1 - timesNeg = -1 - - char_dict = get_character_dict() - user_flow_df = get_user_flow_data() - - print("create training dataset") - (X_tr, y_tr, hits_tr, names_tr) = create_dataset_from_flows( - user_flow_df, char_dict, - maxLen=maxLen, threshold=threshold, windowSize=windowSize) - - pos_idx = np.where(y_tr == 1.0)[0] - neg_idx = np.where(y_tr == 0.0)[0] - - use_idx = np.concatenate((pos_idx, neg_idx)) - - y_tr = y_tr[use_idx] - # hits_tr = hits_tr[use_idx] - # names_tr = names_tr[use_idx] - for i in range(len(X_tr)): - X_tr[i] = X_tr[i][use_idx] - - # TODO: WTF? I don't get it... - sharedCNNFun = get_cnn(len(char_dict) + 1, embeddingSize, maxLen, - domainFeatures, kernel_size, domainFeatures, 0.5) - - inputList = [] - encodedList = [] - numFeatures = flowFeatures - for i in range(windowSize): - inputList.append(Input(shape=(maxLen,))) - encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model - inputList.append(Input(shape=(numFeatures,))) - - merge_layer_input = [] - for i in range(windowSize): - merge_layer_input.append(encodedList[i]) - merge_layer_input.append(inputList[(2 * i) + 1]) - - # We can then concatenate the two vectors: - merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1) - reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector) - # add second cnn - cnn = Conv1D(filters, - kernel_size, - activation='relu', - input_shape=(windowSize, domainFeatures + numFeatures))(reshape) - # we use max pooling: - maxPool = GlobalMaxPooling1D()(cnn) - cnnDropout = Dropout(cnnDropout)(maxPool) - cnnDense = Dense(cnnHiddenDims, activation='relu')(cnnDropout) - cnnOutput = Dense(2, activation='softmax')(cnnDense) - - # We define a trainable model linking the - # tweet inputs to the predictions - model = Model(inputs=inputList, outputs=cnnOutput) - model.compile(optimizer='adam', - loss='binary_crossentropy', - metrics=['accuracy']) - - epochNumber = 0 - trainLabel = np_utils.to_categorical(y_tr, 2) - model.fit(x=X_tr, y=trainLabel, batch_size=128, - epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber) # , - # validation_data=(testData,testLabel)) diff --git a/main.py b/main.py new file mode 100644 index 0000000..b4f0ada --- /dev/null +++ b/main.py @@ -0,0 +1,68 @@ +import numpy as np +from keras.utils import np_utils + +import dataset +import models + + +def main(): + # parameter + innerCNNFilters = 512 + innerCNNKernelSize = 2 + cnnDropout = 0.5 + cnnHiddenDims = 1024 + domainFeatures = 512 + flowFeatures = 3 + numCiscoFeatures = 30 + windowSize = 10 + maxLen = 40 + embeddingSize = 100 + kernel_size = 2 + drop_out = 0.5 + filters = 2 + hidden_dims = 100 + vocabSize = 40 + threshold = 3 + minFlowsPerUser = 10 + numEpochs = 100 + timesNeg = -1 + + char_dict = dataset.get_character_dict() + user_flow_df = dataset.get_user_flow_data() + + print("create training dataset") + (X_tr, y_tr, hits_tr, names_tr) = dataset.create_dataset_from_flows( + user_flow_df, char_dict, + maxLen=maxLen, threshold=threshold, windowSize=windowSize) + + pos_idx = np.where(y_tr == 1.0)[0] + neg_idx = np.where(y_tr == 0.0)[0] + + use_idx = np.concatenate((pos_idx, neg_idx)) + + y_tr = y_tr[use_idx] + # hits_tr = hits_tr[use_idx] + # names_tr = names_tr[use_idx] + for i in range(len(X_tr)): + X_tr[i] = X_tr[i][use_idx] + + # TODO: WTF? I don't get it... + shared_cnn = models.get_shared_cnn(len(char_dict) + 1, embeddingSize, maxLen, + domainFeatures, kernel_size, domainFeatures, 0.5) + + model = models.get_top_cnn(shared_cnn, flowFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size, + cnnHiddenDims, cnnDropout) + + model.compile(optimizer='adam', + loss='binary_crossentropy', + metrics=['accuracy']) + + epochNumber = 0 + y_tr = np_utils.to_categorical(y_tr, 2) + model.fit(x=X_tr, y=y_tr, batch_size=128, + epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber) # , + # validation_data=(testData,testLabel)) + + +if __name__ == "__main__": + main() diff --git a/models.py b/models.py new file mode 100644 index 0000000..a1f8089 --- /dev/null +++ b/models.py @@ -0,0 +1,53 @@ +import keras +from keras.engine import Input, Model +from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, Reshape + + +def get_shared_cnn(vocabSize, embeddingSize, input_length, filters, kernel_size, + hidden_dims, drop_out): + x = y = Input(shape=(input_length,)) + y = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(y) + y = Conv1D(filters, kernel_size, activation='relu')(y) + y = GlobalMaxPooling1D()(y) + y = Dense(hidden_dims)(y) + y = Dropout(drop_out)(y) + y = Activation('relu')(y) + return Model(x, y) + + +def get_full_model(vocabSize, embeddingSize, maxLen, domainFeatures, flowFeatures, + filters, h1, h2, dropout, dense): + pass + + +def get_top_cnn(cnn, numFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size, cnnHiddenDims, cnnDropout): + inputList = [] + encodedList = [] + # TODO: ??? + for i in range(windowSize): + inputList.append(Input(shape=(maxLen,))) + encodedList.append(cnn(inputList[-1])) # add shared domain model + inputList.append(Input(shape=(numFeatures,))) + # TODO: ??? + merge_layer_input = [] + for i in range(windowSize): + merge_layer_input.append(encodedList[i]) + merge_layer_input.append(inputList[(2 * i) + 1]) + # We can then concatenate the two vectors: + merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1) + reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector) + # add second cnn + cnn = Conv1D(filters, + kernel_size, + activation='relu', + input_shape=(windowSize, domainFeatures + numFeatures))(reshape) + # we use max pooling: + maxPool = GlobalMaxPooling1D()(cnn) + cnnDropout = Dropout(cnnDropout)(maxPool) + cnnDense = Dense(cnnHiddenDims, activation='relu')(cnnDropout) + cnnOutput = Dense(2, activation='softmax')(cnnDense) + + # We define a trainable model linking the + # tweet inputs to the predictions + model = Model(inputs=inputList, outputs=cnnOutput) + return model diff --git a/scripts/make_csv_dataset.py b/scripts/make_csv_dataset.py new file mode 100644 index 0000000..2f1b12b --- /dev/null +++ b/scripts/make_csv_dataset.py @@ -0,0 +1,6 @@ +#!/usr/bin/python2 + +import joblib + +datafile = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/currentData.joblib") +user_flows = datafile["data"]