From bbd63fd1dad15e42daeff5f11279092727dc58c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Knaebel?= <rknaebel@uni-potsdam.de>
Date: Fri, 30 Jun 2017 10:12:20 +0200
Subject: [PATCH] separating logical sections into dataset, models and main.
 continued initial refactoring

---
 .gitignore                                  |   5 +-
 cnnOnCnnParameterSelection.py => dataset.py | 107 +-------------------
 main.py                                     |  68 +++++++++++++
 models.py                                   |  53 ++++++++++
 scripts/make_csv_dataset.py                 |   6 ++
 5 files changed, 132 insertions(+), 107 deletions(-)
 rename cnnOnCnnParameterSelection.py => dataset.py (65%)
 create mode 100644 main.py
 create mode 100644 models.py
 create mode 100644 scripts/make_csv_dataset.py

diff --git a/.gitignore b/.gitignore
index e6863c9..699803c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -96,4 +96,7 @@ ENV/
 .DS_Store
 
 # data
-*.tif
\ No newline at end of file
+*.tif
+*.joblib
+*.csv
+*.csv.gz
\ No newline at end of file
diff --git a/cnnOnCnnParameterSelection.py b/dataset.py
similarity index 65%
rename from cnnOnCnnParameterSelection.py
rename to dataset.py
index a7bae38..6bcd700 100644
--- a/cnnOnCnnParameterSelection.py
+++ b/dataset.py
@@ -1,12 +1,8 @@
 # -*- coding: utf-8 -*-
 import string
 
-import keras
 import numpy as np
 import pandas as pd
-from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Reshape, Embedding, Input, Activation
-from keras.models import Model
-from keras.utils import np_utils
 from tqdm import tqdm
 
 
@@ -21,18 +17,6 @@ def get_character_dict():
                 enumerate(string.ascii_lowercase + string.punctuation))
 
 
-def get_cnn(vocabSize, embeddingSize, input_length, filters, kernel_size,
-            hidden_dims, drop_out):
-    x = y = Input(shape=(input_length,))
-    y = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(y)
-    y = Conv1D(filters, kernel_size, activation='relu')(y)
-    y = GlobalMaxPooling1D()(y)
-    y = Dense(hidden_dims)(y)
-    y = Dropout(drop_out)(y)
-    y = Activation('relu')(y)
-    return Model(x, y)
-
-
 def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
                     maxLengthInSeconds=300):
     # print('maxLength: ' + str(maxLengthInSeconds))
@@ -102,10 +86,8 @@ def getCiscoFeatures(curDataLine, urlSIPDict):
     numCiscoFeatures = 30
     try:
         ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
-        # print('cisco features: ' + str(ciscoFeatures))
         # log transform
         ciscoFeatures = np.log1p(ciscoFeatures).astype(float)
-        # print('log transformed: ' + str(ciscoFeatures))
         return ciscoFeatures.ravel()
     except:
         return np.zeros([numCiscoFeatures, ]).ravel()
@@ -117,7 +99,7 @@ def create_dataset_from_flows(user_flow_df, char_dict, maxLen, threshold=3, wind
     print("get chunks from user data frames")
     for i, user_flow in enumerate(get_flow_per_user(user_flow_df)):
         (domainListsTmp, dfListsTmp) = get_user_chunks(user_flow, windowSize=windowSize,
-                                                       overlapping=False, maxLengthInSeconds=maxLengthInSeconds)
+                                                       overlapping=False, maxLengthInSeconds=-1)
         domainLists += domainListsTmp
         dfLists += dfListsTmp
         if i >= 10:
@@ -193,90 +175,3 @@ def get_flow_per_user(df):
     users = df['user_hash'].unique().tolist()
     for user in users:
         yield df.loc[df.user_hash == user]
-
-
-if __name__ == "__main__":
-    # parameter    
-    innerCNNFilters = 512
-    innerCNNKernelSize = 2
-    cnnDropout = 0.5
-    cnnHiddenDims = 1024
-    domainFeatures = 512
-    flowFeatures = 3
-    numCiscoFeatures = 30
-    windowSize = 10
-    maxLen = 40
-    embeddingSize = 100
-    kernel_size = 2
-    drop_out = 0.5
-    filters = 2
-    hidden_dims = 100
-    vocabSize = 40
-    threshold = 3
-    minFlowsPerUser = 10
-    numEpochs = 100
-    maxLengthInSeconds = -1
-    timesNeg = -1
-
-    char_dict = get_character_dict()
-    user_flow_df = get_user_flow_data()
-
-    print("create training dataset")
-    (X_tr, y_tr, hits_tr, names_tr) = create_dataset_from_flows(
-        user_flow_df, char_dict,
-        maxLen=maxLen, threshold=threshold, windowSize=windowSize)
-
-    pos_idx = np.where(y_tr == 1.0)[0]
-    neg_idx = np.where(y_tr == 0.0)[0]
-
-    use_idx = np.concatenate((pos_idx, neg_idx))
-
-    y_tr = y_tr[use_idx]
-    # hits_tr = hits_tr[use_idx]
-    # names_tr = names_tr[use_idx]
-    for i in range(len(X_tr)):
-        X_tr[i] = X_tr[i][use_idx]
-
-    # TODO: WTF? I don't get it...
-    sharedCNNFun = get_cnn(len(char_dict) + 1, embeddingSize, maxLen,
-                           domainFeatures, kernel_size, domainFeatures, 0.5)
-
-    inputList = []
-    encodedList = []
-    numFeatures = flowFeatures
-    for i in range(windowSize):
-        inputList.append(Input(shape=(maxLen,)))
-        encodedList.append(sharedCNNFun(inputList[-1]))  # add shared domain model
-        inputList.append(Input(shape=(numFeatures,)))
-
-    merge_layer_input = []
-    for i in range(windowSize):
-        merge_layer_input.append(encodedList[i])
-        merge_layer_input.append(inputList[(2 * i) + 1])
-
-    # We can then concatenate the two vectors:
-    merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
-    reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector)
-    # add second cnn
-    cnn = Conv1D(filters,
-                 kernel_size,
-                 activation='relu',
-                 input_shape=(windowSize, domainFeatures + numFeatures))(reshape)
-    # we use max pooling:
-    maxPool = GlobalMaxPooling1D()(cnn)
-    cnnDropout = Dropout(cnnDropout)(maxPool)
-    cnnDense = Dense(cnnHiddenDims, activation='relu')(cnnDropout)
-    cnnOutput = Dense(2, activation='softmax')(cnnDense)
-
-    # We define a trainable model linking the
-    # tweet inputs to the predictions
-    model = Model(inputs=inputList, outputs=cnnOutput)
-    model.compile(optimizer='adam',
-                  loss='binary_crossentropy',
-                  metrics=['accuracy'])
-
-    epochNumber = 0
-    trainLabel = np_utils.to_categorical(y_tr, 2)
-    model.fit(x=X_tr, y=trainLabel, batch_size=128,
-              epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber)  # ,
-    # validation_data=(testData,testLabel))
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..b4f0ada
--- /dev/null
+++ b/main.py
@@ -0,0 +1,68 @@
+import numpy as np
+from keras.utils import np_utils
+
+import dataset
+import models
+
+
+def main():
+    # parameter
+    innerCNNFilters = 512
+    innerCNNKernelSize = 2
+    cnnDropout = 0.5
+    cnnHiddenDims = 1024
+    domainFeatures = 512
+    flowFeatures = 3
+    numCiscoFeatures = 30
+    windowSize = 10
+    maxLen = 40
+    embeddingSize = 100
+    kernel_size = 2
+    drop_out = 0.5
+    filters = 2
+    hidden_dims = 100
+    vocabSize = 40
+    threshold = 3
+    minFlowsPerUser = 10
+    numEpochs = 100
+    timesNeg = -1
+
+    char_dict = dataset.get_character_dict()
+    user_flow_df = dataset.get_user_flow_data()
+
+    print("create training dataset")
+    (X_tr, y_tr, hits_tr, names_tr) = dataset.create_dataset_from_flows(
+        user_flow_df, char_dict,
+        maxLen=maxLen, threshold=threshold, windowSize=windowSize)
+
+    pos_idx = np.where(y_tr == 1.0)[0]
+    neg_idx = np.where(y_tr == 0.0)[0]
+
+    use_idx = np.concatenate((pos_idx, neg_idx))
+
+    y_tr = y_tr[use_idx]
+    # hits_tr = hits_tr[use_idx]
+    # names_tr = names_tr[use_idx]
+    for i in range(len(X_tr)):
+        X_tr[i] = X_tr[i][use_idx]
+
+    # TODO: WTF? I don't get it...
+    shared_cnn = models.get_shared_cnn(len(char_dict) + 1, embeddingSize, maxLen,
+                                       domainFeatures, kernel_size, domainFeatures, 0.5)
+
+    model = models.get_top_cnn(shared_cnn, flowFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size,
+                               cnnHiddenDims, cnnDropout)
+
+    model.compile(optimizer='adam',
+                  loss='binary_crossentropy',
+                  metrics=['accuracy'])
+
+    epochNumber = 0
+    y_tr = np_utils.to_categorical(y_tr, 2)
+    model.fit(x=X_tr, y=y_tr, batch_size=128,
+              epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber)  # ,
+    # validation_data=(testData,testLabel))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/models.py b/models.py
new file mode 100644
index 0000000..a1f8089
--- /dev/null
+++ b/models.py
@@ -0,0 +1,53 @@
+import keras
+from keras.engine import Input, Model
+from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, Reshape
+
+
+def get_shared_cnn(vocabSize, embeddingSize, input_length, filters, kernel_size,
+                   hidden_dims, drop_out):
+    x = y = Input(shape=(input_length,))
+    y = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(y)
+    y = Conv1D(filters, kernel_size, activation='relu')(y)
+    y = GlobalMaxPooling1D()(y)
+    y = Dense(hidden_dims)(y)
+    y = Dropout(drop_out)(y)
+    y = Activation('relu')(y)
+    return Model(x, y)
+
+
+def get_full_model(vocabSize, embeddingSize, maxLen, domainFeatures, flowFeatures,
+                   filters, h1, h2, dropout, dense):
+    pass
+
+
+def get_top_cnn(cnn, numFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size, cnnHiddenDims, cnnDropout):
+    inputList = []
+    encodedList = []
+    # TODO: ???
+    for i in range(windowSize):
+        inputList.append(Input(shape=(maxLen,)))
+        encodedList.append(cnn(inputList[-1]))  # add shared domain model
+        inputList.append(Input(shape=(numFeatures,)))
+    # TODO: ???
+    merge_layer_input = []
+    for i in range(windowSize):
+        merge_layer_input.append(encodedList[i])
+        merge_layer_input.append(inputList[(2 * i) + 1])
+    # We can then concatenate the two vectors:
+    merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
+    reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector)
+    # add second cnn
+    cnn = Conv1D(filters,
+                 kernel_size,
+                 activation='relu',
+                 input_shape=(windowSize, domainFeatures + numFeatures))(reshape)
+    # we use max pooling:
+    maxPool = GlobalMaxPooling1D()(cnn)
+    cnnDropout = Dropout(cnnDropout)(maxPool)
+    cnnDense = Dense(cnnHiddenDims, activation='relu')(cnnDropout)
+    cnnOutput = Dense(2, activation='softmax')(cnnDense)
+
+    # We define a trainable model linking the
+    # tweet inputs to the predictions
+    model = Model(inputs=inputList, outputs=cnnOutput)
+    return model
diff --git a/scripts/make_csv_dataset.py b/scripts/make_csv_dataset.py
new file mode 100644
index 0000000..2f1b12b
--- /dev/null
+++ b/scripts/make_csv_dataset.py
@@ -0,0 +1,6 @@
+#!/usr/bin/python2
+
+import joblib
+
+datafile = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/currentData.joblib")
+user_flows = datafile["data"]