Browse Source

separating logical sections into dataset, models and main.

continued initial refactoring
master
René Knaebel 2 years ago
parent
commit
bbd63fd1da
5 changed files with 132 additions and 107 deletions
  1. 4
    1
      .gitignore
  2. 1
    106
      dataset.py
  3. 68
    0
      main.py
  4. 53
    0
      models.py
  5. 6
    0
      scripts/make_csv_dataset.py

+ 4
- 1
.gitignore View File

@@ -96,4 +96,7 @@ ENV/
96 96
 .DS_Store
97 97
 
98 98
 # data
99
-*.tif
99
+*.tif
100
+*.joblib
101
+*.csv
102
+*.csv.gz

cnnOnCnnParameterSelection.py → dataset.py View File

@@ -1,12 +1,8 @@
1 1
 # -*- coding: utf-8 -*-
2 2
 import string
3 3
 
4
-import keras
5 4
 import numpy as np
6 5
 import pandas as pd
7
-from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Reshape, Embedding, Input, Activation
8
-from keras.models import Model
9
-from keras.utils import np_utils
10 6
 from tqdm import tqdm
11 7
 
12 8
 
@@ -21,18 +17,6 @@ def get_character_dict():
21 17
                 enumerate(string.ascii_lowercase + string.punctuation))
22 18
 
23 19
 
24
-def get_cnn(vocabSize, embeddingSize, input_length, filters, kernel_size,
25
-            hidden_dims, drop_out):
26
-    x = y = Input(shape=(input_length,))
27
-    y = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(y)
28
-    y = Conv1D(filters, kernel_size, activation='relu')(y)
29
-    y = GlobalMaxPooling1D()(y)
30
-    y = Dense(hidden_dims)(y)
31
-    y = Dropout(drop_out)(y)
32
-    y = Activation('relu')(y)
33
-    return Model(x, y)
34
-
35
-
36 20
 def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
37 21
                     maxLengthInSeconds=300):
38 22
     # print('maxLength: ' + str(maxLengthInSeconds))
@@ -102,10 +86,8 @@ def getCiscoFeatures(curDataLine, urlSIPDict):
102 86
     numCiscoFeatures = 30
103 87
     try:
104 88
         ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
105
-        # print('cisco features: ' + str(ciscoFeatures))
106 89
         # log transform
107 90
         ciscoFeatures = np.log1p(ciscoFeatures).astype(float)
108
-        # print('log transformed: ' + str(ciscoFeatures))
109 91
         return ciscoFeatures.ravel()
110 92
     except:
111 93
         return np.zeros([numCiscoFeatures, ]).ravel()
@@ -117,7 +99,7 @@ def create_dataset_from_flows(user_flow_df, char_dict, maxLen, threshold=3, wind
117 99
     print("get chunks from user data frames")
118 100
     for i, user_flow in enumerate(get_flow_per_user(user_flow_df)):
119 101
         (domainListsTmp, dfListsTmp) = get_user_chunks(user_flow, windowSize=windowSize,
120
-                                                       overlapping=False, maxLengthInSeconds=maxLengthInSeconds)
102
+                                                       overlapping=False, maxLengthInSeconds=-1)
121 103
         domainLists += domainListsTmp
122 104
         dfLists += dfListsTmp
123 105
         if i >= 10:
@@ -193,90 +175,3 @@ def get_flow_per_user(df):
193 175
     users = df['user_hash'].unique().tolist()
194 176
     for user in users:
195 177
         yield df.loc[df.user_hash == user]
196
-
197
-
198
-if __name__ == "__main__":
199
-    # parameter    
200
-    innerCNNFilters = 512
201
-    innerCNNKernelSize = 2
202
-    cnnDropout = 0.5
203
-    cnnHiddenDims = 1024
204
-    domainFeatures = 512
205
-    flowFeatures = 3
206
-    numCiscoFeatures = 30
207
-    windowSize = 10
208
-    maxLen = 40
209
-    embeddingSize = 100
210
-    kernel_size = 2
211
-    drop_out = 0.5
212
-    filters = 2
213
-    hidden_dims = 100
214
-    vocabSize = 40
215
-    threshold = 3
216
-    minFlowsPerUser = 10
217
-    numEpochs = 100
218
-    maxLengthInSeconds = -1
219
-    timesNeg = -1
220
-
221
-    char_dict = get_character_dict()
222
-    user_flow_df = get_user_flow_data()
223
-
224
-    print("create training dataset")
225
-    (X_tr, y_tr, hits_tr, names_tr) = create_dataset_from_flows(
226
-        user_flow_df, char_dict,
227
-        maxLen=maxLen, threshold=threshold, windowSize=windowSize)
228
-
229
-    pos_idx = np.where(y_tr == 1.0)[0]
230
-    neg_idx = np.where(y_tr == 0.0)[0]
231
-
232
-    use_idx = np.concatenate((pos_idx, neg_idx))
233
-
234
-    y_tr = y_tr[use_idx]
235
-    # hits_tr = hits_tr[use_idx]
236
-    # names_tr = names_tr[use_idx]
237
-    for i in range(len(X_tr)):
238
-        X_tr[i] = X_tr[i][use_idx]
239
-
240
-    # TODO: WTF? I don't get it...
241
-    sharedCNNFun = get_cnn(len(char_dict) + 1, embeddingSize, maxLen,
242
-                           domainFeatures, kernel_size, domainFeatures, 0.5)
243
-
244
-    inputList = []
245
-    encodedList = []
246
-    numFeatures = flowFeatures
247
-    for i in range(windowSize):
248
-        inputList.append(Input(shape=(maxLen,)))
249
-        encodedList.append(sharedCNNFun(inputList[-1]))  # add shared domain model
250
-        inputList.append(Input(shape=(numFeatures,)))
251
-
252
-    merge_layer_input = []
253
-    for i in range(windowSize):
254
-        merge_layer_input.append(encodedList[i])
255
-        merge_layer_input.append(inputList[(2 * i) + 1])
256
-
257
-    # We can then concatenate the two vectors:
258
-    merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
259
-    reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector)
260
-    # add second cnn
261
-    cnn = Conv1D(filters,
262
-                 kernel_size,
263
-                 activation='relu',
264
-                 input_shape=(windowSize, domainFeatures + numFeatures))(reshape)
265
-    # we use max pooling:
266
-    maxPool = GlobalMaxPooling1D()(cnn)
267
-    cnnDropout = Dropout(cnnDropout)(maxPool)
268
-    cnnDense = Dense(cnnHiddenDims, activation='relu')(cnnDropout)
269
-    cnnOutput = Dense(2, activation='softmax')(cnnDense)
270
-
271
-    # We define a trainable model linking the
272
-    # tweet inputs to the predictions
273
-    model = Model(inputs=inputList, outputs=cnnOutput)
274
-    model.compile(optimizer='adam',
275
-                  loss='binary_crossentropy',
276
-                  metrics=['accuracy'])
277
-
278
-    epochNumber = 0
279
-    trainLabel = np_utils.to_categorical(y_tr, 2)
280
-    model.fit(x=X_tr, y=trainLabel, batch_size=128,
281
-              epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber)  # ,
282
-    # validation_data=(testData,testLabel))

+ 68
- 0
main.py View File

@@ -0,0 +1,68 @@
1
+import numpy as np
2
+from keras.utils import np_utils
3
+
4
+import dataset
5
+import models
6
+
7
+
8
+def main():
9
+    # parameter
10
+    innerCNNFilters = 512
11
+    innerCNNKernelSize = 2
12
+    cnnDropout = 0.5
13
+    cnnHiddenDims = 1024
14
+    domainFeatures = 512
15
+    flowFeatures = 3
16
+    numCiscoFeatures = 30
17
+    windowSize = 10
18
+    maxLen = 40
19
+    embeddingSize = 100
20
+    kernel_size = 2
21
+    drop_out = 0.5
22
+    filters = 2
23
+    hidden_dims = 100
24
+    vocabSize = 40
25
+    threshold = 3
26
+    minFlowsPerUser = 10
27
+    numEpochs = 100
28
+    timesNeg = -1
29
+
30
+    char_dict = dataset.get_character_dict()
31
+    user_flow_df = dataset.get_user_flow_data()
32
+
33
+    print("create training dataset")
34
+    (X_tr, y_tr, hits_tr, names_tr) = dataset.create_dataset_from_flows(
35
+        user_flow_df, char_dict,
36
+        maxLen=maxLen, threshold=threshold, windowSize=windowSize)
37
+
38
+    pos_idx = np.where(y_tr == 1.0)[0]
39
+    neg_idx = np.where(y_tr == 0.0)[0]
40
+
41
+    use_idx = np.concatenate((pos_idx, neg_idx))
42
+
43
+    y_tr = y_tr[use_idx]
44
+    # hits_tr = hits_tr[use_idx]
45
+    # names_tr = names_tr[use_idx]
46
+    for i in range(len(X_tr)):
47
+        X_tr[i] = X_tr[i][use_idx]
48
+
49
+    # TODO: WTF? I don't get it...
50
+    shared_cnn = models.get_shared_cnn(len(char_dict) + 1, embeddingSize, maxLen,
51
+                                       domainFeatures, kernel_size, domainFeatures, 0.5)
52
+
53
+    model = models.get_top_cnn(shared_cnn, flowFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size,
54
+                               cnnHiddenDims, cnnDropout)
55
+
56
+    model.compile(optimizer='adam',
57
+                  loss='binary_crossentropy',
58
+                  metrics=['accuracy'])
59
+
60
+    epochNumber = 0
61
+    y_tr = np_utils.to_categorical(y_tr, 2)
62
+    model.fit(x=X_tr, y=y_tr, batch_size=128,
63
+              epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber)  # ,
64
+    # validation_data=(testData,testLabel))
65
+
66
+
67
+if __name__ == "__main__":
68
+    main()

+ 53
- 0
models.py View File

@@ -0,0 +1,53 @@
1
+import keras
2
+from keras.engine import Input, Model
3
+from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, Reshape
4
+
5
+
6
+def get_shared_cnn(vocabSize, embeddingSize, input_length, filters, kernel_size,
7
+                   hidden_dims, drop_out):
8
+    x = y = Input(shape=(input_length,))
9
+    y = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(y)
10
+    y = Conv1D(filters, kernel_size, activation='relu')(y)
11
+    y = GlobalMaxPooling1D()(y)
12
+    y = Dense(hidden_dims)(y)
13
+    y = Dropout(drop_out)(y)
14
+    y = Activation('relu')(y)
15
+    return Model(x, y)
16
+
17
+
18
+def get_full_model(vocabSize, embeddingSize, maxLen, domainFeatures, flowFeatures,
19
+                   filters, h1, h2, dropout, dense):
20
+    pass
21
+
22
+
23
+def get_top_cnn(cnn, numFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size, cnnHiddenDims, cnnDropout):
24
+    inputList = []
25
+    encodedList = []
26
+    # TODO: ???
27
+    for i in range(windowSize):
28
+        inputList.append(Input(shape=(maxLen,)))
29
+        encodedList.append(cnn(inputList[-1]))  # add shared domain model
30
+        inputList.append(Input(shape=(numFeatures,)))
31
+    # TODO: ???
32
+    merge_layer_input = []
33
+    for i in range(windowSize):
34
+        merge_layer_input.append(encodedList[i])
35
+        merge_layer_input.append(inputList[(2 * i) + 1])
36
+    # We can then concatenate the two vectors:
37
+    merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
38
+    reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector)
39
+    # add second cnn
40
+    cnn = Conv1D(filters,
41
+                 kernel_size,
42
+                 activation='relu',
43
+                 input_shape=(windowSize, domainFeatures + numFeatures))(reshape)
44
+    # we use max pooling:
45
+    maxPool = GlobalMaxPooling1D()(cnn)
46
+    cnnDropout = Dropout(cnnDropout)(maxPool)
47
+    cnnDense = Dense(cnnHiddenDims, activation='relu')(cnnDropout)
48
+    cnnOutput = Dense(2, activation='softmax')(cnnDense)
49
+
50
+    # We define a trainable model linking the
51
+    # tweet inputs to the predictions
52
+    model = Model(inputs=inputList, outputs=cnnOutput)
53
+    return model

+ 6
- 0
scripts/make_csv_dataset.py View File

@@ -0,0 +1,6 @@
1
+#!/usr/bin/python2
2
+
3
+import joblib
4
+
5
+datafile = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/currentData.joblib")
6
+user_flows = datafile["data"]

Loading…
Cancel
Save