Browse Source

refactoring and adding standard files for .gitignore and .keep in data

master
René Knaebel 2 years ago
parent
commit
be273d9247
5 changed files with 382 additions and 325 deletions
  1. 99
    0
      .gitignore
  2. 1
    1
      ciscoProcessing.py
  3. 207
    52
      cnnOnCnnParameterSelection.py
  4. 0
    0
      data/.keep
  5. 75
    272
      stackedNeuralModels.py

+ 99
- 0
.gitignore View File

@@ -0,0 +1,99 @@
1
+# Byte-compiled / optimized / DLL files
2
+__pycache__/
3
+*.py[cod]
4
+*$py.class
5
+
6
+# C extensions
7
+*.so
8
+
9
+# Distribution / packaging
10
+.Python
11
+env/
12
+build/
13
+develop-eggs/
14
+dist/
15
+downloads/
16
+eggs/
17
+.eggs/
18
+.cache/
19
+lib/
20
+lib64/
21
+parts/
22
+sdist/
23
+var/
24
+*.egg-info/
25
+.installed.cfg
26
+*.egg
27
+
28
+# PyInstaller
29
+#  Usually these files are written by a python script from a template
30
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
31
+*.manifest
32
+*.spec
33
+
34
+# Installer logs
35
+pip-log.txt
36
+pip-delete-this-directory.txt
37
+
38
+# Unit test / coverage reports
39
+htmlcov/
40
+.tox/
41
+.coverage
42
+.coverage.*
43
+.cache
44
+nosetests.xml
45
+coverage.xml
46
+*,cover
47
+.hypothesis/
48
+
49
+# Translations
50
+*.mo
51
+*.pot
52
+
53
+# Django stuff:
54
+*.log
55
+local_settings.py
56
+
57
+# Flask stuff:
58
+instance/
59
+.webassets-cache
60
+
61
+# Scrapy stuff:
62
+.scrapy
63
+
64
+# Sphinx documentation
65
+docs/_build/
66
+
67
+# PyBuilder
68
+target/
69
+
70
+# IPython Notebook
71
+.ipynb_checkpoints
72
+
73
+# pyenv
74
+.python-version
75
+
76
+# celery beat schedule file
77
+celerybeat-schedule
78
+
79
+# dotenv
80
+.env
81
+
82
+# virtualenv
83
+venv/
84
+ENV/
85
+
86
+# Spyder project settings
87
+.spyderproject
88
+
89
+# Rope project settings
90
+.ropeproject
91
+
92
+# intelliJ
93
+.idea/
94
+
95
+# Apple?
96
+.DS_Store
97
+
98
+# data
99
+*.tif

+ 1
- 1
ciscoProcessing.py View File

@@ -17,7 +17,7 @@ import random
17 17
 from keras.models import model_from_json
18 18
 import time
19 19
 import re
20
-import mongoDBConnector as mongoDBConnector
20
+# import mongoDBConnector as mongoDBConnector
21 21
 import stackedNeuralModels as stackedNeuralModels
22 22
 from tqdm import tqdm
23 23
 

+ 207
- 52
cnnOnCnnParameterSelection.py View File

@@ -1,20 +1,199 @@
1 1
 # -*- coding: utf-8 -*-
2
-import joblib
2
+import string
3
+
3 4
 import keras
4 5
 import numpy as np
5
-import tensorflow as tf
6
-from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Reshape
7
-from keras.layers import Input
6
+import pandas as pd
7
+from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Reshape, Embedding, Input, Activation
8 8
 from keras.models import Model
9 9
 from keras.utils import np_utils
10 10
 from tqdm import tqdm
11 11
 
12
-import stackedNeuralModels as stackedNeuralModels
13 12
 
14
-config = tf.ConfigProto(log_device_placement=True)
15
-config.gpu_options.per_process_gpu_memory_fraction = 0.5
16
-config.gpu_options.allow_growth = True
17
-session = tf.Session(config=config)
13
+# config = tf.ConfigProto(log_device_placement=True)
14
+# config.gpu_options.per_process_gpu_memory_fraction = 0.5
15
+# config.gpu_options.allow_growth = True
16
+# session = tf.Session(config=config)
17
+
18
+
19
+def get_character_dict():
20
+    return dict((char, idx) for (idx, char) in
21
+                enumerate(string.ascii_lowercase + string.punctuation))
22
+
23
+
24
+def get_cnn(vocabSize, embeddingSize, input_length, filters, kernel_size,
25
+            hidden_dims, drop_out):
26
+    x = y = Input(shape=(input_length,))
27
+    y = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(y)
28
+    y = Conv1D(filters, kernel_size, activation='relu')(y)
29
+    y = GlobalMaxPooling1D()(y)
30
+    y = Dense(hidden_dims)(y)
31
+    y = Dropout(drop_out)(y)
32
+    y = Activation('relu')(y)
33
+    return Model(x, y)
34
+
35
+
36
+def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
37
+                    maxLengthInSeconds=300):
38
+    # print('maxLength: ' + str(maxLengthInSeconds))
39
+    maxMilliSeconds = maxLengthInSeconds * 1000
40
+    outDomainLists = []
41
+    outDFFrames = []
42
+    if overlapping == False:
43
+        numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize)))
44
+        userIDs = np.arange(len(dataFrame))
45
+        for blockID in np.arange(numBlocks):
46
+            curIDs = userIDs[(blockID * windowSize):((blockID + 1) * windowSize)]
47
+            # print(curIDs)
48
+            useData = dataFrame.iloc[curIDs]
49
+            curDomains = useData['domain']
50
+            if maxLengthInSeconds != -1:
51
+                curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
52
+                underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
53
+                if len(underTimeOutIDs) != len(curIDs):
54
+                    curIDs = curIDs[underTimeOutIDs]
55
+                    useData = dataFrame.iloc[curIDs]
56
+                    curDomains = useData['domain']
57
+            outDomainLists.append(list(curDomains))
58
+            outDFFrames.append(useData)
59
+    else:
60
+        numBlocks = len(dataFrame) + 1 - windowSize
61
+        userIDs = np.arange(len(dataFrame))
62
+        for blockID in np.arange(numBlocks):
63
+            curIDs = userIDs[blockID:blockID + windowSize]
64
+            # print(curIDs)
65
+            useData = dataFrame.iloc[curIDs]
66
+            curDomains = useData['domain']
67
+            if maxLengthInSeconds != -1:
68
+                curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
69
+                underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
70
+                if len(underTimeOutIDs) != len(curIDs):
71
+                    curIDs = curIDs[underTimeOutIDs]
72
+                    useData = dataFrame.iloc[curIDs]
73
+                    curDomains = useData['domain']
74
+            outDomainLists.append(list(curDomains))
75
+            outDFFrames.append(useData)
76
+    return (outDomainLists, outDFFrames)
77
+
78
+
79
+def getFeatureVecForDomain(domain, characterDict, maxLen=40):
80
+    curFeature = np.zeros([maxLen, ])
81
+    for j in range(np.min([len(domain), maxLen])):
82
+        # print(j)
83
+        curCharacter = domain[-j]
84
+        if curCharacter in characterDict:
85
+            curFeature[j] = characterDict[curCharacter]
86
+    return curFeature
87
+
88
+
89
+def getFlowFeatures(curDataLine):
90
+    useKeys = ['duration', 'bytes_down', 'bytes_up']
91
+    curFeature = np.zeros([len(useKeys), ])
92
+    for i in range(len(useKeys)):
93
+        curKey = useKeys[i]
94
+        try:
95
+            curFeature[i] = np.log1p(curDataLine[curKey]).astype(float)
96
+        except:
97
+            pass
98
+    return curFeature
99
+
100
+
101
+def getCiscoFeatures(curDataLine, urlSIPDict):
102
+    numCiscoFeatures = 30
103
+    try:
104
+        ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
105
+        # print('cisco features: ' + str(ciscoFeatures))
106
+        # log transform
107
+        ciscoFeatures = np.log1p(ciscoFeatures).astype(float)
108
+        # print('log transformed: ' + str(ciscoFeatures))
109
+        return ciscoFeatures.ravel()
110
+    except:
111
+        return np.zeros([numCiscoFeatures, ]).ravel()
112
+
113
+
114
+def create_dataset_from_flows(user_flow_df, char_dict, maxLen, threshold=3, windowSize=10):
115
+    domainLists = []
116
+    dfLists = []
117
+    print("get chunks from user data frames")
118
+    for i, user_flow in enumerate(get_flow_per_user(user_flow_df)):
119
+        (domainListsTmp, dfListsTmp) = get_user_chunks(user_flow, windowSize=windowSize,
120
+                                                       overlapping=False, maxLengthInSeconds=maxLengthInSeconds)
121
+        domainLists += domainListsTmp
122
+        dfLists += dfListsTmp
123
+        if i >= 10:
124
+            break
125
+
126
+    print("create training dataset")
127
+    return create_dataset_from_lists(
128
+        domainLists=domainLists, dfLists=dfLists, charachterDict=char_dict,
129
+        maxLen=maxLen, threshold=threshold,
130
+        flagUseCiscoFeatures=False, urlSIPDIct=dict(),
131
+        windowSize=windowSize)
132
+
133
+
134
+def create_dataset_from_lists(domainLists, dfLists, charachterDict, maxLen, threshold=3,
135
+                              flagUseCiscoFeatures=False, urlSIPDIct=dict(),
136
+                              windowSize=10):
137
+    if 'hits' in dfLists[0].keys():
138
+        hitName = 'hits'
139
+    elif 'virusTotalHits' in dfLists[0].keys():
140
+        hitName = 'virusTotalHits'
141
+    numFlowFeatures = 3
142
+    numCiscoFeatures = 30
143
+    numFeatures = numFlowFeatures
144
+    if flagUseCiscoFeatures:
145
+        numFeatures += numCiscoFeatures
146
+    outputFeatures = []
147
+    label = []
148
+    hits = []
149
+    trainNames = []
150
+    for i in range(windowSize):
151
+        outputFeatures.append(np.zeros([len(domainLists), maxLen]))
152
+        outputFeatures.append(np.zeros([len(domainLists), numFeatures]))
153
+
154
+    for i in tqdm(np.arange(len(domainLists)), miniters=10):
155
+        curCounter = 0
156
+        # print('len domainList: ' + str(len(domainLists[i])))
157
+        # print('len df: ' + str(len(dfLists[i])))
158
+        for j in range(np.min([windowSize, len(domainLists[i])])):
159
+            outputFeatures[curCounter][i, :] = getFeatureVecForDomain(domainLists[i][j], charachterDict, maxLen)
160
+            curCounter += 1
161
+            if flagUseCiscoFeatures:
162
+                outputFeatures[curCounter][i, 0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j])
163
+                outputFeatures[curCounter][i, numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j], urlSIPDIct)
164
+            else:
165
+                outputFeatures[curCounter][i, :] = getFlowFeatures(dfLists[i].iloc[j])
166
+            curCounter += 1
167
+        curLabel = 0.0
168
+        if np.max(dfLists[i][hitName]) >= threshold:
169
+            curLabel = 1.0
170
+        elif np.max(dfLists[i][hitName]) == -1:
171
+            curLabel = -1.0
172
+        elif np.max(dfLists[i][hitName]) > 0 and np.max(dfLists[i][hitName]) < threshold:
173
+            curLabel = -2.0
174
+        label.append(curLabel)
175
+        hits.append(np.max(dfLists[i][hitName]))
176
+        trainNames.append(np.unique(dfLists[i]['user_hash']))
177
+    return (outputFeatures, np.array(label), np.array(hits), np.array(trainNames))
178
+
179
+
180
+def get_user_flow_data():
181
+    # load train and test data from joblib
182
+    # created with createTrainDataMultipleTaskLearning.py
183
+    # rk: changed to csv file
184
+    trainDFs = pd.read_csv("data/rk_data.csv.gz")
185
+    trainDFs.drop("Unnamed: 0", 1, inplace=True)
186
+    trainDFs.set_index(keys=['user_hash'], drop=False, inplace=True)
187
+    users = trainDFs['user_hash'].unique().tolist()
188
+    u0 = trainDFs.loc[trainDFs.user_hash == users[0]]
189
+    return trainDFs
190
+
191
+
192
+def get_flow_per_user(df):
193
+    users = df['user_hash'].unique().tolist()
194
+    for user in users:
195
+        yield df.loc[df.user_hash == user]
196
+
18 197
 
19 198
 if __name__ == "__main__":
20 199
     # parameter    
@@ -39,51 +218,28 @@ if __name__ == "__main__":
39 218
     maxLengthInSeconds = -1
40 219
     timesNeg = -1
41 220
 
42
-    trainDataPath = '/mnt/projekte/pmlcluster/cisco/trainData/equalClass/currentData.joblib'
43
-    testDataPath = '/mnt/projekte/pmlcluster/cisco/trainData/equalClass/futureData.joblib'
221
+    char_dict = get_character_dict()
222
+    user_flow_df = get_user_flow_data()
44 223
 
45
-    if 'characterDict' not in locals():
46
-        characterDictPath = 'trainData/characterIDDict.joblib'
47
-        characterDict = joblib.load(characterDictPath)['characterIDDict']
224
+    print("create training dataset")
225
+    (X_tr, y_tr, hits_tr, names_tr) = create_dataset_from_flows(
226
+        user_flow_df, char_dict,
227
+        maxLen=maxLen, threshold=threshold, windowSize=windowSize)
48 228
 
49
-    # load train and test data from joblib
50
-    # created with createTrainDataMultipleTaskLearning.py
51
-    if 'trainDFs' not in locals():
52
-        tmpLoad = joblib.load(trainDataPath)
53
-        trainDFs = tmpLoad['data']
229
+    pos_idx = np.where(y_tr == 1.0)[0]
230
+    neg_idx = np.where(y_tr == 0.0)[0]
54 231
 
55
-    if 'testDFs' not in locals():
56
-        tmpLoad = joblib.load(testDataPath)
232
+    use_idx = np.concatenate((pos_idx, neg_idx))
57 233
 
58
-    sharedCNNFun = stackedNeuralModels.getCNNWitoutLastLayerFunctional(len(characterDict) + 1, embeddingSize, maxLen,
59
-                                                                       domainFeatures, kernel_size, domainFeatures, 0.5)
234
+    y_tr = y_tr[use_idx]
235
+    # hits_tr = hits_tr[use_idx]
236
+    # names_tr = names_tr[use_idx]
237
+    for i in range(len(X_tr)):
238
+        X_tr[i] = X_tr[i][use_idx]
60 239
 
61
-    domainLists = []
62
-    dfLists = []
63
-    for i in tqdm(np.arange(len(trainDFs)), miniters=10):
64
-        (domainListsTmp, dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(trainDFs[i],
65
-                                                                                      windowSize=windowSize,
66
-                                                                                      overlapping=False,
67
-                                                                                      maxLengthInSeconds=maxLengthInSeconds)
68
-        domainLists += domainListsTmp
69
-        dfLists += dfListsTmp
70
-        if i == 100:
71
-            break
72
-
73
-    (testData, testLabel, testHits, testNames) = stackedNeuralModels.createTrainData(
74
-        domainLists=domainLists, dfLists=dfLists, charachterDict=characterDict,
75
-        maxLen=maxLen, threshold=threshold,
76
-        flagUseCiscoFeatures=False, urlSIPDIct=dict(),
77
-        windowSize=windowSize)
78
-
79
-    useIDs = np.where(testLabel == 1.0)[0]
80
-    useIDs = np.concatenate([useIDs, np.where(testLabel == 0.0)[0]])
81
-
82
-    testLabel = testLabel[useIDs]
83
-    testHits = testHits[useIDs]
84
-    testNames = testNames[useIDs]
85
-    for i in range(len(testData)):
86
-        testData[i] = testData[i][useIDs]
240
+    # TODO: WTF? I don't get it...
241
+    sharedCNNFun = get_cnn(len(char_dict) + 1, embeddingSize, maxLen,
242
+                           domainFeatures, kernel_size, domainFeatures, 0.5)
87 243
 
88 244
     inputList = []
89 245
     encodedList = []
@@ -102,7 +258,6 @@ if __name__ == "__main__":
102 258
     merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
103 259
     reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector)
104 260
     # add second cnn
105
-
106 261
     cnn = Conv1D(filters,
107 262
                  kernel_size,
108 263
                  activation='relu',
@@ -121,7 +276,7 @@ if __name__ == "__main__":
121 276
                   metrics=['accuracy'])
122 277
 
123 278
     epochNumber = 0
124
-    trainLabel = np_utils.to_categorical(testLabel, 2)
125
-    model.fit(x=testData, y=trainLabel,
279
+    trainLabel = np_utils.to_categorical(y_tr, 2)
280
+    model.fit(x=X_tr, y=trainLabel, batch_size=128,
126 281
               epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber)  # ,
127 282
     # validation_data=(testData,testLabel))

+ 0
- 0
data/.keep View File


+ 75
- 272
stackedNeuralModels.py View File

@@ -1,63 +1,52 @@
1 1
 # -*- coding: utf-8 -*-
2
-from keras.models import Sequential
3
-from keras.layers import Dense, Activation,LSTM,Embedding,Dropout,Conv1D, GlobalMaxPooling1D, Merge, Reshape, Lambda
4
-from keras.layers import Convolution1D
5
-import ciscoProcessing as ciscoProcessing
6
-import numpy as np
7
-import matplotlib.pyplot as plt
8
-import pandas as pd
9
-import joblib
10 2
 import csv
11 3
 
12
-import keras
4
+import numpy as np
5
+from keras.layers import Dense, Activation, Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Lambda
13 6
 from keras.layers import Input
14 7
 from keras.models import Model
15
-from keras.utils import np_utils
16
-
17
-from sklearn.metrics import precision_recall_curve
18
-from sklearn.metrics import auc, roc_curve
8
+from keras.models import Sequential
19 9
 from tqdm import tqdm
20
-import os
21 10
 
22 11
 
23
-def getCiscoFeatures(curDataLine,urlSIPDict):
12
+def getCiscoFeatures(curDataLine, urlSIPDict):
24 13
     numCiscoFeatures = 30
25 14
     try:
26 15
         ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
27
-        #print('cisco features: ' + str(ciscoFeatures))
16
+        # print('cisco features: ' + str(ciscoFeatures))
28 17
         # log transform
29
-        ciscoFeatures = np.log1p(ciscoFeatures,dtype='float32')
30
-        #print('log transformed: ' + str(ciscoFeatures))
18
+        ciscoFeatures = np.log1p(ciscoFeatures, dtype='float32')
19
+        # print('log transformed: ' + str(ciscoFeatures))
31 20
         return ciscoFeatures.ravel()
32 21
     except:
33
-        return np.zeros([numCiscoFeatures,]).ravel()
22
+        return np.zeros([numCiscoFeatures, ]).ravel()
34 23
 
35 24
 
36
-    
37
-def getCNNWithoutLastLayer(vocabSize,embeddingSize,input_length,filters,kernel_size,
38
-                       hidden_dims,drop_out):
25
+def getCNNWithoutLastLayer(vocabSize, embeddingSize, input_length, filters, kernel_size,
26
+                           hidden_dims, drop_out):
39 27
     model = Sequential()
40 28
     model.add(Embedding(input_dim=vocabSize, output_dim=embeddingSize,
41 29
                         input_length=input_length))
42
-                        
30
+
43 31
     model.add(Conv1D(filters,
44 32
                      kernel_size,
45 33
                      activation='relu'))
46
-                     
34
+
47 35
     # we use max pooling:
48 36
     model.add(GlobalMaxPooling1D())
49
-    
37
+
50 38
     # We add a vanilla hidden layer:
51 39
     model.add(Dense(hidden_dims))
52 40
     model.add(Dropout(drop_out))
53 41
     model.add(Activation('relu'))
54 42
     return model
55
-    
56
-def getCNNWitoutLastLayerFunctional(vocabSize,embeddingSize,input_length,filters,kernel_size,
57
-                       hidden_dims,drop_out):                           
43
+
44
+
45
+def getCNNWitoutLastLayerFunctional(vocabSize, embeddingSize, input_length, filters, kernel_size,
46
+                                    hidden_dims, drop_out):
58 47
     a = Input(shape=(input_length,))
59
-    embedding = Embedding(input_dim=vocabSize,output_dim=embeddingSize)(a)
60
-    conv1 = Conv1D(filters,kernel_size,activation='relu')(embedding)
48
+    embedding = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(a)
49
+    conv1 = Conv1D(filters, kernel_size, activation='relu')(embedding)
61 50
     glob = GlobalMaxPooling1D()(conv1)
62 51
     dense = Dense(hidden_dims)(glob)
63 52
     drop = Dropout(drop_out)(dense)
@@ -65,55 +54,58 @@ def getCNNWitoutLastLayerFunctional(vocabSize,embeddingSize,input_length,filters
65 54
     model = Model(a, model)
66 55
     return model
67 56
 
57
+
68 58
 def getFlowFeatureLayer(numFeatures):
69 59
     model = Sequential()
70
-    #slpModel.add(Dense(1, input_shape=(1,)))
60
+    # slpModel.add(Dense(1, input_shape=(1,)))
71 61
     model.add(Lambda(lambda x: x + 0.0, input_shape=(numFeatures,)))
72 62
     return model
73
-    
74 63
 
75
-def createCNNDataSet(domains,label,characterDict,maxLen=40):
64
+
65
+def createCNNDataSet(domains, label, characterDict, maxLen=40):
76 66
     # process domains in reverse order
77
-    outFeature = np.zeros([len(domains),maxLen])
78
-    outLabel = np.zeros([len(domains),])
67
+    outFeature = np.zeros([len(domains), maxLen])
68
+    outLabel = np.zeros([len(domains), ])
79 69
     for i in range(len(domains)):
80 70
         domain = domains[i]
81 71
         curLabel = label[i]
82
-        curFeature = np.zeros([maxLen,])
83
-       # print(domain + ' ' + str(len(domain)))        
84
-        for j in range(np.min([len(domain),maxLen])):
85
-            #print(j)
72
+        curFeature = np.zeros([maxLen, ])
73
+        # print(domain + ' ' + str(len(domain)))
74
+        for j in range(np.min([len(domain), maxLen])):
75
+            # print(j)
86 76
             curCharacter = domain[-j]
87 77
             if curCharacter in characterDict:
88 78
                 curFeature[j] = characterDict[curCharacter]
89 79
         outFeature[i] = curFeature
90 80
         outLabel[i] = curLabel
91
-    return (outFeature,outLabel)
81
+    return (outFeature, outLabel)
92 82
 
93
-def getFeatureVecForDomain(domain,characterDict,maxLen=40):
94
-    curFeature = np.zeros([maxLen,])
95
-    for j in range(np.min([len(domain),maxLen])):
96
-        #print(j)
83
+
84
+def getFeatureVecForDomain(domain, characterDict, maxLen=40):
85
+    curFeature = np.zeros([maxLen, ])
86
+    for j in range(np.min([len(domain), maxLen])):
87
+        # print(j)
97 88
         curCharacter = domain[-j]
98 89
         if curCharacter in characterDict:
99 90
             curFeature[j] = characterDict[curCharacter]
100 91
     return curFeature
101
-    
92
+
93
+
102 94
 def getFlowFeatures(curDataLine):
103
-    useKeys = ['duration','bytes_down','bytes_up']
104
-    curFeature = np.zeros([len(useKeys),])
95
+    useKeys = ['duration', 'bytes_down', 'bytes_up']
96
+    curFeature = np.zeros([len(useKeys), ])
105 97
     for i in range(len(useKeys)):
106 98
         curKey = useKeys[i]
107 99
         try:
108
-            curFeature[i] = np.log1p(curDataLine[curKey],dtype='float32')
100
+            curFeature[i] = np.log1p(curDataLine[curKey], dtype='float32')
109 101
         except:
110 102
             pass
111 103
     return curFeature
112
-    
113
-    
114
-def getChunksFromUserDataFrame(dataFrame,windowSize=10,overlapping=False,
104
+
105
+
106
+def getChunksFromUserDataFrame(dataFrame, windowSize=10, overlapping=False,
115 107
                                maxLengthInSeconds=300):
116
-    #print('maxLength: ' + str(maxLengthInSeconds))
108
+    # print('maxLength: ' + str(maxLengthInSeconds))
117 109
     maxMilliSeconds = maxLengthInSeconds * 1000
118 110
     outDomainLists = []
119 111
     outDFFrames = []
@@ -121,8 +113,8 @@ def getChunksFromUserDataFrame(dataFrame,windowSize=10,overlapping=False,
121 113
         numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize)))
122 114
         userIDs = np.arange(len(dataFrame))
123 115
         for blockID in np.arange(numBlocks):
124
-            curIDs = userIDs[(blockID * windowSize):((blockID+1)*windowSize)]
125
-            #print(curIDs)
116
+            curIDs = userIDs[(blockID * windowSize):((blockID + 1) * windowSize)]
117
+            # print(curIDs)
126 118
             useData = dataFrame.iloc[curIDs]
127 119
             curDomains = useData['domain']
128 120
             if maxLengthInSeconds != -1:
@@ -138,8 +130,8 @@ def getChunksFromUserDataFrame(dataFrame,windowSize=10,overlapping=False,
138 130
         numBlocks = len(dataFrame) + 1 - windowSize
139 131
         userIDs = np.arange(len(dataFrame))
140 132
         for blockID in np.arange(numBlocks):
141
-            curIDs = userIDs[blockID:blockID+windowSize]
142
-            #print(curIDs)
133
+            curIDs = userIDs[blockID:blockID + windowSize]
134
+            # print(curIDs)
143 135
             useData = dataFrame.iloc[curIDs]
144 136
             curDomains = useData['domain']
145 137
             if maxLengthInSeconds != -1:
@@ -151,11 +143,11 @@ def getChunksFromUserDataFrame(dataFrame,windowSize=10,overlapping=False,
151 143
                     curDomains = useData['domain']
152 144
             outDomainLists.append(list(curDomains))
153 145
             outDFFrames.append(useData)
154
-    return (outDomainLists,outDFFrames)
155
-    
156
-    
157
-def createTrainData(domainLists,dfLists,charachterDict,maxLen,threshold = 3,
158
-                    flagUseCiscoFeatures=False,urlSIPDIct=dict,
146
+    return (outDomainLists, outDFFrames)
147
+
148
+
149
+def createTrainData(domainLists, dfLists, charachterDict, maxLen, threshold=3,
150
+                    flagUseCiscoFeatures=False, urlSIPDIct=dict,
159 151
                     windowSize=10):
160 152
     if 'hits' in dfLists[0].keys():
161 153
         hitName = 'hits'
@@ -171,21 +163,21 @@ def createTrainData(domainLists,dfLists,charachterDict,maxLen,threshold = 3,
171 163
     hits = []
172 164
     trainNames = []
173 165
     for i in range(windowSize):
174
-        outputFeatures.append(np.zeros([len(domainLists),maxLen]))
175
-        outputFeatures.append(np.zeros([len(domainLists),numFeatures]))
176
-    
166
+        outputFeatures.append(np.zeros([len(domainLists), maxLen]))
167
+        outputFeatures.append(np.zeros([len(domainLists), numFeatures]))
168
+
177 169
     for i in tqdm(np.arange(len(domainLists)), miniters=10):
178 170
         curCounter = 0
179
-        #print('len domainList: ' + str(len(domainLists[i])))
180
-        #print('len df: ' + str(len(dfLists[i])))
181
-        for j in range(np.min([windowSize,len(domainLists[i])])):
182
-            outputFeatures[curCounter][i,:] = getFeatureVecForDomain(domainLists[i][j],charachterDict,maxLen)
171
+        # print('len domainList: ' + str(len(domainLists[i])))
172
+        # print('len df: ' + str(len(dfLists[i])))
173
+        for j in range(np.min([windowSize, len(domainLists[i])])):
174
+            outputFeatures[curCounter][i, :] = getFeatureVecForDomain(domainLists[i][j], charachterDict, maxLen)
183 175
             curCounter += 1
184
-            if flagUseCiscoFeatures:                
185
-                outputFeatures[curCounter][i,0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j])
186
-                outputFeatures[curCounter][i,numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j],urlSIPDIct)
176
+            if flagUseCiscoFeatures:
177
+                outputFeatures[curCounter][i, 0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j])
178
+                outputFeatures[curCounter][i, numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j], urlSIPDIct)
187 179
             else:
188
-                outputFeatures[curCounter][i,:] = getFlowFeatures(dfLists[i].iloc[j])
180
+                outputFeatures[curCounter][i, :] = getFlowFeatures(dfLists[i].iloc[j])
189 181
             curCounter += 1
190 182
         curLabel = 0.0
191 183
         if np.max(dfLists[i][hitName]) >= threshold:
@@ -198,215 +190,26 @@ def createTrainData(domainLists,dfLists,charachterDict,maxLen,threshold = 3,
198 190
         hits.append(np.max(dfLists[i][hitName]))
199 191
         trainNames.append(np.unique(dfLists[i]['user_hash']))
200 192
     return (outputFeatures, np.array(label), np.array(hits), np.array(trainNames))
201
-    
202
-    
193
+
194
+
203 195
 def transformStringListToNumpyArray(listString):
204
-    listString = listString.replace('[','').replace(']','')
205
-    return np.array(listString.split(','),dtype='float32')
206
-    
196
+    listString = listString.replace('[', '').replace(']', '')
197
+    return np.array(listString.split(','), dtype='float32')
198
+
199
+
207 200
 def getCiscoFeatureDict(csvPathList):
208 201
     outDict = dict()
209 202
     for path in tqdm(csvPathList, miniters=1):
210
-        fobj = open(path,'r')
211
-        csvReader = csv.DictReader(fobj,delimiter=',')
203
+        fobj = open(path, 'r')
204
+        csvReader = csv.DictReader(fobj, delimiter=',')
212 205
         for row in csvReader:
213 206
             urlSIPString = row['Domain'] + row['ServerIP']
214 207
             ciscoFeatures = row['CiscoFeature']
215 208
             outDict[urlSIPString] = transformStringListToNumpyArray(ciscoFeatures)
216
-            #if len(outDict) % 10000 == 0:
209
+            # if len(outDict) % 10000 == 0:
217 210
             #    print('numbers in dict: ' + str(len(outDict)))
218 211
     return outDict
219 212
 
220
-    
213
+
221 214
 if __name__ == "__main__":
222
-    
223
-    # get data
224
-    trainDirsUserLevel = ['trainData/joblib2016-07-annomalous-stg-new/10/',
225
-                          'trainData/joblib2016-07-annomalous-stg-new/09/',
226
-                          'trainData/joblib2016-07-annomalous-stg-new/08/',
227
-                          'trainData/joblib2016-07-annomalous-stg-new/07/',
228
-                          'trainData/joblib2016-07-annomalous-stg-new/06/']
229
-    
230
-    testDirsUserLevel = ['trainData/joblib2016-09-annomalous-stg-new/07/',\
231
-                'trainData/joblib2016-09-annomalous-stg-new/08/',\
232
-                'trainData/joblib2016-09-annomalous-stg-new/09/',\
233
-                'trainData/joblib2016-09-annomalous-stg-new/10/',\
234
-                'trainData/joblib2016-09-annomalous-stg-new/11/',\
235
-                'trainData/joblib2016-09-annomalous-stg-new/12/',\
236
-                'trainData/joblib2016-09-annomalous-stg-new/13/',\
237
-                'trainData/joblib2016-09-annomalous-stg-new/14/']
238
-    
239
-    trainCiscoFeatureCSVPaths   = ['trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_07.csv',
240
-                'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_06.csv',
241
-                'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_08.csv',
242
-                'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_10.csv',
243
-                'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_09.csv']
244
-                
245
-    testCiscoFeatureCSVPaths    = ['trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_12.csv',
246
-                'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_08.csv',
247
-                'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_07.csv',
248
-                'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_09.csv',
249
-                'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_13.csv',
250
-                'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_14.csv',
251
-                'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_10.csv',
252
-                'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_11.csv']
253
-                
254
-    # parameter
255
-    numNegPerDay = 5000
256
-    numEpochs = 10
257
-    domainFeatures  = 512
258
-    flowFeatures    = 3
259
-    numCiscoFeatures= 30
260
-    windowSize      = 10
261
-    maxLen = 40
262
-    
263
-    lstmUnits = 32
264
-    lstmDenseSize = 128
265
-    embeddingSize = 100
266
-    kernel_size = 2
267
-    drop_out = 0.5
268
-    filters = 2
269
-    hidden_dims = 100
270
-    vocabSize = 40
271
-    flagUseCiscoFeatures = True
272
-    threshold = 3
273
-    resultStoreDir = 'results/201705/'
274
-    if flagUseCiscoFeatures:
275
-        resultStorePath = resultStoreDir + 'cnn_plus_cisco_plus_lstm_numNegPerDay' + str(numNegPerDay) + '.joblib'
276
-        resultModelPath = resultStoreDir + 'cnn_plus_cisco_plus_lstm_numNegPerDay' + str(numNegPerDay)
277
-    else:
278
-        resultStorePath = resultStoreDir + 'cnn_plus_lstm_numNegPerDay' + str(numNegPerDay) + '.joblib'
279
-        resultModelPath = resultStoreDir + 'cnn_plus_lstm_numNegPerDay' + str(numNegPerDay)
280
-    flagRedo = True
281
-    
282
-    
283
-    if flagUseCiscoFeatures:
284
-        if 'trainCiscoFeatureDict' not in locals():
285
-            trainCiscoFeatureDict = getCiscoFeatureDict(trainCiscoFeatureCSVPaths)
286
-            
287
-        if 'testCiscoFeatureDict' not in locals():
288
-            testCiscoFeatureDict = getCiscoFeatureDict(testCiscoFeatureCSVPaths)
289
-    else:
290
-        trainCiscoFeatureDict = dict()
291
-        testCiscoFeatureDict = dict()
292
-    
293
-    if flagRedo or not os.path.exists(resultStorePath):        
294
-        if 'characterDict' not in locals():
295
-            characterDictPath = 'trainData/characterIDDict.joblib'
296
-            characterDict = joblib.load(characterDictPath)['characterIDDict']
297
-        
298
-        
299
-        print('create train data')
300
-        if 'dataFrameList' not in locals():
301
-            (dataFrameList) = ciscoProcessing.loadRawDataSetFromJoblibPerUser(\
302
-                            trainDirsUserLevel,numNegPerDay = numNegPerDay)
303
-            maxHits = []
304
-            for i in range(len(dataFrameList)):
305
-                maxHits.append(np.max(dataFrameList[i]['hits']))
306
-        
307
-        print('create test data')
308
-        # validation error
309
-        if 'testDataFrameList' not in locals():
310
-            (testDataFrameList) = ciscoProcessing.loadRawDataSetFromJoblibPerUser(\
311
-                            [testDirsUserLevel[0]],numNegPerDay = numNegPerDay)
312
-            maxHits = []
313
-            for i in range(len(testDataFrameList)):
314
-                maxHits.append(np.max(testDataFrameList[i]['hits']))
315
-    
316
-        sharedCNNFun = getCNNWitoutLastLayerFunctional(len(characterDict)+1,embeddingSize,maxLen,domainFeatures,kernel_size,domainFeatures,0.5)
317
-    
318
-        inputList = []
319
-        encodedList = []
320
-        numFeatures = flowFeatures
321
-        if flagUseCiscoFeatures:
322
-            numFeatures += numCiscoFeatures
323
-        for i in range(windowSize):
324
-            inputList.append(Input(shape=(maxLen,)))
325
-            encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model
326
-            inputList.append(Input(shape=(numFeatures,)))
327
-        
328
-        merge_layer_input = []
329
-        for i in range(windowSize):
330
-            merge_layer_input.append(encodedList[i])
331
-            merge_layer_input.append(inputList[(2*i)+1])
332
-            
333
-            
334
-        # We can then concatenate the two vectors:
335
-        merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
336
-        reshape = Reshape((windowSize, domainFeatures+numFeatures))(merged_vector)
337
-        lstm = LSTM(lstmUnits, input_shape=(windowSize,domainFeatures+numFeatures))(reshape)
338
-        dense = Dense(lstmDenseSize, activation='relu')(lstm)
339
-        dropout = Dropout(0.5)(dense)
340
-        # And add a logistic regression on top
341
-        predictions = Dense(2, activation='softmax')(dropout)
342
-        
343
-        # We define a trainable model linking the
344
-        # tweet inputs to the predictions
345
-        model = Model(inputs=inputList, outputs=predictions)
346
-        
347
-        model.compile(optimizer='adam',
348
-                      loss='binary_crossentropy',
349
-                      metrics=['accuracy'])
350
-                      
351
-        
352
-        # get train data
353
-        domainLists = []
354
-        dfLists = []
355
-        for i in tqdm(np.arange(len(dataFrameList)), miniters=10):
356
-            (domainListsTmp,dfListsTmp) = getChunksFromUserDataFrame(dataFrameList[i],windowSize=windowSize,overlapping=False)
357
-            domainLists += domainListsTmp
358
-            dfLists += dfListsTmp    
359
-        
360
-        (trainData,trainLabel,trainHits,trainNames) = createTrainData(domainLists,dfLists,characterDict,
361
-                maxLen,threshold = threshold,
362
-                flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=trainCiscoFeatureDict)
363
-        useIDs = np.where(trainHits == 0)[0]
364
-        useIDs = np.concatenate([useIDs,np.where(trainHits >= threshold)[0]])
365
-        for i in range(len(trainData)):
366
-            trainData[i] = np.array(trainData[i])[useIDs]
367
-        trainLabel = trainLabel[useIDs]
368
-        trainHits = trainHits[useIDs]
369
-        trainNames = trainNames[useIDs]
370
-        
371
-        # get test data
372
-        domainLists = []
373
-        dfLists = []
374
-        for i in tqdm(np.arange(len(testDataFrameList)), miniters=10):
375
-            (domainListsTmp,dfListsTmp) = getChunksFromUserDataFrame(testDataFrameList[i],windowSize=windowSize,overlapping=False)
376
-            domainLists += domainListsTmp
377
-            dfLists += dfListsTmp    
378
-        
379
-        (testData,testLabel,testHits,testNames) = createTrainData(domainLists,dfLists,characterDict,
380
-                maxLen,threshold = threshold,
381
-                flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=testCiscoFeatureDict)
382
-        useIDs = np.where(testHits == 0)[0]
383
-        useIDs = np.concatenate([useIDs,np.where(testHits >= threshold)[0]])
384
-        for i in range(len(testData)):
385
-            testData[i] = np.array(testData[i])[useIDs]
386
-        testLabel = testLabel[useIDs]
387
-        testHits = testHits[useIDs]
388
-        testNames = testNames[useIDs]
389
-        
390
-        numPos = len(np.where(trainLabel == 1.0)[0])
391
-        numNeg = len(np.where(trainLabel == 0.0)[0])
392
-        print('major class: ' + str(float(numNeg) / float(numNeg + numPos)))
393
-        lstmLabel = np_utils.to_categorical(trainLabel, 2)
394
-        lstmTestLabel = np_utils.to_categorical(testLabel, 2)
395
-        trainHist = model.fit(trainData,lstmLabel,epochs=numEpochs,batch_size=128, validation_data=(testData,lstmTestLabel))
396
-        
397
-        
398
-        # save lstm model
399
-        ciscoProcessing.save_model(model,resultModelPath+'.json',
400
-            resultModelPath + '.h5')
401
-        
402
-        # classify train and test
403
-        trainScores = model.predict(trainData)[:,1]
404
-        testScores = model.predict(testData)[:,1]  
405
-        
406
-        joblib.dump({'testLabel':testLabel,
407
-                     'testHits':testHits,
408
-                     'testNames':testNames,
409
-                     'testScores':testScores,
410
-                     'trainLabel':trainLabel,
411
-                     'trainScores':trainScores},resultStorePath,compress=3)
412
-    
215
+    pass

Loading…
Cancel
Save