Browse Source

separating logical sections into dataset, models and main.

continued initial refactoring
master
René Knaebel 4 years ago
parent
commit
bbd63fd1da
  1. 5
      .gitignore
  2. 107
      dataset.py
  3. 68
      main.py
  4. 53
      models.py
  5. 6
      scripts/make_csv_dataset.py

5
.gitignore

@ -96,4 +96,7 @@ ENV/
.DS_Store
# data
*.tif
*.tif
*.joblib
*.csv
*.csv.gz

107
cnnOnCnnParameterSelection.py → dataset.py

@ -1,12 +1,8 @@
# -*- coding: utf-8 -*-
import string
import keras
import numpy as np
import pandas as pd
from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Reshape, Embedding, Input, Activation
from keras.models import Model
from keras.utils import np_utils
from tqdm import tqdm
@ -21,18 +17,6 @@ def get_character_dict():
enumerate(string.ascii_lowercase + string.punctuation))
def get_cnn(vocabSize, embeddingSize, input_length, filters, kernel_size,
hidden_dims, drop_out):
x = y = Input(shape=(input_length,))
y = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(y)
y = Conv1D(filters, kernel_size, activation='relu')(y)
y = GlobalMaxPooling1D()(y)
y = Dense(hidden_dims)(y)
y = Dropout(drop_out)(y)
y = Activation('relu')(y)
return Model(x, y)
def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
maxLengthInSeconds=300):
# print('maxLength: ' + str(maxLengthInSeconds))
@ -102,10 +86,8 @@ def getCiscoFeatures(curDataLine, urlSIPDict):
numCiscoFeatures = 30
try:
ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
# print('cisco features: ' + str(ciscoFeatures))
# log transform
ciscoFeatures = np.log1p(ciscoFeatures).astype(float)
# print('log transformed: ' + str(ciscoFeatures))
return ciscoFeatures.ravel()
except:
return np.zeros([numCiscoFeatures, ]).ravel()
@ -117,7 +99,7 @@ def create_dataset_from_flows(user_flow_df, char_dict, maxLen, threshold=3, wind
print("get chunks from user data frames")
for i, user_flow in enumerate(get_flow_per_user(user_flow_df)):
(domainListsTmp, dfListsTmp) = get_user_chunks(user_flow, windowSize=windowSize,
overlapping=False, maxLengthInSeconds=maxLengthInSeconds)
overlapping=False, maxLengthInSeconds=-1)
domainLists += domainListsTmp
dfLists += dfListsTmp
if i >= 10:
@ -193,90 +175,3 @@ def get_flow_per_user(df):
users = df['user_hash'].unique().tolist()
for user in users:
yield df.loc[df.user_hash == user]
if __name__ == "__main__":
# parameter
innerCNNFilters = 512
innerCNNKernelSize = 2
cnnDropout = 0.5
cnnHiddenDims = 1024
domainFeatures = 512
flowFeatures = 3
numCiscoFeatures = 30
windowSize = 10
maxLen = 40
embeddingSize = 100
kernel_size = 2
drop_out = 0.5
filters = 2
hidden_dims = 100
vocabSize = 40
threshold = 3
minFlowsPerUser = 10
numEpochs = 100
maxLengthInSeconds = -1
timesNeg = -1
char_dict = get_character_dict()
user_flow_df = get_user_flow_data()
print("create training dataset")
(X_tr, y_tr, hits_tr, names_tr) = create_dataset_from_flows(
user_flow_df, char_dict,
maxLen=maxLen, threshold=threshold, windowSize=windowSize)
pos_idx = np.where(y_tr == 1.0)[0]
neg_idx = np.where(y_tr == 0.0)[0]
use_idx = np.concatenate((pos_idx, neg_idx))
y_tr = y_tr[use_idx]
# hits_tr = hits_tr[use_idx]
# names_tr = names_tr[use_idx]
for i in range(len(X_tr)):
X_tr[i] = X_tr[i][use_idx]
# TODO: WTF? I don't get it...
sharedCNNFun = get_cnn(len(char_dict) + 1, embeddingSize, maxLen,
domainFeatures, kernel_size, domainFeatures, 0.5)
inputList = []
encodedList = []
numFeatures = flowFeatures
for i in range(windowSize):
inputList.append(Input(shape=(maxLen,)))
encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model
inputList.append(Input(shape=(numFeatures,)))
merge_layer_input = []
for i in range(windowSize):
merge_layer_input.append(encodedList[i])
merge_layer_input.append(inputList[(2 * i) + 1])
# We can then concatenate the two vectors:
merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector)
# add second cnn
cnn = Conv1D(filters,
kernel_size,
activation='relu',
input_shape=(windowSize, domainFeatures + numFeatures))(reshape)
# we use max pooling:
maxPool = GlobalMaxPooling1D()(cnn)
cnnDropout = Dropout(cnnDropout)(maxPool)
cnnDense = Dense(cnnHiddenDims, activation='relu')(cnnDropout)
cnnOutput = Dense(2, activation='softmax')(cnnDense)
# We define a trainable model linking the
# tweet inputs to the predictions
model = Model(inputs=inputList, outputs=cnnOutput)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
epochNumber = 0
trainLabel = np_utils.to_categorical(y_tr, 2)
model.fit(x=X_tr, y=trainLabel, batch_size=128,
epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber) # ,
# validation_data=(testData,testLabel))

68
main.py

@ -0,0 +1,68 @@
import numpy as np
from keras.utils import np_utils
import dataset
import models
def main():
# parameter
innerCNNFilters = 512
innerCNNKernelSize = 2
cnnDropout = 0.5
cnnHiddenDims = 1024
domainFeatures = 512
flowFeatures = 3
numCiscoFeatures = 30
windowSize = 10
maxLen = 40
embeddingSize = 100
kernel_size = 2
drop_out = 0.5
filters = 2
hidden_dims = 100
vocabSize = 40
threshold = 3
minFlowsPerUser = 10
numEpochs = 100
timesNeg = -1
char_dict = dataset.get_character_dict()
user_flow_df = dataset.get_user_flow_data()
print("create training dataset")
(X_tr, y_tr, hits_tr, names_tr) = dataset.create_dataset_from_flows(
user_flow_df, char_dict,
maxLen=maxLen, threshold=threshold, windowSize=windowSize)
pos_idx = np.where(y_tr == 1.0)[0]
neg_idx = np.where(y_tr == 0.0)[0]
use_idx = np.concatenate((pos_idx, neg_idx))
y_tr = y_tr[use_idx]
# hits_tr = hits_tr[use_idx]
# names_tr = names_tr[use_idx]
for i in range(len(X_tr)):
X_tr[i] = X_tr[i][use_idx]
# TODO: WTF? I don't get it...
shared_cnn = models.get_shared_cnn(len(char_dict) + 1, embeddingSize, maxLen,
domainFeatures, kernel_size, domainFeatures, 0.5)
model = models.get_top_cnn(shared_cnn, flowFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size,
cnnHiddenDims, cnnDropout)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
epochNumber = 0
y_tr = np_utils.to_categorical(y_tr, 2)
model.fit(x=X_tr, y=y_tr, batch_size=128,
epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber) # ,
# validation_data=(testData,testLabel))
if __name__ == "__main__":
main()

53
models.py

@ -0,0 +1,53 @@
import keras
from keras.engine import Input, Model
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, Reshape
def get_shared_cnn(vocabSize, embeddingSize, input_length, filters, kernel_size,
hidden_dims, drop_out):
x = y = Input(shape=(input_length,))
y = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(y)
y = Conv1D(filters, kernel_size, activation='relu')(y)
y = GlobalMaxPooling1D()(y)
y = Dense(hidden_dims)(y)
y = Dropout(drop_out)(y)
y = Activation('relu')(y)
return Model(x, y)
def get_full_model(vocabSize, embeddingSize, maxLen, domainFeatures, flowFeatures,
filters, h1, h2, dropout, dense):
pass
def get_top_cnn(cnn, numFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size, cnnHiddenDims, cnnDropout):
inputList = []
encodedList = []
# TODO: ???
for i in range(windowSize):
inputList.append(Input(shape=(maxLen,)))
encodedList.append(cnn(inputList[-1])) # add shared domain model
inputList.append(Input(shape=(numFeatures,)))
# TODO: ???
merge_layer_input = []
for i in range(windowSize):
merge_layer_input.append(encodedList[i])
merge_layer_input.append(inputList[(2 * i) + 1])
# We can then concatenate the two vectors:
merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector)
# add second cnn
cnn = Conv1D(filters,
kernel_size,
activation='relu',
input_shape=(windowSize, domainFeatures + numFeatures))(reshape)
# we use max pooling:
maxPool = GlobalMaxPooling1D()(cnn)
cnnDropout = Dropout(cnnDropout)(maxPool)
cnnDense = Dense(cnnHiddenDims, activation='relu')(cnnDropout)
cnnOutput = Dense(2, activation='softmax')(cnnDense)
# We define a trainable model linking the
# tweet inputs to the predictions
model = Model(inputs=inputList, outputs=cnnOutput)
return model

6
scripts/make_csv_dataset.py

@ -0,0 +1,6 @@
#!/usr/bin/python2
import joblib
datafile = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/currentData.joblib")
user_flows = datafile["data"]
Loading…
Cancel
Save