separating logical sections into dataset, models and main.

continued initial refactoring
This commit is contained in:
René Knaebel 2017-06-30 10:12:20 +02:00
parent be273d9247
commit bbd63fd1da
5 changed files with 132 additions and 107 deletions

5
.gitignore vendored
View File

@ -96,4 +96,7 @@ ENV/
.DS_Store .DS_Store
# data # data
*.tif *.tif
*.joblib
*.csv
*.csv.gz

View File

@ -1,12 +1,8 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import string import string
import keras
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Reshape, Embedding, Input, Activation
from keras.models import Model
from keras.utils import np_utils
from tqdm import tqdm from tqdm import tqdm
@ -21,18 +17,6 @@ def get_character_dict():
enumerate(string.ascii_lowercase + string.punctuation)) enumerate(string.ascii_lowercase + string.punctuation))
def get_cnn(vocabSize, embeddingSize, input_length, filters, kernel_size,
hidden_dims, drop_out):
x = y = Input(shape=(input_length,))
y = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(y)
y = Conv1D(filters, kernel_size, activation='relu')(y)
y = GlobalMaxPooling1D()(y)
y = Dense(hidden_dims)(y)
y = Dropout(drop_out)(y)
y = Activation('relu')(y)
return Model(x, y)
def get_user_chunks(dataFrame, windowSize=10, overlapping=False, def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
maxLengthInSeconds=300): maxLengthInSeconds=300):
# print('maxLength: ' + str(maxLengthInSeconds)) # print('maxLength: ' + str(maxLengthInSeconds))
@ -102,10 +86,8 @@ def getCiscoFeatures(curDataLine, urlSIPDict):
numCiscoFeatures = 30 numCiscoFeatures = 30
try: try:
ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])] ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
# print('cisco features: ' + str(ciscoFeatures))
# log transform # log transform
ciscoFeatures = np.log1p(ciscoFeatures).astype(float) ciscoFeatures = np.log1p(ciscoFeatures).astype(float)
# print('log transformed: ' + str(ciscoFeatures))
return ciscoFeatures.ravel() return ciscoFeatures.ravel()
except: except:
return np.zeros([numCiscoFeatures, ]).ravel() return np.zeros([numCiscoFeatures, ]).ravel()
@ -117,7 +99,7 @@ def create_dataset_from_flows(user_flow_df, char_dict, maxLen, threshold=3, wind
print("get chunks from user data frames") print("get chunks from user data frames")
for i, user_flow in enumerate(get_flow_per_user(user_flow_df)): for i, user_flow in enumerate(get_flow_per_user(user_flow_df)):
(domainListsTmp, dfListsTmp) = get_user_chunks(user_flow, windowSize=windowSize, (domainListsTmp, dfListsTmp) = get_user_chunks(user_flow, windowSize=windowSize,
overlapping=False, maxLengthInSeconds=maxLengthInSeconds) overlapping=False, maxLengthInSeconds=-1)
domainLists += domainListsTmp domainLists += domainListsTmp
dfLists += dfListsTmp dfLists += dfListsTmp
if i >= 10: if i >= 10:
@ -193,90 +175,3 @@ def get_flow_per_user(df):
users = df['user_hash'].unique().tolist() users = df['user_hash'].unique().tolist()
for user in users: for user in users:
yield df.loc[df.user_hash == user] yield df.loc[df.user_hash == user]
if __name__ == "__main__":
# parameter
innerCNNFilters = 512
innerCNNKernelSize = 2
cnnDropout = 0.5
cnnHiddenDims = 1024
domainFeatures = 512
flowFeatures = 3
numCiscoFeatures = 30
windowSize = 10
maxLen = 40
embeddingSize = 100
kernel_size = 2
drop_out = 0.5
filters = 2
hidden_dims = 100
vocabSize = 40
threshold = 3
minFlowsPerUser = 10
numEpochs = 100
maxLengthInSeconds = -1
timesNeg = -1
char_dict = get_character_dict()
user_flow_df = get_user_flow_data()
print("create training dataset")
(X_tr, y_tr, hits_tr, names_tr) = create_dataset_from_flows(
user_flow_df, char_dict,
maxLen=maxLen, threshold=threshold, windowSize=windowSize)
pos_idx = np.where(y_tr == 1.0)[0]
neg_idx = np.where(y_tr == 0.0)[0]
use_idx = np.concatenate((pos_idx, neg_idx))
y_tr = y_tr[use_idx]
# hits_tr = hits_tr[use_idx]
# names_tr = names_tr[use_idx]
for i in range(len(X_tr)):
X_tr[i] = X_tr[i][use_idx]
# TODO: WTF? I don't get it...
sharedCNNFun = get_cnn(len(char_dict) + 1, embeddingSize, maxLen,
domainFeatures, kernel_size, domainFeatures, 0.5)
inputList = []
encodedList = []
numFeatures = flowFeatures
for i in range(windowSize):
inputList.append(Input(shape=(maxLen,)))
encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model
inputList.append(Input(shape=(numFeatures,)))
merge_layer_input = []
for i in range(windowSize):
merge_layer_input.append(encodedList[i])
merge_layer_input.append(inputList[(2 * i) + 1])
# We can then concatenate the two vectors:
merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector)
# add second cnn
cnn = Conv1D(filters,
kernel_size,
activation='relu',
input_shape=(windowSize, domainFeatures + numFeatures))(reshape)
# we use max pooling:
maxPool = GlobalMaxPooling1D()(cnn)
cnnDropout = Dropout(cnnDropout)(maxPool)
cnnDense = Dense(cnnHiddenDims, activation='relu')(cnnDropout)
cnnOutput = Dense(2, activation='softmax')(cnnDense)
# We define a trainable model linking the
# tweet inputs to the predictions
model = Model(inputs=inputList, outputs=cnnOutput)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
epochNumber = 0
trainLabel = np_utils.to_categorical(y_tr, 2)
model.fit(x=X_tr, y=trainLabel, batch_size=128,
epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber) # ,
# validation_data=(testData,testLabel))

68
main.py Normal file
View File

@ -0,0 +1,68 @@
import numpy as np
from keras.utils import np_utils
import dataset
import models
def main():
# parameter
innerCNNFilters = 512
innerCNNKernelSize = 2
cnnDropout = 0.5
cnnHiddenDims = 1024
domainFeatures = 512
flowFeatures = 3
numCiscoFeatures = 30
windowSize = 10
maxLen = 40
embeddingSize = 100
kernel_size = 2
drop_out = 0.5
filters = 2
hidden_dims = 100
vocabSize = 40
threshold = 3
minFlowsPerUser = 10
numEpochs = 100
timesNeg = -1
char_dict = dataset.get_character_dict()
user_flow_df = dataset.get_user_flow_data()
print("create training dataset")
(X_tr, y_tr, hits_tr, names_tr) = dataset.create_dataset_from_flows(
user_flow_df, char_dict,
maxLen=maxLen, threshold=threshold, windowSize=windowSize)
pos_idx = np.where(y_tr == 1.0)[0]
neg_idx = np.where(y_tr == 0.0)[0]
use_idx = np.concatenate((pos_idx, neg_idx))
y_tr = y_tr[use_idx]
# hits_tr = hits_tr[use_idx]
# names_tr = names_tr[use_idx]
for i in range(len(X_tr)):
X_tr[i] = X_tr[i][use_idx]
# TODO: WTF? I don't get it...
shared_cnn = models.get_shared_cnn(len(char_dict) + 1, embeddingSize, maxLen,
domainFeatures, kernel_size, domainFeatures, 0.5)
model = models.get_top_cnn(shared_cnn, flowFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size,
cnnHiddenDims, cnnDropout)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
epochNumber = 0
y_tr = np_utils.to_categorical(y_tr, 2)
model.fit(x=X_tr, y=y_tr, batch_size=128,
epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber) # ,
# validation_data=(testData,testLabel))
if __name__ == "__main__":
main()

53
models.py Normal file
View File

@ -0,0 +1,53 @@
import keras
from keras.engine import Input, Model
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, Reshape
def get_shared_cnn(vocabSize, embeddingSize, input_length, filters, kernel_size,
hidden_dims, drop_out):
x = y = Input(shape=(input_length,))
y = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(y)
y = Conv1D(filters, kernel_size, activation='relu')(y)
y = GlobalMaxPooling1D()(y)
y = Dense(hidden_dims)(y)
y = Dropout(drop_out)(y)
y = Activation('relu')(y)
return Model(x, y)
def get_full_model(vocabSize, embeddingSize, maxLen, domainFeatures, flowFeatures,
filters, h1, h2, dropout, dense):
pass
def get_top_cnn(cnn, numFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size, cnnHiddenDims, cnnDropout):
inputList = []
encodedList = []
# TODO: ???
for i in range(windowSize):
inputList.append(Input(shape=(maxLen,)))
encodedList.append(cnn(inputList[-1])) # add shared domain model
inputList.append(Input(shape=(numFeatures,)))
# TODO: ???
merge_layer_input = []
for i in range(windowSize):
merge_layer_input.append(encodedList[i])
merge_layer_input.append(inputList[(2 * i) + 1])
# We can then concatenate the two vectors:
merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector)
# add second cnn
cnn = Conv1D(filters,
kernel_size,
activation='relu',
input_shape=(windowSize, domainFeatures + numFeatures))(reshape)
# we use max pooling:
maxPool = GlobalMaxPooling1D()(cnn)
cnnDropout = Dropout(cnnDropout)(maxPool)
cnnDense = Dense(cnnHiddenDims, activation='relu')(cnnDropout)
cnnOutput = Dense(2, activation='softmax')(cnnDense)
# We define a trainable model linking the
# tweet inputs to the predictions
model = Model(inputs=inputList, outputs=cnnOutput)
return model

View File

@ -0,0 +1,6 @@
#!/usr/bin/python2
import joblib
datafile = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/currentData.joblib")
user_flows = datafile["data"]