Browse Source

separating logical sections into dataset, models and main.

continued initial refactoring
master
René Knaebel 2 years ago
parent
commit
bbd63fd1da
5 changed files with 132 additions and 107 deletions
  1. 4
    1
      .gitignore
  2. 1
    106
      dataset.py
  3. 68
    0
      main.py
  4. 53
    0
      models.py
  5. 6
    0
      scripts/make_csv_dataset.py

+ 4
- 1
.gitignore View File

@@ -96,4 +96,7 @@ ENV/
.DS_Store

# data
*.tif
*.tif
*.joblib
*.csv
*.csv.gz

cnnOnCnnParameterSelection.py → dataset.py View File

@@ -1,12 +1,8 @@
# -*- coding: utf-8 -*-
import string

import keras
import numpy as np
import pandas as pd
from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Reshape, Embedding, Input, Activation
from keras.models import Model
from keras.utils import np_utils
from tqdm import tqdm


@@ -21,18 +17,6 @@ def get_character_dict():
enumerate(string.ascii_lowercase + string.punctuation))


def get_cnn(vocabSize, embeddingSize, input_length, filters, kernel_size,
hidden_dims, drop_out):
x = y = Input(shape=(input_length,))
y = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(y)
y = Conv1D(filters, kernel_size, activation='relu')(y)
y = GlobalMaxPooling1D()(y)
y = Dense(hidden_dims)(y)
y = Dropout(drop_out)(y)
y = Activation('relu')(y)
return Model(x, y)


def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
maxLengthInSeconds=300):
# print('maxLength: ' + str(maxLengthInSeconds))
@@ -102,10 +86,8 @@ def getCiscoFeatures(curDataLine, urlSIPDict):
numCiscoFeatures = 30
try:
ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
# print('cisco features: ' + str(ciscoFeatures))
# log transform
ciscoFeatures = np.log1p(ciscoFeatures).astype(float)
# print('log transformed: ' + str(ciscoFeatures))
return ciscoFeatures.ravel()
except:
return np.zeros([numCiscoFeatures, ]).ravel()
@@ -117,7 +99,7 @@ def create_dataset_from_flows(user_flow_df, char_dict, maxLen, threshold=3, wind
print("get chunks from user data frames")
for i, user_flow in enumerate(get_flow_per_user(user_flow_df)):
(domainListsTmp, dfListsTmp) = get_user_chunks(user_flow, windowSize=windowSize,
overlapping=False, maxLengthInSeconds=maxLengthInSeconds)
overlapping=False, maxLengthInSeconds=-1)
domainLists += domainListsTmp
dfLists += dfListsTmp
if i >= 10:
@@ -193,90 +175,3 @@ def get_flow_per_user(df):
users = df['user_hash'].unique().tolist()
for user in users:
yield df.loc[df.user_hash == user]


if __name__ == "__main__":
# parameter
innerCNNFilters = 512
innerCNNKernelSize = 2
cnnDropout = 0.5
cnnHiddenDims = 1024
domainFeatures = 512
flowFeatures = 3
numCiscoFeatures = 30
windowSize = 10
maxLen = 40
embeddingSize = 100
kernel_size = 2
drop_out = 0.5
filters = 2
hidden_dims = 100
vocabSize = 40
threshold = 3
minFlowsPerUser = 10
numEpochs = 100
maxLengthInSeconds = -1
timesNeg = -1

char_dict = get_character_dict()
user_flow_df = get_user_flow_data()

print("create training dataset")
(X_tr, y_tr, hits_tr, names_tr) = create_dataset_from_flows(
user_flow_df, char_dict,
maxLen=maxLen, threshold=threshold, windowSize=windowSize)

pos_idx = np.where(y_tr == 1.0)[0]
neg_idx = np.where(y_tr == 0.0)[0]

use_idx = np.concatenate((pos_idx, neg_idx))

y_tr = y_tr[use_idx]
# hits_tr = hits_tr[use_idx]
# names_tr = names_tr[use_idx]
for i in range(len(X_tr)):
X_tr[i] = X_tr[i][use_idx]

# TODO: WTF? I don't get it...
sharedCNNFun = get_cnn(len(char_dict) + 1, embeddingSize, maxLen,
domainFeatures, kernel_size, domainFeatures, 0.5)

inputList = []
encodedList = []
numFeatures = flowFeatures
for i in range(windowSize):
inputList.append(Input(shape=(maxLen,)))
encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model
inputList.append(Input(shape=(numFeatures,)))

merge_layer_input = []
for i in range(windowSize):
merge_layer_input.append(encodedList[i])
merge_layer_input.append(inputList[(2 * i) + 1])

# We can then concatenate the two vectors:
merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector)
# add second cnn
cnn = Conv1D(filters,
kernel_size,
activation='relu',
input_shape=(windowSize, domainFeatures + numFeatures))(reshape)
# we use max pooling:
maxPool = GlobalMaxPooling1D()(cnn)
cnnDropout = Dropout(cnnDropout)(maxPool)
cnnDense = Dense(cnnHiddenDims, activation='relu')(cnnDropout)
cnnOutput = Dense(2, activation='softmax')(cnnDense)

# We define a trainable model linking the
# tweet inputs to the predictions
model = Model(inputs=inputList, outputs=cnnOutput)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])

epochNumber = 0
trainLabel = np_utils.to_categorical(y_tr, 2)
model.fit(x=X_tr, y=trainLabel, batch_size=128,
epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber) # ,
# validation_data=(testData,testLabel))

+ 68
- 0
main.py View File

@@ -0,0 +1,68 @@
import numpy as np
from keras.utils import np_utils

import dataset
import models


def main():
# parameter
innerCNNFilters = 512
innerCNNKernelSize = 2
cnnDropout = 0.5
cnnHiddenDims = 1024
domainFeatures = 512
flowFeatures = 3
numCiscoFeatures = 30
windowSize = 10
maxLen = 40
embeddingSize = 100
kernel_size = 2
drop_out = 0.5
filters = 2
hidden_dims = 100
vocabSize = 40
threshold = 3
minFlowsPerUser = 10
numEpochs = 100
timesNeg = -1

char_dict = dataset.get_character_dict()
user_flow_df = dataset.get_user_flow_data()

print("create training dataset")
(X_tr, y_tr, hits_tr, names_tr) = dataset.create_dataset_from_flows(
user_flow_df, char_dict,
maxLen=maxLen, threshold=threshold, windowSize=windowSize)

pos_idx = np.where(y_tr == 1.0)[0]
neg_idx = np.where(y_tr == 0.0)[0]

use_idx = np.concatenate((pos_idx, neg_idx))

y_tr = y_tr[use_idx]
# hits_tr = hits_tr[use_idx]
# names_tr = names_tr[use_idx]
for i in range(len(X_tr)):
X_tr[i] = X_tr[i][use_idx]

# TODO: WTF? I don't get it...
shared_cnn = models.get_shared_cnn(len(char_dict) + 1, embeddingSize, maxLen,
domainFeatures, kernel_size, domainFeatures, 0.5)

model = models.get_top_cnn(shared_cnn, flowFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size,
cnnHiddenDims, cnnDropout)

model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])

epochNumber = 0
y_tr = np_utils.to_categorical(y_tr, 2)
model.fit(x=X_tr, y=y_tr, batch_size=128,
epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber) # ,
# validation_data=(testData,testLabel))


if __name__ == "__main__":
main()

+ 53
- 0
models.py View File

@@ -0,0 +1,53 @@
import keras
from keras.engine import Input, Model
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, Reshape


def get_shared_cnn(vocabSize, embeddingSize, input_length, filters, kernel_size,
hidden_dims, drop_out):
x = y = Input(shape=(input_length,))
y = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(y)
y = Conv1D(filters, kernel_size, activation='relu')(y)
y = GlobalMaxPooling1D()(y)
y = Dense(hidden_dims)(y)
y = Dropout(drop_out)(y)
y = Activation('relu')(y)
return Model(x, y)


def get_full_model(vocabSize, embeddingSize, maxLen, domainFeatures, flowFeatures,
filters, h1, h2, dropout, dense):
pass


def get_top_cnn(cnn, numFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size, cnnHiddenDims, cnnDropout):
inputList = []
encodedList = []
# TODO: ???
for i in range(windowSize):
inputList.append(Input(shape=(maxLen,)))
encodedList.append(cnn(inputList[-1])) # add shared domain model
inputList.append(Input(shape=(numFeatures,)))
# TODO: ???
merge_layer_input = []
for i in range(windowSize):
merge_layer_input.append(encodedList[i])
merge_layer_input.append(inputList[(2 * i) + 1])
# We can then concatenate the two vectors:
merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector)
# add second cnn
cnn = Conv1D(filters,
kernel_size,
activation='relu',
input_shape=(windowSize, domainFeatures + numFeatures))(reshape)
# we use max pooling:
maxPool = GlobalMaxPooling1D()(cnn)
cnnDropout = Dropout(cnnDropout)(maxPool)
cnnDense = Dense(cnnHiddenDims, activation='relu')(cnnDropout)
cnnOutput = Dense(2, activation='softmax')(cnnDense)

# We define a trainable model linking the
# tweet inputs to the predictions
model = Model(inputs=inputList, outputs=cnnOutput)
return model

+ 6
- 0
scripts/make_csv_dataset.py View File

@@ -0,0 +1,6 @@
#!/usr/bin/python2

import joblib

datafile = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/currentData.joblib")
user_flows = datafile["data"]

Loading…
Cancel
Save