added new networks for domain embedding and classification task

This commit is contained in:
René Knaebel 2017-07-05 17:37:08 +02:00
parent 59c1176e85
commit 3862dce975
4 changed files with 82 additions and 33 deletions

3
.gitignore vendored
View File

@ -99,4 +99,5 @@ ENV/
*.tif *.tif
*.joblib *.joblib
*.csv *.csv
*.csv.gz *.csv.gz
*.csv.tar.*

43
main.py
View File

@ -37,9 +37,21 @@ parser.add_argument("--epochs", action="store", dest="epochs",
# parser.add_argument("--samples_val", action="store", dest="samples_val", # parser.add_argument("--samples_val", action="store", dest="samples_val",
# default=10000, type=int) # default=10000, type=int)
# #
# parser.add_argument("--area", action="store", dest="area_size", parser.add_argument("--embd", action="store", dest="embedding",
# default=25, type=int) default=128, type=int)
#
parser.add_argument("--hidden_char_dims", action="store", dest="hidden_char_dims",
default=256, type=int)
parser.add_argument("--window", action="store", dest="window",
default=10, type=int)
parser.add_argument("--domain_length", action="store", dest="domain_length",
default=40, type=int)
parser.add_argument("--domain_embd", action="store", dest="domain_embedding",
default=512, type=int)
# parser.add_argument("--queue", action="store", dest="queue_size", # parser.add_argument("--queue", action="store", dest="queue_size",
# default=50, type=int) # default=50, type=int)
# #
@ -59,6 +71,7 @@ parser.add_argument("--epochs", action="store", dest="epochs",
args = parser.parse_args() args = parser.parse_args()
# config = tf.ConfigProto(log_device_placement=True) # config = tf.ConfigProto(log_device_placement=True)
# config.gpu_options.per_process_gpu_memory_fraction = 0.5 # config.gpu_options.per_process_gpu_memory_fraction = 0.5
# config.gpu_options.allow_growth = True # config.gpu_options.allow_growth = True
@ -67,24 +80,17 @@ args = parser.parse_args()
def main(): def main():
# parameter # parameter
innerCNNFilters = 512
innerCNNKernelSize = 2
cnnDropout = 0.5 cnnDropout = 0.5
cnnHiddenDims = 1024 cnnHiddenDims = 1024
domainFeatures = 512
flowFeatures = 3 flowFeatures = 3
numCiscoFeatures = 30 numCiscoFeatures = 30
windowSize = 10 kernel_size = 3
maxLen = 40
embeddingSize = 100
kernel_size = 2
drop_out = 0.5 drop_out = 0.5
filters = 2 filters = 128
hidden_dims = 100 hidden_dims = 100
vocabSize = 40 vocabSize = 40
threshold = 3 threshold = 3
minFlowsPerUser = 10 minFlowsPerUser = 10
numEpochs = 100
char_dict = dataset.get_character_dict() char_dict = dataset.get_character_dict()
user_flow_df = dataset.get_user_flow_data() user_flow_df = dataset.get_user_flow_data()
@ -92,7 +98,7 @@ def main():
print("create training dataset") print("create training dataset")
(X_tr, hits_tr, names_tr, server_tr, trusted_hits_tr) = dataset.create_dataset_from_flows( (X_tr, hits_tr, names_tr, server_tr, trusted_hits_tr) = dataset.create_dataset_from_flows(
user_flow_df, char_dict, user_flow_df, char_dict,
max_len=maxLen, window_size=windowSize) max_len=args.domain_length, window_size=args.window)
# make client labels discrete with 4 different values # make client labels discrete with 4 different values
# TODO: use trusted_hits_tr for client classification too # TODO: use trusted_hits_tr for client classification too
client_labels = np.apply_along_axis(lambda x: dataset.discretize_label(x, 3), 0, np.atleast_2d(hits_tr)) client_labels = np.apply_along_axis(lambda x: dataset.discretize_label(x, 3), 0, np.atleast_2d(hits_tr))
@ -104,11 +110,14 @@ def main():
client_labels = client_labels[idx] client_labels = client_labels[idx]
server_labels = server_tr[idx] server_labels = server_tr[idx]
shared_cnn = models.get_shared_cnn(len(char_dict) + 1, embeddingSize, maxLen, shared_cnn = models.get_embedding_network_rene(len(char_dict) + 1, args.embedding, args.domain_length,
domainFeatures, kernel_size, domainFeatures, 0.5) args.hidden_char_dims, args.domain_embedding, 0.5)
shared_cnn.summary()
model = models.get_top_cnn(shared_cnn, flowFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size, model = models.get_top_cnn_rene(cnnDropout, flowFeatures, args.domain_embedding,
cnnHiddenDims, cnnDropout) args.window, args.domain_length, filters, kernel_size,
cnnHiddenDims, shared_cnn)
model.summary()
model.compile(optimizer='adam', model.compile(optimizer='adam',
loss='binary_crossentropy', loss='binary_crossentropy',

View File

@ -1,10 +1,11 @@
import keras import keras
from keras.engine import Input, Model from keras.engine import Input, Model
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, TimeDistributed from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, TimeDistributed, MaxPool1D
def get_shared_cnn(vocab_size, embedding_size, input_length, filters, kernel_size, # designed by paul
hidden_dims, drop_out): def get_embedding_network_paul(vocab_size, embedding_size, input_length, filters, kernel_size,
hidden_dims, drop_out=0.5):
x = y = Input(shape=(input_length,)) x = y = Input(shape=(input_length,))
y = Embedding(input_dim=vocab_size, output_dim=embedding_size)(y) y = Embedding(input_dim=vocab_size, output_dim=embedding_size)(y)
y = Conv1D(filters, kernel_size, activation='relu')(y) y = Conv1D(filters, kernel_size, activation='relu')(y)
@ -15,26 +16,65 @@ def get_shared_cnn(vocab_size, embedding_size, input_length, filters, kernel_siz
return Model(x, y) return Model(x, y)
def get_embedding_network_rene(vocab_size, embedding_size, input_length,
hidden_char_dims, hidden_dims, drop_out=0.5):
x = y = Input(shape=(input_length,))
y = Embedding(input_dim=vocab_size, output_dim=embedding_size, mask_zero=True)(y)
y = Conv1D(hidden_char_dims, kernel_size=5, activation='relu')(y)
y = MaxPool1D(pool_size=3, strides=1)(y)
y = Conv1D(hidden_char_dims, kernel_size=3, activation='relu')(y)
y = MaxPool1D(pool_size=3, strides=1)(y)
y = Conv1D(hidden_char_dims, kernel_size=3, activation='relu')(y)
y = GlobalMaxPooling1D()(y)
y = Dense(hidden_dims)(y)
y = Dropout(drop_out)(y)
y = Activation('relu')(y)
return Model(x, y)
def get_full_model(vocabSize, embeddingSize, maxLen, domainFeatures, flowFeatures, def get_full_model(vocabSize, embeddingSize, maxLen, domainFeatures, flowFeatures,
filters, h1, h2, dropout, dense): filters, h1, h2, dropout, dense):
pass pass
def get_top_cnn(cnn, numFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size, cnnHiddenDims, cnnDropout): # designed by paul
ipt_domains = Input(shape=(windowSize, maxLen), name="ipt_domains") def get_top_cnn(cnnDropout, flow_features, domain_features, window_size, domain_length, cnn_dims, kernel_size,
dense_dim,
cnn):
ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
encoded = TimeDistributed(cnn)(ipt_domains) encoded = TimeDistributed(cnn)(ipt_domains)
ipt_flows = Input(shape=(windowSize, numFeatures), name="ipt_flows") ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
merged = keras.layers.concatenate([encoded, ipt_flows], -1) merged = keras.layers.concatenate([encoded, ipt_flows], -1)
# add second cnn # CNN processing a small slides of flow windows
y = Conv1D(filters, # TODO: add more layers?
y = Conv1D(cnn_dims,
kernel_size, kernel_size,
activation='relu', activation='relu',
input_shape=(windowSize, domainFeatures + numFeatures))(merged) input_shape=(window_size, domain_features + flow_features))(merged)
# TODO: why global pooling? -> 3D to 2D # remove temporal dimension by global max pooling
# we use max pooling:
y = GlobalMaxPooling1D()(y) y = GlobalMaxPooling1D()(y)
y = Dropout(cnnDropout)(y) y = Dropout(cnnDropout)(y)
y = Dense(cnnHiddenDims, activation='relu')(y) y = Dense(dense_dim, activation='relu')(y)
y1 = Dense(2, activation='softmax', name="client")(y)
y2 = Dense(2, activation='softmax', name="server")(y)
return Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2))
def get_top_cnn_rene(cnnDropout, flow_features, domain_features, window_size, domain_length, cnn_dims, kernel_size,
dense_dim, cnn):
ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
encoded = TimeDistributed(cnn)(ipt_domains)
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
# CNN processing a small slides of flow windows
# TODO: add more layers?
y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu',
input_shape=(window_size, domain_features + flow_features))(merged)
# remove temporal dimension by global max pooling
y = GlobalMaxPooling1D()(y)
y = Dropout(cnnDropout)(y)
y = Dense(dense_dim, activation='relu')(y)
y1 = Dense(2, activation='softmax', name="client")(y) y1 = Dense(2, activation='softmax', name="client")(y)
y2 = Dense(2, activation='softmax', name="server")(y) y2 = Dense(2, activation='softmax', name="server")(y)

View File

@ -4,7 +4,6 @@ import joblib
import pandas as pd import pandas as pd
df = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/currentData.joblib") df = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/currentData.joblib")
df = df["data"] df = pd.concat(df["data"])
df = pd.concat(df)
df.reset_index(inplace=True) df.reset_index(inplace=True)
df.to_csv("/tmp/rk/full_dataset.csv.gz", compression="gzip") df.to_csv("/tmp/rk/full_future_dataset.csv.gz", compression="gzip")