ma_cisco_malware/cnnOnCnnParameterSelection.py

128 lines
4.6 KiB
Python
Raw Normal View History

2017-06-27 20:29:19 +02:00
# -*- coding: utf-8 -*-
import joblib
import keras
2017-06-29 09:19:36 +02:00
import numpy as np
import tensorflow as tf
from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Reshape
2017-06-27 20:29:19 +02:00
from keras.layers import Input
from keras.models import Model
from keras.utils import np_utils
2017-06-29 09:19:36 +02:00
from tqdm import tqdm
2017-06-27 20:29:19 +02:00
2017-06-29 09:19:36 +02:00
import stackedNeuralModels as stackedNeuralModels
2017-06-27 20:29:19 +02:00
2017-06-29 09:19:36 +02:00
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.per_process_gpu_memory_fraction = 0.5
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
if __name__ == "__main__":
2017-06-27 20:29:19 +02:00
# parameter
innerCNNFilters = 512
innerCNNKernelSize = 2
cnnDropout = 0.5
cnnHiddenDims = 1024
2017-06-29 09:19:36 +02:00
domainFeatures = 512
flowFeatures = 3
numCiscoFeatures = 30
windowSize = 10
2017-06-27 20:29:19 +02:00
maxLen = 40
embeddingSize = 100
kernel_size = 2
drop_out = 0.5
filters = 2
hidden_dims = 100
vocabSize = 40
threshold = 3
minFlowsPerUser = 10
numEpochs = 100
maxLengthInSeconds = -1
timesNeg = -1
2017-06-29 09:19:36 +02:00
trainDataPath = '/mnt/projekte/pmlcluster/cisco/trainData/equalClass/currentData.joblib'
testDataPath = '/mnt/projekte/pmlcluster/cisco/trainData/equalClass/futureData.joblib'
2017-06-27 20:29:19 +02:00
if 'characterDict' not in locals():
characterDictPath = 'trainData/characterIDDict.joblib'
characterDict = joblib.load(characterDictPath)['characterIDDict']
2017-06-29 09:19:36 +02:00
2017-06-27 20:29:19 +02:00
# load train and test data from joblib
# created with createTrainDataMultipleTaskLearning.py
if 'trainDFs' not in locals():
tmpLoad = joblib.load(trainDataPath)
trainDFs = tmpLoad['data']
2017-06-29 09:19:36 +02:00
2017-06-27 20:29:19 +02:00
if 'testDFs' not in locals():
tmpLoad = joblib.load(testDataPath)
2017-06-29 09:19:36 +02:00
sharedCNNFun = stackedNeuralModels.getCNNWitoutLastLayerFunctional(len(characterDict) + 1, embeddingSize, maxLen,
domainFeatures, kernel_size, domainFeatures, 0.5)
2017-06-27 20:29:19 +02:00
domainLists = []
2017-06-29 09:19:36 +02:00
dfLists = []
2017-06-27 20:29:19 +02:00
for i in tqdm(np.arange(len(trainDFs)), miniters=10):
2017-06-29 09:19:36 +02:00
(domainListsTmp, dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(trainDFs[i],
windowSize=windowSize,
overlapping=False,
maxLengthInSeconds=maxLengthInSeconds)
2017-06-27 20:29:19 +02:00
domainLists += domainListsTmp
dfLists += dfListsTmp
if i == 100:
break
2017-06-29 09:19:36 +02:00
(testData, testLabel, testHits, testNames) = stackedNeuralModels.createTrainData(
domainLists=domainLists, dfLists=dfLists, charachterDict=characterDict,
maxLen=maxLen, threshold=threshold,
flagUseCiscoFeatures=False, urlSIPDIct=dict(),
windowSize=windowSize)
2017-06-27 20:29:19 +02:00
useIDs = np.where(testLabel == 1.0)[0]
2017-06-29 09:19:36 +02:00
useIDs = np.concatenate([useIDs, np.where(testLabel == 0.0)[0]])
2017-06-27 20:29:19 +02:00
testLabel = testLabel[useIDs]
testHits = testHits[useIDs]
testNames = testNames[useIDs]
for i in range(len(testData)):
testData[i] = testData[i][useIDs]
2017-06-29 09:19:36 +02:00
2017-06-27 20:29:19 +02:00
inputList = []
encodedList = []
numFeatures = flowFeatures
for i in range(windowSize):
inputList.append(Input(shape=(maxLen,)))
2017-06-29 09:19:36 +02:00
encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model
2017-06-27 20:29:19 +02:00
inputList.append(Input(shape=(numFeatures,)))
2017-06-29 09:19:36 +02:00
2017-06-27 20:29:19 +02:00
merge_layer_input = []
for i in range(windowSize):
merge_layer_input.append(encodedList[i])
2017-06-29 09:19:36 +02:00
merge_layer_input.append(inputList[(2 * i) + 1])
2017-06-27 20:29:19 +02:00
# We can then concatenate the two vectors:
merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
2017-06-29 09:19:36 +02:00
reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector)
2017-06-27 20:29:19 +02:00
# add second cnn
2017-06-29 09:19:36 +02:00
2017-06-27 20:29:19 +02:00
cnn = Conv1D(filters,
kernel_size,
activation='relu',
2017-06-29 09:19:36 +02:00
input_shape=(windowSize, domainFeatures + numFeatures))(reshape)
2017-06-27 20:29:19 +02:00
# we use max pooling:
maxPool = GlobalMaxPooling1D()(cnn)
cnnDropout = Dropout(cnnDropout)(maxPool)
2017-06-29 09:19:36 +02:00
cnnDense = Dense(cnnHiddenDims, activation='relu')(cnnDropout)
cnnOutput = Dense(2, activation='softmax')(cnnDense)
2017-06-27 20:29:19 +02:00
# We define a trainable model linking the
# tweet inputs to the predictions
model = Model(inputs=inputList, outputs=cnnOutput)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
2017-06-29 09:19:36 +02:00
epochNumber = 0
2017-06-27 20:29:19 +02:00
trainLabel = np_utils.to_categorical(testLabel, 2)
model.fit(x=testData, y=trainLabel,
2017-06-29 09:19:36 +02:00
epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber) # ,
# validation_data=(testData,testLabel))