ma_cisco_malware/cnnOnCnnParameterSelection.py
2017-06-29 09:19:36 +02:00

128 lines
4.6 KiB
Python

# -*- coding: utf-8 -*-
import joblib
import keras
import numpy as np
import tensorflow as tf
from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Reshape
from keras.layers import Input
from keras.models import Model
from keras.utils import np_utils
from tqdm import tqdm
import stackedNeuralModels as stackedNeuralModels
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.per_process_gpu_memory_fraction = 0.5
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
if __name__ == "__main__":
# parameter
innerCNNFilters = 512
innerCNNKernelSize = 2
cnnDropout = 0.5
cnnHiddenDims = 1024
domainFeatures = 512
flowFeatures = 3
numCiscoFeatures = 30
windowSize = 10
maxLen = 40
embeddingSize = 100
kernel_size = 2
drop_out = 0.5
filters = 2
hidden_dims = 100
vocabSize = 40
threshold = 3
minFlowsPerUser = 10
numEpochs = 100
maxLengthInSeconds = -1
timesNeg = -1
trainDataPath = '/mnt/projekte/pmlcluster/cisco/trainData/equalClass/currentData.joblib'
testDataPath = '/mnt/projekte/pmlcluster/cisco/trainData/equalClass/futureData.joblib'
if 'characterDict' not in locals():
characterDictPath = 'trainData/characterIDDict.joblib'
characterDict = joblib.load(characterDictPath)['characterIDDict']
# load train and test data from joblib
# created with createTrainDataMultipleTaskLearning.py
if 'trainDFs' not in locals():
tmpLoad = joblib.load(trainDataPath)
trainDFs = tmpLoad['data']
if 'testDFs' not in locals():
tmpLoad = joblib.load(testDataPath)
sharedCNNFun = stackedNeuralModels.getCNNWitoutLastLayerFunctional(len(characterDict) + 1, embeddingSize, maxLen,
domainFeatures, kernel_size, domainFeatures, 0.5)
domainLists = []
dfLists = []
for i in tqdm(np.arange(len(trainDFs)), miniters=10):
(domainListsTmp, dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(trainDFs[i],
windowSize=windowSize,
overlapping=False,
maxLengthInSeconds=maxLengthInSeconds)
domainLists += domainListsTmp
dfLists += dfListsTmp
if i == 100:
break
(testData, testLabel, testHits, testNames) = stackedNeuralModels.createTrainData(
domainLists=domainLists, dfLists=dfLists, charachterDict=characterDict,
maxLen=maxLen, threshold=threshold,
flagUseCiscoFeatures=False, urlSIPDIct=dict(),
windowSize=windowSize)
useIDs = np.where(testLabel == 1.0)[0]
useIDs = np.concatenate([useIDs, np.where(testLabel == 0.0)[0]])
testLabel = testLabel[useIDs]
testHits = testHits[useIDs]
testNames = testNames[useIDs]
for i in range(len(testData)):
testData[i] = testData[i][useIDs]
inputList = []
encodedList = []
numFeatures = flowFeatures
for i in range(windowSize):
inputList.append(Input(shape=(maxLen,)))
encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model
inputList.append(Input(shape=(numFeatures,)))
merge_layer_input = []
for i in range(windowSize):
merge_layer_input.append(encodedList[i])
merge_layer_input.append(inputList[(2 * i) + 1])
# We can then concatenate the two vectors:
merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector)
# add second cnn
cnn = Conv1D(filters,
kernel_size,
activation='relu',
input_shape=(windowSize, domainFeatures + numFeatures))(reshape)
# we use max pooling:
maxPool = GlobalMaxPooling1D()(cnn)
cnnDropout = Dropout(cnnDropout)(maxPool)
cnnDense = Dense(cnnHiddenDims, activation='relu')(cnnDropout)
cnnOutput = Dense(2, activation='softmax')(cnnDense)
# We define a trainable model linking the
# tweet inputs to the predictions
model = Model(inputs=inputList, outputs=cnnOutput)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
epochNumber = 0
trainLabel = np_utils.to_categorical(testLabel, 2)
model.fit(x=testData, y=trainLabel,
epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber) # ,
# validation_data=(testData,testLabel))