reformat code

This commit is contained in:
René Knaebel 2017-06-29 09:19:36 +02:00
parent 24d677e101
commit 87b927cdc9

View File

@ -1,43 +1,31 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import joblib
import keras
import numpy as np
import tensorflow as tf
from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Reshape
from keras.layers import Input
from keras.models import Model
from keras.utils import np_utils
from tqdm import tqdm from tqdm import tqdm
import tensorflow as tf import stackedNeuralModels as stackedNeuralModels
config = tf.ConfigProto(log_device_placement=True) config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.per_process_gpu_memory_fraction = 0.5 config.gpu_options.per_process_gpu_memory_fraction = 0.5
config.gpu_options.allow_growth = True config.gpu_options.allow_growth = True
session = tf.Session(config=config) session = tf.Session(config=config)
from pymongo import MongoClient if __name__ == "__main__":
import joblib
import pickle
import numpy as np
import ciscoProcessing as ciscoProcessing
import stackedNeuralModels as stackedNeuralModels
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc, roc_curve
import matplotlib.pyplot as plt
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation,LSTM,Embedding,Dropout,Conv1D, GlobalMaxPooling1D, Merge, Reshape, Lambda
from keras.layers import Convolution1D
from keras.layers import Input
from keras.models import Model
from keras.utils import np_utils
if __name__ == "__main__":
# parameter # parameter
innerCNNFilters = 512 innerCNNFilters = 512
innerCNNKernelSize = 2 innerCNNKernelSize = 2
cnnDropout = 0.5 cnnDropout = 0.5
cnnHiddenDims = 1024 cnnHiddenDims = 1024
domainFeatures = 512 domainFeatures = 512
flowFeatures = 3 flowFeatures = 3
numCiscoFeatures=30 numCiscoFeatures = 30
windowSize = 10 windowSize = 10
maxLen = 40 maxLen = 40
embeddingSize = 100 embeddingSize = 100
kernel_size = 2 kernel_size = 2
@ -50,82 +38,81 @@ if __name__ == "__main__":
numEpochs = 100 numEpochs = 100
maxLengthInSeconds = -1 maxLengthInSeconds = -1
timesNeg = -1 timesNeg = -1
trainDataPath = '/mnt/projekte/pmlcluster/cisco/trainData/equalClass/currentData.joblib' trainDataPath = '/mnt/projekte/pmlcluster/cisco/trainData/equalClass/currentData.joblib'
testDataPath = '/mnt/projekte/pmlcluster/cisco/trainData/equalClass/futureData.joblib' testDataPath = '/mnt/projekte/pmlcluster/cisco/trainData/equalClass/futureData.joblib'
if 'characterDict' not in locals(): if 'characterDict' not in locals():
characterDictPath = 'trainData/characterIDDict.joblib' characterDictPath = 'trainData/characterIDDict.joblib'
characterDict = joblib.load(characterDictPath)['characterIDDict'] characterDict = joblib.load(characterDictPath)['characterIDDict']
# load train and test data from joblib # load train and test data from joblib
# created with createTrainDataMultipleTaskLearning.py # created with createTrainDataMultipleTaskLearning.py
if 'trainDFs' not in locals(): if 'trainDFs' not in locals():
tmpLoad = joblib.load(trainDataPath) tmpLoad = joblib.load(trainDataPath)
trainDFs = tmpLoad['data'] trainDFs = tmpLoad['data']
if 'testDFs' not in locals(): if 'testDFs' not in locals():
tmpLoad = joblib.load(testDataPath) tmpLoad = joblib.load(testDataPath)
sharedCNNFun = stackedNeuralModels.getCNNWitoutLastLayerFunctional(len(characterDict) + 1, embeddingSize, maxLen,
sharedCNNFun = stackedNeuralModels.getCNNWitoutLastLayerFunctional(len(characterDict)+1,embeddingSize,maxLen,domainFeatures,kernel_size,domainFeatures,0.5) domainFeatures, kernel_size, domainFeatures, 0.5)
domainLists = [] domainLists = []
dfLists = [] dfLists = []
for i in tqdm(np.arange(len(trainDFs)), miniters=10): for i in tqdm(np.arange(len(trainDFs)), miniters=10):
(domainListsTmp,dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(trainDFs[i], (domainListsTmp, dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(trainDFs[i],
windowSize=windowSize,overlapping=False,maxLengthInSeconds=maxLengthInSeconds) windowSize=windowSize,
overlapping=False,
maxLengthInSeconds=maxLengthInSeconds)
domainLists += domainListsTmp domainLists += domainListsTmp
dfLists += dfListsTmp dfLists += dfListsTmp
if i == 100: if i == 100:
break break
(testData,testLabel,testHits,testNames) = stackedNeuralModels.createTrainData( (testData, testLabel, testHits, testNames) = stackedNeuralModels.createTrainData(
domainLists=domainLists,dfLists=dfLists,charachterDict=characterDict, domainLists=domainLists, dfLists=dfLists, charachterDict=characterDict,
maxLen=maxLen,threshold = threshold, maxLen=maxLen, threshold=threshold,
flagUseCiscoFeatures=False,urlSIPDIct=dict(), flagUseCiscoFeatures=False, urlSIPDIct=dict(),
windowSize=windowSize) windowSize=windowSize)
useIDs = np.where(testLabel == 1.0)[0] useIDs = np.where(testLabel == 1.0)[0]
useIDs = np.concatenate([useIDs,np.where(testLabel == 0.0)[0]]) useIDs = np.concatenate([useIDs, np.where(testLabel == 0.0)[0]])
testLabel = testLabel[useIDs] testLabel = testLabel[useIDs]
testHits = testHits[useIDs] testHits = testHits[useIDs]
testNames = testNames[useIDs] testNames = testNames[useIDs]
for i in range(len(testData)): for i in range(len(testData)):
testData[i] = testData[i][useIDs] testData[i] = testData[i][useIDs]
inputList = [] inputList = []
encodedList = [] encodedList = []
numFeatures = flowFeatures numFeatures = flowFeatures
for i in range(windowSize): for i in range(windowSize):
inputList.append(Input(shape=(maxLen,))) inputList.append(Input(shape=(maxLen,)))
encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model
inputList.append(Input(shape=(numFeatures,))) inputList.append(Input(shape=(numFeatures,)))
merge_layer_input = [] merge_layer_input = []
for i in range(windowSize): for i in range(windowSize):
merge_layer_input.append(encodedList[i]) merge_layer_input.append(encodedList[i])
merge_layer_input.append(inputList[(2*i)+1]) merge_layer_input.append(inputList[(2 * i) + 1])
# We can then concatenate the two vectors: # We can then concatenate the two vectors:
merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1) merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
reshape = Reshape((windowSize, domainFeatures+numFeatures))(merged_vector) reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector)
# add second cnn # add second cnn
cnn = Conv1D(filters, cnn = Conv1D(filters,
kernel_size, kernel_size,
activation='relu', activation='relu',
input_shape=(windowSize,domainFeatures+numFeatures))(reshape) input_shape=(windowSize, domainFeatures + numFeatures))(reshape)
# we use max pooling: # we use max pooling:
maxPool = GlobalMaxPooling1D()(cnn) maxPool = GlobalMaxPooling1D()(cnn)
cnnDropout = Dropout(cnnDropout)(maxPool) cnnDropout = Dropout(cnnDropout)(maxPool)
cnnDense = Dense(cnnHiddenDims,activation='relu')(cnnDropout) cnnDense = Dense(cnnHiddenDims, activation='relu')(cnnDropout)
cnnOutput = Dense(2,activation='softmax')(cnnDense) cnnOutput = Dense(2, activation='softmax')(cnnDense)
# We define a trainable model linking the # We define a trainable model linking the
# tweet inputs to the predictions # tweet inputs to the predictions
model = Model(inputs=inputList, outputs=cnnOutput) model = Model(inputs=inputList, outputs=cnnOutput)
@ -133,11 +120,8 @@ if __name__ == "__main__":
loss='binary_crossentropy', loss='binary_crossentropy',
metrics=['accuracy']) metrics=['accuracy'])
epochNumber = 0
epochNumber= 0
trainLabel = np_utils.to_categorical(testLabel, 2) trainLabel = np_utils.to_categorical(testLabel, 2)
model.fit(x=testData, y=trainLabel, model.fit(x=testData, y=trainLabel,
epochs=epochNumber + 1,shuffle=True,initial_epoch=epochNumber)#, epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber) # ,
#validation_data=(testData,testLabel)) # validation_data=(testData,testLabel))