diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e6863c9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,99 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +.cache/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# IPython Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# dotenv +.env + +# virtualenv +venv/ +ENV/ + +# Spyder project settings +.spyderproject + +# Rope project settings +.ropeproject + +# intelliJ +.idea/ + +# Apple? +.DS_Store + +# data +*.tif \ No newline at end of file diff --git a/ciscoProcessing.py b/ciscoProcessing.py index edbc542..33f4d98 100644 --- a/ciscoProcessing.py +++ b/ciscoProcessing.py @@ -17,7 +17,7 @@ import random from keras.models import model_from_json import time import re -import mongoDBConnector as mongoDBConnector +# import mongoDBConnector as mongoDBConnector import stackedNeuralModels as stackedNeuralModels from tqdm import tqdm diff --git a/cnnOnCnnParameterSelection.py b/cnnOnCnnParameterSelection.py index 5fe8fb4..a7bae38 100644 --- a/cnnOnCnnParameterSelection.py +++ b/cnnOnCnnParameterSelection.py @@ -1,20 +1,199 @@ # -*- coding: utf-8 -*- -import joblib +import string + import keras import numpy as np -import tensorflow as tf -from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Reshape -from keras.layers import Input +import pandas as pd +from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Reshape, Embedding, Input, Activation from keras.models import Model from keras.utils import np_utils from tqdm import tqdm -import stackedNeuralModels as stackedNeuralModels -config = tf.ConfigProto(log_device_placement=True) -config.gpu_options.per_process_gpu_memory_fraction = 0.5 -config.gpu_options.allow_growth = True -session = tf.Session(config=config) +# config = tf.ConfigProto(log_device_placement=True) +# config.gpu_options.per_process_gpu_memory_fraction = 0.5 +# config.gpu_options.allow_growth = True +# session = tf.Session(config=config) + + +def get_character_dict(): + return dict((char, idx) for (idx, char) in + enumerate(string.ascii_lowercase + string.punctuation)) + + +def get_cnn(vocabSize, embeddingSize, input_length, filters, kernel_size, + hidden_dims, drop_out): + x = y = Input(shape=(input_length,)) + y = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(y) + y = Conv1D(filters, kernel_size, activation='relu')(y) + y = GlobalMaxPooling1D()(y) + y = Dense(hidden_dims)(y) + y = Dropout(drop_out)(y) + y = Activation('relu')(y) + return Model(x, y) + + +def get_user_chunks(dataFrame, windowSize=10, overlapping=False, + maxLengthInSeconds=300): + # print('maxLength: ' + str(maxLengthInSeconds)) + maxMilliSeconds = maxLengthInSeconds * 1000 + outDomainLists = [] + outDFFrames = [] + if overlapping == False: + numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize))) + userIDs = np.arange(len(dataFrame)) + for blockID in np.arange(numBlocks): + curIDs = userIDs[(blockID * windowSize):((blockID + 1) * windowSize)] + # print(curIDs) + useData = dataFrame.iloc[curIDs] + curDomains = useData['domain'] + if maxLengthInSeconds != -1: + curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds + underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds) + if len(underTimeOutIDs) != len(curIDs): + curIDs = curIDs[underTimeOutIDs] + useData = dataFrame.iloc[curIDs] + curDomains = useData['domain'] + outDomainLists.append(list(curDomains)) + outDFFrames.append(useData) + else: + numBlocks = len(dataFrame) + 1 - windowSize + userIDs = np.arange(len(dataFrame)) + for blockID in np.arange(numBlocks): + curIDs = userIDs[blockID:blockID + windowSize] + # print(curIDs) + useData = dataFrame.iloc[curIDs] + curDomains = useData['domain'] + if maxLengthInSeconds != -1: + curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds + underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds) + if len(underTimeOutIDs) != len(curIDs): + curIDs = curIDs[underTimeOutIDs] + useData = dataFrame.iloc[curIDs] + curDomains = useData['domain'] + outDomainLists.append(list(curDomains)) + outDFFrames.append(useData) + return (outDomainLists, outDFFrames) + + +def getFeatureVecForDomain(domain, characterDict, maxLen=40): + curFeature = np.zeros([maxLen, ]) + for j in range(np.min([len(domain), maxLen])): + # print(j) + curCharacter = domain[-j] + if curCharacter in characterDict: + curFeature[j] = characterDict[curCharacter] + return curFeature + + +def getFlowFeatures(curDataLine): + useKeys = ['duration', 'bytes_down', 'bytes_up'] + curFeature = np.zeros([len(useKeys), ]) + for i in range(len(useKeys)): + curKey = useKeys[i] + try: + curFeature[i] = np.log1p(curDataLine[curKey]).astype(float) + except: + pass + return curFeature + + +def getCiscoFeatures(curDataLine, urlSIPDict): + numCiscoFeatures = 30 + try: + ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])] + # print('cisco features: ' + str(ciscoFeatures)) + # log transform + ciscoFeatures = np.log1p(ciscoFeatures).astype(float) + # print('log transformed: ' + str(ciscoFeatures)) + return ciscoFeatures.ravel() + except: + return np.zeros([numCiscoFeatures, ]).ravel() + + +def create_dataset_from_flows(user_flow_df, char_dict, maxLen, threshold=3, windowSize=10): + domainLists = [] + dfLists = [] + print("get chunks from user data frames") + for i, user_flow in enumerate(get_flow_per_user(user_flow_df)): + (domainListsTmp, dfListsTmp) = get_user_chunks(user_flow, windowSize=windowSize, + overlapping=False, maxLengthInSeconds=maxLengthInSeconds) + domainLists += domainListsTmp + dfLists += dfListsTmp + if i >= 10: + break + + print("create training dataset") + return create_dataset_from_lists( + domainLists=domainLists, dfLists=dfLists, charachterDict=char_dict, + maxLen=maxLen, threshold=threshold, + flagUseCiscoFeatures=False, urlSIPDIct=dict(), + windowSize=windowSize) + + +def create_dataset_from_lists(domainLists, dfLists, charachterDict, maxLen, threshold=3, + flagUseCiscoFeatures=False, urlSIPDIct=dict(), + windowSize=10): + if 'hits' in dfLists[0].keys(): + hitName = 'hits' + elif 'virusTotalHits' in dfLists[0].keys(): + hitName = 'virusTotalHits' + numFlowFeatures = 3 + numCiscoFeatures = 30 + numFeatures = numFlowFeatures + if flagUseCiscoFeatures: + numFeatures += numCiscoFeatures + outputFeatures = [] + label = [] + hits = [] + trainNames = [] + for i in range(windowSize): + outputFeatures.append(np.zeros([len(domainLists), maxLen])) + outputFeatures.append(np.zeros([len(domainLists), numFeatures])) + + for i in tqdm(np.arange(len(domainLists)), miniters=10): + curCounter = 0 + # print('len domainList: ' + str(len(domainLists[i]))) + # print('len df: ' + str(len(dfLists[i]))) + for j in range(np.min([windowSize, len(domainLists[i])])): + outputFeatures[curCounter][i, :] = getFeatureVecForDomain(domainLists[i][j], charachterDict, maxLen) + curCounter += 1 + if flagUseCiscoFeatures: + outputFeatures[curCounter][i, 0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j]) + outputFeatures[curCounter][i, numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j], urlSIPDIct) + else: + outputFeatures[curCounter][i, :] = getFlowFeatures(dfLists[i].iloc[j]) + curCounter += 1 + curLabel = 0.0 + if np.max(dfLists[i][hitName]) >= threshold: + curLabel = 1.0 + elif np.max(dfLists[i][hitName]) == -1: + curLabel = -1.0 + elif np.max(dfLists[i][hitName]) > 0 and np.max(dfLists[i][hitName]) < threshold: + curLabel = -2.0 + label.append(curLabel) + hits.append(np.max(dfLists[i][hitName])) + trainNames.append(np.unique(dfLists[i]['user_hash'])) + return (outputFeatures, np.array(label), np.array(hits), np.array(trainNames)) + + +def get_user_flow_data(): + # load train and test data from joblib + # created with createTrainDataMultipleTaskLearning.py + # rk: changed to csv file + trainDFs = pd.read_csv("data/rk_data.csv.gz") + trainDFs.drop("Unnamed: 0", 1, inplace=True) + trainDFs.set_index(keys=['user_hash'], drop=False, inplace=True) + users = trainDFs['user_hash'].unique().tolist() + u0 = trainDFs.loc[trainDFs.user_hash == users[0]] + return trainDFs + + +def get_flow_per_user(df): + users = df['user_hash'].unique().tolist() + for user in users: + yield df.loc[df.user_hash == user] + if __name__ == "__main__": # parameter @@ -39,51 +218,28 @@ if __name__ == "__main__": maxLengthInSeconds = -1 timesNeg = -1 - trainDataPath = '/mnt/projekte/pmlcluster/cisco/trainData/equalClass/currentData.joblib' - testDataPath = '/mnt/projekte/pmlcluster/cisco/trainData/equalClass/futureData.joblib' + char_dict = get_character_dict() + user_flow_df = get_user_flow_data() - if 'characterDict' not in locals(): - characterDictPath = 'trainData/characterIDDict.joblib' - characterDict = joblib.load(characterDictPath)['characterIDDict'] + print("create training dataset") + (X_tr, y_tr, hits_tr, names_tr) = create_dataset_from_flows( + user_flow_df, char_dict, + maxLen=maxLen, threshold=threshold, windowSize=windowSize) - # load train and test data from joblib - # created with createTrainDataMultipleTaskLearning.py - if 'trainDFs' not in locals(): - tmpLoad = joblib.load(trainDataPath) - trainDFs = tmpLoad['data'] + pos_idx = np.where(y_tr == 1.0)[0] + neg_idx = np.where(y_tr == 0.0)[0] - if 'testDFs' not in locals(): - tmpLoad = joblib.load(testDataPath) + use_idx = np.concatenate((pos_idx, neg_idx)) - sharedCNNFun = stackedNeuralModels.getCNNWitoutLastLayerFunctional(len(characterDict) + 1, embeddingSize, maxLen, - domainFeatures, kernel_size, domainFeatures, 0.5) + y_tr = y_tr[use_idx] + # hits_tr = hits_tr[use_idx] + # names_tr = names_tr[use_idx] + for i in range(len(X_tr)): + X_tr[i] = X_tr[i][use_idx] - domainLists = [] - dfLists = [] - for i in tqdm(np.arange(len(trainDFs)), miniters=10): - (domainListsTmp, dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(trainDFs[i], - windowSize=windowSize, - overlapping=False, - maxLengthInSeconds=maxLengthInSeconds) - domainLists += domainListsTmp - dfLists += dfListsTmp - if i == 100: - break - - (testData, testLabel, testHits, testNames) = stackedNeuralModels.createTrainData( - domainLists=domainLists, dfLists=dfLists, charachterDict=characterDict, - maxLen=maxLen, threshold=threshold, - flagUseCiscoFeatures=False, urlSIPDIct=dict(), - windowSize=windowSize) - - useIDs = np.where(testLabel == 1.0)[0] - useIDs = np.concatenate([useIDs, np.where(testLabel == 0.0)[0]]) - - testLabel = testLabel[useIDs] - testHits = testHits[useIDs] - testNames = testNames[useIDs] - for i in range(len(testData)): - testData[i] = testData[i][useIDs] + # TODO: WTF? I don't get it... + sharedCNNFun = get_cnn(len(char_dict) + 1, embeddingSize, maxLen, + domainFeatures, kernel_size, domainFeatures, 0.5) inputList = [] encodedList = [] @@ -102,7 +258,6 @@ if __name__ == "__main__": merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1) reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector) # add second cnn - cnn = Conv1D(filters, kernel_size, activation='relu', @@ -121,7 +276,7 @@ if __name__ == "__main__": metrics=['accuracy']) epochNumber = 0 - trainLabel = np_utils.to_categorical(testLabel, 2) - model.fit(x=testData, y=trainLabel, + trainLabel = np_utils.to_categorical(y_tr, 2) + model.fit(x=X_tr, y=trainLabel, batch_size=128, epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber) # , # validation_data=(testData,testLabel)) diff --git a/data/.keep b/data/.keep new file mode 100644 index 0000000..e69de29 diff --git a/stackedNeuralModels.py b/stackedNeuralModels.py index 23045a8..c6fd5da 100644 --- a/stackedNeuralModels.py +++ b/stackedNeuralModels.py @@ -1,63 +1,52 @@ # -*- coding: utf-8 -*- -from keras.models import Sequential -from keras.layers import Dense, Activation,LSTM,Embedding,Dropout,Conv1D, GlobalMaxPooling1D, Merge, Reshape, Lambda -from keras.layers import Convolution1D -import ciscoProcessing as ciscoProcessing -import numpy as np -import matplotlib.pyplot as plt -import pandas as pd -import joblib import csv -import keras +import numpy as np +from keras.layers import Dense, Activation, Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Lambda from keras.layers import Input from keras.models import Model -from keras.utils import np_utils - -from sklearn.metrics import precision_recall_curve -from sklearn.metrics import auc, roc_curve +from keras.models import Sequential from tqdm import tqdm -import os -def getCiscoFeatures(curDataLine,urlSIPDict): +def getCiscoFeatures(curDataLine, urlSIPDict): numCiscoFeatures = 30 try: ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])] - #print('cisco features: ' + str(ciscoFeatures)) + # print('cisco features: ' + str(ciscoFeatures)) # log transform - ciscoFeatures = np.log1p(ciscoFeatures,dtype='float32') - #print('log transformed: ' + str(ciscoFeatures)) + ciscoFeatures = np.log1p(ciscoFeatures, dtype='float32') + # print('log transformed: ' + str(ciscoFeatures)) return ciscoFeatures.ravel() except: - return np.zeros([numCiscoFeatures,]).ravel() + return np.zeros([numCiscoFeatures, ]).ravel() - -def getCNNWithoutLastLayer(vocabSize,embeddingSize,input_length,filters,kernel_size, - hidden_dims,drop_out): +def getCNNWithoutLastLayer(vocabSize, embeddingSize, input_length, filters, kernel_size, + hidden_dims, drop_out): model = Sequential() model.add(Embedding(input_dim=vocabSize, output_dim=embeddingSize, input_length=input_length)) - + model.add(Conv1D(filters, kernel_size, activation='relu')) - + # we use max pooling: model.add(GlobalMaxPooling1D()) - + # We add a vanilla hidden layer: model.add(Dense(hidden_dims)) model.add(Dropout(drop_out)) model.add(Activation('relu')) return model - -def getCNNWitoutLastLayerFunctional(vocabSize,embeddingSize,input_length,filters,kernel_size, - hidden_dims,drop_out): + + +def getCNNWitoutLastLayerFunctional(vocabSize, embeddingSize, input_length, filters, kernel_size, + hidden_dims, drop_out): a = Input(shape=(input_length,)) - embedding = Embedding(input_dim=vocabSize,output_dim=embeddingSize)(a) - conv1 = Conv1D(filters,kernel_size,activation='relu')(embedding) + embedding = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(a) + conv1 = Conv1D(filters, kernel_size, activation='relu')(embedding) glob = GlobalMaxPooling1D()(conv1) dense = Dense(hidden_dims)(glob) drop = Dropout(drop_out)(dense) @@ -65,55 +54,58 @@ def getCNNWitoutLastLayerFunctional(vocabSize,embeddingSize,input_length,filters model = Model(a, model) return model + def getFlowFeatureLayer(numFeatures): model = Sequential() - #slpModel.add(Dense(1, input_shape=(1,))) + # slpModel.add(Dense(1, input_shape=(1,))) model.add(Lambda(lambda x: x + 0.0, input_shape=(numFeatures,))) return model - -def createCNNDataSet(domains,label,characterDict,maxLen=40): + +def createCNNDataSet(domains, label, characterDict, maxLen=40): # process domains in reverse order - outFeature = np.zeros([len(domains),maxLen]) - outLabel = np.zeros([len(domains),]) + outFeature = np.zeros([len(domains), maxLen]) + outLabel = np.zeros([len(domains), ]) for i in range(len(domains)): domain = domains[i] curLabel = label[i] - curFeature = np.zeros([maxLen,]) - # print(domain + ' ' + str(len(domain))) - for j in range(np.min([len(domain),maxLen])): - #print(j) + curFeature = np.zeros([maxLen, ]) + # print(domain + ' ' + str(len(domain))) + for j in range(np.min([len(domain), maxLen])): + # print(j) curCharacter = domain[-j] if curCharacter in characterDict: curFeature[j] = characterDict[curCharacter] outFeature[i] = curFeature outLabel[i] = curLabel - return (outFeature,outLabel) + return (outFeature, outLabel) -def getFeatureVecForDomain(domain,characterDict,maxLen=40): - curFeature = np.zeros([maxLen,]) - for j in range(np.min([len(domain),maxLen])): - #print(j) + +def getFeatureVecForDomain(domain, characterDict, maxLen=40): + curFeature = np.zeros([maxLen, ]) + for j in range(np.min([len(domain), maxLen])): + # print(j) curCharacter = domain[-j] if curCharacter in characterDict: curFeature[j] = characterDict[curCharacter] return curFeature - + + def getFlowFeatures(curDataLine): - useKeys = ['duration','bytes_down','bytes_up'] - curFeature = np.zeros([len(useKeys),]) + useKeys = ['duration', 'bytes_down', 'bytes_up'] + curFeature = np.zeros([len(useKeys), ]) for i in range(len(useKeys)): curKey = useKeys[i] try: - curFeature[i] = np.log1p(curDataLine[curKey],dtype='float32') + curFeature[i] = np.log1p(curDataLine[curKey], dtype='float32') except: pass return curFeature - - -def getChunksFromUserDataFrame(dataFrame,windowSize=10,overlapping=False, + + +def getChunksFromUserDataFrame(dataFrame, windowSize=10, overlapping=False, maxLengthInSeconds=300): - #print('maxLength: ' + str(maxLengthInSeconds)) + # print('maxLength: ' + str(maxLengthInSeconds)) maxMilliSeconds = maxLengthInSeconds * 1000 outDomainLists = [] outDFFrames = [] @@ -121,8 +113,8 @@ def getChunksFromUserDataFrame(dataFrame,windowSize=10,overlapping=False, numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize))) userIDs = np.arange(len(dataFrame)) for blockID in np.arange(numBlocks): - curIDs = userIDs[(blockID * windowSize):((blockID+1)*windowSize)] - #print(curIDs) + curIDs = userIDs[(blockID * windowSize):((blockID + 1) * windowSize)] + # print(curIDs) useData = dataFrame.iloc[curIDs] curDomains = useData['domain'] if maxLengthInSeconds != -1: @@ -138,8 +130,8 @@ def getChunksFromUserDataFrame(dataFrame,windowSize=10,overlapping=False, numBlocks = len(dataFrame) + 1 - windowSize userIDs = np.arange(len(dataFrame)) for blockID in np.arange(numBlocks): - curIDs = userIDs[blockID:blockID+windowSize] - #print(curIDs) + curIDs = userIDs[blockID:blockID + windowSize] + # print(curIDs) useData = dataFrame.iloc[curIDs] curDomains = useData['domain'] if maxLengthInSeconds != -1: @@ -151,11 +143,11 @@ def getChunksFromUserDataFrame(dataFrame,windowSize=10,overlapping=False, curDomains = useData['domain'] outDomainLists.append(list(curDomains)) outDFFrames.append(useData) - return (outDomainLists,outDFFrames) - - -def createTrainData(domainLists,dfLists,charachterDict,maxLen,threshold = 3, - flagUseCiscoFeatures=False,urlSIPDIct=dict, + return (outDomainLists, outDFFrames) + + +def createTrainData(domainLists, dfLists, charachterDict, maxLen, threshold=3, + flagUseCiscoFeatures=False, urlSIPDIct=dict, windowSize=10): if 'hits' in dfLists[0].keys(): hitName = 'hits' @@ -171,21 +163,21 @@ def createTrainData(domainLists,dfLists,charachterDict,maxLen,threshold = 3, hits = [] trainNames = [] for i in range(windowSize): - outputFeatures.append(np.zeros([len(domainLists),maxLen])) - outputFeatures.append(np.zeros([len(domainLists),numFeatures])) - + outputFeatures.append(np.zeros([len(domainLists), maxLen])) + outputFeatures.append(np.zeros([len(domainLists), numFeatures])) + for i in tqdm(np.arange(len(domainLists)), miniters=10): curCounter = 0 - #print('len domainList: ' + str(len(domainLists[i]))) - #print('len df: ' + str(len(dfLists[i]))) - for j in range(np.min([windowSize,len(domainLists[i])])): - outputFeatures[curCounter][i,:] = getFeatureVecForDomain(domainLists[i][j],charachterDict,maxLen) + # print('len domainList: ' + str(len(domainLists[i]))) + # print('len df: ' + str(len(dfLists[i]))) + for j in range(np.min([windowSize, len(domainLists[i])])): + outputFeatures[curCounter][i, :] = getFeatureVecForDomain(domainLists[i][j], charachterDict, maxLen) curCounter += 1 - if flagUseCiscoFeatures: - outputFeatures[curCounter][i,0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j]) - outputFeatures[curCounter][i,numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j],urlSIPDIct) + if flagUseCiscoFeatures: + outputFeatures[curCounter][i, 0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j]) + outputFeatures[curCounter][i, numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j], urlSIPDIct) else: - outputFeatures[curCounter][i,:] = getFlowFeatures(dfLists[i].iloc[j]) + outputFeatures[curCounter][i, :] = getFlowFeatures(dfLists[i].iloc[j]) curCounter += 1 curLabel = 0.0 if np.max(dfLists[i][hitName]) >= threshold: @@ -198,215 +190,26 @@ def createTrainData(domainLists,dfLists,charachterDict,maxLen,threshold = 3, hits.append(np.max(dfLists[i][hitName])) trainNames.append(np.unique(dfLists[i]['user_hash'])) return (outputFeatures, np.array(label), np.array(hits), np.array(trainNames)) - - + + def transformStringListToNumpyArray(listString): - listString = listString.replace('[','').replace(']','') - return np.array(listString.split(','),dtype='float32') - + listString = listString.replace('[', '').replace(']', '') + return np.array(listString.split(','), dtype='float32') + + def getCiscoFeatureDict(csvPathList): outDict = dict() for path in tqdm(csvPathList, miniters=1): - fobj = open(path,'r') - csvReader = csv.DictReader(fobj,delimiter=',') + fobj = open(path, 'r') + csvReader = csv.DictReader(fobj, delimiter=',') for row in csvReader: urlSIPString = row['Domain'] + row['ServerIP'] ciscoFeatures = row['CiscoFeature'] outDict[urlSIPString] = transformStringListToNumpyArray(ciscoFeatures) - #if len(outDict) % 10000 == 0: + # if len(outDict) % 10000 == 0: # print('numbers in dict: ' + str(len(outDict))) return outDict - + if __name__ == "__main__": - - # get data - trainDirsUserLevel = ['trainData/joblib2016-07-annomalous-stg-new/10/', - 'trainData/joblib2016-07-annomalous-stg-new/09/', - 'trainData/joblib2016-07-annomalous-stg-new/08/', - 'trainData/joblib2016-07-annomalous-stg-new/07/', - 'trainData/joblib2016-07-annomalous-stg-new/06/'] - - testDirsUserLevel = ['trainData/joblib2016-09-annomalous-stg-new/07/',\ - 'trainData/joblib2016-09-annomalous-stg-new/08/',\ - 'trainData/joblib2016-09-annomalous-stg-new/09/',\ - 'trainData/joblib2016-09-annomalous-stg-new/10/',\ - 'trainData/joblib2016-09-annomalous-stg-new/11/',\ - 'trainData/joblib2016-09-annomalous-stg-new/12/',\ - 'trainData/joblib2016-09-annomalous-stg-new/13/',\ - 'trainData/joblib2016-09-annomalous-stg-new/14/'] - - trainCiscoFeatureCSVPaths = ['trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_07.csv', - 'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_06.csv', - 'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_08.csv', - 'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_10.csv', - 'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_09.csv'] - - testCiscoFeatureCSVPaths = ['trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_12.csv', - 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_08.csv', - 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_07.csv', - 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_09.csv', - 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_13.csv', - 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_14.csv', - 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_10.csv', - 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_11.csv'] - - # parameter - numNegPerDay = 5000 - numEpochs = 10 - domainFeatures = 512 - flowFeatures = 3 - numCiscoFeatures= 30 - windowSize = 10 - maxLen = 40 - - lstmUnits = 32 - lstmDenseSize = 128 - embeddingSize = 100 - kernel_size = 2 - drop_out = 0.5 - filters = 2 - hidden_dims = 100 - vocabSize = 40 - flagUseCiscoFeatures = True - threshold = 3 - resultStoreDir = 'results/201705/' - if flagUseCiscoFeatures: - resultStorePath = resultStoreDir + 'cnn_plus_cisco_plus_lstm_numNegPerDay' + str(numNegPerDay) + '.joblib' - resultModelPath = resultStoreDir + 'cnn_plus_cisco_plus_lstm_numNegPerDay' + str(numNegPerDay) - else: - resultStorePath = resultStoreDir + 'cnn_plus_lstm_numNegPerDay' + str(numNegPerDay) + '.joblib' - resultModelPath = resultStoreDir + 'cnn_plus_lstm_numNegPerDay' + str(numNegPerDay) - flagRedo = True - - - if flagUseCiscoFeatures: - if 'trainCiscoFeatureDict' not in locals(): - trainCiscoFeatureDict = getCiscoFeatureDict(trainCiscoFeatureCSVPaths) - - if 'testCiscoFeatureDict' not in locals(): - testCiscoFeatureDict = getCiscoFeatureDict(testCiscoFeatureCSVPaths) - else: - trainCiscoFeatureDict = dict() - testCiscoFeatureDict = dict() - - if flagRedo or not os.path.exists(resultStorePath): - if 'characterDict' not in locals(): - characterDictPath = 'trainData/characterIDDict.joblib' - characterDict = joblib.load(characterDictPath)['characterIDDict'] - - - print('create train data') - if 'dataFrameList' not in locals(): - (dataFrameList) = ciscoProcessing.loadRawDataSetFromJoblibPerUser(\ - trainDirsUserLevel,numNegPerDay = numNegPerDay) - maxHits = [] - for i in range(len(dataFrameList)): - maxHits.append(np.max(dataFrameList[i]['hits'])) - - print('create test data') - # validation error - if 'testDataFrameList' not in locals(): - (testDataFrameList) = ciscoProcessing.loadRawDataSetFromJoblibPerUser(\ - [testDirsUserLevel[0]],numNegPerDay = numNegPerDay) - maxHits = [] - for i in range(len(testDataFrameList)): - maxHits.append(np.max(testDataFrameList[i]['hits'])) - - sharedCNNFun = getCNNWitoutLastLayerFunctional(len(characterDict)+1,embeddingSize,maxLen,domainFeatures,kernel_size,domainFeatures,0.5) - - inputList = [] - encodedList = [] - numFeatures = flowFeatures - if flagUseCiscoFeatures: - numFeatures += numCiscoFeatures - for i in range(windowSize): - inputList.append(Input(shape=(maxLen,))) - encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model - inputList.append(Input(shape=(numFeatures,))) - - merge_layer_input = [] - for i in range(windowSize): - merge_layer_input.append(encodedList[i]) - merge_layer_input.append(inputList[(2*i)+1]) - - - # We can then concatenate the two vectors: - merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1) - reshape = Reshape((windowSize, domainFeatures+numFeatures))(merged_vector) - lstm = LSTM(lstmUnits, input_shape=(windowSize,domainFeatures+numFeatures))(reshape) - dense = Dense(lstmDenseSize, activation='relu')(lstm) - dropout = Dropout(0.5)(dense) - # And add a logistic regression on top - predictions = Dense(2, activation='softmax')(dropout) - - # We define a trainable model linking the - # tweet inputs to the predictions - model = Model(inputs=inputList, outputs=predictions) - - model.compile(optimizer='adam', - loss='binary_crossentropy', - metrics=['accuracy']) - - - # get train data - domainLists = [] - dfLists = [] - for i in tqdm(np.arange(len(dataFrameList)), miniters=10): - (domainListsTmp,dfListsTmp) = getChunksFromUserDataFrame(dataFrameList[i],windowSize=windowSize,overlapping=False) - domainLists += domainListsTmp - dfLists += dfListsTmp - - (trainData,trainLabel,trainHits,trainNames) = createTrainData(domainLists,dfLists,characterDict, - maxLen,threshold = threshold, - flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=trainCiscoFeatureDict) - useIDs = np.where(trainHits == 0)[0] - useIDs = np.concatenate([useIDs,np.where(trainHits >= threshold)[0]]) - for i in range(len(trainData)): - trainData[i] = np.array(trainData[i])[useIDs] - trainLabel = trainLabel[useIDs] - trainHits = trainHits[useIDs] - trainNames = trainNames[useIDs] - - # get test data - domainLists = [] - dfLists = [] - for i in tqdm(np.arange(len(testDataFrameList)), miniters=10): - (domainListsTmp,dfListsTmp) = getChunksFromUserDataFrame(testDataFrameList[i],windowSize=windowSize,overlapping=False) - domainLists += domainListsTmp - dfLists += dfListsTmp - - (testData,testLabel,testHits,testNames) = createTrainData(domainLists,dfLists,characterDict, - maxLen,threshold = threshold, - flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=testCiscoFeatureDict) - useIDs = np.where(testHits == 0)[0] - useIDs = np.concatenate([useIDs,np.where(testHits >= threshold)[0]]) - for i in range(len(testData)): - testData[i] = np.array(testData[i])[useIDs] - testLabel = testLabel[useIDs] - testHits = testHits[useIDs] - testNames = testNames[useIDs] - - numPos = len(np.where(trainLabel == 1.0)[0]) - numNeg = len(np.where(trainLabel == 0.0)[0]) - print('major class: ' + str(float(numNeg) / float(numNeg + numPos))) - lstmLabel = np_utils.to_categorical(trainLabel, 2) - lstmTestLabel = np_utils.to_categorical(testLabel, 2) - trainHist = model.fit(trainData,lstmLabel,epochs=numEpochs,batch_size=128, validation_data=(testData,lstmTestLabel)) - - - # save lstm model - ciscoProcessing.save_model(model,resultModelPath+'.json', - resultModelPath + '.h5') - - # classify train and test - trainScores = model.predict(trainData)[:,1] - testScores = model.predict(testData)[:,1] - - joblib.dump({'testLabel':testLabel, - 'testHits':testHits, - 'testNames':testNames, - 'testScores':testScores, - 'trainLabel':trainLabel, - 'trainScores':trainScores},resultStorePath,compress=3) - \ No newline at end of file + pass