diff --git a/ciscoProcessing.py b/ciscoProcessing.py new file mode 100644 index 0000000..edbc542 --- /dev/null +++ b/ciscoProcessing.py @@ -0,0 +1,846 @@ +# -*- coding: utf-8 -*- +import sys +sys.path.append('..') +sys.path.append('/mnt/projekte/pmlcluster/home/prasse/projects/ciscoSVN/cisco/trunk/code/') +import os +import numpy as np +import joblib +from keras.preprocessing.sequence import pad_sequences +from keras.utils import np_utils +from keras.models import Sequential +from keras.layers import Dense +from keras.layers import LSTM +from keras.layers import Dropout +import csv +import pandas as pd +import random +from keras.models import model_from_json +import time +import re +import mongoDBConnector as mongoDBConnector +import stackedNeuralModels as stackedNeuralModels +from tqdm import tqdm + + +def getCiscoDomainLabel(curDomain,curSIP,hostSet,sipSet,sldSet): + # check server-ip + if curSIP in sipSet: + return 1.0 + # check second level domain + splitDomain = curDomain.split('.') + if len(splitDomain) >= 2: + curSLD = splitDomain[-2] + '.' + splitDomain[-1] + else: + curSLD = curDomain + if curSLD in sldSet: + return 1.0 + + # check domain + if curDomain in hostSet: + return 1.0 + else: + if curSLD in hostSet: + return 1.0 + else: + return 0.0 + return 0.0 + +def getSubSample(useDir,numUser,threshold=3, + windowSize=10,minFlowsPerUser=10, + maxLen=40,flagUseCiscoFeatures=False, + urlSIPDIct=dict(),characterDict=dict(), + maxLengthInSeconds=-1, + timesNeg=-1, + mongoHost='',mongoPort=0,dbName='', + collectionName='',metaCollectionName=''): + curDFs = mongoDBConnector.sampleDataFromDir(mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName, + useDir=useDir,collectionName=collectionName, + metaCollectionName=metaCollectionName, + numUser=numUser,minFlowsPerUser=minFlowsPerUser) + + domainLists = [] + dfLists = [] + for i in tqdm(np.arange(len(curDFs)), miniters=10): + (domainListsTmp,dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(curDFs[i], + windowSize=windowSize,overlapping=False,maxLengthInSeconds=maxLengthInSeconds) + domainLists += domainListsTmp + dfLists += dfListsTmp + + (testData,testLabel,testHits,testNames) = stackedNeuralModels.createTrainData( + domainLists=domainLists,dfLists=dfLists,charachterDict=characterDict, + maxLen=maxLen,threshold = threshold, + flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=urlSIPDIct, + windowSize=windowSize) + + useIDs = np.where(np.array(testLabel) == 1.0)[0] + useIDs = np.concatenate([useIDs, np.where(np.array(testLabel) == 0.0)[0]]) + + if timesNeg != -1: + posIDs = np.where(np.array(testLabel)[useIDs] == 1.0)[0] + negIDs = np.where(np.array(testLabel)[useIDs] == 0.0)[0] + if len(negIDs) > len(posIDs) * timesNeg: + negIDs = np.random.permutation(negIDs) + negIDs = negIDs[0:len(posIDs) * timesNeg] + negIDs = useIDs[negIDs] + posIDs = useIDs[posIDs] + useIDs = np.concatenate([negIDs,posIDs]) + testLabel = testLabel[useIDs] + testHits = testHits[useIDs] + testNames = testNames[useIDs] + for i in range(len(testData)): + testData[i] = testData[i][useIDs] + return (testData,testLabel,testHits,testNames) + + +def getSubSampleAllPositiveUsers(useDir,threshold=3, + windowSize=10,minFlowsPerUser=10, + maxLen=40,flagUseCiscoFeatures=False, + urlSIPDIct=dict(),characterDict=dict(), + maxLengthInSeconds=-1, + numNegUser=10000, + mongoHost='',mongoPort=0,dbName='', + collectionName='',metaCollectionName=''): + + curDFs = mongoDBConnector.sampleAllPositiveUserFromDir(mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName, + useDir=useDir,collectionName=collectionName, + metaCollectionName=metaCollectionName, + numNegUser=numNegUser,minFlowsPerUser=minFlowsPerUser) + domainLists = [] + dfLists = [] + for i in tqdm(np.arange(len(curDFs)), miniters=10): + (domainListsTmp,dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(curDFs[i], + windowSize=windowSize,overlapping=False,maxLengthInSeconds=maxLengthInSeconds) + domainLists += domainListsTmp + dfLists += dfListsTmp + + (testData,testLabel,testHits,testNames) = stackedNeuralModels.createTrainData( + domainLists=domainLists,dfLists=dfLists,charachterDict=characterDict, + maxLen=maxLen,threshold = threshold, + flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=urlSIPDIct, + windowSize=windowSize) + + useIDs = np.where(np.array(testLabel) == 1.0)[0] + useIDs = np.concatenate([useIDs, np.where(np.array(testLabel) == 0.0)[0]]) + + testLabel = testLabel[useIDs] + testHits = testHits[useIDs] + testNames = testNames[useIDs] + for i in range(len(testData)): + testData[i] = testData[i][useIDs] + return (testData,testLabel,testHits,testNames) + +def sequenceGenerator(useDir,numUser,threshold=3, + windowSize=10,minFlowsPerUser=10, + maxLen=40,flagUseCiscoFeatures=False, + urlSIPDIct=dict(),characterDict=dict(), + maxLengthInSeconds=-1, + timesNeg=-1, + mongoHost='',mongoPort=0,dbName='', + collectionName='',metaCollectionName=''): + while 1: + (testData,testLabel,testHits,testNames) = getSubSample(useDir,numUser,threshold=threshold, + windowSize=windowSize,minFlowsPerUser=minFlowsPerUser, + maxLen=maxLen,flagUseCiscoFeatures=flagUseCiscoFeatures, + urlSIPDIct=urlSIPDIct,characterDict=characterDict, + maxLengthInSeconds=maxLengthInSeconds, + timesNeg=timesNeg, + mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName, + collectionName=collectionName,metaCollectionName=metaCollectionName) + testLabel = np_utils.to_categorical(testLabel, 2) + #print(testData.shape) + yield (testData, testLabel) + + +def sequenceGeneratorTest(data,label): + while 1: + yield (data, label) + +# three different modes +# if mode == 'correct' -> dont permute or touch the ordering +# if mode == 'permutate' -> permute the ordering +# if mode == 'sort' -> sort the flows by sent bytes +def dataGenerator(trainData,trainLabel,numTimesPos,mode='correct'): + return True + +def getMalwareClassDict(path): + outDict = dict() + for line in file(path): + lineSplit = line.strip().split('\t') + if len(lineSplit) == 3: + outDict[lineSplit[0]] = (lineSplit[1],lineSplit[2]) + return outDict + +def applyLower(inStr): + try: + return inStr.lower() + except: + return inStr + +def logTransformData(inputMatrix): + # delete timestamps + try: + return np.log1p(np.array(inputMatrix,dtype='float64')) + except: + return inputMatrix + +def getTrainMatrixLabelFromDataFrame(dataFrame,parameter=dict(),\ + hostDict=dict(),sipDict=dict(),vtDF = dict(), + flagReturnDomains=False): + if len(dataFrame) == 0: + return ([],-1) + if 'flowFeatures' in parameter: + flowFeatures = parameter['flowFeatures'] + else: + flowFeatures = ['duration','bytes_down','bytes_up'] + # extract flow features + data = dataFrame[flowFeatures].values + # get time-gap feature + timeStamps = np.array(dataFrame['timeStamp'].values,dtype='float32') + timeStampsPre = np.zeros([len(timeStamps),]) + timeStampsPre[1:] = timeStamps[0:len(timeStamps)-1] + diffTimeStamps = timeStamps - timeStampsPre + diffTimeStamps[0] = 0.0 + negIDs = np.where(diffTimeStamps < 0.0)[0] + diffTimeStamps[negIDs] = 0.0 + diffTimeStamps = np.reshape(diffTimeStamps,[len(diffTimeStamps),1]) + data = np.hstack([data,diffTimeStamps]) + # log transform + data = logTransformData(data) + if 'urlFeature' in dataFrame: + urlFeatures = np.zeros([len(dataFrame),len(dataFrame.iloc[0]['urlFeature'])]) + for i in range(len(dataFrame)): + urlFeatures[i,:] = dataFrame.iloc[i]['urlFeature'] + data = np.hstack([data,urlFeatures]) + # cisco feature + numCiscoFeature = 30 + ciscoFeatures = np.zeros([data.shape[0],2*numCiscoFeature]) + if len(hostDict) > 0: + counter = 0 + for i in range(len(dataFrame)): + row = dataFrame.iloc[i] + curHost = extractHost(row['domain']) + if curHost in hostDict: + ciscoFeatures[counter,0:numCiscoFeature] = hostDict[curHost] + if len(sipDict) > 0: + counter = 0 + for i in range(len(dataFrame)): + row = dataFrame.iloc[i] + curSIP = row['server_ip'] + if curSIP in sipDict: + ciscoFeatures[counter,numCiscoFeature:] = sipDict[curSIP] + data = np.hstack([data,ciscoFeatures]) + if len(vtDF) != 0: + vtHashSet = set(vtDF['hash']) + hitNums = [] + hashes = dataFrame['anyConnect_hash'] + for curHash in hashes: + #print(vtDF.keys()) + try: + if curHash.lower() in vtHashSet: + curID = np.where(vtDF['hash'] == curHash.lower())[0] + if len(curID) >= 1: + curID = curID[0] + hitNums.append(float(vtDF.iloc[curID]['hits'])) + else: + hitNums.append(-1.0) + else: + hitNums.append(-1.0) + except: + hitNums.append(-1.0) + maxHits = np.max(hitNums) + else: + if 'hits' in dataFrame: + maxHits = np.max(dataFrame['hits']) + else: + maxHits = -1 + label = np.max(dataFrame['label']) + if flagReturnDomains: + return (data,label,maxHits,dataFrame['domain']) + else: + return (data,label,maxHits) + + + +def getDomainChunksByUser(data,useUserName,blockSize): + outData = [] + outLabel = [] + useDataAll = data[data['user_hash'] == useUserName] + userIDs = np.arange(len(useDataAll)) + #print('number of found flows for user: ' + str(len(userIDs))) + numBlocks = int(np.ceil(float(len(userIDs)) / float(blockSize))) + for blockID in range(numBlocks): + curIDs = userIDs[(blockID * blockSize):((blockID+1)*blockSize)] + #print(curIDs) + useData = useDataAll.iloc[curIDs] + curDomains = useData['domain'] + curLabel = np.max(useData['label']) + outData.append(curDomains) + outLabel.append(curLabel) + return (outData,outLabel) + + +def getChunksByUser(data,useUserName,blockSize,parameter=dict(),\ + hostDict=dict(),sipDict=dict(), vtDF = dict, flagOnlyOneUser = False, + flagReturnDomains=False): + outData = [] + outLabel = [] + outHits = [] + outDomains = [] + if flagOnlyOneUser: + useDataAll = data + else: + useDataAll = data[data['user_hash'] == useUserName] + userIDs = np.arange(len(useDataAll)) + #print('number of found flows for user: ' + str(len(userIDs))) + numBlocks = int(np.ceil(float(len(userIDs)) / float(blockSize))) + for blockID in range(numBlocks): + curIDs = userIDs[(blockID * blockSize):((blockID+1)*blockSize)] + #print(curIDs) + useData = useDataAll.iloc[curIDs] + if flagReturnDomains: + (curTrainData,curLabel,curMaxHits,curDomains) = getTrainMatrixLabelFromDataFrame(useData,\ + parameter,hostDict,sipDict,vtDF=vtDF,flagReturnDomains=flagReturnDomains) + else: + (curTrainData,curLabel,curMaxHits) = getTrainMatrixLabelFromDataFrame(useData,\ + parameter,hostDict,sipDict,vtDF=vtDF,flagReturnDomains=flagReturnDomains) + outData.append(curTrainData) + outLabel.append(curLabel) + outHits.append(curMaxHits) + if flagReturnDomains: + outDomains.append(curDomains) + if flagReturnDomains: + return (outData,outLabel,outHits,outDomains) + else: + return (outData,outLabel,outHits) + + +def getLSTMModel(blockSize=10,input_dim=103,lstmUnits=10,denseSize=128): + nb_classes = 2 + model = Sequential() + model.add(LSTM(lstmUnits, input_dim=input_dim, input_length=blockSize)) + model.add(Dense(denseSize, activation='relu')) + model.add(Dropout(0.5)) + model.add(Dense(nb_classes, activation='softmax')) + model.compile(loss='binary_crossentropy', + optimizer='adam', metrics=['accuracy']) + # number of params: + # params = 4 * ((size_of_input + 1) * size_of_output + size_of_output^2) + return model + + + +def getCiscoURLFeatureForRow(row): + sortKeys = list(row.keys()) + sortKeys.sort() + featureVec = np.zeros([len(sortKeys)-1,]) + counter = 0 + for keyName in sortKeys: + if 'key' in keyName: + continue + try: + featureVec[counter] = float(row[keyName]) + except: + featureVec[counter] = 0.0 + counter += 1 + featureVec[np.where(np.isnan(featureVec))[0]] = 0.0 + return featureVec + + +def getCiscoFeatureDictForHost(headerPath,dataPath): + # get header + header = [] + for line in file(headerPath): + header.append(line.strip()) + + header = ['key'] + header + + fobj = open(dataPath,'r') + csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t') + hostDict = dict() + counter = 0 + for row in csvReader: + featureVec = getCiscoURLFeatureForRow(row) + curHost = row['key'] + curHost = extractHost(curHost) + hostDict[curHost] = featureVec + counter += 1 + if counter % 10000 == 0: + print(str(counter) + ' host features collected') + return hostDict + +def getCiscoFeatureDictForSIP(headerPath,dataPath): + # get header + header = [] + for line in file(headerPath): + header.append(line.strip()) + + header = ['key'] + header + + fobj = open(dataPath,'r') + csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t') + hostDict = dict() + counter = 0 + for row in csvReader: + featureVec = getCiscoURLFeatureForRow(row) + curHost = row['key'] + hostDict[curHost] = featureVec + counter += 1 + if counter % 10000 == 0: + print(str(counter) + ' sip features collected') + return hostDict + +def getCiscoFeatureDictForSLD(headerPath,dataPath): + # get header + header = [] + for line in file(headerPath): + header.append(line.strip()) + + header = ['key'] + header + + fobj = open(dataPath,'r') + csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t') + hostDict = dict() + counter = 0 + for row in csvReader: + featureVec = getCiscoURLFeatureForRow(row) + curHost = row['key'] + hostDict[curHost] = featureVec + counter += 1 + if counter % 10000 == 0: + print(str(counter) + ' sld features collected') + return hostDict + + +def extractHost(domain): + curHostSplit = domain.split('.') + try: + curHost = curHostSplit[-2] + '.' + curHostSplit[-1] + return curHost + except: + return domain + + +def loadDataSetFromJoblib(trainDirs,minFlowsPerUser = 10,numTimesPos = 20): + for dirID in range(len(trainDirs)): + curDir = trainDirs[dirID] + curFiles = os.listdir(curDir) + dayJoblibCounter = 0 + for curFile in curFiles: + curFile = curDir + curFile + if curFile.endswith('.joblib'): + curData = joblib.load(curFile) + if dayJoblibCounter == 0: + dayData = curData + else: + dayData = dayData.append(curData,ignore_index=True) + dayJoblibCounter += 1 + print('processed file number: ' + str(dayJoblibCounter) + ' (dir ' + str(curDir) +')') + # use flows with min minFlowsPerUser flows + if minFlowsPerUser != -1: + grouped = dayData.groupby('user_hash') + useUsers = set() + for grouping in grouped: + numFlowsCurUser = len(grouping[1]) + userLabel = np.max(grouping[1]['label']) + if numFlowsCurUser >= minFlowsPerUser and userLabel != -1.0: + useUsers.add(grouping[0]) + # get ids + userIDs = dayData.loc[dayData['user_hash'].isin(useUsers)].index.values + dayData = dayData.iloc[userIDs] + dayData = dayData.reset_index(drop=True) + if numTimesPos != -1: + grouped = dayData.groupby('user_hash') + curUserLabel = [] + curUserNames = [] + for grouping in grouped: + numFlowsCurUser = len(grouping[1]) + userLabel = np.max(grouping[1]['label']) + curUserLabel.append(userLabel) + curUserNames.append(grouping[1]['user_hash'].values[0]) + posIDs = np.where(np.array(curUserLabel) == 1.0)[0] + negIDs = np.where(np.array(curUserLabel) == 0.0)[0] + maxNegLabel = len(posIDs) * numTimesPos + if len(negIDs) > maxNegLabel: + np.random.seed(1) + np.random.shuffle(negIDs) + negIDs = negIDs[0:maxNegLabel] + useIDs = np.concatenate([posIDs,negIDs]) + useUsers = np.array(curUserNames)[useIDs] + useUsers = set(useUsers) + # get ids + userIDs = dayData.loc[dayData['user_hash'].isin(useUsers)].index.values + dayData = dayData.iloc[userIDs] + dayData = dayData.reset_index(drop=True) + if dirID == 0: + allData = dayData + else: + allData = allData.append(dayData,ignore_index=True) + return allData + +def tokenizeDomain(domain,n=3): + domain = domain.replace('https://','') + domain = domain.replace('www.','') + domain = domain.replace('/','') + # reverse domain + domain = domain[::-1] + outList = [] + splitCriterion = n + # overlapping n-grams + outList = [domain[i:i+splitCriterion] for i in range(0, len(domain), 1)] + return outList + + +def getDomainsInWindowData(allData,numNeg=-1,blockSize=10): + uniqueTrainUser = np.unique(allData['user_hash']) + userLabel = [] + for curTrainUser in uniqueTrainUser: + userIDs = allData.loc[allData['user_hash'] == curTrainUser].index.values + curLabel = np.max(allData.iloc[userIDs]['label']) + userLabel.append(curLabel) + negIDs = np.where(np.array(userLabel) == 0.0)[0] + userLabel = np.array(userLabel) + posUser = np.where(userLabel == 1.0)[0] + negUser = np.where(userLabel == 0.0)[0] + + if numNeg != -1: + if len(negUser) > numNeg: + np.random.shuffle(negUser) + negUser = negIDs[0:numNeg] + + useUser = posUser + useUser = np.concatenate([posUser,negUser]) + counter = 0 + trainDomains = [] + trainBlockLabel = [] + trainNames = [] + for uID in range(len(useUser)): + curTrainUser = uniqueTrainUser[useUser[uID]] + (curUserData,curUserLabel) = getDomainChunksByUser(allData,curTrainUser,blockSize) + for i in range(len(curUserLabel)): + trainNames.append(curTrainUser) + trainDomains += curUserData + trainBlockLabel += curUserLabel + print('processed ' + str(counter) + ' users of ' + str(len(useUser))) + counter+= 1 + return (trainDomains,trainBlockLabel,trainNames) + +def getPaddedData(allData,numNeg=-1,blockSize=10,parameterDict=dict(),\ + hostDict=dict(),sipDict = dict(),vtLabelPath=''): + if vtLabelPath != '': + vtDF = pd.read_csv(vtLabelPath,sep='\t') + else: + vtDF = dict() + uniqueTrainUser = np.unique(allData['user_hash']) + userLabel = [] + for curTrainUser in uniqueTrainUser: + userIDs = allData.loc[allData['user_hash'] == curTrainUser].index.values + curLabel = np.max(allData.iloc[userIDs]['label']) + userLabel.append(curLabel) + negIDs = np.where(np.array(userLabel) == 0.0)[0] + userLabel = np.array(userLabel) + posUser = np.where(userLabel == 1.0)[0] + negUser = np.where(userLabel == 0.0)[0] + + if numNeg != -1: + if len(negUser) > numNeg: + np.random.shuffle(negUser) + negUser = negIDs[0:numNeg] + + useUser = posUser + useUser = np.concatenate([posUser,negUser]) + counter = 0 + trainBlocks = [] + trainBlockLabel = [] + trainNames = [] + trainBlockHits = [] + for uID in range(len(useUser)): + curTrainUser = uniqueTrainUser[useUser[uID]] + (curUserData,curUserLabel,curHits) = getChunksByUser(allData,curTrainUser,blockSize,\ + parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF = vtDF) + for i in range(len(curUserLabel)): + trainNames.append(curTrainUser) + trainBlocks += curUserData + trainBlockLabel += curUserLabel + trainBlockHits += curHits + print('processed ' + str(counter) + ' users of ' + str(len(useUser))) + counter+= 1 + paddedData = pad_sequences(trainBlocks, maxlen=blockSize,dtype='float32') + #paddedData = paddedData[:,:,featureTypeDict[useFeatureType]] + return (paddedData,trainBlockLabel,trainNames,trainBlockHits) + + +def createTrainDataFromJoblibsPerUser(joblibPaths,minFlowsPerUser = 10,blockSize=10, + hostDict=dict(),sipDict=dict(), + vtLabelPath='',maxFlowsPerUser = 50000): + trainBlockLabel = [] + trainNames = [] + trainBlockHits = [] + parameterDict = dict() + numBlocksToInitialize = 10000 + paddedData = np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures]) + overallCounter = 0 + startTime = time.time() + for uID in range(len(joblibPaths)): + curSavePath = joblibPaths[uID] + curData = joblib.load(curSavePath)['dataFrame'] + if len(curData) < minFlowsPerUser: + continue + #curUserName = np.unique(curData['user_hash'])[0] + (curUserData,curUserLabel,curHits) = getChunksByUser(curData,'',blockSize,\ + parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=dict(),flagOnlyOneUser = True) + curPaddedData = pad_sequences(curUserData, maxlen=blockSize,dtype='float32') + if (curPaddedData.shape[0] > maxFlowsPerUser): + curPaddedData = curPaddedData[0:maxFlowsPerUser] + curUserLabel = list(np.array(curUserLabel)[0:maxFlowsPerUser]) + curHits = list(np.array(curHits)[0:maxFlowsPerUser]) + for i in range(len(curPaddedData)): + trainNames.append(curSavePath) + trainBlockLabel += curUserLabel + trainBlockHits += curHits + #curPaddedData = curPaddedData[:,:,featureTypeDict[useFeatureType]] + numCurInstances = curPaddedData.shape[0] + while overallCounter+numCurInstances > paddedData.shape[0]: + paddedData = np.vstack([paddedData,np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])]) + paddedData[overallCounter:overallCounter+numCurInstances,:] = curPaddedData + overallCounter += numCurInstances + if uID % 1000 == 0: + elapsedTime = time.time() - startTime + startTime = time.time() + print(str(uID+1) + ' user processed [' + str(elapsedTime) + ']') + paddedData = paddedData[0:overallCounter] + return (paddedData,trainBlockLabel,trainNames,trainBlockHits) + +def loadDataSetFromJoblibPerUser(trainDirs,minFlowsPerUser = 10,numNegPerDay = 50000, + blockSize = 10,hostDict=dict(),sipDict=dict(), + seed =1,flagSkipNoLabelUser=False, + vtLabelPath='',maxFlowsPerUser = 50000, + flagReturnDomains=False): + if vtLabelPath != '': + vtDF = pd.read_csv(vtLabelPath,sep='\t') + else: + vtDF = dict() + trainBlockLabel = [] + trainNames = [] + trainBlockHits = [] + trainBlockDomains = [] + parameterDict = dict() + numBlocksToInitialize = 10000 + paddedData = np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures]) + overallCounter = 0 + for curDirID in range(len(trainDirs)): + curDir = trainDirs[curDirID] + curLabelFile = curDir + 'data_label.joblib' + labelData = joblib.load(curLabelFile) + posIDs = np.where(np.array(labelData['label']) == 1.0)[0] + negIDs = np.where(np.array(labelData['label']) == 0.0)[0] + random.seed(seed) + random.shuffle(negIDs) + useIDs = np.concatenate([posIDs,negIDs]) + counter = 0 + negCounter = 0 + startTime = time.time() + for uID in range(len(useIDs)): + curID = useIDs[uID] + curUserName = labelData['usernames'][curID] + curSavePath = curDir + str(curUserName) + '.joblib' + curData = joblib.load(curSavePath)['dataFrame'] + if flagSkipNoLabelUser: + curUserLabel = np.max(curData['label']) + if curUserLabel == -1.0: + continue + if len(curData) < minFlowsPerUser: + continue + if numNegPerDay != -1: + if negCounter > numNegPerDay: + break + if flagReturnDomains: + (curUserData,curUserLabel,curHits,curDomains) = getChunksByUser(curData,curUserName,blockSize,\ + parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=vtDF,\ + flagReturnDomains=flagReturnDomains) + else: + (curUserData,curUserLabel,curHits) = getChunksByUser(curData,curUserName,blockSize,\ + parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=vtDF,\ + flagReturnDomains=flagReturnDomains) + curPaddedData = pad_sequences(curUserData, maxlen=blockSize,dtype='float32') + if (curPaddedData.shape[0] > maxFlowsPerUser): + curPaddedData = curPaddedData[0:maxFlowsPerUser] + curUserLabel = list(np.array(curUserLabel)[0:maxFlowsPerUser]) + curHits = list(np.array(curHits)[0:maxFlowsPerUser]) + if 'curDomains' in locals(): + curDomains = list(np.array(curDomains)[0:maxFlowsPerUser]) + for i in range(len(curPaddedData)): + trainNames.append(curUserName) + trainBlockLabel += curUserLabel + trainBlockHits += curHits + trainBlockDomains += curDomains + #curPaddedData = curPaddedData[:,:,featureTypeDict[useFeatureType]] + numCurInstances = curPaddedData.shape[0] + while overallCounter+numCurInstances > paddedData.shape[0]: + paddedData = np.vstack([paddedData,np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])]) + paddedData[overallCounter:overallCounter+numCurInstances,:] = curPaddedData + overallCounter += numCurInstances + #print('num of instances: ' + str(numCurInstances)) + if (counter+1) % 1000 == 0: + elapsedTime = time.time() - startTime + print('processed ' + str(counter+1) + ' users of ' +\ + str(len(useIDs)) + ' with ' + str(len(curData['label'])) +\ + ' flows [dir ' + str(curDirID+1) + ' of ' +\ + str(len(trainDirs)) + '] in ' + str(elapsedTime) + ' sec') + startTime = time.time() + if np.max(np.array(curUserLabel)) == 0.0: + negCounter += 1 + counter+= 1 + paddedData = paddedData[0:overallCounter] + if flagReturnDomains: + return (paddedData,trainBlockLabel,trainNames,trainBlockHits,trainBlockDomains) + else: + return (paddedData,trainBlockLabel,trainNames,trainBlockHits) + +def loadRawDataSetFromJoblibPerUser(trainDirs,numNegPerDay = 2000, seed = 1): + dataFrameList = [] + overallCounter = 0 + from tqdm import tqdm + for curDirID in tqdm(np.arange(len(trainDirs)), miniters=1): + curDir = trainDirs[curDirID] + curLabelFile = curDir + 'data_label.joblib' + labelData = joblib.load(curLabelFile) + posIDs = np.where(np.array(labelData['label']) == 1.0)[0] + negIDs = np.where(np.array(labelData['label']) == 0.0)[0] + random.seed(seed) + random.shuffle(negIDs) + if len(negIDs) >= numNegPerDay: + negIDs = negIDs[0:numNegPerDay] + useIDs = np.concatenate([posIDs,negIDs]) + for uID in range(len(useIDs)): + curID = useIDs[uID] + curUserName = labelData['usernames'][curID] + curSavePath = curDir + str(curUserName) + '.joblib' + curData = joblib.load(curSavePath)['dataFrame'] + dataFrameList.append(curData) + overallCounter += 1 + return dataFrameList + + +def checkDomainForSecondLevelDomain(inDomain,sldDomainDict): + if not 'str' in str(type(inDomain)): + return False + splitDomain = inDomain.split('.') + if len(splitDomain) <= 2: + return False + sldDomain = splitDomain[-2] + '.' + splitDomain[-1] + if sldDomain in sldDomainDict: + return True + else: + return False + ''' + out = False + for sldDomain in sldDomainDict: + if inDomain.endswith(sldDomain): + out = True + break + return out + ''' + +def save_model(model,jsonPath,h5Path): + # saving model + json_model = model.to_json() + open(jsonPath, 'w').write(json_model) + # saving weights + model.save_weights(h5Path, overwrite=True) + +def load_model(jsonPath,h5Path): + # loading model + model = model_from_json(open(jsonPath).read()) + model.load_weights(h5Path) + return model + + +def getResultsFromSavedJoblibFile(joblibFiles,threshold=3): + testUserScores = [] + testUserLabel = [] + testLabel = [] + testScores = [] + testNames = [] + for joblibPath in joblibFiles: + print('process: ' + joblibPath) + tmpJoblib = joblib.load(joblibPath) + if 'testBlockScores' in tmpJoblib.keys(): + curTestBlockScores = tmpJoblib['testBlockScores'] + for i in range(len(curTestBlockScores)): + if i == 0: + curTestScores = curTestBlockScores[i] + else: + curTestScores = np.concatenate([curTestScores,curTestBlockScores[i]]) + curTestHits = tmpJoblib['blockHits'] + curTestHits = np.array(curTestHits) + curTestScores = np.array(curTestScores) + curTestLabel = np.ones([len(curTestScores),]) * -1.0 + curTestLabel[np.where(curTestHits == 0)[0]] = 0.0 + curTestLabel[np.where(curTestHits >= threshold)[0]] = 1.0 + curTestNames = tmpJoblib['testNames'] + else: + curTestHits = tmpJoblib['testHits'] + curTestScores = tmpJoblib['testScores'] + curTestLabel = tmpJoblib['testLabel'] + curTestNames = tmpJoblib['testNames'] + + useIDs = np.where(curTestHits >= threshold)[0] + useIDs = np.concatenate([useIDs,np.where(curTestHits == 0.0)[0]]) + # old code + #useIDs = np.where(tmpJoblib['testLabel'] == 1.0)[0] + #useIDs = np.concatenate([useIDs,np.where(tmpJoblib['testLabel'] == 0.0)[0]]) + curTestScoresT = curTestScores[useIDs] + curTestLabelT = curTestLabel[useIDs] + if len(testScores) == 0: + testScores = curTestScoresT + testLabel = curTestLabelT + else: + testScores = np.concatenate([testScores,curTestScoresT]) + testLabel = np.concatenate([testLabel,curTestLabelT]) + + if 'testBlockScores' in tmpJoblib.keys(): + tmpScores = np.array(tmpJoblib['testScores']) + tmpHits = np.array(tmpJoblib['testHits']) + tmpLabel = np.ones([len(tmpHits),])*-1 + tmpLabel[np.where(tmpHits == 0.0)[0]] = 0.0 + tmpLabel[np.where(tmpHits >= threshold)[0]] = 1.0 + useIDs = np.where(tmpLabel == 1.0)[0] + useIDs = np.concatenate([useIDs,np.where(tmpLabel == 0.0)[0]]) + testUserLabel += list(np.array(tmpLabel)[useIDs]) + testUserScores += list(np.array(tmpScores)[useIDs]) + else: + # get user label + uniqueTestNames = list(np.unique(curTestNames)) + for testName in uniqueTestNames: + curIDs = np.where(curTestNames == testName)[0] + curMaxHits = np.max(curTestHits[curIDs]) + if curMaxHits > 0 and curMaxHits < threshold: + continue + if curMaxHits >= threshold: + testUserLabel.append(1.0) + else: + testUserLabel.append(0.0) + curScore = np.max(curTestScores[curIDs]) + testUserScores.append(curScore) + testNames.append(testName) + testUserScores = np.array(testUserScores) + testUserLabel = np.array(testUserLabel) + testNames = np.array(testNames) + return (testUserScores,testUserLabel,testLabel,testScores,testNames) + +def checkIfIP(host): + ipMask = '^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$' + if re.search(ipMask, host) is not None: + return True + else: + return False + +# GLOBAL VALUES +numCiscoFeatures = 30 +featureTypeDict = {'neural':np.arange(4,104,1),\ + 'packet':np.array([0,1,2,3]),\ + 'neural+packet':np.arange(0,104,1),\ + 'neural+packet+cisco':np.arange(0,104+(2*numCiscoFeatures),1),\ + 'cisco':np.arange(104,104+(2*numCiscoFeatures),1)} + +globalNumFeatures = len(featureTypeDict['neural+packet+cisco']) diff --git a/cnnOnCnnParameterSelection.py b/cnnOnCnnParameterSelection.py new file mode 100644 index 0000000..21a5f9d --- /dev/null +++ b/cnnOnCnnParameterSelection.py @@ -0,0 +1,143 @@ +# -*- coding: utf-8 -*- +from tqdm import tqdm + +import tensorflow as tf +config = tf.ConfigProto(log_device_placement=True) +config.gpu_options.per_process_gpu_memory_fraction = 0.5 +config.gpu_options.allow_growth = True +session = tf.Session(config=config) + +from pymongo import MongoClient +import joblib +import pickle +import numpy as np + +import ciscoProcessing as ciscoProcessing +import stackedNeuralModels as stackedNeuralModels + +from sklearn.metrics import precision_recall_curve +from sklearn.metrics import auc, roc_curve +import matplotlib.pyplot as plt + +import keras +from keras.models import Sequential +from keras.layers import Dense, Activation,LSTM,Embedding,Dropout,Conv1D, GlobalMaxPooling1D, Merge, Reshape, Lambda +from keras.layers import Convolution1D +from keras.layers import Input +from keras.models import Model +from keras.utils import np_utils + + +if __name__ == "__main__": + # parameter + innerCNNFilters = 512 + innerCNNKernelSize = 2 + cnnDropout = 0.5 + cnnHiddenDims = 1024 + domainFeatures = 512 + flowFeatures = 3 + numCiscoFeatures=30 + windowSize = 10 + maxLen = 40 + embeddingSize = 100 + kernel_size = 2 + drop_out = 0.5 + filters = 2 + hidden_dims = 100 + vocabSize = 40 + threshold = 3 + minFlowsPerUser = 10 + numEpochs = 100 + maxLengthInSeconds = -1 + timesNeg = -1 + + trainDataPath = '/mnt/projekte/pmlcluster/cisco/trainData/equalClass/currentData.joblib' + testDataPath = '/mnt/projekte/pmlcluster/cisco/trainData/equalClass/futureData.joblib' + + if 'characterDict' not in locals(): + characterDictPath = 'trainData/characterIDDict.joblib' + characterDict = joblib.load(characterDictPath)['characterIDDict'] + + # load train and test data from joblib + # created with createTrainDataMultipleTaskLearning.py + if 'trainDFs' not in locals(): + tmpLoad = joblib.load(trainDataPath) + trainDFs = tmpLoad['data'] + + if 'testDFs' not in locals(): + tmpLoad = joblib.load(testDataPath) + + + sharedCNNFun = stackedNeuralModels.getCNNWitoutLastLayerFunctional(len(characterDict)+1,embeddingSize,maxLen,domainFeatures,kernel_size,domainFeatures,0.5) + + domainLists = [] + dfLists = [] + for i in tqdm(np.arange(len(trainDFs)), miniters=10): + (domainListsTmp,dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(trainDFs[i], + windowSize=windowSize,overlapping=False,maxLengthInSeconds=maxLengthInSeconds) + domainLists += domainListsTmp + dfLists += dfListsTmp + if i == 100: + break + + (testData,testLabel,testHits,testNames) = stackedNeuralModels.createTrainData( + domainLists=domainLists,dfLists=dfLists,charachterDict=characterDict, + maxLen=maxLen,threshold = threshold, + flagUseCiscoFeatures=False,urlSIPDIct=dict(), + windowSize=windowSize) + + useIDs = np.where(testLabel == 1.0)[0] + useIDs = np.concatenate([useIDs,np.where(testLabel == 0.0)[0]]) + + + testLabel = testLabel[useIDs] + testHits = testHits[useIDs] + testNames = testNames[useIDs] + for i in range(len(testData)): + testData[i] = testData[i][useIDs] + + + inputList = [] + encodedList = [] + numFeatures = flowFeatures + for i in range(windowSize): + inputList.append(Input(shape=(maxLen,))) + encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model + inputList.append(Input(shape=(numFeatures,))) + + merge_layer_input = [] + for i in range(windowSize): + merge_layer_input.append(encodedList[i]) + merge_layer_input.append(inputList[(2*i)+1]) + + + # We can then concatenate the two vectors: + merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1) + reshape = Reshape((windowSize, domainFeatures+numFeatures))(merged_vector) + # add second cnn + + cnn = Conv1D(filters, + kernel_size, + activation='relu', + input_shape=(windowSize,domainFeatures+numFeatures))(reshape) + # we use max pooling: + maxPool = GlobalMaxPooling1D()(cnn) + cnnDropout = Dropout(cnnDropout)(maxPool) + cnnDense = Dense(cnnHiddenDims,activation='relu')(cnnDropout) + cnnOutput = Dense(2,activation='softmax')(cnnDense) + + # We define a trainable model linking the + # tweet inputs to the predictions + model = Model(inputs=inputList, outputs=cnnOutput) + model.compile(optimizer='adam', + loss='binary_crossentropy', + metrics=['accuracy']) + + + + epochNumber= 0 + trainLabel = np_utils.to_categorical(testLabel, 2) + model.fit(x=testData, y=trainLabel, + epochs=epochNumber + 1,shuffle=True,initial_epoch=epochNumber)#, + #validation_data=(testData,testLabel)) + \ No newline at end of file diff --git a/stackedNeuralModels.py b/stackedNeuralModels.py new file mode 100644 index 0000000..23045a8 --- /dev/null +++ b/stackedNeuralModels.py @@ -0,0 +1,412 @@ +# -*- coding: utf-8 -*- +from keras.models import Sequential +from keras.layers import Dense, Activation,LSTM,Embedding,Dropout,Conv1D, GlobalMaxPooling1D, Merge, Reshape, Lambda +from keras.layers import Convolution1D +import ciscoProcessing as ciscoProcessing +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd +import joblib +import csv + +import keras +from keras.layers import Input +from keras.models import Model +from keras.utils import np_utils + +from sklearn.metrics import precision_recall_curve +from sklearn.metrics import auc, roc_curve +from tqdm import tqdm +import os + + +def getCiscoFeatures(curDataLine,urlSIPDict): + numCiscoFeatures = 30 + try: + ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])] + #print('cisco features: ' + str(ciscoFeatures)) + # log transform + ciscoFeatures = np.log1p(ciscoFeatures,dtype='float32') + #print('log transformed: ' + str(ciscoFeatures)) + return ciscoFeatures.ravel() + except: + return np.zeros([numCiscoFeatures,]).ravel() + + + +def getCNNWithoutLastLayer(vocabSize,embeddingSize,input_length,filters,kernel_size, + hidden_dims,drop_out): + model = Sequential() + model.add(Embedding(input_dim=vocabSize, output_dim=embeddingSize, + input_length=input_length)) + + model.add(Conv1D(filters, + kernel_size, + activation='relu')) + + # we use max pooling: + model.add(GlobalMaxPooling1D()) + + # We add a vanilla hidden layer: + model.add(Dense(hidden_dims)) + model.add(Dropout(drop_out)) + model.add(Activation('relu')) + return model + +def getCNNWitoutLastLayerFunctional(vocabSize,embeddingSize,input_length,filters,kernel_size, + hidden_dims,drop_out): + a = Input(shape=(input_length,)) + embedding = Embedding(input_dim=vocabSize,output_dim=embeddingSize)(a) + conv1 = Conv1D(filters,kernel_size,activation='relu')(embedding) + glob = GlobalMaxPooling1D()(conv1) + dense = Dense(hidden_dims)(glob) + drop = Dropout(drop_out)(dense) + model = Activation('relu')(drop) + model = Model(a, model) + return model + +def getFlowFeatureLayer(numFeatures): + model = Sequential() + #slpModel.add(Dense(1, input_shape=(1,))) + model.add(Lambda(lambda x: x + 0.0, input_shape=(numFeatures,))) + return model + + +def createCNNDataSet(domains,label,characterDict,maxLen=40): + # process domains in reverse order + outFeature = np.zeros([len(domains),maxLen]) + outLabel = np.zeros([len(domains),]) + for i in range(len(domains)): + domain = domains[i] + curLabel = label[i] + curFeature = np.zeros([maxLen,]) + # print(domain + ' ' + str(len(domain))) + for j in range(np.min([len(domain),maxLen])): + #print(j) + curCharacter = domain[-j] + if curCharacter in characterDict: + curFeature[j] = characterDict[curCharacter] + outFeature[i] = curFeature + outLabel[i] = curLabel + return (outFeature,outLabel) + +def getFeatureVecForDomain(domain,characterDict,maxLen=40): + curFeature = np.zeros([maxLen,]) + for j in range(np.min([len(domain),maxLen])): + #print(j) + curCharacter = domain[-j] + if curCharacter in characterDict: + curFeature[j] = characterDict[curCharacter] + return curFeature + +def getFlowFeatures(curDataLine): + useKeys = ['duration','bytes_down','bytes_up'] + curFeature = np.zeros([len(useKeys),]) + for i in range(len(useKeys)): + curKey = useKeys[i] + try: + curFeature[i] = np.log1p(curDataLine[curKey],dtype='float32') + except: + pass + return curFeature + + +def getChunksFromUserDataFrame(dataFrame,windowSize=10,overlapping=False, + maxLengthInSeconds=300): + #print('maxLength: ' + str(maxLengthInSeconds)) + maxMilliSeconds = maxLengthInSeconds * 1000 + outDomainLists = [] + outDFFrames = [] + if overlapping == False: + numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize))) + userIDs = np.arange(len(dataFrame)) + for blockID in np.arange(numBlocks): + curIDs = userIDs[(blockID * windowSize):((blockID+1)*windowSize)] + #print(curIDs) + useData = dataFrame.iloc[curIDs] + curDomains = useData['domain'] + if maxLengthInSeconds != -1: + curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds + underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds) + if len(underTimeOutIDs) != len(curIDs): + curIDs = curIDs[underTimeOutIDs] + useData = dataFrame.iloc[curIDs] + curDomains = useData['domain'] + outDomainLists.append(list(curDomains)) + outDFFrames.append(useData) + else: + numBlocks = len(dataFrame) + 1 - windowSize + userIDs = np.arange(len(dataFrame)) + for blockID in np.arange(numBlocks): + curIDs = userIDs[blockID:blockID+windowSize] + #print(curIDs) + useData = dataFrame.iloc[curIDs] + curDomains = useData['domain'] + if maxLengthInSeconds != -1: + curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds + underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds) + if len(underTimeOutIDs) != len(curIDs): + curIDs = curIDs[underTimeOutIDs] + useData = dataFrame.iloc[curIDs] + curDomains = useData['domain'] + outDomainLists.append(list(curDomains)) + outDFFrames.append(useData) + return (outDomainLists,outDFFrames) + + +def createTrainData(domainLists,dfLists,charachterDict,maxLen,threshold = 3, + flagUseCiscoFeatures=False,urlSIPDIct=dict, + windowSize=10): + if 'hits' in dfLists[0].keys(): + hitName = 'hits' + elif 'virusTotalHits' in dfLists[0].keys(): + hitName = 'virusTotalHits' + numFlowFeatures = 3 + numCiscoFeatures = 30 + numFeatures = numFlowFeatures + if flagUseCiscoFeatures: + numFeatures += numCiscoFeatures + outputFeatures = [] + label = [] + hits = [] + trainNames = [] + for i in range(windowSize): + outputFeatures.append(np.zeros([len(domainLists),maxLen])) + outputFeatures.append(np.zeros([len(domainLists),numFeatures])) + + for i in tqdm(np.arange(len(domainLists)), miniters=10): + curCounter = 0 + #print('len domainList: ' + str(len(domainLists[i]))) + #print('len df: ' + str(len(dfLists[i]))) + for j in range(np.min([windowSize,len(domainLists[i])])): + outputFeatures[curCounter][i,:] = getFeatureVecForDomain(domainLists[i][j],charachterDict,maxLen) + curCounter += 1 + if flagUseCiscoFeatures: + outputFeatures[curCounter][i,0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j]) + outputFeatures[curCounter][i,numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j],urlSIPDIct) + else: + outputFeatures[curCounter][i,:] = getFlowFeatures(dfLists[i].iloc[j]) + curCounter += 1 + curLabel = 0.0 + if np.max(dfLists[i][hitName]) >= threshold: + curLabel = 1.0 + elif np.max(dfLists[i][hitName]) == -1: + curLabel = -1.0 + elif np.max(dfLists[i][hitName]) > 0 and np.max(dfLists[i][hitName]) < threshold: + curLabel = -2.0 + label.append(curLabel) + hits.append(np.max(dfLists[i][hitName])) + trainNames.append(np.unique(dfLists[i]['user_hash'])) + return (outputFeatures, np.array(label), np.array(hits), np.array(trainNames)) + + +def transformStringListToNumpyArray(listString): + listString = listString.replace('[','').replace(']','') + return np.array(listString.split(','),dtype='float32') + +def getCiscoFeatureDict(csvPathList): + outDict = dict() + for path in tqdm(csvPathList, miniters=1): + fobj = open(path,'r') + csvReader = csv.DictReader(fobj,delimiter=',') + for row in csvReader: + urlSIPString = row['Domain'] + row['ServerIP'] + ciscoFeatures = row['CiscoFeature'] + outDict[urlSIPString] = transformStringListToNumpyArray(ciscoFeatures) + #if len(outDict) % 10000 == 0: + # print('numbers in dict: ' + str(len(outDict))) + return outDict + + +if __name__ == "__main__": + + # get data + trainDirsUserLevel = ['trainData/joblib2016-07-annomalous-stg-new/10/', + 'trainData/joblib2016-07-annomalous-stg-new/09/', + 'trainData/joblib2016-07-annomalous-stg-new/08/', + 'trainData/joblib2016-07-annomalous-stg-new/07/', + 'trainData/joblib2016-07-annomalous-stg-new/06/'] + + testDirsUserLevel = ['trainData/joblib2016-09-annomalous-stg-new/07/',\ + 'trainData/joblib2016-09-annomalous-stg-new/08/',\ + 'trainData/joblib2016-09-annomalous-stg-new/09/',\ + 'trainData/joblib2016-09-annomalous-stg-new/10/',\ + 'trainData/joblib2016-09-annomalous-stg-new/11/',\ + 'trainData/joblib2016-09-annomalous-stg-new/12/',\ + 'trainData/joblib2016-09-annomalous-stg-new/13/',\ + 'trainData/joblib2016-09-annomalous-stg-new/14/'] + + trainCiscoFeatureCSVPaths = ['trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_07.csv', + 'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_06.csv', + 'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_08.csv', + 'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_10.csv', + 'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_09.csv'] + + testCiscoFeatureCSVPaths = ['trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_12.csv', + 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_08.csv', + 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_07.csv', + 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_09.csv', + 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_13.csv', + 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_14.csv', + 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_10.csv', + 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_11.csv'] + + # parameter + numNegPerDay = 5000 + numEpochs = 10 + domainFeatures = 512 + flowFeatures = 3 + numCiscoFeatures= 30 + windowSize = 10 + maxLen = 40 + + lstmUnits = 32 + lstmDenseSize = 128 + embeddingSize = 100 + kernel_size = 2 + drop_out = 0.5 + filters = 2 + hidden_dims = 100 + vocabSize = 40 + flagUseCiscoFeatures = True + threshold = 3 + resultStoreDir = 'results/201705/' + if flagUseCiscoFeatures: + resultStorePath = resultStoreDir + 'cnn_plus_cisco_plus_lstm_numNegPerDay' + str(numNegPerDay) + '.joblib' + resultModelPath = resultStoreDir + 'cnn_plus_cisco_plus_lstm_numNegPerDay' + str(numNegPerDay) + else: + resultStorePath = resultStoreDir + 'cnn_plus_lstm_numNegPerDay' + str(numNegPerDay) + '.joblib' + resultModelPath = resultStoreDir + 'cnn_plus_lstm_numNegPerDay' + str(numNegPerDay) + flagRedo = True + + + if flagUseCiscoFeatures: + if 'trainCiscoFeatureDict' not in locals(): + trainCiscoFeatureDict = getCiscoFeatureDict(trainCiscoFeatureCSVPaths) + + if 'testCiscoFeatureDict' not in locals(): + testCiscoFeatureDict = getCiscoFeatureDict(testCiscoFeatureCSVPaths) + else: + trainCiscoFeatureDict = dict() + testCiscoFeatureDict = dict() + + if flagRedo or not os.path.exists(resultStorePath): + if 'characterDict' not in locals(): + characterDictPath = 'trainData/characterIDDict.joblib' + characterDict = joblib.load(characterDictPath)['characterIDDict'] + + + print('create train data') + if 'dataFrameList' not in locals(): + (dataFrameList) = ciscoProcessing.loadRawDataSetFromJoblibPerUser(\ + trainDirsUserLevel,numNegPerDay = numNegPerDay) + maxHits = [] + for i in range(len(dataFrameList)): + maxHits.append(np.max(dataFrameList[i]['hits'])) + + print('create test data') + # validation error + if 'testDataFrameList' not in locals(): + (testDataFrameList) = ciscoProcessing.loadRawDataSetFromJoblibPerUser(\ + [testDirsUserLevel[0]],numNegPerDay = numNegPerDay) + maxHits = [] + for i in range(len(testDataFrameList)): + maxHits.append(np.max(testDataFrameList[i]['hits'])) + + sharedCNNFun = getCNNWitoutLastLayerFunctional(len(characterDict)+1,embeddingSize,maxLen,domainFeatures,kernel_size,domainFeatures,0.5) + + inputList = [] + encodedList = [] + numFeatures = flowFeatures + if flagUseCiscoFeatures: + numFeatures += numCiscoFeatures + for i in range(windowSize): + inputList.append(Input(shape=(maxLen,))) + encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model + inputList.append(Input(shape=(numFeatures,))) + + merge_layer_input = [] + for i in range(windowSize): + merge_layer_input.append(encodedList[i]) + merge_layer_input.append(inputList[(2*i)+1]) + + + # We can then concatenate the two vectors: + merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1) + reshape = Reshape((windowSize, domainFeatures+numFeatures))(merged_vector) + lstm = LSTM(lstmUnits, input_shape=(windowSize,domainFeatures+numFeatures))(reshape) + dense = Dense(lstmDenseSize, activation='relu')(lstm) + dropout = Dropout(0.5)(dense) + # And add a logistic regression on top + predictions = Dense(2, activation='softmax')(dropout) + + # We define a trainable model linking the + # tweet inputs to the predictions + model = Model(inputs=inputList, outputs=predictions) + + model.compile(optimizer='adam', + loss='binary_crossentropy', + metrics=['accuracy']) + + + # get train data + domainLists = [] + dfLists = [] + for i in tqdm(np.arange(len(dataFrameList)), miniters=10): + (domainListsTmp,dfListsTmp) = getChunksFromUserDataFrame(dataFrameList[i],windowSize=windowSize,overlapping=False) + domainLists += domainListsTmp + dfLists += dfListsTmp + + (trainData,trainLabel,trainHits,trainNames) = createTrainData(domainLists,dfLists,characterDict, + maxLen,threshold = threshold, + flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=trainCiscoFeatureDict) + useIDs = np.where(trainHits == 0)[0] + useIDs = np.concatenate([useIDs,np.where(trainHits >= threshold)[0]]) + for i in range(len(trainData)): + trainData[i] = np.array(trainData[i])[useIDs] + trainLabel = trainLabel[useIDs] + trainHits = trainHits[useIDs] + trainNames = trainNames[useIDs] + + # get test data + domainLists = [] + dfLists = [] + for i in tqdm(np.arange(len(testDataFrameList)), miniters=10): + (domainListsTmp,dfListsTmp) = getChunksFromUserDataFrame(testDataFrameList[i],windowSize=windowSize,overlapping=False) + domainLists += domainListsTmp + dfLists += dfListsTmp + + (testData,testLabel,testHits,testNames) = createTrainData(domainLists,dfLists,characterDict, + maxLen,threshold = threshold, + flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=testCiscoFeatureDict) + useIDs = np.where(testHits == 0)[0] + useIDs = np.concatenate([useIDs,np.where(testHits >= threshold)[0]]) + for i in range(len(testData)): + testData[i] = np.array(testData[i])[useIDs] + testLabel = testLabel[useIDs] + testHits = testHits[useIDs] + testNames = testNames[useIDs] + + numPos = len(np.where(trainLabel == 1.0)[0]) + numNeg = len(np.where(trainLabel == 0.0)[0]) + print('major class: ' + str(float(numNeg) / float(numNeg + numPos))) + lstmLabel = np_utils.to_categorical(trainLabel, 2) + lstmTestLabel = np_utils.to_categorical(testLabel, 2) + trainHist = model.fit(trainData,lstmLabel,epochs=numEpochs,batch_size=128, validation_data=(testData,lstmTestLabel)) + + + # save lstm model + ciscoProcessing.save_model(model,resultModelPath+'.json', + resultModelPath + '.h5') + + # classify train and test + trainScores = model.predict(trainData)[:,1] + testScores = model.predict(testData)[:,1] + + joblib.dump({'testLabel':testLabel, + 'testHits':testHits, + 'testNames':testNames, + 'testScores':testScores, + 'trainLabel':trainLabel, + 'trainScores':trainScores},resultStorePath,compress=3) + \ No newline at end of file