# -*- coding: utf-8 -*- import sys sys.path.append('..') sys.path.append('/mnt/projekte/pmlcluster/home/prasse/projects/ciscoSVN/cisco/trunk/code/') import os import numpy as np import joblib from keras.preprocessing.sequence import pad_sequences from keras.utils import np_utils from keras.models import Sequential from keras.layers import Dense from keras.layers import LSTM from keras.layers import Dropout import csv import pandas as pd import random from keras.models import model_from_json import time import re # import mongoDBConnector as mongoDBConnector import stackedNeuralModels as stackedNeuralModels from tqdm import tqdm def getCiscoDomainLabel(curDomain,curSIP,hostSet,sipSet,sldSet): # check server-ip if curSIP in sipSet: return 1.0 # check second level domain splitDomain = curDomain.split('.') if len(splitDomain) >= 2: curSLD = splitDomain[-2] + '.' + splitDomain[-1] else: curSLD = curDomain if curSLD in sldSet: return 1.0 # check domain if curDomain in hostSet: return 1.0 else: if curSLD in hostSet: return 1.0 else: return 0.0 return 0.0 def getSubSample(useDir,numUser,threshold=3, windowSize=10,minFlowsPerUser=10, maxLen=40,flagUseCiscoFeatures=False, urlSIPDIct=dict(),characterDict=dict(), maxLengthInSeconds=-1, timesNeg=-1, mongoHost='',mongoPort=0,dbName='', collectionName='',metaCollectionName=''): curDFs = mongoDBConnector.sampleDataFromDir(mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName, useDir=useDir,collectionName=collectionName, metaCollectionName=metaCollectionName, numUser=numUser,minFlowsPerUser=minFlowsPerUser) domainLists = [] dfLists = [] for i in tqdm(np.arange(len(curDFs)), miniters=10): (domainListsTmp,dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(curDFs[i], windowSize=windowSize,overlapping=False,maxLengthInSeconds=maxLengthInSeconds) domainLists += domainListsTmp dfLists += dfListsTmp (testData,testLabel,testHits,testNames) = stackedNeuralModels.createTrainData( domainLists=domainLists,dfLists=dfLists,charachterDict=characterDict, maxLen=maxLen,threshold = threshold, flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=urlSIPDIct, windowSize=windowSize) useIDs = np.where(np.array(testLabel) == 1.0)[0] useIDs = np.concatenate([useIDs, np.where(np.array(testLabel) == 0.0)[0]]) if timesNeg != -1: posIDs = np.where(np.array(testLabel)[useIDs] == 1.0)[0] negIDs = np.where(np.array(testLabel)[useIDs] == 0.0)[0] if len(negIDs) > len(posIDs) * timesNeg: negIDs = np.random.permutation(negIDs) negIDs = negIDs[0:len(posIDs) * timesNeg] negIDs = useIDs[negIDs] posIDs = useIDs[posIDs] useIDs = np.concatenate([negIDs,posIDs]) testLabel = testLabel[useIDs] testHits = testHits[useIDs] testNames = testNames[useIDs] for i in range(len(testData)): testData[i] = testData[i][useIDs] return (testData,testLabel,testHits,testNames) def getSubSampleAllPositiveUsers(useDir,threshold=3, windowSize=10,minFlowsPerUser=10, maxLen=40,flagUseCiscoFeatures=False, urlSIPDIct=dict(),characterDict=dict(), maxLengthInSeconds=-1, numNegUser=10000, mongoHost='',mongoPort=0,dbName='', collectionName='',metaCollectionName=''): curDFs = mongoDBConnector.sampleAllPositiveUserFromDir(mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName, useDir=useDir,collectionName=collectionName, metaCollectionName=metaCollectionName, numNegUser=numNegUser,minFlowsPerUser=minFlowsPerUser) domainLists = [] dfLists = [] for i in tqdm(np.arange(len(curDFs)), miniters=10): (domainListsTmp,dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(curDFs[i], windowSize=windowSize,overlapping=False,maxLengthInSeconds=maxLengthInSeconds) domainLists += domainListsTmp dfLists += dfListsTmp (testData,testLabel,testHits,testNames) = stackedNeuralModels.createTrainData( domainLists=domainLists,dfLists=dfLists,charachterDict=characterDict, maxLen=maxLen,threshold = threshold, flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=urlSIPDIct, windowSize=windowSize) useIDs = np.where(np.array(testLabel) == 1.0)[0] useIDs = np.concatenate([useIDs, np.where(np.array(testLabel) == 0.0)[0]]) testLabel = testLabel[useIDs] testHits = testHits[useIDs] testNames = testNames[useIDs] for i in range(len(testData)): testData[i] = testData[i][useIDs] return (testData,testLabel,testHits,testNames) def sequenceGenerator(useDir,numUser,threshold=3, windowSize=10,minFlowsPerUser=10, maxLen=40,flagUseCiscoFeatures=False, urlSIPDIct=dict(),characterDict=dict(), maxLengthInSeconds=-1, timesNeg=-1, mongoHost='',mongoPort=0,dbName='', collectionName='',metaCollectionName=''): while 1: (testData,testLabel,testHits,testNames) = getSubSample(useDir,numUser,threshold=threshold, windowSize=windowSize,minFlowsPerUser=minFlowsPerUser, maxLen=maxLen,flagUseCiscoFeatures=flagUseCiscoFeatures, urlSIPDIct=urlSIPDIct,characterDict=characterDict, maxLengthInSeconds=maxLengthInSeconds, timesNeg=timesNeg, mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName, collectionName=collectionName,metaCollectionName=metaCollectionName) testLabel = np_utils.to_categorical(testLabel, 2) #print(testData.shape) yield (testData, testLabel) def sequenceGeneratorTest(data,label): while 1: yield (data, label) # three different modes # if mode == 'correct' -> dont permute or touch the ordering # if mode == 'permutate' -> permute the ordering # if mode == 'sort' -> sort the flows by sent bytes def dataGenerator(trainData,trainLabel,numTimesPos,mode='correct'): return True def getMalwareClassDict(path): outDict = dict() for line in file(path): lineSplit = line.strip().split('\t') if len(lineSplit) == 3: outDict[lineSplit[0]] = (lineSplit[1],lineSplit[2]) return outDict def applyLower(inStr): try: return inStr.lower() except: return inStr def logTransformData(inputMatrix): # delete timestamps try: return np.log1p(np.array(inputMatrix,dtype='float64')) except: return inputMatrix def getTrainMatrixLabelFromDataFrame(dataFrame,parameter=dict(),\ hostDict=dict(),sipDict=dict(),vtDF = dict(), flagReturnDomains=False): if len(dataFrame) == 0: return ([],-1) if 'flowFeatures' in parameter: flowFeatures = parameter['flowFeatures'] else: flowFeatures = ['duration','bytes_down','bytes_up'] # extract flow features data = dataFrame[flowFeatures].values # get time-gap feature timeStamps = np.array(dataFrame['timeStamp'].values,dtype='float32') timeStampsPre = np.zeros([len(timeStamps),]) timeStampsPre[1:] = timeStamps[0:len(timeStamps)-1] diffTimeStamps = timeStamps - timeStampsPre diffTimeStamps[0] = 0.0 negIDs = np.where(diffTimeStamps < 0.0)[0] diffTimeStamps[negIDs] = 0.0 diffTimeStamps = np.reshape(diffTimeStamps,[len(diffTimeStamps),1]) data = np.hstack([data,diffTimeStamps]) # log transform data = logTransformData(data) if 'urlFeature' in dataFrame: urlFeatures = np.zeros([len(dataFrame),len(dataFrame.iloc[0]['urlFeature'])]) for i in range(len(dataFrame)): urlFeatures[i,:] = dataFrame.iloc[i]['urlFeature'] data = np.hstack([data,urlFeatures]) # cisco feature numCiscoFeature = 30 ciscoFeatures = np.zeros([data.shape[0],2*numCiscoFeature]) if len(hostDict) > 0: counter = 0 for i in range(len(dataFrame)): row = dataFrame.iloc[i] curHost = extractHost(row['domain']) if curHost in hostDict: ciscoFeatures[counter,0:numCiscoFeature] = hostDict[curHost] if len(sipDict) > 0: counter = 0 for i in range(len(dataFrame)): row = dataFrame.iloc[i] curSIP = row['server_ip'] if curSIP in sipDict: ciscoFeatures[counter,numCiscoFeature:] = sipDict[curSIP] data = np.hstack([data,ciscoFeatures]) if len(vtDF) != 0: vtHashSet = set(vtDF['hash']) hitNums = [] hashes = dataFrame['anyConnect_hash'] for curHash in hashes: #print(vtDF.keys()) try: if curHash.lower() in vtHashSet: curID = np.where(vtDF['hash'] == curHash.lower())[0] if len(curID) >= 1: curID = curID[0] hitNums.append(float(vtDF.iloc[curID]['hits'])) else: hitNums.append(-1.0) else: hitNums.append(-1.0) except: hitNums.append(-1.0) maxHits = np.max(hitNums) else: if 'hits' in dataFrame: maxHits = np.max(dataFrame['hits']) else: maxHits = -1 label = np.max(dataFrame['label']) if flagReturnDomains: return (data,label,maxHits,dataFrame['domain']) else: return (data,label,maxHits) def getDomainChunksByUser(data,useUserName,blockSize): outData = [] outLabel = [] useDataAll = data[data['user_hash'] == useUserName] userIDs = np.arange(len(useDataAll)) #print('number of found flows for user: ' + str(len(userIDs))) numBlocks = int(np.ceil(float(len(userIDs)) / float(blockSize))) for blockID in range(numBlocks): curIDs = userIDs[(blockID * blockSize):((blockID+1)*blockSize)] #print(curIDs) useData = useDataAll.iloc[curIDs] curDomains = useData['domain'] curLabel = np.max(useData['label']) outData.append(curDomains) outLabel.append(curLabel) return (outData,outLabel) def getChunksByUser(data,useUserName,blockSize,parameter=dict(),\ hostDict=dict(),sipDict=dict(), vtDF = dict, flagOnlyOneUser = False, flagReturnDomains=False): outData = [] outLabel = [] outHits = [] outDomains = [] if flagOnlyOneUser: useDataAll = data else: useDataAll = data[data['user_hash'] == useUserName] userIDs = np.arange(len(useDataAll)) #print('number of found flows for user: ' + str(len(userIDs))) numBlocks = int(np.ceil(float(len(userIDs)) / float(blockSize))) for blockID in range(numBlocks): curIDs = userIDs[(blockID * blockSize):((blockID+1)*blockSize)] #print(curIDs) useData = useDataAll.iloc[curIDs] if flagReturnDomains: (curTrainData,curLabel,curMaxHits,curDomains) = getTrainMatrixLabelFromDataFrame(useData,\ parameter,hostDict,sipDict,vtDF=vtDF,flagReturnDomains=flagReturnDomains) else: (curTrainData,curLabel,curMaxHits) = getTrainMatrixLabelFromDataFrame(useData,\ parameter,hostDict,sipDict,vtDF=vtDF,flagReturnDomains=flagReturnDomains) outData.append(curTrainData) outLabel.append(curLabel) outHits.append(curMaxHits) if flagReturnDomains: outDomains.append(curDomains) if flagReturnDomains: return (outData,outLabel,outHits,outDomains) else: return (outData,outLabel,outHits) def getLSTMModel(blockSize=10,input_dim=103,lstmUnits=10,denseSize=128): nb_classes = 2 model = Sequential() model.add(LSTM(lstmUnits, input_dim=input_dim, input_length=blockSize)) model.add(Dense(denseSize, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(nb_classes, activation='softmax')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # number of params: # params = 4 * ((size_of_input + 1) * size_of_output + size_of_output^2) return model def getCiscoURLFeatureForRow(row): sortKeys = list(row.keys()) sortKeys.sort() featureVec = np.zeros([len(sortKeys)-1,]) counter = 0 for keyName in sortKeys: if 'key' in keyName: continue try: featureVec[counter] = float(row[keyName]) except: featureVec[counter] = 0.0 counter += 1 featureVec[np.where(np.isnan(featureVec))[0]] = 0.0 return featureVec def getCiscoFeatureDictForHost(headerPath,dataPath): # get header header = [] for line in file(headerPath): header.append(line.strip()) header = ['key'] + header fobj = open(dataPath,'r') csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t') hostDict = dict() counter = 0 for row in csvReader: featureVec = getCiscoURLFeatureForRow(row) curHost = row['key'] curHost = extractHost(curHost) hostDict[curHost] = featureVec counter += 1 if counter % 10000 == 0: print(str(counter) + ' host features collected') return hostDict def getCiscoFeatureDictForSIP(headerPath,dataPath): # get header header = [] for line in file(headerPath): header.append(line.strip()) header = ['key'] + header fobj = open(dataPath,'r') csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t') hostDict = dict() counter = 0 for row in csvReader: featureVec = getCiscoURLFeatureForRow(row) curHost = row['key'] hostDict[curHost] = featureVec counter += 1 if counter % 10000 == 0: print(str(counter) + ' sip features collected') return hostDict def getCiscoFeatureDictForSLD(headerPath,dataPath): # get header header = [] for line in file(headerPath): header.append(line.strip()) header = ['key'] + header fobj = open(dataPath,'r') csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t') hostDict = dict() counter = 0 for row in csvReader: featureVec = getCiscoURLFeatureForRow(row) curHost = row['key'] hostDict[curHost] = featureVec counter += 1 if counter % 10000 == 0: print(str(counter) + ' sld features collected') return hostDict def extractHost(domain): curHostSplit = domain.split('.') try: curHost = curHostSplit[-2] + '.' + curHostSplit[-1] return curHost except: return domain def loadDataSetFromJoblib(trainDirs,minFlowsPerUser = 10,numTimesPos = 20): for dirID in range(len(trainDirs)): curDir = trainDirs[dirID] curFiles = os.listdir(curDir) dayJoblibCounter = 0 for curFile in curFiles: curFile = curDir + curFile if curFile.endswith('.joblib'): curData = joblib.load(curFile) if dayJoblibCounter == 0: dayData = curData else: dayData = dayData.append(curData,ignore_index=True) dayJoblibCounter += 1 print('processed file number: ' + str(dayJoblibCounter) + ' (dir ' + str(curDir) +')') # use flows with min minFlowsPerUser flows if minFlowsPerUser != -1: grouped = dayData.groupby('user_hash') useUsers = set() for grouping in grouped: numFlowsCurUser = len(grouping[1]) userLabel = np.max(grouping[1]['label']) if numFlowsCurUser >= minFlowsPerUser and userLabel != -1.0: useUsers.add(grouping[0]) # get ids userIDs = dayData.loc[dayData['user_hash'].isin(useUsers)].index.values dayData = dayData.iloc[userIDs] dayData = dayData.reset_index(drop=True) if numTimesPos != -1: grouped = dayData.groupby('user_hash') curUserLabel = [] curUserNames = [] for grouping in grouped: numFlowsCurUser = len(grouping[1]) userLabel = np.max(grouping[1]['label']) curUserLabel.append(userLabel) curUserNames.append(grouping[1]['user_hash'].values[0]) posIDs = np.where(np.array(curUserLabel) == 1.0)[0] negIDs = np.where(np.array(curUserLabel) == 0.0)[0] maxNegLabel = len(posIDs) * numTimesPos if len(negIDs) > maxNegLabel: np.random.seed(1) np.random.shuffle(negIDs) negIDs = negIDs[0:maxNegLabel] useIDs = np.concatenate([posIDs,negIDs]) useUsers = np.array(curUserNames)[useIDs] useUsers = set(useUsers) # get ids userIDs = dayData.loc[dayData['user_hash'].isin(useUsers)].index.values dayData = dayData.iloc[userIDs] dayData = dayData.reset_index(drop=True) if dirID == 0: allData = dayData else: allData = allData.append(dayData,ignore_index=True) return allData def tokenizeDomain(domain,n=3): domain = domain.replace('https://','') domain = domain.replace('www.','') domain = domain.replace('/','') # reverse domain domain = domain[::-1] outList = [] splitCriterion = n # overlapping n-grams outList = [domain[i:i+splitCriterion] for i in range(0, len(domain), 1)] return outList def getDomainsInWindowData(allData,numNeg=-1,blockSize=10): uniqueTrainUser = np.unique(allData['user_hash']) userLabel = [] for curTrainUser in uniqueTrainUser: userIDs = allData.loc[allData['user_hash'] == curTrainUser].index.values curLabel = np.max(allData.iloc[userIDs]['label']) userLabel.append(curLabel) negIDs = np.where(np.array(userLabel) == 0.0)[0] userLabel = np.array(userLabel) posUser = np.where(userLabel == 1.0)[0] negUser = np.where(userLabel == 0.0)[0] if numNeg != -1: if len(negUser) > numNeg: np.random.shuffle(negUser) negUser = negIDs[0:numNeg] useUser = posUser useUser = np.concatenate([posUser,negUser]) counter = 0 trainDomains = [] trainBlockLabel = [] trainNames = [] for uID in range(len(useUser)): curTrainUser = uniqueTrainUser[useUser[uID]] (curUserData,curUserLabel) = getDomainChunksByUser(allData,curTrainUser,blockSize) for i in range(len(curUserLabel)): trainNames.append(curTrainUser) trainDomains += curUserData trainBlockLabel += curUserLabel print('processed ' + str(counter) + ' users of ' + str(len(useUser))) counter+= 1 return (trainDomains,trainBlockLabel,trainNames) def getPaddedData(allData,numNeg=-1,blockSize=10,parameterDict=dict(),\ hostDict=dict(),sipDict = dict(),vtLabelPath=''): if vtLabelPath != '': vtDF = pd.read_csv(vtLabelPath,sep='\t') else: vtDF = dict() uniqueTrainUser = np.unique(allData['user_hash']) userLabel = [] for curTrainUser in uniqueTrainUser: userIDs = allData.loc[allData['user_hash'] == curTrainUser].index.values curLabel = np.max(allData.iloc[userIDs]['label']) userLabel.append(curLabel) negIDs = np.where(np.array(userLabel) == 0.0)[0] userLabel = np.array(userLabel) posUser = np.where(userLabel == 1.0)[0] negUser = np.where(userLabel == 0.0)[0] if numNeg != -1: if len(negUser) > numNeg: np.random.shuffle(negUser) negUser = negIDs[0:numNeg] useUser = posUser useUser = np.concatenate([posUser,negUser]) counter = 0 trainBlocks = [] trainBlockLabel = [] trainNames = [] trainBlockHits = [] for uID in range(len(useUser)): curTrainUser = uniqueTrainUser[useUser[uID]] (curUserData,curUserLabel,curHits) = getChunksByUser(allData,curTrainUser,blockSize,\ parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF = vtDF) for i in range(len(curUserLabel)): trainNames.append(curTrainUser) trainBlocks += curUserData trainBlockLabel += curUserLabel trainBlockHits += curHits print('processed ' + str(counter) + ' users of ' + str(len(useUser))) counter+= 1 paddedData = pad_sequences(trainBlocks, maxlen=blockSize,dtype='float32') #paddedData = paddedData[:,:,featureTypeDict[useFeatureType]] return (paddedData,trainBlockLabel,trainNames,trainBlockHits) def createTrainDataFromJoblibsPerUser(joblibPaths,minFlowsPerUser = 10,blockSize=10, hostDict=dict(),sipDict=dict(), vtLabelPath='',maxFlowsPerUser = 50000): trainBlockLabel = [] trainNames = [] trainBlockHits = [] parameterDict = dict() numBlocksToInitialize = 10000 paddedData = np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures]) overallCounter = 0 startTime = time.time() for uID in range(len(joblibPaths)): curSavePath = joblibPaths[uID] curData = joblib.load(curSavePath)['dataFrame'] if len(curData) < minFlowsPerUser: continue #curUserName = np.unique(curData['user_hash'])[0] (curUserData,curUserLabel,curHits) = getChunksByUser(curData,'',blockSize,\ parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=dict(),flagOnlyOneUser = True) curPaddedData = pad_sequences(curUserData, maxlen=blockSize,dtype='float32') if (curPaddedData.shape[0] > maxFlowsPerUser): curPaddedData = curPaddedData[0:maxFlowsPerUser] curUserLabel = list(np.array(curUserLabel)[0:maxFlowsPerUser]) curHits = list(np.array(curHits)[0:maxFlowsPerUser]) for i in range(len(curPaddedData)): trainNames.append(curSavePath) trainBlockLabel += curUserLabel trainBlockHits += curHits #curPaddedData = curPaddedData[:,:,featureTypeDict[useFeatureType]] numCurInstances = curPaddedData.shape[0] while overallCounter+numCurInstances > paddedData.shape[0]: paddedData = np.vstack([paddedData,np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])]) paddedData[overallCounter:overallCounter+numCurInstances,:] = curPaddedData overallCounter += numCurInstances if uID % 1000 == 0: elapsedTime = time.time() - startTime startTime = time.time() print(str(uID+1) + ' user processed [' + str(elapsedTime) + ']') paddedData = paddedData[0:overallCounter] return (paddedData,trainBlockLabel,trainNames,trainBlockHits) def loadDataSetFromJoblibPerUser(trainDirs,minFlowsPerUser = 10,numNegPerDay = 50000, blockSize = 10,hostDict=dict(),sipDict=dict(), seed =1,flagSkipNoLabelUser=False, vtLabelPath='',maxFlowsPerUser = 50000, flagReturnDomains=False): if vtLabelPath != '': vtDF = pd.read_csv(vtLabelPath,sep='\t') else: vtDF = dict() trainBlockLabel = [] trainNames = [] trainBlockHits = [] trainBlockDomains = [] parameterDict = dict() numBlocksToInitialize = 10000 paddedData = np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures]) overallCounter = 0 for curDirID in range(len(trainDirs)): curDir = trainDirs[curDirID] curLabelFile = curDir + 'data_label.joblib' labelData = joblib.load(curLabelFile) posIDs = np.where(np.array(labelData['label']) == 1.0)[0] negIDs = np.where(np.array(labelData['label']) == 0.0)[0] random.seed(seed) random.shuffle(negIDs) useIDs = np.concatenate([posIDs,negIDs]) counter = 0 negCounter = 0 startTime = time.time() for uID in range(len(useIDs)): curID = useIDs[uID] curUserName = labelData['usernames'][curID] curSavePath = curDir + str(curUserName) + '.joblib' curData = joblib.load(curSavePath)['dataFrame'] if flagSkipNoLabelUser: curUserLabel = np.max(curData['label']) if curUserLabel == -1.0: continue if len(curData) < minFlowsPerUser: continue if numNegPerDay != -1: if negCounter > numNegPerDay: break if flagReturnDomains: (curUserData,curUserLabel,curHits,curDomains) = getChunksByUser(curData,curUserName,blockSize,\ parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=vtDF,\ flagReturnDomains=flagReturnDomains) else: (curUserData,curUserLabel,curHits) = getChunksByUser(curData,curUserName,blockSize,\ parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=vtDF,\ flagReturnDomains=flagReturnDomains) curPaddedData = pad_sequences(curUserData, maxlen=blockSize,dtype='float32') if (curPaddedData.shape[0] > maxFlowsPerUser): curPaddedData = curPaddedData[0:maxFlowsPerUser] curUserLabel = list(np.array(curUserLabel)[0:maxFlowsPerUser]) curHits = list(np.array(curHits)[0:maxFlowsPerUser]) if 'curDomains' in locals(): curDomains = list(np.array(curDomains)[0:maxFlowsPerUser]) for i in range(len(curPaddedData)): trainNames.append(curUserName) trainBlockLabel += curUserLabel trainBlockHits += curHits trainBlockDomains += curDomains #curPaddedData = curPaddedData[:,:,featureTypeDict[useFeatureType]] numCurInstances = curPaddedData.shape[0] while overallCounter+numCurInstances > paddedData.shape[0]: paddedData = np.vstack([paddedData,np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])]) paddedData[overallCounter:overallCounter+numCurInstances,:] = curPaddedData overallCounter += numCurInstances #print('num of instances: ' + str(numCurInstances)) if (counter+1) % 1000 == 0: elapsedTime = time.time() - startTime print('processed ' + str(counter+1) + ' users of ' +\ str(len(useIDs)) + ' with ' + str(len(curData['label'])) +\ ' flows [dir ' + str(curDirID+1) + ' of ' +\ str(len(trainDirs)) + '] in ' + str(elapsedTime) + ' sec') startTime = time.time() if np.max(np.array(curUserLabel)) == 0.0: negCounter += 1 counter+= 1 paddedData = paddedData[0:overallCounter] if flagReturnDomains: return (paddedData,trainBlockLabel,trainNames,trainBlockHits,trainBlockDomains) else: return (paddedData,trainBlockLabel,trainNames,trainBlockHits) def loadRawDataSetFromJoblibPerUser(trainDirs,numNegPerDay = 2000, seed = 1): dataFrameList = [] overallCounter = 0 from tqdm import tqdm for curDirID in tqdm(np.arange(len(trainDirs)), miniters=1): curDir = trainDirs[curDirID] curLabelFile = curDir + 'data_label.joblib' labelData = joblib.load(curLabelFile) posIDs = np.where(np.array(labelData['label']) == 1.0)[0] negIDs = np.where(np.array(labelData['label']) == 0.0)[0] random.seed(seed) random.shuffle(negIDs) if len(negIDs) >= numNegPerDay: negIDs = negIDs[0:numNegPerDay] useIDs = np.concatenate([posIDs,negIDs]) for uID in range(len(useIDs)): curID = useIDs[uID] curUserName = labelData['usernames'][curID] curSavePath = curDir + str(curUserName) + '.joblib' curData = joblib.load(curSavePath)['dataFrame'] dataFrameList.append(curData) overallCounter += 1 return dataFrameList def checkDomainForSecondLevelDomain(inDomain,sldDomainDict): if not 'str' in str(type(inDomain)): return False splitDomain = inDomain.split('.') if len(splitDomain) <= 2: return False sldDomain = splitDomain[-2] + '.' + splitDomain[-1] if sldDomain in sldDomainDict: return True else: return False ''' out = False for sldDomain in sldDomainDict: if inDomain.endswith(sldDomain): out = True break return out ''' def save_model(model,jsonPath,h5Path): # saving model json_model = model.to_json() open(jsonPath, 'w').write(json_model) # saving weights model.save_weights(h5Path, overwrite=True) def load_model(jsonPath,h5Path): # loading model model = model_from_json(open(jsonPath).read()) model.load_weights(h5Path) return model def getResultsFromSavedJoblibFile(joblibFiles,threshold=3): testUserScores = [] testUserLabel = [] testLabel = [] testScores = [] testNames = [] for joblibPath in joblibFiles: print('process: ' + joblibPath) tmpJoblib = joblib.load(joblibPath) if 'testBlockScores' in tmpJoblib.keys(): curTestBlockScores = tmpJoblib['testBlockScores'] for i in range(len(curTestBlockScores)): if i == 0: curTestScores = curTestBlockScores[i] else: curTestScores = np.concatenate([curTestScores,curTestBlockScores[i]]) curTestHits = tmpJoblib['blockHits'] curTestHits = np.array(curTestHits) curTestScores = np.array(curTestScores) curTestLabel = np.ones([len(curTestScores),]) * -1.0 curTestLabel[np.where(curTestHits == 0)[0]] = 0.0 curTestLabel[np.where(curTestHits >= threshold)[0]] = 1.0 curTestNames = tmpJoblib['testNames'] else: curTestHits = tmpJoblib['testHits'] curTestScores = tmpJoblib['testScores'] curTestLabel = tmpJoblib['testLabel'] curTestNames = tmpJoblib['testNames'] useIDs = np.where(curTestHits >= threshold)[0] useIDs = np.concatenate([useIDs,np.where(curTestHits == 0.0)[0]]) # old code #useIDs = np.where(tmpJoblib['testLabel'] == 1.0)[0] #useIDs = np.concatenate([useIDs,np.where(tmpJoblib['testLabel'] == 0.0)[0]]) curTestScoresT = curTestScores[useIDs] curTestLabelT = curTestLabel[useIDs] if len(testScores) == 0: testScores = curTestScoresT testLabel = curTestLabelT else: testScores = np.concatenate([testScores,curTestScoresT]) testLabel = np.concatenate([testLabel,curTestLabelT]) if 'testBlockScores' in tmpJoblib.keys(): tmpScores = np.array(tmpJoblib['testScores']) tmpHits = np.array(tmpJoblib['testHits']) tmpLabel = np.ones([len(tmpHits),])*-1 tmpLabel[np.where(tmpHits == 0.0)[0]] = 0.0 tmpLabel[np.where(tmpHits >= threshold)[0]] = 1.0 useIDs = np.where(tmpLabel == 1.0)[0] useIDs = np.concatenate([useIDs,np.where(tmpLabel == 0.0)[0]]) testUserLabel += list(np.array(tmpLabel)[useIDs]) testUserScores += list(np.array(tmpScores)[useIDs]) else: # get user label uniqueTestNames = list(np.unique(curTestNames)) for testName in uniqueTestNames: curIDs = np.where(curTestNames == testName)[0] curMaxHits = np.max(curTestHits[curIDs]) if curMaxHits > 0 and curMaxHits < threshold: continue if curMaxHits >= threshold: testUserLabel.append(1.0) else: testUserLabel.append(0.0) curScore = np.max(curTestScores[curIDs]) testUserScores.append(curScore) testNames.append(testName) testUserScores = np.array(testUserScores) testUserLabel = np.array(testUserLabel) testNames = np.array(testNames) return (testUserScores,testUserLabel,testLabel,testScores,testNames) def checkIfIP(host): ipMask = '^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$' if re.search(ipMask, host) is not None: return True else: return False # GLOBAL VALUES numCiscoFeatures = 30 featureTypeDict = {'neural':np.arange(4,104,1),\ 'packet':np.array([0,1,2,3]),\ 'neural+packet':np.arange(0,104,1),\ 'neural+packet+cisco':np.arange(0,104+(2*numCiscoFeatures),1),\ 'cisco':np.arange(104,104+(2*numCiscoFeatures),1)} globalNumFeatures = len(featureTypeDict['neural+packet+cisco'])