From deac7f9e580cad6b7d6ffb0c7c4df9e1d2516c01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Knaebel?= Date: Fri, 30 Jun 2017 17:21:03 +0200 Subject: [PATCH] removed old files from paul --- ciscoProcessing.py | 846 ----------------------------------------- stackedNeuralModels.py | 215 ----------- 2 files changed, 1061 deletions(-) delete mode 100644 ciscoProcessing.py delete mode 100644 stackedNeuralModels.py diff --git a/ciscoProcessing.py b/ciscoProcessing.py deleted file mode 100644 index 33f4d98..0000000 --- a/ciscoProcessing.py +++ /dev/null @@ -1,846 +0,0 @@ -# -*- coding: utf-8 -*- -import sys -sys.path.append('..') -sys.path.append('/mnt/projekte/pmlcluster/home/prasse/projects/ciscoSVN/cisco/trunk/code/') -import os -import numpy as np -import joblib -from keras.preprocessing.sequence import pad_sequences -from keras.utils import np_utils -from keras.models import Sequential -from keras.layers import Dense -from keras.layers import LSTM -from keras.layers import Dropout -import csv -import pandas as pd -import random -from keras.models import model_from_json -import time -import re -# import mongoDBConnector as mongoDBConnector -import stackedNeuralModels as stackedNeuralModels -from tqdm import tqdm - - -def getCiscoDomainLabel(curDomain,curSIP,hostSet,sipSet,sldSet): - # check server-ip - if curSIP in sipSet: - return 1.0 - # check second level domain - splitDomain = curDomain.split('.') - if len(splitDomain) >= 2: - curSLD = splitDomain[-2] + '.' + splitDomain[-1] - else: - curSLD = curDomain - if curSLD in sldSet: - return 1.0 - - # check domain - if curDomain in hostSet: - return 1.0 - else: - if curSLD in hostSet: - return 1.0 - else: - return 0.0 - return 0.0 - -def getSubSample(useDir,numUser,threshold=3, - windowSize=10,minFlowsPerUser=10, - maxLen=40,flagUseCiscoFeatures=False, - urlSIPDIct=dict(),characterDict=dict(), - maxLengthInSeconds=-1, - timesNeg=-1, - mongoHost='',mongoPort=0,dbName='', - collectionName='',metaCollectionName=''): - curDFs = mongoDBConnector.sampleDataFromDir(mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName, - useDir=useDir,collectionName=collectionName, - metaCollectionName=metaCollectionName, - numUser=numUser,minFlowsPerUser=minFlowsPerUser) - - domainLists = [] - dfLists = [] - for i in tqdm(np.arange(len(curDFs)), miniters=10): - (domainListsTmp,dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(curDFs[i], - windowSize=windowSize,overlapping=False,maxLengthInSeconds=maxLengthInSeconds) - domainLists += domainListsTmp - dfLists += dfListsTmp - - (testData,testLabel,testHits,testNames) = stackedNeuralModels.createTrainData( - domainLists=domainLists,dfLists=dfLists,charachterDict=characterDict, - maxLen=maxLen,threshold = threshold, - flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=urlSIPDIct, - windowSize=windowSize) - - useIDs = np.where(np.array(testLabel) == 1.0)[0] - useIDs = np.concatenate([useIDs, np.where(np.array(testLabel) == 0.0)[0]]) - - if timesNeg != -1: - posIDs = np.where(np.array(testLabel)[useIDs] == 1.0)[0] - negIDs = np.where(np.array(testLabel)[useIDs] == 0.0)[0] - if len(negIDs) > len(posIDs) * timesNeg: - negIDs = np.random.permutation(negIDs) - negIDs = negIDs[0:len(posIDs) * timesNeg] - negIDs = useIDs[negIDs] - posIDs = useIDs[posIDs] - useIDs = np.concatenate([negIDs,posIDs]) - testLabel = testLabel[useIDs] - testHits = testHits[useIDs] - testNames = testNames[useIDs] - for i in range(len(testData)): - testData[i] = testData[i][useIDs] - return (testData,testLabel,testHits,testNames) - - -def getSubSampleAllPositiveUsers(useDir,threshold=3, - windowSize=10,minFlowsPerUser=10, - maxLen=40,flagUseCiscoFeatures=False, - urlSIPDIct=dict(),characterDict=dict(), - maxLengthInSeconds=-1, - numNegUser=10000, - mongoHost='',mongoPort=0,dbName='', - collectionName='',metaCollectionName=''): - - curDFs = mongoDBConnector.sampleAllPositiveUserFromDir(mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName, - useDir=useDir,collectionName=collectionName, - metaCollectionName=metaCollectionName, - numNegUser=numNegUser,minFlowsPerUser=minFlowsPerUser) - domainLists = [] - dfLists = [] - for i in tqdm(np.arange(len(curDFs)), miniters=10): - (domainListsTmp,dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(curDFs[i], - windowSize=windowSize,overlapping=False,maxLengthInSeconds=maxLengthInSeconds) - domainLists += domainListsTmp - dfLists += dfListsTmp - - (testData,testLabel,testHits,testNames) = stackedNeuralModels.createTrainData( - domainLists=domainLists,dfLists=dfLists,charachterDict=characterDict, - maxLen=maxLen,threshold = threshold, - flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=urlSIPDIct, - windowSize=windowSize) - - useIDs = np.where(np.array(testLabel) == 1.0)[0] - useIDs = np.concatenate([useIDs, np.where(np.array(testLabel) == 0.0)[0]]) - - testLabel = testLabel[useIDs] - testHits = testHits[useIDs] - testNames = testNames[useIDs] - for i in range(len(testData)): - testData[i] = testData[i][useIDs] - return (testData,testLabel,testHits,testNames) - -def sequenceGenerator(useDir,numUser,threshold=3, - windowSize=10,minFlowsPerUser=10, - maxLen=40,flagUseCiscoFeatures=False, - urlSIPDIct=dict(),characterDict=dict(), - maxLengthInSeconds=-1, - timesNeg=-1, - mongoHost='',mongoPort=0,dbName='', - collectionName='',metaCollectionName=''): - while 1: - (testData,testLabel,testHits,testNames) = getSubSample(useDir,numUser,threshold=threshold, - windowSize=windowSize,minFlowsPerUser=minFlowsPerUser, - maxLen=maxLen,flagUseCiscoFeatures=flagUseCiscoFeatures, - urlSIPDIct=urlSIPDIct,characterDict=characterDict, - maxLengthInSeconds=maxLengthInSeconds, - timesNeg=timesNeg, - mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName, - collectionName=collectionName,metaCollectionName=metaCollectionName) - testLabel = np_utils.to_categorical(testLabel, 2) - #print(testData.shape) - yield (testData, testLabel) - - -def sequenceGeneratorTest(data,label): - while 1: - yield (data, label) - -# three different modes -# if mode == 'correct' -> dont permute or touch the ordering -# if mode == 'permutate' -> permute the ordering -# if mode == 'sort' -> sort the flows by sent bytes -def dataGenerator(trainData,trainLabel,numTimesPos,mode='correct'): - return True - -def getMalwareClassDict(path): - outDict = dict() - for line in file(path): - lineSplit = line.strip().split('\t') - if len(lineSplit) == 3: - outDict[lineSplit[0]] = (lineSplit[1],lineSplit[2]) - return outDict - -def applyLower(inStr): - try: - return inStr.lower() - except: - return inStr - -def logTransformData(inputMatrix): - # delete timestamps - try: - return np.log1p(np.array(inputMatrix,dtype='float64')) - except: - return inputMatrix - -def getTrainMatrixLabelFromDataFrame(dataFrame,parameter=dict(),\ - hostDict=dict(),sipDict=dict(),vtDF = dict(), - flagReturnDomains=False): - if len(dataFrame) == 0: - return ([],-1) - if 'flowFeatures' in parameter: - flowFeatures = parameter['flowFeatures'] - else: - flowFeatures = ['duration','bytes_down','bytes_up'] - # extract flow features - data = dataFrame[flowFeatures].values - # get time-gap feature - timeStamps = np.array(dataFrame['timeStamp'].values,dtype='float32') - timeStampsPre = np.zeros([len(timeStamps),]) - timeStampsPre[1:] = timeStamps[0:len(timeStamps)-1] - diffTimeStamps = timeStamps - timeStampsPre - diffTimeStamps[0] = 0.0 - negIDs = np.where(diffTimeStamps < 0.0)[0] - diffTimeStamps[negIDs] = 0.0 - diffTimeStamps = np.reshape(diffTimeStamps,[len(diffTimeStamps),1]) - data = np.hstack([data,diffTimeStamps]) - # log transform - data = logTransformData(data) - if 'urlFeature' in dataFrame: - urlFeatures = np.zeros([len(dataFrame),len(dataFrame.iloc[0]['urlFeature'])]) - for i in range(len(dataFrame)): - urlFeatures[i,:] = dataFrame.iloc[i]['urlFeature'] - data = np.hstack([data,urlFeatures]) - # cisco feature - numCiscoFeature = 30 - ciscoFeatures = np.zeros([data.shape[0],2*numCiscoFeature]) - if len(hostDict) > 0: - counter = 0 - for i in range(len(dataFrame)): - row = dataFrame.iloc[i] - curHost = extractHost(row['domain']) - if curHost in hostDict: - ciscoFeatures[counter,0:numCiscoFeature] = hostDict[curHost] - if len(sipDict) > 0: - counter = 0 - for i in range(len(dataFrame)): - row = dataFrame.iloc[i] - curSIP = row['server_ip'] - if curSIP in sipDict: - ciscoFeatures[counter,numCiscoFeature:] = sipDict[curSIP] - data = np.hstack([data,ciscoFeatures]) - if len(vtDF) != 0: - vtHashSet = set(vtDF['hash']) - hitNums = [] - hashes = dataFrame['anyConnect_hash'] - for curHash in hashes: - #print(vtDF.keys()) - try: - if curHash.lower() in vtHashSet: - curID = np.where(vtDF['hash'] == curHash.lower())[0] - if len(curID) >= 1: - curID = curID[0] - hitNums.append(float(vtDF.iloc[curID]['hits'])) - else: - hitNums.append(-1.0) - else: - hitNums.append(-1.0) - except: - hitNums.append(-1.0) - maxHits = np.max(hitNums) - else: - if 'hits' in dataFrame: - maxHits = np.max(dataFrame['hits']) - else: - maxHits = -1 - label = np.max(dataFrame['label']) - if flagReturnDomains: - return (data,label,maxHits,dataFrame['domain']) - else: - return (data,label,maxHits) - - - -def getDomainChunksByUser(data,useUserName,blockSize): - outData = [] - outLabel = [] - useDataAll = data[data['user_hash'] == useUserName] - userIDs = np.arange(len(useDataAll)) - #print('number of found flows for user: ' + str(len(userIDs))) - numBlocks = int(np.ceil(float(len(userIDs)) / float(blockSize))) - for blockID in range(numBlocks): - curIDs = userIDs[(blockID * blockSize):((blockID+1)*blockSize)] - #print(curIDs) - useData = useDataAll.iloc[curIDs] - curDomains = useData['domain'] - curLabel = np.max(useData['label']) - outData.append(curDomains) - outLabel.append(curLabel) - return (outData,outLabel) - - -def getChunksByUser(data,useUserName,blockSize,parameter=dict(),\ - hostDict=dict(),sipDict=dict(), vtDF = dict, flagOnlyOneUser = False, - flagReturnDomains=False): - outData = [] - outLabel = [] - outHits = [] - outDomains = [] - if flagOnlyOneUser: - useDataAll = data - else: - useDataAll = data[data['user_hash'] == useUserName] - userIDs = np.arange(len(useDataAll)) - #print('number of found flows for user: ' + str(len(userIDs))) - numBlocks = int(np.ceil(float(len(userIDs)) / float(blockSize))) - for blockID in range(numBlocks): - curIDs = userIDs[(blockID * blockSize):((blockID+1)*blockSize)] - #print(curIDs) - useData = useDataAll.iloc[curIDs] - if flagReturnDomains: - (curTrainData,curLabel,curMaxHits,curDomains) = getTrainMatrixLabelFromDataFrame(useData,\ - parameter,hostDict,sipDict,vtDF=vtDF,flagReturnDomains=flagReturnDomains) - else: - (curTrainData,curLabel,curMaxHits) = getTrainMatrixLabelFromDataFrame(useData,\ - parameter,hostDict,sipDict,vtDF=vtDF,flagReturnDomains=flagReturnDomains) - outData.append(curTrainData) - outLabel.append(curLabel) - outHits.append(curMaxHits) - if flagReturnDomains: - outDomains.append(curDomains) - if flagReturnDomains: - return (outData,outLabel,outHits,outDomains) - else: - return (outData,outLabel,outHits) - - -def getLSTMModel(blockSize=10,input_dim=103,lstmUnits=10,denseSize=128): - nb_classes = 2 - model = Sequential() - model.add(LSTM(lstmUnits, input_dim=input_dim, input_length=blockSize)) - model.add(Dense(denseSize, activation='relu')) - model.add(Dropout(0.5)) - model.add(Dense(nb_classes, activation='softmax')) - model.compile(loss='binary_crossentropy', - optimizer='adam', metrics=['accuracy']) - # number of params: - # params = 4 * ((size_of_input + 1) * size_of_output + size_of_output^2) - return model - - - -def getCiscoURLFeatureForRow(row): - sortKeys = list(row.keys()) - sortKeys.sort() - featureVec = np.zeros([len(sortKeys)-1,]) - counter = 0 - for keyName in sortKeys: - if 'key' in keyName: - continue - try: - featureVec[counter] = float(row[keyName]) - except: - featureVec[counter] = 0.0 - counter += 1 - featureVec[np.where(np.isnan(featureVec))[0]] = 0.0 - return featureVec - - -def getCiscoFeatureDictForHost(headerPath,dataPath): - # get header - header = [] - for line in file(headerPath): - header.append(line.strip()) - - header = ['key'] + header - - fobj = open(dataPath,'r') - csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t') - hostDict = dict() - counter = 0 - for row in csvReader: - featureVec = getCiscoURLFeatureForRow(row) - curHost = row['key'] - curHost = extractHost(curHost) - hostDict[curHost] = featureVec - counter += 1 - if counter % 10000 == 0: - print(str(counter) + ' host features collected') - return hostDict - -def getCiscoFeatureDictForSIP(headerPath,dataPath): - # get header - header = [] - for line in file(headerPath): - header.append(line.strip()) - - header = ['key'] + header - - fobj = open(dataPath,'r') - csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t') - hostDict = dict() - counter = 0 - for row in csvReader: - featureVec = getCiscoURLFeatureForRow(row) - curHost = row['key'] - hostDict[curHost] = featureVec - counter += 1 - if counter % 10000 == 0: - print(str(counter) + ' sip features collected') - return hostDict - -def getCiscoFeatureDictForSLD(headerPath,dataPath): - # get header - header = [] - for line in file(headerPath): - header.append(line.strip()) - - header = ['key'] + header - - fobj = open(dataPath,'r') - csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t') - hostDict = dict() - counter = 0 - for row in csvReader: - featureVec = getCiscoURLFeatureForRow(row) - curHost = row['key'] - hostDict[curHost] = featureVec - counter += 1 - if counter % 10000 == 0: - print(str(counter) + ' sld features collected') - return hostDict - - -def extractHost(domain): - curHostSplit = domain.split('.') - try: - curHost = curHostSplit[-2] + '.' + curHostSplit[-1] - return curHost - except: - return domain - - -def loadDataSetFromJoblib(trainDirs,minFlowsPerUser = 10,numTimesPos = 20): - for dirID in range(len(trainDirs)): - curDir = trainDirs[dirID] - curFiles = os.listdir(curDir) - dayJoblibCounter = 0 - for curFile in curFiles: - curFile = curDir + curFile - if curFile.endswith('.joblib'): - curData = joblib.load(curFile) - if dayJoblibCounter == 0: - dayData = curData - else: - dayData = dayData.append(curData,ignore_index=True) - dayJoblibCounter += 1 - print('processed file number: ' + str(dayJoblibCounter) + ' (dir ' + str(curDir) +')') - # use flows with min minFlowsPerUser flows - if minFlowsPerUser != -1: - grouped = dayData.groupby('user_hash') - useUsers = set() - for grouping in grouped: - numFlowsCurUser = len(grouping[1]) - userLabel = np.max(grouping[1]['label']) - if numFlowsCurUser >= minFlowsPerUser and userLabel != -1.0: - useUsers.add(grouping[0]) - # get ids - userIDs = dayData.loc[dayData['user_hash'].isin(useUsers)].index.values - dayData = dayData.iloc[userIDs] - dayData = dayData.reset_index(drop=True) - if numTimesPos != -1: - grouped = dayData.groupby('user_hash') - curUserLabel = [] - curUserNames = [] - for grouping in grouped: - numFlowsCurUser = len(grouping[1]) - userLabel = np.max(grouping[1]['label']) - curUserLabel.append(userLabel) - curUserNames.append(grouping[1]['user_hash'].values[0]) - posIDs = np.where(np.array(curUserLabel) == 1.0)[0] - negIDs = np.where(np.array(curUserLabel) == 0.0)[0] - maxNegLabel = len(posIDs) * numTimesPos - if len(negIDs) > maxNegLabel: - np.random.seed(1) - np.random.shuffle(negIDs) - negIDs = negIDs[0:maxNegLabel] - useIDs = np.concatenate([posIDs,negIDs]) - useUsers = np.array(curUserNames)[useIDs] - useUsers = set(useUsers) - # get ids - userIDs = dayData.loc[dayData['user_hash'].isin(useUsers)].index.values - dayData = dayData.iloc[userIDs] - dayData = dayData.reset_index(drop=True) - if dirID == 0: - allData = dayData - else: - allData = allData.append(dayData,ignore_index=True) - return allData - -def tokenizeDomain(domain,n=3): - domain = domain.replace('https://','') - domain = domain.replace('www.','') - domain = domain.replace('/','') - # reverse domain - domain = domain[::-1] - outList = [] - splitCriterion = n - # overlapping n-grams - outList = [domain[i:i+splitCriterion] for i in range(0, len(domain), 1)] - return outList - - -def getDomainsInWindowData(allData,numNeg=-1,blockSize=10): - uniqueTrainUser = np.unique(allData['user_hash']) - userLabel = [] - for curTrainUser in uniqueTrainUser: - userIDs = allData.loc[allData['user_hash'] == curTrainUser].index.values - curLabel = np.max(allData.iloc[userIDs]['label']) - userLabel.append(curLabel) - negIDs = np.where(np.array(userLabel) == 0.0)[0] - userLabel = np.array(userLabel) - posUser = np.where(userLabel == 1.0)[0] - negUser = np.where(userLabel == 0.0)[0] - - if numNeg != -1: - if len(negUser) > numNeg: - np.random.shuffle(negUser) - negUser = negIDs[0:numNeg] - - useUser = posUser - useUser = np.concatenate([posUser,negUser]) - counter = 0 - trainDomains = [] - trainBlockLabel = [] - trainNames = [] - for uID in range(len(useUser)): - curTrainUser = uniqueTrainUser[useUser[uID]] - (curUserData,curUserLabel) = getDomainChunksByUser(allData,curTrainUser,blockSize) - for i in range(len(curUserLabel)): - trainNames.append(curTrainUser) - trainDomains += curUserData - trainBlockLabel += curUserLabel - print('processed ' + str(counter) + ' users of ' + str(len(useUser))) - counter+= 1 - return (trainDomains,trainBlockLabel,trainNames) - -def getPaddedData(allData,numNeg=-1,blockSize=10,parameterDict=dict(),\ - hostDict=dict(),sipDict = dict(),vtLabelPath=''): - if vtLabelPath != '': - vtDF = pd.read_csv(vtLabelPath,sep='\t') - else: - vtDF = dict() - uniqueTrainUser = np.unique(allData['user_hash']) - userLabel = [] - for curTrainUser in uniqueTrainUser: - userIDs = allData.loc[allData['user_hash'] == curTrainUser].index.values - curLabel = np.max(allData.iloc[userIDs]['label']) - userLabel.append(curLabel) - negIDs = np.where(np.array(userLabel) == 0.0)[0] - userLabel = np.array(userLabel) - posUser = np.where(userLabel == 1.0)[0] - negUser = np.where(userLabel == 0.0)[0] - - if numNeg != -1: - if len(negUser) > numNeg: - np.random.shuffle(negUser) - negUser = negIDs[0:numNeg] - - useUser = posUser - useUser = np.concatenate([posUser,negUser]) - counter = 0 - trainBlocks = [] - trainBlockLabel = [] - trainNames = [] - trainBlockHits = [] - for uID in range(len(useUser)): - curTrainUser = uniqueTrainUser[useUser[uID]] - (curUserData,curUserLabel,curHits) = getChunksByUser(allData,curTrainUser,blockSize,\ - parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF = vtDF) - for i in range(len(curUserLabel)): - trainNames.append(curTrainUser) - trainBlocks += curUserData - trainBlockLabel += curUserLabel - trainBlockHits += curHits - print('processed ' + str(counter) + ' users of ' + str(len(useUser))) - counter+= 1 - paddedData = pad_sequences(trainBlocks, maxlen=blockSize,dtype='float32') - #paddedData = paddedData[:,:,featureTypeDict[useFeatureType]] - return (paddedData,trainBlockLabel,trainNames,trainBlockHits) - - -def createTrainDataFromJoblibsPerUser(joblibPaths,minFlowsPerUser = 10,blockSize=10, - hostDict=dict(),sipDict=dict(), - vtLabelPath='',maxFlowsPerUser = 50000): - trainBlockLabel = [] - trainNames = [] - trainBlockHits = [] - parameterDict = dict() - numBlocksToInitialize = 10000 - paddedData = np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures]) - overallCounter = 0 - startTime = time.time() - for uID in range(len(joblibPaths)): - curSavePath = joblibPaths[uID] - curData = joblib.load(curSavePath)['dataFrame'] - if len(curData) < minFlowsPerUser: - continue - #curUserName = np.unique(curData['user_hash'])[0] - (curUserData,curUserLabel,curHits) = getChunksByUser(curData,'',blockSize,\ - parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=dict(),flagOnlyOneUser = True) - curPaddedData = pad_sequences(curUserData, maxlen=blockSize,dtype='float32') - if (curPaddedData.shape[0] > maxFlowsPerUser): - curPaddedData = curPaddedData[0:maxFlowsPerUser] - curUserLabel = list(np.array(curUserLabel)[0:maxFlowsPerUser]) - curHits = list(np.array(curHits)[0:maxFlowsPerUser]) - for i in range(len(curPaddedData)): - trainNames.append(curSavePath) - trainBlockLabel += curUserLabel - trainBlockHits += curHits - #curPaddedData = curPaddedData[:,:,featureTypeDict[useFeatureType]] - numCurInstances = curPaddedData.shape[0] - while overallCounter+numCurInstances > paddedData.shape[0]: - paddedData = np.vstack([paddedData,np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])]) - paddedData[overallCounter:overallCounter+numCurInstances,:] = curPaddedData - overallCounter += numCurInstances - if uID % 1000 == 0: - elapsedTime = time.time() - startTime - startTime = time.time() - print(str(uID+1) + ' user processed [' + str(elapsedTime) + ']') - paddedData = paddedData[0:overallCounter] - return (paddedData,trainBlockLabel,trainNames,trainBlockHits) - -def loadDataSetFromJoblibPerUser(trainDirs,minFlowsPerUser = 10,numNegPerDay = 50000, - blockSize = 10,hostDict=dict(),sipDict=dict(), - seed =1,flagSkipNoLabelUser=False, - vtLabelPath='',maxFlowsPerUser = 50000, - flagReturnDomains=False): - if vtLabelPath != '': - vtDF = pd.read_csv(vtLabelPath,sep='\t') - else: - vtDF = dict() - trainBlockLabel = [] - trainNames = [] - trainBlockHits = [] - trainBlockDomains = [] - parameterDict = dict() - numBlocksToInitialize = 10000 - paddedData = np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures]) - overallCounter = 0 - for curDirID in range(len(trainDirs)): - curDir = trainDirs[curDirID] - curLabelFile = curDir + 'data_label.joblib' - labelData = joblib.load(curLabelFile) - posIDs = np.where(np.array(labelData['label']) == 1.0)[0] - negIDs = np.where(np.array(labelData['label']) == 0.0)[0] - random.seed(seed) - random.shuffle(negIDs) - useIDs = np.concatenate([posIDs,negIDs]) - counter = 0 - negCounter = 0 - startTime = time.time() - for uID in range(len(useIDs)): - curID = useIDs[uID] - curUserName = labelData['usernames'][curID] - curSavePath = curDir + str(curUserName) + '.joblib' - curData = joblib.load(curSavePath)['dataFrame'] - if flagSkipNoLabelUser: - curUserLabel = np.max(curData['label']) - if curUserLabel == -1.0: - continue - if len(curData) < minFlowsPerUser: - continue - if numNegPerDay != -1: - if negCounter > numNegPerDay: - break - if flagReturnDomains: - (curUserData,curUserLabel,curHits,curDomains) = getChunksByUser(curData,curUserName,blockSize,\ - parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=vtDF,\ - flagReturnDomains=flagReturnDomains) - else: - (curUserData,curUserLabel,curHits) = getChunksByUser(curData,curUserName,blockSize,\ - parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=vtDF,\ - flagReturnDomains=flagReturnDomains) - curPaddedData = pad_sequences(curUserData, maxlen=blockSize,dtype='float32') - if (curPaddedData.shape[0] > maxFlowsPerUser): - curPaddedData = curPaddedData[0:maxFlowsPerUser] - curUserLabel = list(np.array(curUserLabel)[0:maxFlowsPerUser]) - curHits = list(np.array(curHits)[0:maxFlowsPerUser]) - if 'curDomains' in locals(): - curDomains = list(np.array(curDomains)[0:maxFlowsPerUser]) - for i in range(len(curPaddedData)): - trainNames.append(curUserName) - trainBlockLabel += curUserLabel - trainBlockHits += curHits - trainBlockDomains += curDomains - #curPaddedData = curPaddedData[:,:,featureTypeDict[useFeatureType]] - numCurInstances = curPaddedData.shape[0] - while overallCounter+numCurInstances > paddedData.shape[0]: - paddedData = np.vstack([paddedData,np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])]) - paddedData[overallCounter:overallCounter+numCurInstances,:] = curPaddedData - overallCounter += numCurInstances - #print('num of instances: ' + str(numCurInstances)) - if (counter+1) % 1000 == 0: - elapsedTime = time.time() - startTime - print('processed ' + str(counter+1) + ' users of ' +\ - str(len(useIDs)) + ' with ' + str(len(curData['label'])) +\ - ' flows [dir ' + str(curDirID+1) + ' of ' +\ - str(len(trainDirs)) + '] in ' + str(elapsedTime) + ' sec') - startTime = time.time() - if np.max(np.array(curUserLabel)) == 0.0: - negCounter += 1 - counter+= 1 - paddedData = paddedData[0:overallCounter] - if flagReturnDomains: - return (paddedData,trainBlockLabel,trainNames,trainBlockHits,trainBlockDomains) - else: - return (paddedData,trainBlockLabel,trainNames,trainBlockHits) - -def loadRawDataSetFromJoblibPerUser(trainDirs,numNegPerDay = 2000, seed = 1): - dataFrameList = [] - overallCounter = 0 - from tqdm import tqdm - for curDirID in tqdm(np.arange(len(trainDirs)), miniters=1): - curDir = trainDirs[curDirID] - curLabelFile = curDir + 'data_label.joblib' - labelData = joblib.load(curLabelFile) - posIDs = np.where(np.array(labelData['label']) == 1.0)[0] - negIDs = np.where(np.array(labelData['label']) == 0.0)[0] - random.seed(seed) - random.shuffle(negIDs) - if len(negIDs) >= numNegPerDay: - negIDs = negIDs[0:numNegPerDay] - useIDs = np.concatenate([posIDs,negIDs]) - for uID in range(len(useIDs)): - curID = useIDs[uID] - curUserName = labelData['usernames'][curID] - curSavePath = curDir + str(curUserName) + '.joblib' - curData = joblib.load(curSavePath)['dataFrame'] - dataFrameList.append(curData) - overallCounter += 1 - return dataFrameList - - -def checkDomainForSecondLevelDomain(inDomain,sldDomainDict): - if not 'str' in str(type(inDomain)): - return False - splitDomain = inDomain.split('.') - if len(splitDomain) <= 2: - return False - sldDomain = splitDomain[-2] + '.' + splitDomain[-1] - if sldDomain in sldDomainDict: - return True - else: - return False - ''' - out = False - for sldDomain in sldDomainDict: - if inDomain.endswith(sldDomain): - out = True - break - return out - ''' - -def save_model(model,jsonPath,h5Path): - # saving model - json_model = model.to_json() - open(jsonPath, 'w').write(json_model) - # saving weights - model.save_weights(h5Path, overwrite=True) - -def load_model(jsonPath,h5Path): - # loading model - model = model_from_json(open(jsonPath).read()) - model.load_weights(h5Path) - return model - - -def getResultsFromSavedJoblibFile(joblibFiles,threshold=3): - testUserScores = [] - testUserLabel = [] - testLabel = [] - testScores = [] - testNames = [] - for joblibPath in joblibFiles: - print('process: ' + joblibPath) - tmpJoblib = joblib.load(joblibPath) - if 'testBlockScores' in tmpJoblib.keys(): - curTestBlockScores = tmpJoblib['testBlockScores'] - for i in range(len(curTestBlockScores)): - if i == 0: - curTestScores = curTestBlockScores[i] - else: - curTestScores = np.concatenate([curTestScores,curTestBlockScores[i]]) - curTestHits = tmpJoblib['blockHits'] - curTestHits = np.array(curTestHits) - curTestScores = np.array(curTestScores) - curTestLabel = np.ones([len(curTestScores),]) * -1.0 - curTestLabel[np.where(curTestHits == 0)[0]] = 0.0 - curTestLabel[np.where(curTestHits >= threshold)[0]] = 1.0 - curTestNames = tmpJoblib['testNames'] - else: - curTestHits = tmpJoblib['testHits'] - curTestScores = tmpJoblib['testScores'] - curTestLabel = tmpJoblib['testLabel'] - curTestNames = tmpJoblib['testNames'] - - useIDs = np.where(curTestHits >= threshold)[0] - useIDs = np.concatenate([useIDs,np.where(curTestHits == 0.0)[0]]) - # old code - #useIDs = np.where(tmpJoblib['testLabel'] == 1.0)[0] - #useIDs = np.concatenate([useIDs,np.where(tmpJoblib['testLabel'] == 0.0)[0]]) - curTestScoresT = curTestScores[useIDs] - curTestLabelT = curTestLabel[useIDs] - if len(testScores) == 0: - testScores = curTestScoresT - testLabel = curTestLabelT - else: - testScores = np.concatenate([testScores,curTestScoresT]) - testLabel = np.concatenate([testLabel,curTestLabelT]) - - if 'testBlockScores' in tmpJoblib.keys(): - tmpScores = np.array(tmpJoblib['testScores']) - tmpHits = np.array(tmpJoblib['testHits']) - tmpLabel = np.ones([len(tmpHits),])*-1 - tmpLabel[np.where(tmpHits == 0.0)[0]] = 0.0 - tmpLabel[np.where(tmpHits >= threshold)[0]] = 1.0 - useIDs = np.where(tmpLabel == 1.0)[0] - useIDs = np.concatenate([useIDs,np.where(tmpLabel == 0.0)[0]]) - testUserLabel += list(np.array(tmpLabel)[useIDs]) - testUserScores += list(np.array(tmpScores)[useIDs]) - else: - # get user label - uniqueTestNames = list(np.unique(curTestNames)) - for testName in uniqueTestNames: - curIDs = np.where(curTestNames == testName)[0] - curMaxHits = np.max(curTestHits[curIDs]) - if curMaxHits > 0 and curMaxHits < threshold: - continue - if curMaxHits >= threshold: - testUserLabel.append(1.0) - else: - testUserLabel.append(0.0) - curScore = np.max(curTestScores[curIDs]) - testUserScores.append(curScore) - testNames.append(testName) - testUserScores = np.array(testUserScores) - testUserLabel = np.array(testUserLabel) - testNames = np.array(testNames) - return (testUserScores,testUserLabel,testLabel,testScores,testNames) - -def checkIfIP(host): - ipMask = '^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$' - if re.search(ipMask, host) is not None: - return True - else: - return False - -# GLOBAL VALUES -numCiscoFeatures = 30 -featureTypeDict = {'neural':np.arange(4,104,1),\ - 'packet':np.array([0,1,2,3]),\ - 'neural+packet':np.arange(0,104,1),\ - 'neural+packet+cisco':np.arange(0,104+(2*numCiscoFeatures),1),\ - 'cisco':np.arange(104,104+(2*numCiscoFeatures),1)} - -globalNumFeatures = len(featureTypeDict['neural+packet+cisco']) diff --git a/stackedNeuralModels.py b/stackedNeuralModels.py deleted file mode 100644 index c6fd5da..0000000 --- a/stackedNeuralModels.py +++ /dev/null @@ -1,215 +0,0 @@ -# -*- coding: utf-8 -*- -import csv - -import numpy as np -from keras.layers import Dense, Activation, Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Lambda -from keras.layers import Input -from keras.models import Model -from keras.models import Sequential -from tqdm import tqdm - - -def getCiscoFeatures(curDataLine, urlSIPDict): - numCiscoFeatures = 30 - try: - ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])] - # print('cisco features: ' + str(ciscoFeatures)) - # log transform - ciscoFeatures = np.log1p(ciscoFeatures, dtype='float32') - # print('log transformed: ' + str(ciscoFeatures)) - return ciscoFeatures.ravel() - except: - return np.zeros([numCiscoFeatures, ]).ravel() - - -def getCNNWithoutLastLayer(vocabSize, embeddingSize, input_length, filters, kernel_size, - hidden_dims, drop_out): - model = Sequential() - model.add(Embedding(input_dim=vocabSize, output_dim=embeddingSize, - input_length=input_length)) - - model.add(Conv1D(filters, - kernel_size, - activation='relu')) - - # we use max pooling: - model.add(GlobalMaxPooling1D()) - - # We add a vanilla hidden layer: - model.add(Dense(hidden_dims)) - model.add(Dropout(drop_out)) - model.add(Activation('relu')) - return model - - -def getCNNWitoutLastLayerFunctional(vocabSize, embeddingSize, input_length, filters, kernel_size, - hidden_dims, drop_out): - a = Input(shape=(input_length,)) - embedding = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(a) - conv1 = Conv1D(filters, kernel_size, activation='relu')(embedding) - glob = GlobalMaxPooling1D()(conv1) - dense = Dense(hidden_dims)(glob) - drop = Dropout(drop_out)(dense) - model = Activation('relu')(drop) - model = Model(a, model) - return model - - -def getFlowFeatureLayer(numFeatures): - model = Sequential() - # slpModel.add(Dense(1, input_shape=(1,))) - model.add(Lambda(lambda x: x + 0.0, input_shape=(numFeatures,))) - return model - - -def createCNNDataSet(domains, label, characterDict, maxLen=40): - # process domains in reverse order - outFeature = np.zeros([len(domains), maxLen]) - outLabel = np.zeros([len(domains), ]) - for i in range(len(domains)): - domain = domains[i] - curLabel = label[i] - curFeature = np.zeros([maxLen, ]) - # print(domain + ' ' + str(len(domain))) - for j in range(np.min([len(domain), maxLen])): - # print(j) - curCharacter = domain[-j] - if curCharacter in characterDict: - curFeature[j] = characterDict[curCharacter] - outFeature[i] = curFeature - outLabel[i] = curLabel - return (outFeature, outLabel) - - -def getFeatureVecForDomain(domain, characterDict, maxLen=40): - curFeature = np.zeros([maxLen, ]) - for j in range(np.min([len(domain), maxLen])): - # print(j) - curCharacter = domain[-j] - if curCharacter in characterDict: - curFeature[j] = characterDict[curCharacter] - return curFeature - - -def getFlowFeatures(curDataLine): - useKeys = ['duration', 'bytes_down', 'bytes_up'] - curFeature = np.zeros([len(useKeys), ]) - for i in range(len(useKeys)): - curKey = useKeys[i] - try: - curFeature[i] = np.log1p(curDataLine[curKey], dtype='float32') - except: - pass - return curFeature - - -def getChunksFromUserDataFrame(dataFrame, windowSize=10, overlapping=False, - maxLengthInSeconds=300): - # print('maxLength: ' + str(maxLengthInSeconds)) - maxMilliSeconds = maxLengthInSeconds * 1000 - outDomainLists = [] - outDFFrames = [] - if overlapping == False: - numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize))) - userIDs = np.arange(len(dataFrame)) - for blockID in np.arange(numBlocks): - curIDs = userIDs[(blockID * windowSize):((blockID + 1) * windowSize)] - # print(curIDs) - useData = dataFrame.iloc[curIDs] - curDomains = useData['domain'] - if maxLengthInSeconds != -1: - curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds - underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds) - if len(underTimeOutIDs) != len(curIDs): - curIDs = curIDs[underTimeOutIDs] - useData = dataFrame.iloc[curIDs] - curDomains = useData['domain'] - outDomainLists.append(list(curDomains)) - outDFFrames.append(useData) - else: - numBlocks = len(dataFrame) + 1 - windowSize - userIDs = np.arange(len(dataFrame)) - for blockID in np.arange(numBlocks): - curIDs = userIDs[blockID:blockID + windowSize] - # print(curIDs) - useData = dataFrame.iloc[curIDs] - curDomains = useData['domain'] - if maxLengthInSeconds != -1: - curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds - underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds) - if len(underTimeOutIDs) != len(curIDs): - curIDs = curIDs[underTimeOutIDs] - useData = dataFrame.iloc[curIDs] - curDomains = useData['domain'] - outDomainLists.append(list(curDomains)) - outDFFrames.append(useData) - return (outDomainLists, outDFFrames) - - -def createTrainData(domainLists, dfLists, charachterDict, maxLen, threshold=3, - flagUseCiscoFeatures=False, urlSIPDIct=dict, - windowSize=10): - if 'hits' in dfLists[0].keys(): - hitName = 'hits' - elif 'virusTotalHits' in dfLists[0].keys(): - hitName = 'virusTotalHits' - numFlowFeatures = 3 - numCiscoFeatures = 30 - numFeatures = numFlowFeatures - if flagUseCiscoFeatures: - numFeatures += numCiscoFeatures - outputFeatures = [] - label = [] - hits = [] - trainNames = [] - for i in range(windowSize): - outputFeatures.append(np.zeros([len(domainLists), maxLen])) - outputFeatures.append(np.zeros([len(domainLists), numFeatures])) - - for i in tqdm(np.arange(len(domainLists)), miniters=10): - curCounter = 0 - # print('len domainList: ' + str(len(domainLists[i]))) - # print('len df: ' + str(len(dfLists[i]))) - for j in range(np.min([windowSize, len(domainLists[i])])): - outputFeatures[curCounter][i, :] = getFeatureVecForDomain(domainLists[i][j], charachterDict, maxLen) - curCounter += 1 - if flagUseCiscoFeatures: - outputFeatures[curCounter][i, 0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j]) - outputFeatures[curCounter][i, numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j], urlSIPDIct) - else: - outputFeatures[curCounter][i, :] = getFlowFeatures(dfLists[i].iloc[j]) - curCounter += 1 - curLabel = 0.0 - if np.max(dfLists[i][hitName]) >= threshold: - curLabel = 1.0 - elif np.max(dfLists[i][hitName]) == -1: - curLabel = -1.0 - elif np.max(dfLists[i][hitName]) > 0 and np.max(dfLists[i][hitName]) < threshold: - curLabel = -2.0 - label.append(curLabel) - hits.append(np.max(dfLists[i][hitName])) - trainNames.append(np.unique(dfLists[i]['user_hash'])) - return (outputFeatures, np.array(label), np.array(hits), np.array(trainNames)) - - -def transformStringListToNumpyArray(listString): - listString = listString.replace('[', '').replace(']', '') - return np.array(listString.split(','), dtype='float32') - - -def getCiscoFeatureDict(csvPathList): - outDict = dict() - for path in tqdm(csvPathList, miniters=1): - fobj = open(path, 'r') - csvReader = csv.DictReader(fobj, delimiter=',') - for row in csvReader: - urlSIPString = row['Domain'] + row['ServerIP'] - ciscoFeatures = row['CiscoFeature'] - outDict[urlSIPString] = transformStringListToNumpyArray(ciscoFeatures) - # if len(outDict) % 10000 == 0: - # print('numbers in dict: ' + str(len(outDict))) - return outDict - - -if __name__ == "__main__": - pass