ma_cisco_malware/ciscoProcessing.py

847 lines
34 KiB
Python

# -*- coding: utf-8 -*-
import sys
sys.path.append('..')
sys.path.append('/mnt/projekte/pmlcluster/home/prasse/projects/ciscoSVN/cisco/trunk/code/')
import os
import numpy as np
import joblib
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
import csv
import pandas as pd
import random
from keras.models import model_from_json
import time
import re
# import mongoDBConnector as mongoDBConnector
import stackedNeuralModels as stackedNeuralModels
from tqdm import tqdm
def getCiscoDomainLabel(curDomain,curSIP,hostSet,sipSet,sldSet):
# check server-ip
if curSIP in sipSet:
return 1.0
# check second level domain
splitDomain = curDomain.split('.')
if len(splitDomain) >= 2:
curSLD = splitDomain[-2] + '.' + splitDomain[-1]
else:
curSLD = curDomain
if curSLD in sldSet:
return 1.0
# check domain
if curDomain in hostSet:
return 1.0
else:
if curSLD in hostSet:
return 1.0
else:
return 0.0
return 0.0
def getSubSample(useDir,numUser,threshold=3,
windowSize=10,minFlowsPerUser=10,
maxLen=40,flagUseCiscoFeatures=False,
urlSIPDIct=dict(),characterDict=dict(),
maxLengthInSeconds=-1,
timesNeg=-1,
mongoHost='',mongoPort=0,dbName='',
collectionName='',metaCollectionName=''):
curDFs = mongoDBConnector.sampleDataFromDir(mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName,
useDir=useDir,collectionName=collectionName,
metaCollectionName=metaCollectionName,
numUser=numUser,minFlowsPerUser=minFlowsPerUser)
domainLists = []
dfLists = []
for i in tqdm(np.arange(len(curDFs)), miniters=10):
(domainListsTmp,dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(curDFs[i],
windowSize=windowSize,overlapping=False,maxLengthInSeconds=maxLengthInSeconds)
domainLists += domainListsTmp
dfLists += dfListsTmp
(testData,testLabel,testHits,testNames) = stackedNeuralModels.createTrainData(
domainLists=domainLists,dfLists=dfLists,charachterDict=characterDict,
maxLen=maxLen,threshold = threshold,
flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=urlSIPDIct,
windowSize=windowSize)
useIDs = np.where(np.array(testLabel) == 1.0)[0]
useIDs = np.concatenate([useIDs, np.where(np.array(testLabel) == 0.0)[0]])
if timesNeg != -1:
posIDs = np.where(np.array(testLabel)[useIDs] == 1.0)[0]
negIDs = np.where(np.array(testLabel)[useIDs] == 0.0)[0]
if len(negIDs) > len(posIDs) * timesNeg:
negIDs = np.random.permutation(negIDs)
negIDs = negIDs[0:len(posIDs) * timesNeg]
negIDs = useIDs[negIDs]
posIDs = useIDs[posIDs]
useIDs = np.concatenate([negIDs,posIDs])
testLabel = testLabel[useIDs]
testHits = testHits[useIDs]
testNames = testNames[useIDs]
for i in range(len(testData)):
testData[i] = testData[i][useIDs]
return (testData,testLabel,testHits,testNames)
def getSubSampleAllPositiveUsers(useDir,threshold=3,
windowSize=10,minFlowsPerUser=10,
maxLen=40,flagUseCiscoFeatures=False,
urlSIPDIct=dict(),characterDict=dict(),
maxLengthInSeconds=-1,
numNegUser=10000,
mongoHost='',mongoPort=0,dbName='',
collectionName='',metaCollectionName=''):
curDFs = mongoDBConnector.sampleAllPositiveUserFromDir(mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName,
useDir=useDir,collectionName=collectionName,
metaCollectionName=metaCollectionName,
numNegUser=numNegUser,minFlowsPerUser=minFlowsPerUser)
domainLists = []
dfLists = []
for i in tqdm(np.arange(len(curDFs)), miniters=10):
(domainListsTmp,dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(curDFs[i],
windowSize=windowSize,overlapping=False,maxLengthInSeconds=maxLengthInSeconds)
domainLists += domainListsTmp
dfLists += dfListsTmp
(testData,testLabel,testHits,testNames) = stackedNeuralModels.createTrainData(
domainLists=domainLists,dfLists=dfLists,charachterDict=characterDict,
maxLen=maxLen,threshold = threshold,
flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=urlSIPDIct,
windowSize=windowSize)
useIDs = np.where(np.array(testLabel) == 1.0)[0]
useIDs = np.concatenate([useIDs, np.where(np.array(testLabel) == 0.0)[0]])
testLabel = testLabel[useIDs]
testHits = testHits[useIDs]
testNames = testNames[useIDs]
for i in range(len(testData)):
testData[i] = testData[i][useIDs]
return (testData,testLabel,testHits,testNames)
def sequenceGenerator(useDir,numUser,threshold=3,
windowSize=10,minFlowsPerUser=10,
maxLen=40,flagUseCiscoFeatures=False,
urlSIPDIct=dict(),characterDict=dict(),
maxLengthInSeconds=-1,
timesNeg=-1,
mongoHost='',mongoPort=0,dbName='',
collectionName='',metaCollectionName=''):
while 1:
(testData,testLabel,testHits,testNames) = getSubSample(useDir,numUser,threshold=threshold,
windowSize=windowSize,minFlowsPerUser=minFlowsPerUser,
maxLen=maxLen,flagUseCiscoFeatures=flagUseCiscoFeatures,
urlSIPDIct=urlSIPDIct,characterDict=characterDict,
maxLengthInSeconds=maxLengthInSeconds,
timesNeg=timesNeg,
mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName,
collectionName=collectionName,metaCollectionName=metaCollectionName)
testLabel = np_utils.to_categorical(testLabel, 2)
#print(testData.shape)
yield (testData, testLabel)
def sequenceGeneratorTest(data,label):
while 1:
yield (data, label)
# three different modes
# if mode == 'correct' -> dont permute or touch the ordering
# if mode == 'permutate' -> permute the ordering
# if mode == 'sort' -> sort the flows by sent bytes
def dataGenerator(trainData,trainLabel,numTimesPos,mode='correct'):
return True
def getMalwareClassDict(path):
outDict = dict()
for line in file(path):
lineSplit = line.strip().split('\t')
if len(lineSplit) == 3:
outDict[lineSplit[0]] = (lineSplit[1],lineSplit[2])
return outDict
def applyLower(inStr):
try:
return inStr.lower()
except:
return inStr
def logTransformData(inputMatrix):
# delete timestamps
try:
return np.log1p(np.array(inputMatrix,dtype='float64'))
except:
return inputMatrix
def getTrainMatrixLabelFromDataFrame(dataFrame,parameter=dict(),\
hostDict=dict(),sipDict=dict(),vtDF = dict(),
flagReturnDomains=False):
if len(dataFrame) == 0:
return ([],-1)
if 'flowFeatures' in parameter:
flowFeatures = parameter['flowFeatures']
else:
flowFeatures = ['duration','bytes_down','bytes_up']
# extract flow features
data = dataFrame[flowFeatures].values
# get time-gap feature
timeStamps = np.array(dataFrame['timeStamp'].values,dtype='float32')
timeStampsPre = np.zeros([len(timeStamps),])
timeStampsPre[1:] = timeStamps[0:len(timeStamps)-1]
diffTimeStamps = timeStamps - timeStampsPre
diffTimeStamps[0] = 0.0
negIDs = np.where(diffTimeStamps < 0.0)[0]
diffTimeStamps[negIDs] = 0.0
diffTimeStamps = np.reshape(diffTimeStamps,[len(diffTimeStamps),1])
data = np.hstack([data,diffTimeStamps])
# log transform
data = logTransformData(data)
if 'urlFeature' in dataFrame:
urlFeatures = np.zeros([len(dataFrame),len(dataFrame.iloc[0]['urlFeature'])])
for i in range(len(dataFrame)):
urlFeatures[i,:] = dataFrame.iloc[i]['urlFeature']
data = np.hstack([data,urlFeatures])
# cisco feature
numCiscoFeature = 30
ciscoFeatures = np.zeros([data.shape[0],2*numCiscoFeature])
if len(hostDict) > 0:
counter = 0
for i in range(len(dataFrame)):
row = dataFrame.iloc[i]
curHost = extractHost(row['domain'])
if curHost in hostDict:
ciscoFeatures[counter,0:numCiscoFeature] = hostDict[curHost]
if len(sipDict) > 0:
counter = 0
for i in range(len(dataFrame)):
row = dataFrame.iloc[i]
curSIP = row['server_ip']
if curSIP in sipDict:
ciscoFeatures[counter,numCiscoFeature:] = sipDict[curSIP]
data = np.hstack([data,ciscoFeatures])
if len(vtDF) != 0:
vtHashSet = set(vtDF['hash'])
hitNums = []
hashes = dataFrame['anyConnect_hash']
for curHash in hashes:
#print(vtDF.keys())
try:
if curHash.lower() in vtHashSet:
curID = np.where(vtDF['hash'] == curHash.lower())[0]
if len(curID) >= 1:
curID = curID[0]
hitNums.append(float(vtDF.iloc[curID]['hits']))
else:
hitNums.append(-1.0)
else:
hitNums.append(-1.0)
except:
hitNums.append(-1.0)
maxHits = np.max(hitNums)
else:
if 'hits' in dataFrame:
maxHits = np.max(dataFrame['hits'])
else:
maxHits = -1
label = np.max(dataFrame['label'])
if flagReturnDomains:
return (data,label,maxHits,dataFrame['domain'])
else:
return (data,label,maxHits)
def getDomainChunksByUser(data,useUserName,blockSize):
outData = []
outLabel = []
useDataAll = data[data['user_hash'] == useUserName]
userIDs = np.arange(len(useDataAll))
#print('number of found flows for user: ' + str(len(userIDs)))
numBlocks = int(np.ceil(float(len(userIDs)) / float(blockSize)))
for blockID in range(numBlocks):
curIDs = userIDs[(blockID * blockSize):((blockID+1)*blockSize)]
#print(curIDs)
useData = useDataAll.iloc[curIDs]
curDomains = useData['domain']
curLabel = np.max(useData['label'])
outData.append(curDomains)
outLabel.append(curLabel)
return (outData,outLabel)
def getChunksByUser(data,useUserName,blockSize,parameter=dict(),\
hostDict=dict(),sipDict=dict(), vtDF = dict, flagOnlyOneUser = False,
flagReturnDomains=False):
outData = []
outLabel = []
outHits = []
outDomains = []
if flagOnlyOneUser:
useDataAll = data
else:
useDataAll = data[data['user_hash'] == useUserName]
userIDs = np.arange(len(useDataAll))
#print('number of found flows for user: ' + str(len(userIDs)))
numBlocks = int(np.ceil(float(len(userIDs)) / float(blockSize)))
for blockID in range(numBlocks):
curIDs = userIDs[(blockID * blockSize):((blockID+1)*blockSize)]
#print(curIDs)
useData = useDataAll.iloc[curIDs]
if flagReturnDomains:
(curTrainData,curLabel,curMaxHits,curDomains) = getTrainMatrixLabelFromDataFrame(useData,\
parameter,hostDict,sipDict,vtDF=vtDF,flagReturnDomains=flagReturnDomains)
else:
(curTrainData,curLabel,curMaxHits) = getTrainMatrixLabelFromDataFrame(useData,\
parameter,hostDict,sipDict,vtDF=vtDF,flagReturnDomains=flagReturnDomains)
outData.append(curTrainData)
outLabel.append(curLabel)
outHits.append(curMaxHits)
if flagReturnDomains:
outDomains.append(curDomains)
if flagReturnDomains:
return (outData,outLabel,outHits,outDomains)
else:
return (outData,outLabel,outHits)
def getLSTMModel(blockSize=10,input_dim=103,lstmUnits=10,denseSize=128):
nb_classes = 2
model = Sequential()
model.add(LSTM(lstmUnits, input_dim=input_dim, input_length=blockSize))
model.add(Dense(denseSize, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes, activation='softmax'))
model.compile(loss='binary_crossentropy',
optimizer='adam', metrics=['accuracy'])
# number of params:
# params = 4 * ((size_of_input + 1) * size_of_output + size_of_output^2)
return model
def getCiscoURLFeatureForRow(row):
sortKeys = list(row.keys())
sortKeys.sort()
featureVec = np.zeros([len(sortKeys)-1,])
counter = 0
for keyName in sortKeys:
if 'key' in keyName:
continue
try:
featureVec[counter] = float(row[keyName])
except:
featureVec[counter] = 0.0
counter += 1
featureVec[np.where(np.isnan(featureVec))[0]] = 0.0
return featureVec
def getCiscoFeatureDictForHost(headerPath,dataPath):
# get header
header = []
for line in file(headerPath):
header.append(line.strip())
header = ['key'] + header
fobj = open(dataPath,'r')
csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t')
hostDict = dict()
counter = 0
for row in csvReader:
featureVec = getCiscoURLFeatureForRow(row)
curHost = row['key']
curHost = extractHost(curHost)
hostDict[curHost] = featureVec
counter += 1
if counter % 10000 == 0:
print(str(counter) + ' host features collected')
return hostDict
def getCiscoFeatureDictForSIP(headerPath,dataPath):
# get header
header = []
for line in file(headerPath):
header.append(line.strip())
header = ['key'] + header
fobj = open(dataPath,'r')
csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t')
hostDict = dict()
counter = 0
for row in csvReader:
featureVec = getCiscoURLFeatureForRow(row)
curHost = row['key']
hostDict[curHost] = featureVec
counter += 1
if counter % 10000 == 0:
print(str(counter) + ' sip features collected')
return hostDict
def getCiscoFeatureDictForSLD(headerPath,dataPath):
# get header
header = []
for line in file(headerPath):
header.append(line.strip())
header = ['key'] + header
fobj = open(dataPath,'r')
csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t')
hostDict = dict()
counter = 0
for row in csvReader:
featureVec = getCiscoURLFeatureForRow(row)
curHost = row['key']
hostDict[curHost] = featureVec
counter += 1
if counter % 10000 == 0:
print(str(counter) + ' sld features collected')
return hostDict
def extractHost(domain):
curHostSplit = domain.split('.')
try:
curHost = curHostSplit[-2] + '.' + curHostSplit[-1]
return curHost
except:
return domain
def loadDataSetFromJoblib(trainDirs,minFlowsPerUser = 10,numTimesPos = 20):
for dirID in range(len(trainDirs)):
curDir = trainDirs[dirID]
curFiles = os.listdir(curDir)
dayJoblibCounter = 0
for curFile in curFiles:
curFile = curDir + curFile
if curFile.endswith('.joblib'):
curData = joblib.load(curFile)
if dayJoblibCounter == 0:
dayData = curData
else:
dayData = dayData.append(curData,ignore_index=True)
dayJoblibCounter += 1
print('processed file number: ' + str(dayJoblibCounter) + ' (dir ' + str(curDir) +')')
# use flows with min minFlowsPerUser flows
if minFlowsPerUser != -1:
grouped = dayData.groupby('user_hash')
useUsers = set()
for grouping in grouped:
numFlowsCurUser = len(grouping[1])
userLabel = np.max(grouping[1]['label'])
if numFlowsCurUser >= minFlowsPerUser and userLabel != -1.0:
useUsers.add(grouping[0])
# get ids
userIDs = dayData.loc[dayData['user_hash'].isin(useUsers)].index.values
dayData = dayData.iloc[userIDs]
dayData = dayData.reset_index(drop=True)
if numTimesPos != -1:
grouped = dayData.groupby('user_hash')
curUserLabel = []
curUserNames = []
for grouping in grouped:
numFlowsCurUser = len(grouping[1])
userLabel = np.max(grouping[1]['label'])
curUserLabel.append(userLabel)
curUserNames.append(grouping[1]['user_hash'].values[0])
posIDs = np.where(np.array(curUserLabel) == 1.0)[0]
negIDs = np.where(np.array(curUserLabel) == 0.0)[0]
maxNegLabel = len(posIDs) * numTimesPos
if len(negIDs) > maxNegLabel:
np.random.seed(1)
np.random.shuffle(negIDs)
negIDs = negIDs[0:maxNegLabel]
useIDs = np.concatenate([posIDs,negIDs])
useUsers = np.array(curUserNames)[useIDs]
useUsers = set(useUsers)
# get ids
userIDs = dayData.loc[dayData['user_hash'].isin(useUsers)].index.values
dayData = dayData.iloc[userIDs]
dayData = dayData.reset_index(drop=True)
if dirID == 0:
allData = dayData
else:
allData = allData.append(dayData,ignore_index=True)
return allData
def tokenizeDomain(domain,n=3):
domain = domain.replace('https://','')
domain = domain.replace('www.','')
domain = domain.replace('/','')
# reverse domain
domain = domain[::-1]
outList = []
splitCriterion = n
# overlapping n-grams
outList = [domain[i:i+splitCriterion] for i in range(0, len(domain), 1)]
return outList
def getDomainsInWindowData(allData,numNeg=-1,blockSize=10):
uniqueTrainUser = np.unique(allData['user_hash'])
userLabel = []
for curTrainUser in uniqueTrainUser:
userIDs = allData.loc[allData['user_hash'] == curTrainUser].index.values
curLabel = np.max(allData.iloc[userIDs]['label'])
userLabel.append(curLabel)
negIDs = np.where(np.array(userLabel) == 0.0)[0]
userLabel = np.array(userLabel)
posUser = np.where(userLabel == 1.0)[0]
negUser = np.where(userLabel == 0.0)[0]
if numNeg != -1:
if len(negUser) > numNeg:
np.random.shuffle(negUser)
negUser = negIDs[0:numNeg]
useUser = posUser
useUser = np.concatenate([posUser,negUser])
counter = 0
trainDomains = []
trainBlockLabel = []
trainNames = []
for uID in range(len(useUser)):
curTrainUser = uniqueTrainUser[useUser[uID]]
(curUserData,curUserLabel) = getDomainChunksByUser(allData,curTrainUser,blockSize)
for i in range(len(curUserLabel)):
trainNames.append(curTrainUser)
trainDomains += curUserData
trainBlockLabel += curUserLabel
print('processed ' + str(counter) + ' users of ' + str(len(useUser)))
counter+= 1
return (trainDomains,trainBlockLabel,trainNames)
def getPaddedData(allData,numNeg=-1,blockSize=10,parameterDict=dict(),\
hostDict=dict(),sipDict = dict(),vtLabelPath=''):
if vtLabelPath != '':
vtDF = pd.read_csv(vtLabelPath,sep='\t')
else:
vtDF = dict()
uniqueTrainUser = np.unique(allData['user_hash'])
userLabel = []
for curTrainUser in uniqueTrainUser:
userIDs = allData.loc[allData['user_hash'] == curTrainUser].index.values
curLabel = np.max(allData.iloc[userIDs]['label'])
userLabel.append(curLabel)
negIDs = np.where(np.array(userLabel) == 0.0)[0]
userLabel = np.array(userLabel)
posUser = np.where(userLabel == 1.0)[0]
negUser = np.where(userLabel == 0.0)[0]
if numNeg != -1:
if len(negUser) > numNeg:
np.random.shuffle(negUser)
negUser = negIDs[0:numNeg]
useUser = posUser
useUser = np.concatenate([posUser,negUser])
counter = 0
trainBlocks = []
trainBlockLabel = []
trainNames = []
trainBlockHits = []
for uID in range(len(useUser)):
curTrainUser = uniqueTrainUser[useUser[uID]]
(curUserData,curUserLabel,curHits) = getChunksByUser(allData,curTrainUser,blockSize,\
parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF = vtDF)
for i in range(len(curUserLabel)):
trainNames.append(curTrainUser)
trainBlocks += curUserData
trainBlockLabel += curUserLabel
trainBlockHits += curHits
print('processed ' + str(counter) + ' users of ' + str(len(useUser)))
counter+= 1
paddedData = pad_sequences(trainBlocks, maxlen=blockSize,dtype='float32')
#paddedData = paddedData[:,:,featureTypeDict[useFeatureType]]
return (paddedData,trainBlockLabel,trainNames,trainBlockHits)
def createTrainDataFromJoblibsPerUser(joblibPaths,minFlowsPerUser = 10,blockSize=10,
hostDict=dict(),sipDict=dict(),
vtLabelPath='',maxFlowsPerUser = 50000):
trainBlockLabel = []
trainNames = []
trainBlockHits = []
parameterDict = dict()
numBlocksToInitialize = 10000
paddedData = np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])
overallCounter = 0
startTime = time.time()
for uID in range(len(joblibPaths)):
curSavePath = joblibPaths[uID]
curData = joblib.load(curSavePath)['dataFrame']
if len(curData) < minFlowsPerUser:
continue
#curUserName = np.unique(curData['user_hash'])[0]
(curUserData,curUserLabel,curHits) = getChunksByUser(curData,'',blockSize,\
parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=dict(),flagOnlyOneUser = True)
curPaddedData = pad_sequences(curUserData, maxlen=blockSize,dtype='float32')
if (curPaddedData.shape[0] > maxFlowsPerUser):
curPaddedData = curPaddedData[0:maxFlowsPerUser]
curUserLabel = list(np.array(curUserLabel)[0:maxFlowsPerUser])
curHits = list(np.array(curHits)[0:maxFlowsPerUser])
for i in range(len(curPaddedData)):
trainNames.append(curSavePath)
trainBlockLabel += curUserLabel
trainBlockHits += curHits
#curPaddedData = curPaddedData[:,:,featureTypeDict[useFeatureType]]
numCurInstances = curPaddedData.shape[0]
while overallCounter+numCurInstances > paddedData.shape[0]:
paddedData = np.vstack([paddedData,np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])])
paddedData[overallCounter:overallCounter+numCurInstances,:] = curPaddedData
overallCounter += numCurInstances
if uID % 1000 == 0:
elapsedTime = time.time() - startTime
startTime = time.time()
print(str(uID+1) + ' user processed [' + str(elapsedTime) + ']')
paddedData = paddedData[0:overallCounter]
return (paddedData,trainBlockLabel,trainNames,trainBlockHits)
def loadDataSetFromJoblibPerUser(trainDirs,minFlowsPerUser = 10,numNegPerDay = 50000,
blockSize = 10,hostDict=dict(),sipDict=dict(),
seed =1,flagSkipNoLabelUser=False,
vtLabelPath='',maxFlowsPerUser = 50000,
flagReturnDomains=False):
if vtLabelPath != '':
vtDF = pd.read_csv(vtLabelPath,sep='\t')
else:
vtDF = dict()
trainBlockLabel = []
trainNames = []
trainBlockHits = []
trainBlockDomains = []
parameterDict = dict()
numBlocksToInitialize = 10000
paddedData = np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])
overallCounter = 0
for curDirID in range(len(trainDirs)):
curDir = trainDirs[curDirID]
curLabelFile = curDir + 'data_label.joblib'
labelData = joblib.load(curLabelFile)
posIDs = np.where(np.array(labelData['label']) == 1.0)[0]
negIDs = np.where(np.array(labelData['label']) == 0.0)[0]
random.seed(seed)
random.shuffle(negIDs)
useIDs = np.concatenate([posIDs,negIDs])
counter = 0
negCounter = 0
startTime = time.time()
for uID in range(len(useIDs)):
curID = useIDs[uID]
curUserName = labelData['usernames'][curID]
curSavePath = curDir + str(curUserName) + '.joblib'
curData = joblib.load(curSavePath)['dataFrame']
if flagSkipNoLabelUser:
curUserLabel = np.max(curData['label'])
if curUserLabel == -1.0:
continue
if len(curData) < minFlowsPerUser:
continue
if numNegPerDay != -1:
if negCounter > numNegPerDay:
break
if flagReturnDomains:
(curUserData,curUserLabel,curHits,curDomains) = getChunksByUser(curData,curUserName,blockSize,\
parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=vtDF,\
flagReturnDomains=flagReturnDomains)
else:
(curUserData,curUserLabel,curHits) = getChunksByUser(curData,curUserName,blockSize,\
parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=vtDF,\
flagReturnDomains=flagReturnDomains)
curPaddedData = pad_sequences(curUserData, maxlen=blockSize,dtype='float32')
if (curPaddedData.shape[0] > maxFlowsPerUser):
curPaddedData = curPaddedData[0:maxFlowsPerUser]
curUserLabel = list(np.array(curUserLabel)[0:maxFlowsPerUser])
curHits = list(np.array(curHits)[0:maxFlowsPerUser])
if 'curDomains' in locals():
curDomains = list(np.array(curDomains)[0:maxFlowsPerUser])
for i in range(len(curPaddedData)):
trainNames.append(curUserName)
trainBlockLabel += curUserLabel
trainBlockHits += curHits
trainBlockDomains += curDomains
#curPaddedData = curPaddedData[:,:,featureTypeDict[useFeatureType]]
numCurInstances = curPaddedData.shape[0]
while overallCounter+numCurInstances > paddedData.shape[0]:
paddedData = np.vstack([paddedData,np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])])
paddedData[overallCounter:overallCounter+numCurInstances,:] = curPaddedData
overallCounter += numCurInstances
#print('num of instances: ' + str(numCurInstances))
if (counter+1) % 1000 == 0:
elapsedTime = time.time() - startTime
print('processed ' + str(counter+1) + ' users of ' +\
str(len(useIDs)) + ' with ' + str(len(curData['label'])) +\
' flows [dir ' + str(curDirID+1) + ' of ' +\
str(len(trainDirs)) + '] in ' + str(elapsedTime) + ' sec')
startTime = time.time()
if np.max(np.array(curUserLabel)) == 0.0:
negCounter += 1
counter+= 1
paddedData = paddedData[0:overallCounter]
if flagReturnDomains:
return (paddedData,trainBlockLabel,trainNames,trainBlockHits,trainBlockDomains)
else:
return (paddedData,trainBlockLabel,trainNames,trainBlockHits)
def loadRawDataSetFromJoblibPerUser(trainDirs,numNegPerDay = 2000, seed = 1):
dataFrameList = []
overallCounter = 0
from tqdm import tqdm
for curDirID in tqdm(np.arange(len(trainDirs)), miniters=1):
curDir = trainDirs[curDirID]
curLabelFile = curDir + 'data_label.joblib'
labelData = joblib.load(curLabelFile)
posIDs = np.where(np.array(labelData['label']) == 1.0)[0]
negIDs = np.where(np.array(labelData['label']) == 0.0)[0]
random.seed(seed)
random.shuffle(negIDs)
if len(negIDs) >= numNegPerDay:
negIDs = negIDs[0:numNegPerDay]
useIDs = np.concatenate([posIDs,negIDs])
for uID in range(len(useIDs)):
curID = useIDs[uID]
curUserName = labelData['usernames'][curID]
curSavePath = curDir + str(curUserName) + '.joblib'
curData = joblib.load(curSavePath)['dataFrame']
dataFrameList.append(curData)
overallCounter += 1
return dataFrameList
def checkDomainForSecondLevelDomain(inDomain,sldDomainDict):
if not 'str' in str(type(inDomain)):
return False
splitDomain = inDomain.split('.')
if len(splitDomain) <= 2:
return False
sldDomain = splitDomain[-2] + '.' + splitDomain[-1]
if sldDomain in sldDomainDict:
return True
else:
return False
'''
out = False
for sldDomain in sldDomainDict:
if inDomain.endswith(sldDomain):
out = True
break
return out
'''
def save_model(model,jsonPath,h5Path):
# saving model
json_model = model.to_json()
open(jsonPath, 'w').write(json_model)
# saving weights
model.save_weights(h5Path, overwrite=True)
def load_model(jsonPath,h5Path):
# loading model
model = model_from_json(open(jsonPath).read())
model.load_weights(h5Path)
return model
def getResultsFromSavedJoblibFile(joblibFiles,threshold=3):
testUserScores = []
testUserLabel = []
testLabel = []
testScores = []
testNames = []
for joblibPath in joblibFiles:
print('process: ' + joblibPath)
tmpJoblib = joblib.load(joblibPath)
if 'testBlockScores' in tmpJoblib.keys():
curTestBlockScores = tmpJoblib['testBlockScores']
for i in range(len(curTestBlockScores)):
if i == 0:
curTestScores = curTestBlockScores[i]
else:
curTestScores = np.concatenate([curTestScores,curTestBlockScores[i]])
curTestHits = tmpJoblib['blockHits']
curTestHits = np.array(curTestHits)
curTestScores = np.array(curTestScores)
curTestLabel = np.ones([len(curTestScores),]) * -1.0
curTestLabel[np.where(curTestHits == 0)[0]] = 0.0
curTestLabel[np.where(curTestHits >= threshold)[0]] = 1.0
curTestNames = tmpJoblib['testNames']
else:
curTestHits = tmpJoblib['testHits']
curTestScores = tmpJoblib['testScores']
curTestLabel = tmpJoblib['testLabel']
curTestNames = tmpJoblib['testNames']
useIDs = np.where(curTestHits >= threshold)[0]
useIDs = np.concatenate([useIDs,np.where(curTestHits == 0.0)[0]])
# old code
#useIDs = np.where(tmpJoblib['testLabel'] == 1.0)[0]
#useIDs = np.concatenate([useIDs,np.where(tmpJoblib['testLabel'] == 0.0)[0]])
curTestScoresT = curTestScores[useIDs]
curTestLabelT = curTestLabel[useIDs]
if len(testScores) == 0:
testScores = curTestScoresT
testLabel = curTestLabelT
else:
testScores = np.concatenate([testScores,curTestScoresT])
testLabel = np.concatenate([testLabel,curTestLabelT])
if 'testBlockScores' in tmpJoblib.keys():
tmpScores = np.array(tmpJoblib['testScores'])
tmpHits = np.array(tmpJoblib['testHits'])
tmpLabel = np.ones([len(tmpHits),])*-1
tmpLabel[np.where(tmpHits == 0.0)[0]] = 0.0
tmpLabel[np.where(tmpHits >= threshold)[0]] = 1.0
useIDs = np.where(tmpLabel == 1.0)[0]
useIDs = np.concatenate([useIDs,np.where(tmpLabel == 0.0)[0]])
testUserLabel += list(np.array(tmpLabel)[useIDs])
testUserScores += list(np.array(tmpScores)[useIDs])
else:
# get user label
uniqueTestNames = list(np.unique(curTestNames))
for testName in uniqueTestNames:
curIDs = np.where(curTestNames == testName)[0]
curMaxHits = np.max(curTestHits[curIDs])
if curMaxHits > 0 and curMaxHits < threshold:
continue
if curMaxHits >= threshold:
testUserLabel.append(1.0)
else:
testUserLabel.append(0.0)
curScore = np.max(curTestScores[curIDs])
testUserScores.append(curScore)
testNames.append(testName)
testUserScores = np.array(testUserScores)
testUserLabel = np.array(testUserLabel)
testNames = np.array(testNames)
return (testUserScores,testUserLabel,testLabel,testScores,testNames)
def checkIfIP(host):
ipMask = '^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$'
if re.search(ipMask, host) is not None:
return True
else:
return False
# GLOBAL VALUES
numCiscoFeatures = 30
featureTypeDict = {'neural':np.arange(4,104,1),\
'packet':np.array([0,1,2,3]),\
'neural+packet':np.arange(0,104,1),\
'neural+packet+cisco':np.arange(0,104+(2*numCiscoFeatures),1),\
'cisco':np.arange(104,104+(2*numCiscoFeatures),1)}
globalNumFeatures = len(featureTypeDict['neural+packet+cisco'])