removed old files from paul

This commit is contained in:
René Knaebel 2017-06-30 17:21:03 +02:00
parent d19036a611
commit deac7f9e58
2 changed files with 0 additions and 1061 deletions

View File

@ -1,846 +0,0 @@
# -*- coding: utf-8 -*-
import sys
sys.path.append('..')
sys.path.append('/mnt/projekte/pmlcluster/home/prasse/projects/ciscoSVN/cisco/trunk/code/')
import os
import numpy as np
import joblib
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
import csv
import pandas as pd
import random
from keras.models import model_from_json
import time
import re
# import mongoDBConnector as mongoDBConnector
import stackedNeuralModels as stackedNeuralModels
from tqdm import tqdm
def getCiscoDomainLabel(curDomain,curSIP,hostSet,sipSet,sldSet):
# check server-ip
if curSIP in sipSet:
return 1.0
# check second level domain
splitDomain = curDomain.split('.')
if len(splitDomain) >= 2:
curSLD = splitDomain[-2] + '.' + splitDomain[-1]
else:
curSLD = curDomain
if curSLD in sldSet:
return 1.0
# check domain
if curDomain in hostSet:
return 1.0
else:
if curSLD in hostSet:
return 1.0
else:
return 0.0
return 0.0
def getSubSample(useDir,numUser,threshold=3,
windowSize=10,minFlowsPerUser=10,
maxLen=40,flagUseCiscoFeatures=False,
urlSIPDIct=dict(),characterDict=dict(),
maxLengthInSeconds=-1,
timesNeg=-1,
mongoHost='',mongoPort=0,dbName='',
collectionName='',metaCollectionName=''):
curDFs = mongoDBConnector.sampleDataFromDir(mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName,
useDir=useDir,collectionName=collectionName,
metaCollectionName=metaCollectionName,
numUser=numUser,minFlowsPerUser=minFlowsPerUser)
domainLists = []
dfLists = []
for i in tqdm(np.arange(len(curDFs)), miniters=10):
(domainListsTmp,dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(curDFs[i],
windowSize=windowSize,overlapping=False,maxLengthInSeconds=maxLengthInSeconds)
domainLists += domainListsTmp
dfLists += dfListsTmp
(testData,testLabel,testHits,testNames) = stackedNeuralModels.createTrainData(
domainLists=domainLists,dfLists=dfLists,charachterDict=characterDict,
maxLen=maxLen,threshold = threshold,
flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=urlSIPDIct,
windowSize=windowSize)
useIDs = np.where(np.array(testLabel) == 1.0)[0]
useIDs = np.concatenate([useIDs, np.where(np.array(testLabel) == 0.0)[0]])
if timesNeg != -1:
posIDs = np.where(np.array(testLabel)[useIDs] == 1.0)[0]
negIDs = np.where(np.array(testLabel)[useIDs] == 0.0)[0]
if len(negIDs) > len(posIDs) * timesNeg:
negIDs = np.random.permutation(negIDs)
negIDs = negIDs[0:len(posIDs) * timesNeg]
negIDs = useIDs[negIDs]
posIDs = useIDs[posIDs]
useIDs = np.concatenate([negIDs,posIDs])
testLabel = testLabel[useIDs]
testHits = testHits[useIDs]
testNames = testNames[useIDs]
for i in range(len(testData)):
testData[i] = testData[i][useIDs]
return (testData,testLabel,testHits,testNames)
def getSubSampleAllPositiveUsers(useDir,threshold=3,
windowSize=10,minFlowsPerUser=10,
maxLen=40,flagUseCiscoFeatures=False,
urlSIPDIct=dict(),characterDict=dict(),
maxLengthInSeconds=-1,
numNegUser=10000,
mongoHost='',mongoPort=0,dbName='',
collectionName='',metaCollectionName=''):
curDFs = mongoDBConnector.sampleAllPositiveUserFromDir(mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName,
useDir=useDir,collectionName=collectionName,
metaCollectionName=metaCollectionName,
numNegUser=numNegUser,minFlowsPerUser=minFlowsPerUser)
domainLists = []
dfLists = []
for i in tqdm(np.arange(len(curDFs)), miniters=10):
(domainListsTmp,dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(curDFs[i],
windowSize=windowSize,overlapping=False,maxLengthInSeconds=maxLengthInSeconds)
domainLists += domainListsTmp
dfLists += dfListsTmp
(testData,testLabel,testHits,testNames) = stackedNeuralModels.createTrainData(
domainLists=domainLists,dfLists=dfLists,charachterDict=characterDict,
maxLen=maxLen,threshold = threshold,
flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=urlSIPDIct,
windowSize=windowSize)
useIDs = np.where(np.array(testLabel) == 1.0)[0]
useIDs = np.concatenate([useIDs, np.where(np.array(testLabel) == 0.0)[0]])
testLabel = testLabel[useIDs]
testHits = testHits[useIDs]
testNames = testNames[useIDs]
for i in range(len(testData)):
testData[i] = testData[i][useIDs]
return (testData,testLabel,testHits,testNames)
def sequenceGenerator(useDir,numUser,threshold=3,
windowSize=10,minFlowsPerUser=10,
maxLen=40,flagUseCiscoFeatures=False,
urlSIPDIct=dict(),characterDict=dict(),
maxLengthInSeconds=-1,
timesNeg=-1,
mongoHost='',mongoPort=0,dbName='',
collectionName='',metaCollectionName=''):
while 1:
(testData,testLabel,testHits,testNames) = getSubSample(useDir,numUser,threshold=threshold,
windowSize=windowSize,minFlowsPerUser=minFlowsPerUser,
maxLen=maxLen,flagUseCiscoFeatures=flagUseCiscoFeatures,
urlSIPDIct=urlSIPDIct,characterDict=characterDict,
maxLengthInSeconds=maxLengthInSeconds,
timesNeg=timesNeg,
mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName,
collectionName=collectionName,metaCollectionName=metaCollectionName)
testLabel = np_utils.to_categorical(testLabel, 2)
#print(testData.shape)
yield (testData, testLabel)
def sequenceGeneratorTest(data,label):
while 1:
yield (data, label)
# three different modes
# if mode == 'correct' -> dont permute or touch the ordering
# if mode == 'permutate' -> permute the ordering
# if mode == 'sort' -> sort the flows by sent bytes
def dataGenerator(trainData,trainLabel,numTimesPos,mode='correct'):
return True
def getMalwareClassDict(path):
outDict = dict()
for line in file(path):
lineSplit = line.strip().split('\t')
if len(lineSplit) == 3:
outDict[lineSplit[0]] = (lineSplit[1],lineSplit[2])
return outDict
def applyLower(inStr):
try:
return inStr.lower()
except:
return inStr
def logTransformData(inputMatrix):
# delete timestamps
try:
return np.log1p(np.array(inputMatrix,dtype='float64'))
except:
return inputMatrix
def getTrainMatrixLabelFromDataFrame(dataFrame,parameter=dict(),\
hostDict=dict(),sipDict=dict(),vtDF = dict(),
flagReturnDomains=False):
if len(dataFrame) == 0:
return ([],-1)
if 'flowFeatures' in parameter:
flowFeatures = parameter['flowFeatures']
else:
flowFeatures = ['duration','bytes_down','bytes_up']
# extract flow features
data = dataFrame[flowFeatures].values
# get time-gap feature
timeStamps = np.array(dataFrame['timeStamp'].values,dtype='float32')
timeStampsPre = np.zeros([len(timeStamps),])
timeStampsPre[1:] = timeStamps[0:len(timeStamps)-1]
diffTimeStamps = timeStamps - timeStampsPre
diffTimeStamps[0] = 0.0
negIDs = np.where(diffTimeStamps < 0.0)[0]
diffTimeStamps[negIDs] = 0.0
diffTimeStamps = np.reshape(diffTimeStamps,[len(diffTimeStamps),1])
data = np.hstack([data,diffTimeStamps])
# log transform
data = logTransformData(data)
if 'urlFeature' in dataFrame:
urlFeatures = np.zeros([len(dataFrame),len(dataFrame.iloc[0]['urlFeature'])])
for i in range(len(dataFrame)):
urlFeatures[i,:] = dataFrame.iloc[i]['urlFeature']
data = np.hstack([data,urlFeatures])
# cisco feature
numCiscoFeature = 30
ciscoFeatures = np.zeros([data.shape[0],2*numCiscoFeature])
if len(hostDict) > 0:
counter = 0
for i in range(len(dataFrame)):
row = dataFrame.iloc[i]
curHost = extractHost(row['domain'])
if curHost in hostDict:
ciscoFeatures[counter,0:numCiscoFeature] = hostDict[curHost]
if len(sipDict) > 0:
counter = 0
for i in range(len(dataFrame)):
row = dataFrame.iloc[i]
curSIP = row['server_ip']
if curSIP in sipDict:
ciscoFeatures[counter,numCiscoFeature:] = sipDict[curSIP]
data = np.hstack([data,ciscoFeatures])
if len(vtDF) != 0:
vtHashSet = set(vtDF['hash'])
hitNums = []
hashes = dataFrame['anyConnect_hash']
for curHash in hashes:
#print(vtDF.keys())
try:
if curHash.lower() in vtHashSet:
curID = np.where(vtDF['hash'] == curHash.lower())[0]
if len(curID) >= 1:
curID = curID[0]
hitNums.append(float(vtDF.iloc[curID]['hits']))
else:
hitNums.append(-1.0)
else:
hitNums.append(-1.0)
except:
hitNums.append(-1.0)
maxHits = np.max(hitNums)
else:
if 'hits' in dataFrame:
maxHits = np.max(dataFrame['hits'])
else:
maxHits = -1
label = np.max(dataFrame['label'])
if flagReturnDomains:
return (data,label,maxHits,dataFrame['domain'])
else:
return (data,label,maxHits)
def getDomainChunksByUser(data,useUserName,blockSize):
outData = []
outLabel = []
useDataAll = data[data['user_hash'] == useUserName]
userIDs = np.arange(len(useDataAll))
#print('number of found flows for user: ' + str(len(userIDs)))
numBlocks = int(np.ceil(float(len(userIDs)) / float(blockSize)))
for blockID in range(numBlocks):
curIDs = userIDs[(blockID * blockSize):((blockID+1)*blockSize)]
#print(curIDs)
useData = useDataAll.iloc[curIDs]
curDomains = useData['domain']
curLabel = np.max(useData['label'])
outData.append(curDomains)
outLabel.append(curLabel)
return (outData,outLabel)
def getChunksByUser(data,useUserName,blockSize,parameter=dict(),\
hostDict=dict(),sipDict=dict(), vtDF = dict, flagOnlyOneUser = False,
flagReturnDomains=False):
outData = []
outLabel = []
outHits = []
outDomains = []
if flagOnlyOneUser:
useDataAll = data
else:
useDataAll = data[data['user_hash'] == useUserName]
userIDs = np.arange(len(useDataAll))
#print('number of found flows for user: ' + str(len(userIDs)))
numBlocks = int(np.ceil(float(len(userIDs)) / float(blockSize)))
for blockID in range(numBlocks):
curIDs = userIDs[(blockID * blockSize):((blockID+1)*blockSize)]
#print(curIDs)
useData = useDataAll.iloc[curIDs]
if flagReturnDomains:
(curTrainData,curLabel,curMaxHits,curDomains) = getTrainMatrixLabelFromDataFrame(useData,\
parameter,hostDict,sipDict,vtDF=vtDF,flagReturnDomains=flagReturnDomains)
else:
(curTrainData,curLabel,curMaxHits) = getTrainMatrixLabelFromDataFrame(useData,\
parameter,hostDict,sipDict,vtDF=vtDF,flagReturnDomains=flagReturnDomains)
outData.append(curTrainData)
outLabel.append(curLabel)
outHits.append(curMaxHits)
if flagReturnDomains:
outDomains.append(curDomains)
if flagReturnDomains:
return (outData,outLabel,outHits,outDomains)
else:
return (outData,outLabel,outHits)
def getLSTMModel(blockSize=10,input_dim=103,lstmUnits=10,denseSize=128):
nb_classes = 2
model = Sequential()
model.add(LSTM(lstmUnits, input_dim=input_dim, input_length=blockSize))
model.add(Dense(denseSize, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes, activation='softmax'))
model.compile(loss='binary_crossentropy',
optimizer='adam', metrics=['accuracy'])
# number of params:
# params = 4 * ((size_of_input + 1) * size_of_output + size_of_output^2)
return model
def getCiscoURLFeatureForRow(row):
sortKeys = list(row.keys())
sortKeys.sort()
featureVec = np.zeros([len(sortKeys)-1,])
counter = 0
for keyName in sortKeys:
if 'key' in keyName:
continue
try:
featureVec[counter] = float(row[keyName])
except:
featureVec[counter] = 0.0
counter += 1
featureVec[np.where(np.isnan(featureVec))[0]] = 0.0
return featureVec
def getCiscoFeatureDictForHost(headerPath,dataPath):
# get header
header = []
for line in file(headerPath):
header.append(line.strip())
header = ['key'] + header
fobj = open(dataPath,'r')
csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t')
hostDict = dict()
counter = 0
for row in csvReader:
featureVec = getCiscoURLFeatureForRow(row)
curHost = row['key']
curHost = extractHost(curHost)
hostDict[curHost] = featureVec
counter += 1
if counter % 10000 == 0:
print(str(counter) + ' host features collected')
return hostDict
def getCiscoFeatureDictForSIP(headerPath,dataPath):
# get header
header = []
for line in file(headerPath):
header.append(line.strip())
header = ['key'] + header
fobj = open(dataPath,'r')
csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t')
hostDict = dict()
counter = 0
for row in csvReader:
featureVec = getCiscoURLFeatureForRow(row)
curHost = row['key']
hostDict[curHost] = featureVec
counter += 1
if counter % 10000 == 0:
print(str(counter) + ' sip features collected')
return hostDict
def getCiscoFeatureDictForSLD(headerPath,dataPath):
# get header
header = []
for line in file(headerPath):
header.append(line.strip())
header = ['key'] + header
fobj = open(dataPath,'r')
csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t')
hostDict = dict()
counter = 0
for row in csvReader:
featureVec = getCiscoURLFeatureForRow(row)
curHost = row['key']
hostDict[curHost] = featureVec
counter += 1
if counter % 10000 == 0:
print(str(counter) + ' sld features collected')
return hostDict
def extractHost(domain):
curHostSplit = domain.split('.')
try:
curHost = curHostSplit[-2] + '.' + curHostSplit[-1]
return curHost
except:
return domain
def loadDataSetFromJoblib(trainDirs,minFlowsPerUser = 10,numTimesPos = 20):
for dirID in range(len(trainDirs)):
curDir = trainDirs[dirID]
curFiles = os.listdir(curDir)
dayJoblibCounter = 0
for curFile in curFiles:
curFile = curDir + curFile
if curFile.endswith('.joblib'):
curData = joblib.load(curFile)
if dayJoblibCounter == 0:
dayData = curData
else:
dayData = dayData.append(curData,ignore_index=True)
dayJoblibCounter += 1
print('processed file number: ' + str(dayJoblibCounter) + ' (dir ' + str(curDir) +')')
# use flows with min minFlowsPerUser flows
if minFlowsPerUser != -1:
grouped = dayData.groupby('user_hash')
useUsers = set()
for grouping in grouped:
numFlowsCurUser = len(grouping[1])
userLabel = np.max(grouping[1]['label'])
if numFlowsCurUser >= minFlowsPerUser and userLabel != -1.0:
useUsers.add(grouping[0])
# get ids
userIDs = dayData.loc[dayData['user_hash'].isin(useUsers)].index.values
dayData = dayData.iloc[userIDs]
dayData = dayData.reset_index(drop=True)
if numTimesPos != -1:
grouped = dayData.groupby('user_hash')
curUserLabel = []
curUserNames = []
for grouping in grouped:
numFlowsCurUser = len(grouping[1])
userLabel = np.max(grouping[1]['label'])
curUserLabel.append(userLabel)
curUserNames.append(grouping[1]['user_hash'].values[0])
posIDs = np.where(np.array(curUserLabel) == 1.0)[0]
negIDs = np.where(np.array(curUserLabel) == 0.0)[0]
maxNegLabel = len(posIDs) * numTimesPos
if len(negIDs) > maxNegLabel:
np.random.seed(1)
np.random.shuffle(negIDs)
negIDs = negIDs[0:maxNegLabel]
useIDs = np.concatenate([posIDs,negIDs])
useUsers = np.array(curUserNames)[useIDs]
useUsers = set(useUsers)
# get ids
userIDs = dayData.loc[dayData['user_hash'].isin(useUsers)].index.values
dayData = dayData.iloc[userIDs]
dayData = dayData.reset_index(drop=True)
if dirID == 0:
allData = dayData
else:
allData = allData.append(dayData,ignore_index=True)
return allData
def tokenizeDomain(domain,n=3):
domain = domain.replace('https://','')
domain = domain.replace('www.','')
domain = domain.replace('/','')
# reverse domain
domain = domain[::-1]
outList = []
splitCriterion = n
# overlapping n-grams
outList = [domain[i:i+splitCriterion] for i in range(0, len(domain), 1)]
return outList
def getDomainsInWindowData(allData,numNeg=-1,blockSize=10):
uniqueTrainUser = np.unique(allData['user_hash'])
userLabel = []
for curTrainUser in uniqueTrainUser:
userIDs = allData.loc[allData['user_hash'] == curTrainUser].index.values
curLabel = np.max(allData.iloc[userIDs]['label'])
userLabel.append(curLabel)
negIDs = np.where(np.array(userLabel) == 0.0)[0]
userLabel = np.array(userLabel)
posUser = np.where(userLabel == 1.0)[0]
negUser = np.where(userLabel == 0.0)[0]
if numNeg != -1:
if len(negUser) > numNeg:
np.random.shuffle(negUser)
negUser = negIDs[0:numNeg]
useUser = posUser
useUser = np.concatenate([posUser,negUser])
counter = 0
trainDomains = []
trainBlockLabel = []
trainNames = []
for uID in range(len(useUser)):
curTrainUser = uniqueTrainUser[useUser[uID]]
(curUserData,curUserLabel) = getDomainChunksByUser(allData,curTrainUser,blockSize)
for i in range(len(curUserLabel)):
trainNames.append(curTrainUser)
trainDomains += curUserData
trainBlockLabel += curUserLabel
print('processed ' + str(counter) + ' users of ' + str(len(useUser)))
counter+= 1
return (trainDomains,trainBlockLabel,trainNames)
def getPaddedData(allData,numNeg=-1,blockSize=10,parameterDict=dict(),\
hostDict=dict(),sipDict = dict(),vtLabelPath=''):
if vtLabelPath != '':
vtDF = pd.read_csv(vtLabelPath,sep='\t')
else:
vtDF = dict()
uniqueTrainUser = np.unique(allData['user_hash'])
userLabel = []
for curTrainUser in uniqueTrainUser:
userIDs = allData.loc[allData['user_hash'] == curTrainUser].index.values
curLabel = np.max(allData.iloc[userIDs]['label'])
userLabel.append(curLabel)
negIDs = np.where(np.array(userLabel) == 0.0)[0]
userLabel = np.array(userLabel)
posUser = np.where(userLabel == 1.0)[0]
negUser = np.where(userLabel == 0.0)[0]
if numNeg != -1:
if len(negUser) > numNeg:
np.random.shuffle(negUser)
negUser = negIDs[0:numNeg]
useUser = posUser
useUser = np.concatenate([posUser,negUser])
counter = 0
trainBlocks = []
trainBlockLabel = []
trainNames = []
trainBlockHits = []
for uID in range(len(useUser)):
curTrainUser = uniqueTrainUser[useUser[uID]]
(curUserData,curUserLabel,curHits) = getChunksByUser(allData,curTrainUser,blockSize,\
parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF = vtDF)
for i in range(len(curUserLabel)):
trainNames.append(curTrainUser)
trainBlocks += curUserData
trainBlockLabel += curUserLabel
trainBlockHits += curHits
print('processed ' + str(counter) + ' users of ' + str(len(useUser)))
counter+= 1
paddedData = pad_sequences(trainBlocks, maxlen=blockSize,dtype='float32')
#paddedData = paddedData[:,:,featureTypeDict[useFeatureType]]
return (paddedData,trainBlockLabel,trainNames,trainBlockHits)
def createTrainDataFromJoblibsPerUser(joblibPaths,minFlowsPerUser = 10,blockSize=10,
hostDict=dict(),sipDict=dict(),
vtLabelPath='',maxFlowsPerUser = 50000):
trainBlockLabel = []
trainNames = []
trainBlockHits = []
parameterDict = dict()
numBlocksToInitialize = 10000
paddedData = np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])
overallCounter = 0
startTime = time.time()
for uID in range(len(joblibPaths)):
curSavePath = joblibPaths[uID]
curData = joblib.load(curSavePath)['dataFrame']
if len(curData) < minFlowsPerUser:
continue
#curUserName = np.unique(curData['user_hash'])[0]
(curUserData,curUserLabel,curHits) = getChunksByUser(curData,'',blockSize,\
parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=dict(),flagOnlyOneUser = True)
curPaddedData = pad_sequences(curUserData, maxlen=blockSize,dtype='float32')
if (curPaddedData.shape[0] > maxFlowsPerUser):
curPaddedData = curPaddedData[0:maxFlowsPerUser]
curUserLabel = list(np.array(curUserLabel)[0:maxFlowsPerUser])
curHits = list(np.array(curHits)[0:maxFlowsPerUser])
for i in range(len(curPaddedData)):
trainNames.append(curSavePath)
trainBlockLabel += curUserLabel
trainBlockHits += curHits
#curPaddedData = curPaddedData[:,:,featureTypeDict[useFeatureType]]
numCurInstances = curPaddedData.shape[0]
while overallCounter+numCurInstances > paddedData.shape[0]:
paddedData = np.vstack([paddedData,np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])])
paddedData[overallCounter:overallCounter+numCurInstances,:] = curPaddedData
overallCounter += numCurInstances
if uID % 1000 == 0:
elapsedTime = time.time() - startTime
startTime = time.time()
print(str(uID+1) + ' user processed [' + str(elapsedTime) + ']')
paddedData = paddedData[0:overallCounter]
return (paddedData,trainBlockLabel,trainNames,trainBlockHits)
def loadDataSetFromJoblibPerUser(trainDirs,minFlowsPerUser = 10,numNegPerDay = 50000,
blockSize = 10,hostDict=dict(),sipDict=dict(),
seed =1,flagSkipNoLabelUser=False,
vtLabelPath='',maxFlowsPerUser = 50000,
flagReturnDomains=False):
if vtLabelPath != '':
vtDF = pd.read_csv(vtLabelPath,sep='\t')
else:
vtDF = dict()
trainBlockLabel = []
trainNames = []
trainBlockHits = []
trainBlockDomains = []
parameterDict = dict()
numBlocksToInitialize = 10000
paddedData = np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])
overallCounter = 0
for curDirID in range(len(trainDirs)):
curDir = trainDirs[curDirID]
curLabelFile = curDir + 'data_label.joblib'
labelData = joblib.load(curLabelFile)
posIDs = np.where(np.array(labelData['label']) == 1.0)[0]
negIDs = np.where(np.array(labelData['label']) == 0.0)[0]
random.seed(seed)
random.shuffle(negIDs)
useIDs = np.concatenate([posIDs,negIDs])
counter = 0
negCounter = 0
startTime = time.time()
for uID in range(len(useIDs)):
curID = useIDs[uID]
curUserName = labelData['usernames'][curID]
curSavePath = curDir + str(curUserName) + '.joblib'
curData = joblib.load(curSavePath)['dataFrame']
if flagSkipNoLabelUser:
curUserLabel = np.max(curData['label'])
if curUserLabel == -1.0:
continue
if len(curData) < minFlowsPerUser:
continue
if numNegPerDay != -1:
if negCounter > numNegPerDay:
break
if flagReturnDomains:
(curUserData,curUserLabel,curHits,curDomains) = getChunksByUser(curData,curUserName,blockSize,\
parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=vtDF,\
flagReturnDomains=flagReturnDomains)
else:
(curUserData,curUserLabel,curHits) = getChunksByUser(curData,curUserName,blockSize,\
parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=vtDF,\
flagReturnDomains=flagReturnDomains)
curPaddedData = pad_sequences(curUserData, maxlen=blockSize,dtype='float32')
if (curPaddedData.shape[0] > maxFlowsPerUser):
curPaddedData = curPaddedData[0:maxFlowsPerUser]
curUserLabel = list(np.array(curUserLabel)[0:maxFlowsPerUser])
curHits = list(np.array(curHits)[0:maxFlowsPerUser])
if 'curDomains' in locals():
curDomains = list(np.array(curDomains)[0:maxFlowsPerUser])
for i in range(len(curPaddedData)):
trainNames.append(curUserName)
trainBlockLabel += curUserLabel
trainBlockHits += curHits
trainBlockDomains += curDomains
#curPaddedData = curPaddedData[:,:,featureTypeDict[useFeatureType]]
numCurInstances = curPaddedData.shape[0]
while overallCounter+numCurInstances > paddedData.shape[0]:
paddedData = np.vstack([paddedData,np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])])
paddedData[overallCounter:overallCounter+numCurInstances,:] = curPaddedData
overallCounter += numCurInstances
#print('num of instances: ' + str(numCurInstances))
if (counter+1) % 1000 == 0:
elapsedTime = time.time() - startTime
print('processed ' + str(counter+1) + ' users of ' +\
str(len(useIDs)) + ' with ' + str(len(curData['label'])) +\
' flows [dir ' + str(curDirID+1) + ' of ' +\
str(len(trainDirs)) + '] in ' + str(elapsedTime) + ' sec')
startTime = time.time()
if np.max(np.array(curUserLabel)) == 0.0:
negCounter += 1
counter+= 1
paddedData = paddedData[0:overallCounter]
if flagReturnDomains:
return (paddedData,trainBlockLabel,trainNames,trainBlockHits,trainBlockDomains)
else:
return (paddedData,trainBlockLabel,trainNames,trainBlockHits)
def loadRawDataSetFromJoblibPerUser(trainDirs,numNegPerDay = 2000, seed = 1):
dataFrameList = []
overallCounter = 0
from tqdm import tqdm
for curDirID in tqdm(np.arange(len(trainDirs)), miniters=1):
curDir = trainDirs[curDirID]
curLabelFile = curDir + 'data_label.joblib'
labelData = joblib.load(curLabelFile)
posIDs = np.where(np.array(labelData['label']) == 1.0)[0]
negIDs = np.where(np.array(labelData['label']) == 0.0)[0]
random.seed(seed)
random.shuffle(negIDs)
if len(negIDs) >= numNegPerDay:
negIDs = negIDs[0:numNegPerDay]
useIDs = np.concatenate([posIDs,negIDs])
for uID in range(len(useIDs)):
curID = useIDs[uID]
curUserName = labelData['usernames'][curID]
curSavePath = curDir + str(curUserName) + '.joblib'
curData = joblib.load(curSavePath)['dataFrame']
dataFrameList.append(curData)
overallCounter += 1
return dataFrameList
def checkDomainForSecondLevelDomain(inDomain,sldDomainDict):
if not 'str' in str(type(inDomain)):
return False
splitDomain = inDomain.split('.')
if len(splitDomain) <= 2:
return False
sldDomain = splitDomain[-2] + '.' + splitDomain[-1]
if sldDomain in sldDomainDict:
return True
else:
return False
'''
out = False
for sldDomain in sldDomainDict:
if inDomain.endswith(sldDomain):
out = True
break
return out
'''
def save_model(model,jsonPath,h5Path):
# saving model
json_model = model.to_json()
open(jsonPath, 'w').write(json_model)
# saving weights
model.save_weights(h5Path, overwrite=True)
def load_model(jsonPath,h5Path):
# loading model
model = model_from_json(open(jsonPath).read())
model.load_weights(h5Path)
return model
def getResultsFromSavedJoblibFile(joblibFiles,threshold=3):
testUserScores = []
testUserLabel = []
testLabel = []
testScores = []
testNames = []
for joblibPath in joblibFiles:
print('process: ' + joblibPath)
tmpJoblib = joblib.load(joblibPath)
if 'testBlockScores' in tmpJoblib.keys():
curTestBlockScores = tmpJoblib['testBlockScores']
for i in range(len(curTestBlockScores)):
if i == 0:
curTestScores = curTestBlockScores[i]
else:
curTestScores = np.concatenate([curTestScores,curTestBlockScores[i]])
curTestHits = tmpJoblib['blockHits']
curTestHits = np.array(curTestHits)
curTestScores = np.array(curTestScores)
curTestLabel = np.ones([len(curTestScores),]) * -1.0
curTestLabel[np.where(curTestHits == 0)[0]] = 0.0
curTestLabel[np.where(curTestHits >= threshold)[0]] = 1.0
curTestNames = tmpJoblib['testNames']
else:
curTestHits = tmpJoblib['testHits']
curTestScores = tmpJoblib['testScores']
curTestLabel = tmpJoblib['testLabel']
curTestNames = tmpJoblib['testNames']
useIDs = np.where(curTestHits >= threshold)[0]
useIDs = np.concatenate([useIDs,np.where(curTestHits == 0.0)[0]])
# old code
#useIDs = np.where(tmpJoblib['testLabel'] == 1.0)[0]
#useIDs = np.concatenate([useIDs,np.where(tmpJoblib['testLabel'] == 0.0)[0]])
curTestScoresT = curTestScores[useIDs]
curTestLabelT = curTestLabel[useIDs]
if len(testScores) == 0:
testScores = curTestScoresT
testLabel = curTestLabelT
else:
testScores = np.concatenate([testScores,curTestScoresT])
testLabel = np.concatenate([testLabel,curTestLabelT])
if 'testBlockScores' in tmpJoblib.keys():
tmpScores = np.array(tmpJoblib['testScores'])
tmpHits = np.array(tmpJoblib['testHits'])
tmpLabel = np.ones([len(tmpHits),])*-1
tmpLabel[np.where(tmpHits == 0.0)[0]] = 0.0
tmpLabel[np.where(tmpHits >= threshold)[0]] = 1.0
useIDs = np.where(tmpLabel == 1.0)[0]
useIDs = np.concatenate([useIDs,np.where(tmpLabel == 0.0)[0]])
testUserLabel += list(np.array(tmpLabel)[useIDs])
testUserScores += list(np.array(tmpScores)[useIDs])
else:
# get user label
uniqueTestNames = list(np.unique(curTestNames))
for testName in uniqueTestNames:
curIDs = np.where(curTestNames == testName)[0]
curMaxHits = np.max(curTestHits[curIDs])
if curMaxHits > 0 and curMaxHits < threshold:
continue
if curMaxHits >= threshold:
testUserLabel.append(1.0)
else:
testUserLabel.append(0.0)
curScore = np.max(curTestScores[curIDs])
testUserScores.append(curScore)
testNames.append(testName)
testUserScores = np.array(testUserScores)
testUserLabel = np.array(testUserLabel)
testNames = np.array(testNames)
return (testUserScores,testUserLabel,testLabel,testScores,testNames)
def checkIfIP(host):
ipMask = '^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$'
if re.search(ipMask, host) is not None:
return True
else:
return False
# GLOBAL VALUES
numCiscoFeatures = 30
featureTypeDict = {'neural':np.arange(4,104,1),\
'packet':np.array([0,1,2,3]),\
'neural+packet':np.arange(0,104,1),\
'neural+packet+cisco':np.arange(0,104+(2*numCiscoFeatures),1),\
'cisco':np.arange(104,104+(2*numCiscoFeatures),1)}
globalNumFeatures = len(featureTypeDict['neural+packet+cisco'])

View File

@ -1,215 +0,0 @@
# -*- coding: utf-8 -*-
import csv
import numpy as np
from keras.layers import Dense, Activation, Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Lambda
from keras.layers import Input
from keras.models import Model
from keras.models import Sequential
from tqdm import tqdm
def getCiscoFeatures(curDataLine, urlSIPDict):
numCiscoFeatures = 30
try:
ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
# print('cisco features: ' + str(ciscoFeatures))
# log transform
ciscoFeatures = np.log1p(ciscoFeatures, dtype='float32')
# print('log transformed: ' + str(ciscoFeatures))
return ciscoFeatures.ravel()
except:
return np.zeros([numCiscoFeatures, ]).ravel()
def getCNNWithoutLastLayer(vocabSize, embeddingSize, input_length, filters, kernel_size,
hidden_dims, drop_out):
model = Sequential()
model.add(Embedding(input_dim=vocabSize, output_dim=embeddingSize,
input_length=input_length))
model.add(Conv1D(filters,
kernel_size,
activation='relu'))
# we use max pooling:
model.add(GlobalMaxPooling1D())
# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(drop_out))
model.add(Activation('relu'))
return model
def getCNNWitoutLastLayerFunctional(vocabSize, embeddingSize, input_length, filters, kernel_size,
hidden_dims, drop_out):
a = Input(shape=(input_length,))
embedding = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(a)
conv1 = Conv1D(filters, kernel_size, activation='relu')(embedding)
glob = GlobalMaxPooling1D()(conv1)
dense = Dense(hidden_dims)(glob)
drop = Dropout(drop_out)(dense)
model = Activation('relu')(drop)
model = Model(a, model)
return model
def getFlowFeatureLayer(numFeatures):
model = Sequential()
# slpModel.add(Dense(1, input_shape=(1,)))
model.add(Lambda(lambda x: x + 0.0, input_shape=(numFeatures,)))
return model
def createCNNDataSet(domains, label, characterDict, maxLen=40):
# process domains in reverse order
outFeature = np.zeros([len(domains), maxLen])
outLabel = np.zeros([len(domains), ])
for i in range(len(domains)):
domain = domains[i]
curLabel = label[i]
curFeature = np.zeros([maxLen, ])
# print(domain + ' ' + str(len(domain)))
for j in range(np.min([len(domain), maxLen])):
# print(j)
curCharacter = domain[-j]
if curCharacter in characterDict:
curFeature[j] = characterDict[curCharacter]
outFeature[i] = curFeature
outLabel[i] = curLabel
return (outFeature, outLabel)
def getFeatureVecForDomain(domain, characterDict, maxLen=40):
curFeature = np.zeros([maxLen, ])
for j in range(np.min([len(domain), maxLen])):
# print(j)
curCharacter = domain[-j]
if curCharacter in characterDict:
curFeature[j] = characterDict[curCharacter]
return curFeature
def getFlowFeatures(curDataLine):
useKeys = ['duration', 'bytes_down', 'bytes_up']
curFeature = np.zeros([len(useKeys), ])
for i in range(len(useKeys)):
curKey = useKeys[i]
try:
curFeature[i] = np.log1p(curDataLine[curKey], dtype='float32')
except:
pass
return curFeature
def getChunksFromUserDataFrame(dataFrame, windowSize=10, overlapping=False,
maxLengthInSeconds=300):
# print('maxLength: ' + str(maxLengthInSeconds))
maxMilliSeconds = maxLengthInSeconds * 1000
outDomainLists = []
outDFFrames = []
if overlapping == False:
numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize)))
userIDs = np.arange(len(dataFrame))
for blockID in np.arange(numBlocks):
curIDs = userIDs[(blockID * windowSize):((blockID + 1) * windowSize)]
# print(curIDs)
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
if maxLengthInSeconds != -1:
curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
if len(underTimeOutIDs) != len(curIDs):
curIDs = curIDs[underTimeOutIDs]
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
outDomainLists.append(list(curDomains))
outDFFrames.append(useData)
else:
numBlocks = len(dataFrame) + 1 - windowSize
userIDs = np.arange(len(dataFrame))
for blockID in np.arange(numBlocks):
curIDs = userIDs[blockID:blockID + windowSize]
# print(curIDs)
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
if maxLengthInSeconds != -1:
curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
if len(underTimeOutIDs) != len(curIDs):
curIDs = curIDs[underTimeOutIDs]
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
outDomainLists.append(list(curDomains))
outDFFrames.append(useData)
return (outDomainLists, outDFFrames)
def createTrainData(domainLists, dfLists, charachterDict, maxLen, threshold=3,
flagUseCiscoFeatures=False, urlSIPDIct=dict,
windowSize=10):
if 'hits' in dfLists[0].keys():
hitName = 'hits'
elif 'virusTotalHits' in dfLists[0].keys():
hitName = 'virusTotalHits'
numFlowFeatures = 3
numCiscoFeatures = 30
numFeatures = numFlowFeatures
if flagUseCiscoFeatures:
numFeatures += numCiscoFeatures
outputFeatures = []
label = []
hits = []
trainNames = []
for i in range(windowSize):
outputFeatures.append(np.zeros([len(domainLists), maxLen]))
outputFeatures.append(np.zeros([len(domainLists), numFeatures]))
for i in tqdm(np.arange(len(domainLists)), miniters=10):
curCounter = 0
# print('len domainList: ' + str(len(domainLists[i])))
# print('len df: ' + str(len(dfLists[i])))
for j in range(np.min([windowSize, len(domainLists[i])])):
outputFeatures[curCounter][i, :] = getFeatureVecForDomain(domainLists[i][j], charachterDict, maxLen)
curCounter += 1
if flagUseCiscoFeatures:
outputFeatures[curCounter][i, 0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j])
outputFeatures[curCounter][i, numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j], urlSIPDIct)
else:
outputFeatures[curCounter][i, :] = getFlowFeatures(dfLists[i].iloc[j])
curCounter += 1
curLabel = 0.0
if np.max(dfLists[i][hitName]) >= threshold:
curLabel = 1.0
elif np.max(dfLists[i][hitName]) == -1:
curLabel = -1.0
elif np.max(dfLists[i][hitName]) > 0 and np.max(dfLists[i][hitName]) < threshold:
curLabel = -2.0
label.append(curLabel)
hits.append(np.max(dfLists[i][hitName]))
trainNames.append(np.unique(dfLists[i]['user_hash']))
return (outputFeatures, np.array(label), np.array(hits), np.array(trainNames))
def transformStringListToNumpyArray(listString):
listString = listString.replace('[', '').replace(']', '')
return np.array(listString.split(','), dtype='float32')
def getCiscoFeatureDict(csvPathList):
outDict = dict()
for path in tqdm(csvPathList, miniters=1):
fobj = open(path, 'r')
csvReader = csv.DictReader(fobj, delimiter=',')
for row in csvReader:
urlSIPString = row['Domain'] + row['ServerIP']
ciscoFeatures = row['CiscoFeature']
outDict[urlSIPString] = transformStringListToNumpyArray(ciscoFeatures)
# if len(outDict) % 10000 == 0:
# print('numbers in dict: ' + str(len(outDict)))
return outDict
if __name__ == "__main__":
pass