added pauls scripts

This commit is contained in:
d069717 2017-06-27 20:29:19 +02:00
parent 3121729464
commit 24d677e101
3 changed files with 1401 additions and 0 deletions

846
ciscoProcessing.py Normal file
View File

@ -0,0 +1,846 @@
# -*- coding: utf-8 -*-
import sys
sys.path.append('..')
sys.path.append('/mnt/projekte/pmlcluster/home/prasse/projects/ciscoSVN/cisco/trunk/code/')
import os
import numpy as np
import joblib
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
import csv
import pandas as pd
import random
from keras.models import model_from_json
import time
import re
import mongoDBConnector as mongoDBConnector
import stackedNeuralModels as stackedNeuralModels
from tqdm import tqdm
def getCiscoDomainLabel(curDomain,curSIP,hostSet,sipSet,sldSet):
# check server-ip
if curSIP in sipSet:
return 1.0
# check second level domain
splitDomain = curDomain.split('.')
if len(splitDomain) >= 2:
curSLD = splitDomain[-2] + '.' + splitDomain[-1]
else:
curSLD = curDomain
if curSLD in sldSet:
return 1.0
# check domain
if curDomain in hostSet:
return 1.0
else:
if curSLD in hostSet:
return 1.0
else:
return 0.0
return 0.0
def getSubSample(useDir,numUser,threshold=3,
windowSize=10,minFlowsPerUser=10,
maxLen=40,flagUseCiscoFeatures=False,
urlSIPDIct=dict(),characterDict=dict(),
maxLengthInSeconds=-1,
timesNeg=-1,
mongoHost='',mongoPort=0,dbName='',
collectionName='',metaCollectionName=''):
curDFs = mongoDBConnector.sampleDataFromDir(mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName,
useDir=useDir,collectionName=collectionName,
metaCollectionName=metaCollectionName,
numUser=numUser,minFlowsPerUser=minFlowsPerUser)
domainLists = []
dfLists = []
for i in tqdm(np.arange(len(curDFs)), miniters=10):
(domainListsTmp,dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(curDFs[i],
windowSize=windowSize,overlapping=False,maxLengthInSeconds=maxLengthInSeconds)
domainLists += domainListsTmp
dfLists += dfListsTmp
(testData,testLabel,testHits,testNames) = stackedNeuralModels.createTrainData(
domainLists=domainLists,dfLists=dfLists,charachterDict=characterDict,
maxLen=maxLen,threshold = threshold,
flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=urlSIPDIct,
windowSize=windowSize)
useIDs = np.where(np.array(testLabel) == 1.0)[0]
useIDs = np.concatenate([useIDs, np.where(np.array(testLabel) == 0.0)[0]])
if timesNeg != -1:
posIDs = np.where(np.array(testLabel)[useIDs] == 1.0)[0]
negIDs = np.where(np.array(testLabel)[useIDs] == 0.0)[0]
if len(negIDs) > len(posIDs) * timesNeg:
negIDs = np.random.permutation(negIDs)
negIDs = negIDs[0:len(posIDs) * timesNeg]
negIDs = useIDs[negIDs]
posIDs = useIDs[posIDs]
useIDs = np.concatenate([negIDs,posIDs])
testLabel = testLabel[useIDs]
testHits = testHits[useIDs]
testNames = testNames[useIDs]
for i in range(len(testData)):
testData[i] = testData[i][useIDs]
return (testData,testLabel,testHits,testNames)
def getSubSampleAllPositiveUsers(useDir,threshold=3,
windowSize=10,minFlowsPerUser=10,
maxLen=40,flagUseCiscoFeatures=False,
urlSIPDIct=dict(),characterDict=dict(),
maxLengthInSeconds=-1,
numNegUser=10000,
mongoHost='',mongoPort=0,dbName='',
collectionName='',metaCollectionName=''):
curDFs = mongoDBConnector.sampleAllPositiveUserFromDir(mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName,
useDir=useDir,collectionName=collectionName,
metaCollectionName=metaCollectionName,
numNegUser=numNegUser,minFlowsPerUser=minFlowsPerUser)
domainLists = []
dfLists = []
for i in tqdm(np.arange(len(curDFs)), miniters=10):
(domainListsTmp,dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(curDFs[i],
windowSize=windowSize,overlapping=False,maxLengthInSeconds=maxLengthInSeconds)
domainLists += domainListsTmp
dfLists += dfListsTmp
(testData,testLabel,testHits,testNames) = stackedNeuralModels.createTrainData(
domainLists=domainLists,dfLists=dfLists,charachterDict=characterDict,
maxLen=maxLen,threshold = threshold,
flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=urlSIPDIct,
windowSize=windowSize)
useIDs = np.where(np.array(testLabel) == 1.0)[0]
useIDs = np.concatenate([useIDs, np.where(np.array(testLabel) == 0.0)[0]])
testLabel = testLabel[useIDs]
testHits = testHits[useIDs]
testNames = testNames[useIDs]
for i in range(len(testData)):
testData[i] = testData[i][useIDs]
return (testData,testLabel,testHits,testNames)
def sequenceGenerator(useDir,numUser,threshold=3,
windowSize=10,minFlowsPerUser=10,
maxLen=40,flagUseCiscoFeatures=False,
urlSIPDIct=dict(),characterDict=dict(),
maxLengthInSeconds=-1,
timesNeg=-1,
mongoHost='',mongoPort=0,dbName='',
collectionName='',metaCollectionName=''):
while 1:
(testData,testLabel,testHits,testNames) = getSubSample(useDir,numUser,threshold=threshold,
windowSize=windowSize,minFlowsPerUser=minFlowsPerUser,
maxLen=maxLen,flagUseCiscoFeatures=flagUseCiscoFeatures,
urlSIPDIct=urlSIPDIct,characterDict=characterDict,
maxLengthInSeconds=maxLengthInSeconds,
timesNeg=timesNeg,
mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName,
collectionName=collectionName,metaCollectionName=metaCollectionName)
testLabel = np_utils.to_categorical(testLabel, 2)
#print(testData.shape)
yield (testData, testLabel)
def sequenceGeneratorTest(data,label):
while 1:
yield (data, label)
# three different modes
# if mode == 'correct' -> dont permute or touch the ordering
# if mode == 'permutate' -> permute the ordering
# if mode == 'sort' -> sort the flows by sent bytes
def dataGenerator(trainData,trainLabel,numTimesPos,mode='correct'):
return True
def getMalwareClassDict(path):
outDict = dict()
for line in file(path):
lineSplit = line.strip().split('\t')
if len(lineSplit) == 3:
outDict[lineSplit[0]] = (lineSplit[1],lineSplit[2])
return outDict
def applyLower(inStr):
try:
return inStr.lower()
except:
return inStr
def logTransformData(inputMatrix):
# delete timestamps
try:
return np.log1p(np.array(inputMatrix,dtype='float64'))
except:
return inputMatrix
def getTrainMatrixLabelFromDataFrame(dataFrame,parameter=dict(),\
hostDict=dict(),sipDict=dict(),vtDF = dict(),
flagReturnDomains=False):
if len(dataFrame) == 0:
return ([],-1)
if 'flowFeatures' in parameter:
flowFeatures = parameter['flowFeatures']
else:
flowFeatures = ['duration','bytes_down','bytes_up']
# extract flow features
data = dataFrame[flowFeatures].values
# get time-gap feature
timeStamps = np.array(dataFrame['timeStamp'].values,dtype='float32')
timeStampsPre = np.zeros([len(timeStamps),])
timeStampsPre[1:] = timeStamps[0:len(timeStamps)-1]
diffTimeStamps = timeStamps - timeStampsPre
diffTimeStamps[0] = 0.0
negIDs = np.where(diffTimeStamps < 0.0)[0]
diffTimeStamps[negIDs] = 0.0
diffTimeStamps = np.reshape(diffTimeStamps,[len(diffTimeStamps),1])
data = np.hstack([data,diffTimeStamps])
# log transform
data = logTransformData(data)
if 'urlFeature' in dataFrame:
urlFeatures = np.zeros([len(dataFrame),len(dataFrame.iloc[0]['urlFeature'])])
for i in range(len(dataFrame)):
urlFeatures[i,:] = dataFrame.iloc[i]['urlFeature']
data = np.hstack([data,urlFeatures])
# cisco feature
numCiscoFeature = 30
ciscoFeatures = np.zeros([data.shape[0],2*numCiscoFeature])
if len(hostDict) > 0:
counter = 0
for i in range(len(dataFrame)):
row = dataFrame.iloc[i]
curHost = extractHost(row['domain'])
if curHost in hostDict:
ciscoFeatures[counter,0:numCiscoFeature] = hostDict[curHost]
if len(sipDict) > 0:
counter = 0
for i in range(len(dataFrame)):
row = dataFrame.iloc[i]
curSIP = row['server_ip']
if curSIP in sipDict:
ciscoFeatures[counter,numCiscoFeature:] = sipDict[curSIP]
data = np.hstack([data,ciscoFeatures])
if len(vtDF) != 0:
vtHashSet = set(vtDF['hash'])
hitNums = []
hashes = dataFrame['anyConnect_hash']
for curHash in hashes:
#print(vtDF.keys())
try:
if curHash.lower() in vtHashSet:
curID = np.where(vtDF['hash'] == curHash.lower())[0]
if len(curID) >= 1:
curID = curID[0]
hitNums.append(float(vtDF.iloc[curID]['hits']))
else:
hitNums.append(-1.0)
else:
hitNums.append(-1.0)
except:
hitNums.append(-1.0)
maxHits = np.max(hitNums)
else:
if 'hits' in dataFrame:
maxHits = np.max(dataFrame['hits'])
else:
maxHits = -1
label = np.max(dataFrame['label'])
if flagReturnDomains:
return (data,label,maxHits,dataFrame['domain'])
else:
return (data,label,maxHits)
def getDomainChunksByUser(data,useUserName,blockSize):
outData = []
outLabel = []
useDataAll = data[data['user_hash'] == useUserName]
userIDs = np.arange(len(useDataAll))
#print('number of found flows for user: ' + str(len(userIDs)))
numBlocks = int(np.ceil(float(len(userIDs)) / float(blockSize)))
for blockID in range(numBlocks):
curIDs = userIDs[(blockID * blockSize):((blockID+1)*blockSize)]
#print(curIDs)
useData = useDataAll.iloc[curIDs]
curDomains = useData['domain']
curLabel = np.max(useData['label'])
outData.append(curDomains)
outLabel.append(curLabel)
return (outData,outLabel)
def getChunksByUser(data,useUserName,blockSize,parameter=dict(),\
hostDict=dict(),sipDict=dict(), vtDF = dict, flagOnlyOneUser = False,
flagReturnDomains=False):
outData = []
outLabel = []
outHits = []
outDomains = []
if flagOnlyOneUser:
useDataAll = data
else:
useDataAll = data[data['user_hash'] == useUserName]
userIDs = np.arange(len(useDataAll))
#print('number of found flows for user: ' + str(len(userIDs)))
numBlocks = int(np.ceil(float(len(userIDs)) / float(blockSize)))
for blockID in range(numBlocks):
curIDs = userIDs[(blockID * blockSize):((blockID+1)*blockSize)]
#print(curIDs)
useData = useDataAll.iloc[curIDs]
if flagReturnDomains:
(curTrainData,curLabel,curMaxHits,curDomains) = getTrainMatrixLabelFromDataFrame(useData,\
parameter,hostDict,sipDict,vtDF=vtDF,flagReturnDomains=flagReturnDomains)
else:
(curTrainData,curLabel,curMaxHits) = getTrainMatrixLabelFromDataFrame(useData,\
parameter,hostDict,sipDict,vtDF=vtDF,flagReturnDomains=flagReturnDomains)
outData.append(curTrainData)
outLabel.append(curLabel)
outHits.append(curMaxHits)
if flagReturnDomains:
outDomains.append(curDomains)
if flagReturnDomains:
return (outData,outLabel,outHits,outDomains)
else:
return (outData,outLabel,outHits)
def getLSTMModel(blockSize=10,input_dim=103,lstmUnits=10,denseSize=128):
nb_classes = 2
model = Sequential()
model.add(LSTM(lstmUnits, input_dim=input_dim, input_length=blockSize))
model.add(Dense(denseSize, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes, activation='softmax'))
model.compile(loss='binary_crossentropy',
optimizer='adam', metrics=['accuracy'])
# number of params:
# params = 4 * ((size_of_input + 1) * size_of_output + size_of_output^2)
return model
def getCiscoURLFeatureForRow(row):
sortKeys = list(row.keys())
sortKeys.sort()
featureVec = np.zeros([len(sortKeys)-1,])
counter = 0
for keyName in sortKeys:
if 'key' in keyName:
continue
try:
featureVec[counter] = float(row[keyName])
except:
featureVec[counter] = 0.0
counter += 1
featureVec[np.where(np.isnan(featureVec))[0]] = 0.0
return featureVec
def getCiscoFeatureDictForHost(headerPath,dataPath):
# get header
header = []
for line in file(headerPath):
header.append(line.strip())
header = ['key'] + header
fobj = open(dataPath,'r')
csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t')
hostDict = dict()
counter = 0
for row in csvReader:
featureVec = getCiscoURLFeatureForRow(row)
curHost = row['key']
curHost = extractHost(curHost)
hostDict[curHost] = featureVec
counter += 1
if counter % 10000 == 0:
print(str(counter) + ' host features collected')
return hostDict
def getCiscoFeatureDictForSIP(headerPath,dataPath):
# get header
header = []
for line in file(headerPath):
header.append(line.strip())
header = ['key'] + header
fobj = open(dataPath,'r')
csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t')
hostDict = dict()
counter = 0
for row in csvReader:
featureVec = getCiscoURLFeatureForRow(row)
curHost = row['key']
hostDict[curHost] = featureVec
counter += 1
if counter % 10000 == 0:
print(str(counter) + ' sip features collected')
return hostDict
def getCiscoFeatureDictForSLD(headerPath,dataPath):
# get header
header = []
for line in file(headerPath):
header.append(line.strip())
header = ['key'] + header
fobj = open(dataPath,'r')
csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t')
hostDict = dict()
counter = 0
for row in csvReader:
featureVec = getCiscoURLFeatureForRow(row)
curHost = row['key']
hostDict[curHost] = featureVec
counter += 1
if counter % 10000 == 0:
print(str(counter) + ' sld features collected')
return hostDict
def extractHost(domain):
curHostSplit = domain.split('.')
try:
curHost = curHostSplit[-2] + '.' + curHostSplit[-1]
return curHost
except:
return domain
def loadDataSetFromJoblib(trainDirs,minFlowsPerUser = 10,numTimesPos = 20):
for dirID in range(len(trainDirs)):
curDir = trainDirs[dirID]
curFiles = os.listdir(curDir)
dayJoblibCounter = 0
for curFile in curFiles:
curFile = curDir + curFile
if curFile.endswith('.joblib'):
curData = joblib.load(curFile)
if dayJoblibCounter == 0:
dayData = curData
else:
dayData = dayData.append(curData,ignore_index=True)
dayJoblibCounter += 1
print('processed file number: ' + str(dayJoblibCounter) + ' (dir ' + str(curDir) +')')
# use flows with min minFlowsPerUser flows
if minFlowsPerUser != -1:
grouped = dayData.groupby('user_hash')
useUsers = set()
for grouping in grouped:
numFlowsCurUser = len(grouping[1])
userLabel = np.max(grouping[1]['label'])
if numFlowsCurUser >= minFlowsPerUser and userLabel != -1.0:
useUsers.add(grouping[0])
# get ids
userIDs = dayData.loc[dayData['user_hash'].isin(useUsers)].index.values
dayData = dayData.iloc[userIDs]
dayData = dayData.reset_index(drop=True)
if numTimesPos != -1:
grouped = dayData.groupby('user_hash')
curUserLabel = []
curUserNames = []
for grouping in grouped:
numFlowsCurUser = len(grouping[1])
userLabel = np.max(grouping[1]['label'])
curUserLabel.append(userLabel)
curUserNames.append(grouping[1]['user_hash'].values[0])
posIDs = np.where(np.array(curUserLabel) == 1.0)[0]
negIDs = np.where(np.array(curUserLabel) == 0.0)[0]
maxNegLabel = len(posIDs) * numTimesPos
if len(negIDs) > maxNegLabel:
np.random.seed(1)
np.random.shuffle(negIDs)
negIDs = negIDs[0:maxNegLabel]
useIDs = np.concatenate([posIDs,negIDs])
useUsers = np.array(curUserNames)[useIDs]
useUsers = set(useUsers)
# get ids
userIDs = dayData.loc[dayData['user_hash'].isin(useUsers)].index.values
dayData = dayData.iloc[userIDs]
dayData = dayData.reset_index(drop=True)
if dirID == 0:
allData = dayData
else:
allData = allData.append(dayData,ignore_index=True)
return allData
def tokenizeDomain(domain,n=3):
domain = domain.replace('https://','')
domain = domain.replace('www.','')
domain = domain.replace('/','')
# reverse domain
domain = domain[::-1]
outList = []
splitCriterion = n
# overlapping n-grams
outList = [domain[i:i+splitCriterion] for i in range(0, len(domain), 1)]
return outList
def getDomainsInWindowData(allData,numNeg=-1,blockSize=10):
uniqueTrainUser = np.unique(allData['user_hash'])
userLabel = []
for curTrainUser in uniqueTrainUser:
userIDs = allData.loc[allData['user_hash'] == curTrainUser].index.values
curLabel = np.max(allData.iloc[userIDs]['label'])
userLabel.append(curLabel)
negIDs = np.where(np.array(userLabel) == 0.0)[0]
userLabel = np.array(userLabel)
posUser = np.where(userLabel == 1.0)[0]
negUser = np.where(userLabel == 0.0)[0]
if numNeg != -1:
if len(negUser) > numNeg:
np.random.shuffle(negUser)
negUser = negIDs[0:numNeg]
useUser = posUser
useUser = np.concatenate([posUser,negUser])
counter = 0
trainDomains = []
trainBlockLabel = []
trainNames = []
for uID in range(len(useUser)):
curTrainUser = uniqueTrainUser[useUser[uID]]
(curUserData,curUserLabel) = getDomainChunksByUser(allData,curTrainUser,blockSize)
for i in range(len(curUserLabel)):
trainNames.append(curTrainUser)
trainDomains += curUserData
trainBlockLabel += curUserLabel
print('processed ' + str(counter) + ' users of ' + str(len(useUser)))
counter+= 1
return (trainDomains,trainBlockLabel,trainNames)
def getPaddedData(allData,numNeg=-1,blockSize=10,parameterDict=dict(),\
hostDict=dict(),sipDict = dict(),vtLabelPath=''):
if vtLabelPath != '':
vtDF = pd.read_csv(vtLabelPath,sep='\t')
else:
vtDF = dict()
uniqueTrainUser = np.unique(allData['user_hash'])
userLabel = []
for curTrainUser in uniqueTrainUser:
userIDs = allData.loc[allData['user_hash'] == curTrainUser].index.values
curLabel = np.max(allData.iloc[userIDs]['label'])
userLabel.append(curLabel)
negIDs = np.where(np.array(userLabel) == 0.0)[0]
userLabel = np.array(userLabel)
posUser = np.where(userLabel == 1.0)[0]
negUser = np.where(userLabel == 0.0)[0]
if numNeg != -1:
if len(negUser) > numNeg:
np.random.shuffle(negUser)
negUser = negIDs[0:numNeg]
useUser = posUser
useUser = np.concatenate([posUser,negUser])
counter = 0
trainBlocks = []
trainBlockLabel = []
trainNames = []
trainBlockHits = []
for uID in range(len(useUser)):
curTrainUser = uniqueTrainUser[useUser[uID]]
(curUserData,curUserLabel,curHits) = getChunksByUser(allData,curTrainUser,blockSize,\
parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF = vtDF)
for i in range(len(curUserLabel)):
trainNames.append(curTrainUser)
trainBlocks += curUserData
trainBlockLabel += curUserLabel
trainBlockHits += curHits
print('processed ' + str(counter) + ' users of ' + str(len(useUser)))
counter+= 1
paddedData = pad_sequences(trainBlocks, maxlen=blockSize,dtype='float32')
#paddedData = paddedData[:,:,featureTypeDict[useFeatureType]]
return (paddedData,trainBlockLabel,trainNames,trainBlockHits)
def createTrainDataFromJoblibsPerUser(joblibPaths,minFlowsPerUser = 10,blockSize=10,
hostDict=dict(),sipDict=dict(),
vtLabelPath='',maxFlowsPerUser = 50000):
trainBlockLabel = []
trainNames = []
trainBlockHits = []
parameterDict = dict()
numBlocksToInitialize = 10000
paddedData = np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])
overallCounter = 0
startTime = time.time()
for uID in range(len(joblibPaths)):
curSavePath = joblibPaths[uID]
curData = joblib.load(curSavePath)['dataFrame']
if len(curData) < minFlowsPerUser:
continue
#curUserName = np.unique(curData['user_hash'])[0]
(curUserData,curUserLabel,curHits) = getChunksByUser(curData,'',blockSize,\
parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=dict(),flagOnlyOneUser = True)
curPaddedData = pad_sequences(curUserData, maxlen=blockSize,dtype='float32')
if (curPaddedData.shape[0] > maxFlowsPerUser):
curPaddedData = curPaddedData[0:maxFlowsPerUser]
curUserLabel = list(np.array(curUserLabel)[0:maxFlowsPerUser])
curHits = list(np.array(curHits)[0:maxFlowsPerUser])
for i in range(len(curPaddedData)):
trainNames.append(curSavePath)
trainBlockLabel += curUserLabel
trainBlockHits += curHits
#curPaddedData = curPaddedData[:,:,featureTypeDict[useFeatureType]]
numCurInstances = curPaddedData.shape[0]
while overallCounter+numCurInstances > paddedData.shape[0]:
paddedData = np.vstack([paddedData,np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])])
paddedData[overallCounter:overallCounter+numCurInstances,:] = curPaddedData
overallCounter += numCurInstances
if uID % 1000 == 0:
elapsedTime = time.time() - startTime
startTime = time.time()
print(str(uID+1) + ' user processed [' + str(elapsedTime) + ']')
paddedData = paddedData[0:overallCounter]
return (paddedData,trainBlockLabel,trainNames,trainBlockHits)
def loadDataSetFromJoblibPerUser(trainDirs,minFlowsPerUser = 10,numNegPerDay = 50000,
blockSize = 10,hostDict=dict(),sipDict=dict(),
seed =1,flagSkipNoLabelUser=False,
vtLabelPath='',maxFlowsPerUser = 50000,
flagReturnDomains=False):
if vtLabelPath != '':
vtDF = pd.read_csv(vtLabelPath,sep='\t')
else:
vtDF = dict()
trainBlockLabel = []
trainNames = []
trainBlockHits = []
trainBlockDomains = []
parameterDict = dict()
numBlocksToInitialize = 10000
paddedData = np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])
overallCounter = 0
for curDirID in range(len(trainDirs)):
curDir = trainDirs[curDirID]
curLabelFile = curDir + 'data_label.joblib'
labelData = joblib.load(curLabelFile)
posIDs = np.where(np.array(labelData['label']) == 1.0)[0]
negIDs = np.where(np.array(labelData['label']) == 0.0)[0]
random.seed(seed)
random.shuffle(negIDs)
useIDs = np.concatenate([posIDs,negIDs])
counter = 0
negCounter = 0
startTime = time.time()
for uID in range(len(useIDs)):
curID = useIDs[uID]
curUserName = labelData['usernames'][curID]
curSavePath = curDir + str(curUserName) + '.joblib'
curData = joblib.load(curSavePath)['dataFrame']
if flagSkipNoLabelUser:
curUserLabel = np.max(curData['label'])
if curUserLabel == -1.0:
continue
if len(curData) < minFlowsPerUser:
continue
if numNegPerDay != -1:
if negCounter > numNegPerDay:
break
if flagReturnDomains:
(curUserData,curUserLabel,curHits,curDomains) = getChunksByUser(curData,curUserName,blockSize,\
parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=vtDF,\
flagReturnDomains=flagReturnDomains)
else:
(curUserData,curUserLabel,curHits) = getChunksByUser(curData,curUserName,blockSize,\
parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=vtDF,\
flagReturnDomains=flagReturnDomains)
curPaddedData = pad_sequences(curUserData, maxlen=blockSize,dtype='float32')
if (curPaddedData.shape[0] > maxFlowsPerUser):
curPaddedData = curPaddedData[0:maxFlowsPerUser]
curUserLabel = list(np.array(curUserLabel)[0:maxFlowsPerUser])
curHits = list(np.array(curHits)[0:maxFlowsPerUser])
if 'curDomains' in locals():
curDomains = list(np.array(curDomains)[0:maxFlowsPerUser])
for i in range(len(curPaddedData)):
trainNames.append(curUserName)
trainBlockLabel += curUserLabel
trainBlockHits += curHits
trainBlockDomains += curDomains
#curPaddedData = curPaddedData[:,:,featureTypeDict[useFeatureType]]
numCurInstances = curPaddedData.shape[0]
while overallCounter+numCurInstances > paddedData.shape[0]:
paddedData = np.vstack([paddedData,np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])])
paddedData[overallCounter:overallCounter+numCurInstances,:] = curPaddedData
overallCounter += numCurInstances
#print('num of instances: ' + str(numCurInstances))
if (counter+1) % 1000 == 0:
elapsedTime = time.time() - startTime
print('processed ' + str(counter+1) + ' users of ' +\
str(len(useIDs)) + ' with ' + str(len(curData['label'])) +\
' flows [dir ' + str(curDirID+1) + ' of ' +\
str(len(trainDirs)) + '] in ' + str(elapsedTime) + ' sec')
startTime = time.time()
if np.max(np.array(curUserLabel)) == 0.0:
negCounter += 1
counter+= 1
paddedData = paddedData[0:overallCounter]
if flagReturnDomains:
return (paddedData,trainBlockLabel,trainNames,trainBlockHits,trainBlockDomains)
else:
return (paddedData,trainBlockLabel,trainNames,trainBlockHits)
def loadRawDataSetFromJoblibPerUser(trainDirs,numNegPerDay = 2000, seed = 1):
dataFrameList = []
overallCounter = 0
from tqdm import tqdm
for curDirID in tqdm(np.arange(len(trainDirs)), miniters=1):
curDir = trainDirs[curDirID]
curLabelFile = curDir + 'data_label.joblib'
labelData = joblib.load(curLabelFile)
posIDs = np.where(np.array(labelData['label']) == 1.0)[0]
negIDs = np.where(np.array(labelData['label']) == 0.0)[0]
random.seed(seed)
random.shuffle(negIDs)
if len(negIDs) >= numNegPerDay:
negIDs = negIDs[0:numNegPerDay]
useIDs = np.concatenate([posIDs,negIDs])
for uID in range(len(useIDs)):
curID = useIDs[uID]
curUserName = labelData['usernames'][curID]
curSavePath = curDir + str(curUserName) + '.joblib'
curData = joblib.load(curSavePath)['dataFrame']
dataFrameList.append(curData)
overallCounter += 1
return dataFrameList
def checkDomainForSecondLevelDomain(inDomain,sldDomainDict):
if not 'str' in str(type(inDomain)):
return False
splitDomain = inDomain.split('.')
if len(splitDomain) <= 2:
return False
sldDomain = splitDomain[-2] + '.' + splitDomain[-1]
if sldDomain in sldDomainDict:
return True
else:
return False
'''
out = False
for sldDomain in sldDomainDict:
if inDomain.endswith(sldDomain):
out = True
break
return out
'''
def save_model(model,jsonPath,h5Path):
# saving model
json_model = model.to_json()
open(jsonPath, 'w').write(json_model)
# saving weights
model.save_weights(h5Path, overwrite=True)
def load_model(jsonPath,h5Path):
# loading model
model = model_from_json(open(jsonPath).read())
model.load_weights(h5Path)
return model
def getResultsFromSavedJoblibFile(joblibFiles,threshold=3):
testUserScores = []
testUserLabel = []
testLabel = []
testScores = []
testNames = []
for joblibPath in joblibFiles:
print('process: ' + joblibPath)
tmpJoblib = joblib.load(joblibPath)
if 'testBlockScores' in tmpJoblib.keys():
curTestBlockScores = tmpJoblib['testBlockScores']
for i in range(len(curTestBlockScores)):
if i == 0:
curTestScores = curTestBlockScores[i]
else:
curTestScores = np.concatenate([curTestScores,curTestBlockScores[i]])
curTestHits = tmpJoblib['blockHits']
curTestHits = np.array(curTestHits)
curTestScores = np.array(curTestScores)
curTestLabel = np.ones([len(curTestScores),]) * -1.0
curTestLabel[np.where(curTestHits == 0)[0]] = 0.0
curTestLabel[np.where(curTestHits >= threshold)[0]] = 1.0
curTestNames = tmpJoblib['testNames']
else:
curTestHits = tmpJoblib['testHits']
curTestScores = tmpJoblib['testScores']
curTestLabel = tmpJoblib['testLabel']
curTestNames = tmpJoblib['testNames']
useIDs = np.where(curTestHits >= threshold)[0]
useIDs = np.concatenate([useIDs,np.where(curTestHits == 0.0)[0]])
# old code
#useIDs = np.where(tmpJoblib['testLabel'] == 1.0)[0]
#useIDs = np.concatenate([useIDs,np.where(tmpJoblib['testLabel'] == 0.0)[0]])
curTestScoresT = curTestScores[useIDs]
curTestLabelT = curTestLabel[useIDs]
if len(testScores) == 0:
testScores = curTestScoresT
testLabel = curTestLabelT
else:
testScores = np.concatenate([testScores,curTestScoresT])
testLabel = np.concatenate([testLabel,curTestLabelT])
if 'testBlockScores' in tmpJoblib.keys():
tmpScores = np.array(tmpJoblib['testScores'])
tmpHits = np.array(tmpJoblib['testHits'])
tmpLabel = np.ones([len(tmpHits),])*-1
tmpLabel[np.where(tmpHits == 0.0)[0]] = 0.0
tmpLabel[np.where(tmpHits >= threshold)[0]] = 1.0
useIDs = np.where(tmpLabel == 1.0)[0]
useIDs = np.concatenate([useIDs,np.where(tmpLabel == 0.0)[0]])
testUserLabel += list(np.array(tmpLabel)[useIDs])
testUserScores += list(np.array(tmpScores)[useIDs])
else:
# get user label
uniqueTestNames = list(np.unique(curTestNames))
for testName in uniqueTestNames:
curIDs = np.where(curTestNames == testName)[0]
curMaxHits = np.max(curTestHits[curIDs])
if curMaxHits > 0 and curMaxHits < threshold:
continue
if curMaxHits >= threshold:
testUserLabel.append(1.0)
else:
testUserLabel.append(0.0)
curScore = np.max(curTestScores[curIDs])
testUserScores.append(curScore)
testNames.append(testName)
testUserScores = np.array(testUserScores)
testUserLabel = np.array(testUserLabel)
testNames = np.array(testNames)
return (testUserScores,testUserLabel,testLabel,testScores,testNames)
def checkIfIP(host):
ipMask = '^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$'
if re.search(ipMask, host) is not None:
return True
else:
return False
# GLOBAL VALUES
numCiscoFeatures = 30
featureTypeDict = {'neural':np.arange(4,104,1),\
'packet':np.array([0,1,2,3]),\
'neural+packet':np.arange(0,104,1),\
'neural+packet+cisco':np.arange(0,104+(2*numCiscoFeatures),1),\
'cisco':np.arange(104,104+(2*numCiscoFeatures),1)}
globalNumFeatures = len(featureTypeDict['neural+packet+cisco'])

View File

@ -0,0 +1,143 @@
# -*- coding: utf-8 -*-
from tqdm import tqdm
import tensorflow as tf
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.per_process_gpu_memory_fraction = 0.5
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
from pymongo import MongoClient
import joblib
import pickle
import numpy as np
import ciscoProcessing as ciscoProcessing
import stackedNeuralModels as stackedNeuralModels
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc, roc_curve
import matplotlib.pyplot as plt
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation,LSTM,Embedding,Dropout,Conv1D, GlobalMaxPooling1D, Merge, Reshape, Lambda
from keras.layers import Convolution1D
from keras.layers import Input
from keras.models import Model
from keras.utils import np_utils
if __name__ == "__main__":
# parameter
innerCNNFilters = 512
innerCNNKernelSize = 2
cnnDropout = 0.5
cnnHiddenDims = 1024
domainFeatures = 512
flowFeatures = 3
numCiscoFeatures=30
windowSize = 10
maxLen = 40
embeddingSize = 100
kernel_size = 2
drop_out = 0.5
filters = 2
hidden_dims = 100
vocabSize = 40
threshold = 3
minFlowsPerUser = 10
numEpochs = 100
maxLengthInSeconds = -1
timesNeg = -1
trainDataPath = '/mnt/projekte/pmlcluster/cisco/trainData/equalClass/currentData.joblib'
testDataPath = '/mnt/projekte/pmlcluster/cisco/trainData/equalClass/futureData.joblib'
if 'characterDict' not in locals():
characterDictPath = 'trainData/characterIDDict.joblib'
characterDict = joblib.load(characterDictPath)['characterIDDict']
# load train and test data from joblib
# created with createTrainDataMultipleTaskLearning.py
if 'trainDFs' not in locals():
tmpLoad = joblib.load(trainDataPath)
trainDFs = tmpLoad['data']
if 'testDFs' not in locals():
tmpLoad = joblib.load(testDataPath)
sharedCNNFun = stackedNeuralModels.getCNNWitoutLastLayerFunctional(len(characterDict)+1,embeddingSize,maxLen,domainFeatures,kernel_size,domainFeatures,0.5)
domainLists = []
dfLists = []
for i in tqdm(np.arange(len(trainDFs)), miniters=10):
(domainListsTmp,dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(trainDFs[i],
windowSize=windowSize,overlapping=False,maxLengthInSeconds=maxLengthInSeconds)
domainLists += domainListsTmp
dfLists += dfListsTmp
if i == 100:
break
(testData,testLabel,testHits,testNames) = stackedNeuralModels.createTrainData(
domainLists=domainLists,dfLists=dfLists,charachterDict=characterDict,
maxLen=maxLen,threshold = threshold,
flagUseCiscoFeatures=False,urlSIPDIct=dict(),
windowSize=windowSize)
useIDs = np.where(testLabel == 1.0)[0]
useIDs = np.concatenate([useIDs,np.where(testLabel == 0.0)[0]])
testLabel = testLabel[useIDs]
testHits = testHits[useIDs]
testNames = testNames[useIDs]
for i in range(len(testData)):
testData[i] = testData[i][useIDs]
inputList = []
encodedList = []
numFeatures = flowFeatures
for i in range(windowSize):
inputList.append(Input(shape=(maxLen,)))
encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model
inputList.append(Input(shape=(numFeatures,)))
merge_layer_input = []
for i in range(windowSize):
merge_layer_input.append(encodedList[i])
merge_layer_input.append(inputList[(2*i)+1])
# We can then concatenate the two vectors:
merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
reshape = Reshape((windowSize, domainFeatures+numFeatures))(merged_vector)
# add second cnn
cnn = Conv1D(filters,
kernel_size,
activation='relu',
input_shape=(windowSize,domainFeatures+numFeatures))(reshape)
# we use max pooling:
maxPool = GlobalMaxPooling1D()(cnn)
cnnDropout = Dropout(cnnDropout)(maxPool)
cnnDense = Dense(cnnHiddenDims,activation='relu')(cnnDropout)
cnnOutput = Dense(2,activation='softmax')(cnnDense)
# We define a trainable model linking the
# tweet inputs to the predictions
model = Model(inputs=inputList, outputs=cnnOutput)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
epochNumber= 0
trainLabel = np_utils.to_categorical(testLabel, 2)
model.fit(x=testData, y=trainLabel,
epochs=epochNumber + 1,shuffle=True,initial_epoch=epochNumber)#,
#validation_data=(testData,testLabel))

412
stackedNeuralModels.py Normal file
View File

@ -0,0 +1,412 @@
# -*- coding: utf-8 -*-
from keras.models import Sequential
from keras.layers import Dense, Activation,LSTM,Embedding,Dropout,Conv1D, GlobalMaxPooling1D, Merge, Reshape, Lambda
from keras.layers import Convolution1D
import ciscoProcessing as ciscoProcessing
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import joblib
import csv
import keras
from keras.layers import Input
from keras.models import Model
from keras.utils import np_utils
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc, roc_curve
from tqdm import tqdm
import os
def getCiscoFeatures(curDataLine,urlSIPDict):
numCiscoFeatures = 30
try:
ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
#print('cisco features: ' + str(ciscoFeatures))
# log transform
ciscoFeatures = np.log1p(ciscoFeatures,dtype='float32')
#print('log transformed: ' + str(ciscoFeatures))
return ciscoFeatures.ravel()
except:
return np.zeros([numCiscoFeatures,]).ravel()
def getCNNWithoutLastLayer(vocabSize,embeddingSize,input_length,filters,kernel_size,
hidden_dims,drop_out):
model = Sequential()
model.add(Embedding(input_dim=vocabSize, output_dim=embeddingSize,
input_length=input_length))
model.add(Conv1D(filters,
kernel_size,
activation='relu'))
# we use max pooling:
model.add(GlobalMaxPooling1D())
# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(drop_out))
model.add(Activation('relu'))
return model
def getCNNWitoutLastLayerFunctional(vocabSize,embeddingSize,input_length,filters,kernel_size,
hidden_dims,drop_out):
a = Input(shape=(input_length,))
embedding = Embedding(input_dim=vocabSize,output_dim=embeddingSize)(a)
conv1 = Conv1D(filters,kernel_size,activation='relu')(embedding)
glob = GlobalMaxPooling1D()(conv1)
dense = Dense(hidden_dims)(glob)
drop = Dropout(drop_out)(dense)
model = Activation('relu')(drop)
model = Model(a, model)
return model
def getFlowFeatureLayer(numFeatures):
model = Sequential()
#slpModel.add(Dense(1, input_shape=(1,)))
model.add(Lambda(lambda x: x + 0.0, input_shape=(numFeatures,)))
return model
def createCNNDataSet(domains,label,characterDict,maxLen=40):
# process domains in reverse order
outFeature = np.zeros([len(domains),maxLen])
outLabel = np.zeros([len(domains),])
for i in range(len(domains)):
domain = domains[i]
curLabel = label[i]
curFeature = np.zeros([maxLen,])
# print(domain + ' ' + str(len(domain)))
for j in range(np.min([len(domain),maxLen])):
#print(j)
curCharacter = domain[-j]
if curCharacter in characterDict:
curFeature[j] = characterDict[curCharacter]
outFeature[i] = curFeature
outLabel[i] = curLabel
return (outFeature,outLabel)
def getFeatureVecForDomain(domain,characterDict,maxLen=40):
curFeature = np.zeros([maxLen,])
for j in range(np.min([len(domain),maxLen])):
#print(j)
curCharacter = domain[-j]
if curCharacter in characterDict:
curFeature[j] = characterDict[curCharacter]
return curFeature
def getFlowFeatures(curDataLine):
useKeys = ['duration','bytes_down','bytes_up']
curFeature = np.zeros([len(useKeys),])
for i in range(len(useKeys)):
curKey = useKeys[i]
try:
curFeature[i] = np.log1p(curDataLine[curKey],dtype='float32')
except:
pass
return curFeature
def getChunksFromUserDataFrame(dataFrame,windowSize=10,overlapping=False,
maxLengthInSeconds=300):
#print('maxLength: ' + str(maxLengthInSeconds))
maxMilliSeconds = maxLengthInSeconds * 1000
outDomainLists = []
outDFFrames = []
if overlapping == False:
numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize)))
userIDs = np.arange(len(dataFrame))
for blockID in np.arange(numBlocks):
curIDs = userIDs[(blockID * windowSize):((blockID+1)*windowSize)]
#print(curIDs)
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
if maxLengthInSeconds != -1:
curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
if len(underTimeOutIDs) != len(curIDs):
curIDs = curIDs[underTimeOutIDs]
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
outDomainLists.append(list(curDomains))
outDFFrames.append(useData)
else:
numBlocks = len(dataFrame) + 1 - windowSize
userIDs = np.arange(len(dataFrame))
for blockID in np.arange(numBlocks):
curIDs = userIDs[blockID:blockID+windowSize]
#print(curIDs)
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
if maxLengthInSeconds != -1:
curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
if len(underTimeOutIDs) != len(curIDs):
curIDs = curIDs[underTimeOutIDs]
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
outDomainLists.append(list(curDomains))
outDFFrames.append(useData)
return (outDomainLists,outDFFrames)
def createTrainData(domainLists,dfLists,charachterDict,maxLen,threshold = 3,
flagUseCiscoFeatures=False,urlSIPDIct=dict,
windowSize=10):
if 'hits' in dfLists[0].keys():
hitName = 'hits'
elif 'virusTotalHits' in dfLists[0].keys():
hitName = 'virusTotalHits'
numFlowFeatures = 3
numCiscoFeatures = 30
numFeatures = numFlowFeatures
if flagUseCiscoFeatures:
numFeatures += numCiscoFeatures
outputFeatures = []
label = []
hits = []
trainNames = []
for i in range(windowSize):
outputFeatures.append(np.zeros([len(domainLists),maxLen]))
outputFeatures.append(np.zeros([len(domainLists),numFeatures]))
for i in tqdm(np.arange(len(domainLists)), miniters=10):
curCounter = 0
#print('len domainList: ' + str(len(domainLists[i])))
#print('len df: ' + str(len(dfLists[i])))
for j in range(np.min([windowSize,len(domainLists[i])])):
outputFeatures[curCounter][i,:] = getFeatureVecForDomain(domainLists[i][j],charachterDict,maxLen)
curCounter += 1
if flagUseCiscoFeatures:
outputFeatures[curCounter][i,0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j])
outputFeatures[curCounter][i,numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j],urlSIPDIct)
else:
outputFeatures[curCounter][i,:] = getFlowFeatures(dfLists[i].iloc[j])
curCounter += 1
curLabel = 0.0
if np.max(dfLists[i][hitName]) >= threshold:
curLabel = 1.0
elif np.max(dfLists[i][hitName]) == -1:
curLabel = -1.0
elif np.max(dfLists[i][hitName]) > 0 and np.max(dfLists[i][hitName]) < threshold:
curLabel = -2.0
label.append(curLabel)
hits.append(np.max(dfLists[i][hitName]))
trainNames.append(np.unique(dfLists[i]['user_hash']))
return (outputFeatures, np.array(label), np.array(hits), np.array(trainNames))
def transformStringListToNumpyArray(listString):
listString = listString.replace('[','').replace(']','')
return np.array(listString.split(','),dtype='float32')
def getCiscoFeatureDict(csvPathList):
outDict = dict()
for path in tqdm(csvPathList, miniters=1):
fobj = open(path,'r')
csvReader = csv.DictReader(fobj,delimiter=',')
for row in csvReader:
urlSIPString = row['Domain'] + row['ServerIP']
ciscoFeatures = row['CiscoFeature']
outDict[urlSIPString] = transformStringListToNumpyArray(ciscoFeatures)
#if len(outDict) % 10000 == 0:
# print('numbers in dict: ' + str(len(outDict)))
return outDict
if __name__ == "__main__":
# get data
trainDirsUserLevel = ['trainData/joblib2016-07-annomalous-stg-new/10/',
'trainData/joblib2016-07-annomalous-stg-new/09/',
'trainData/joblib2016-07-annomalous-stg-new/08/',
'trainData/joblib2016-07-annomalous-stg-new/07/',
'trainData/joblib2016-07-annomalous-stg-new/06/']
testDirsUserLevel = ['trainData/joblib2016-09-annomalous-stg-new/07/',\
'trainData/joblib2016-09-annomalous-stg-new/08/',\
'trainData/joblib2016-09-annomalous-stg-new/09/',\
'trainData/joblib2016-09-annomalous-stg-new/10/',\
'trainData/joblib2016-09-annomalous-stg-new/11/',\
'trainData/joblib2016-09-annomalous-stg-new/12/',\
'trainData/joblib2016-09-annomalous-stg-new/13/',\
'trainData/joblib2016-09-annomalous-stg-new/14/']
trainCiscoFeatureCSVPaths = ['trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_07.csv',
'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_06.csv',
'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_08.csv',
'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_10.csv',
'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_09.csv']
testCiscoFeatureCSVPaths = ['trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_12.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_08.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_07.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_09.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_13.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_14.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_10.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_11.csv']
# parameter
numNegPerDay = 5000
numEpochs = 10
domainFeatures = 512
flowFeatures = 3
numCiscoFeatures= 30
windowSize = 10
maxLen = 40
lstmUnits = 32
lstmDenseSize = 128
embeddingSize = 100
kernel_size = 2
drop_out = 0.5
filters = 2
hidden_dims = 100
vocabSize = 40
flagUseCiscoFeatures = True
threshold = 3
resultStoreDir = 'results/201705/'
if flagUseCiscoFeatures:
resultStorePath = resultStoreDir + 'cnn_plus_cisco_plus_lstm_numNegPerDay' + str(numNegPerDay) + '.joblib'
resultModelPath = resultStoreDir + 'cnn_plus_cisco_plus_lstm_numNegPerDay' + str(numNegPerDay)
else:
resultStorePath = resultStoreDir + 'cnn_plus_lstm_numNegPerDay' + str(numNegPerDay) + '.joblib'
resultModelPath = resultStoreDir + 'cnn_plus_lstm_numNegPerDay' + str(numNegPerDay)
flagRedo = True
if flagUseCiscoFeatures:
if 'trainCiscoFeatureDict' not in locals():
trainCiscoFeatureDict = getCiscoFeatureDict(trainCiscoFeatureCSVPaths)
if 'testCiscoFeatureDict' not in locals():
testCiscoFeatureDict = getCiscoFeatureDict(testCiscoFeatureCSVPaths)
else:
trainCiscoFeatureDict = dict()
testCiscoFeatureDict = dict()
if flagRedo or not os.path.exists(resultStorePath):
if 'characterDict' not in locals():
characterDictPath = 'trainData/characterIDDict.joblib'
characterDict = joblib.load(characterDictPath)['characterIDDict']
print('create train data')
if 'dataFrameList' not in locals():
(dataFrameList) = ciscoProcessing.loadRawDataSetFromJoblibPerUser(\
trainDirsUserLevel,numNegPerDay = numNegPerDay)
maxHits = []
for i in range(len(dataFrameList)):
maxHits.append(np.max(dataFrameList[i]['hits']))
print('create test data')
# validation error
if 'testDataFrameList' not in locals():
(testDataFrameList) = ciscoProcessing.loadRawDataSetFromJoblibPerUser(\
[testDirsUserLevel[0]],numNegPerDay = numNegPerDay)
maxHits = []
for i in range(len(testDataFrameList)):
maxHits.append(np.max(testDataFrameList[i]['hits']))
sharedCNNFun = getCNNWitoutLastLayerFunctional(len(characterDict)+1,embeddingSize,maxLen,domainFeatures,kernel_size,domainFeatures,0.5)
inputList = []
encodedList = []
numFeatures = flowFeatures
if flagUseCiscoFeatures:
numFeatures += numCiscoFeatures
for i in range(windowSize):
inputList.append(Input(shape=(maxLen,)))
encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model
inputList.append(Input(shape=(numFeatures,)))
merge_layer_input = []
for i in range(windowSize):
merge_layer_input.append(encodedList[i])
merge_layer_input.append(inputList[(2*i)+1])
# We can then concatenate the two vectors:
merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
reshape = Reshape((windowSize, domainFeatures+numFeatures))(merged_vector)
lstm = LSTM(lstmUnits, input_shape=(windowSize,domainFeatures+numFeatures))(reshape)
dense = Dense(lstmDenseSize, activation='relu')(lstm)
dropout = Dropout(0.5)(dense)
# And add a logistic regression on top
predictions = Dense(2, activation='softmax')(dropout)
# We define a trainable model linking the
# tweet inputs to the predictions
model = Model(inputs=inputList, outputs=predictions)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
# get train data
domainLists = []
dfLists = []
for i in tqdm(np.arange(len(dataFrameList)), miniters=10):
(domainListsTmp,dfListsTmp) = getChunksFromUserDataFrame(dataFrameList[i],windowSize=windowSize,overlapping=False)
domainLists += domainListsTmp
dfLists += dfListsTmp
(trainData,trainLabel,trainHits,trainNames) = createTrainData(domainLists,dfLists,characterDict,
maxLen,threshold = threshold,
flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=trainCiscoFeatureDict)
useIDs = np.where(trainHits == 0)[0]
useIDs = np.concatenate([useIDs,np.where(trainHits >= threshold)[0]])
for i in range(len(trainData)):
trainData[i] = np.array(trainData[i])[useIDs]
trainLabel = trainLabel[useIDs]
trainHits = trainHits[useIDs]
trainNames = trainNames[useIDs]
# get test data
domainLists = []
dfLists = []
for i in tqdm(np.arange(len(testDataFrameList)), miniters=10):
(domainListsTmp,dfListsTmp) = getChunksFromUserDataFrame(testDataFrameList[i],windowSize=windowSize,overlapping=False)
domainLists += domainListsTmp
dfLists += dfListsTmp
(testData,testLabel,testHits,testNames) = createTrainData(domainLists,dfLists,characterDict,
maxLen,threshold = threshold,
flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=testCiscoFeatureDict)
useIDs = np.where(testHits == 0)[0]
useIDs = np.concatenate([useIDs,np.where(testHits >= threshold)[0]])
for i in range(len(testData)):
testData[i] = np.array(testData[i])[useIDs]
testLabel = testLabel[useIDs]
testHits = testHits[useIDs]
testNames = testNames[useIDs]
numPos = len(np.where(trainLabel == 1.0)[0])
numNeg = len(np.where(trainLabel == 0.0)[0])
print('major class: ' + str(float(numNeg) / float(numNeg + numPos)))
lstmLabel = np_utils.to_categorical(trainLabel, 2)
lstmTestLabel = np_utils.to_categorical(testLabel, 2)
trainHist = model.fit(trainData,lstmLabel,epochs=numEpochs,batch_size=128, validation_data=(testData,lstmTestLabel))
# save lstm model
ciscoProcessing.save_model(model,resultModelPath+'.json',
resultModelPath + '.h5')
# classify train and test
trainScores = model.predict(trainData)[:,1]
testScores = model.predict(testData)[:,1]
joblib.dump({'testLabel':testLabel,
'testHits':testHits,
'testNames':testNames,
'testScores':testScores,
'trainLabel':trainLabel,
'trainScores':trainScores},resultStorePath,compress=3)