removed old files from paul

2017-06-30 17:21:03 +02:00
parent d19036a611
commit deac7f9e58
2 changed files with 0 additions and 1061 deletions
--- a/ciscoProcessing.py
+++ b/ciscoProcessing.py
@@ -1,846 +0,0 @@
-# -*- coding: utf-8 -*-
-import sys
-sys.path.append('..')
-sys.path.append('/mnt/projekte/pmlcluster/home/prasse/projects/ciscoSVN/cisco/trunk/code/')
-import os
-import numpy as np
-import joblib
-from keras.preprocessing.sequence import pad_sequences
-from keras.utils import np_utils
-from keras.models import Sequential
-from keras.layers import Dense
-from keras.layers import LSTM
-from keras.layers import Dropout
-import csv
-import pandas as pd
-import random
-from keras.models import model_from_json
-import time
-import re
-# import mongoDBConnector as mongoDBConnector
-import stackedNeuralModels as stackedNeuralModels
-from tqdm import tqdm
-
-
-def getCiscoDomainLabel(curDomain,curSIP,hostSet,sipSet,sldSet):
-    # check server-ip
-    if curSIP in sipSet:
-        return 1.0        
-    # check second level domain
-    splitDomain = curDomain.split('.')
-    if len(splitDomain) >= 2:
-        curSLD = splitDomain[-2] + '.' + splitDomain[-1]
-    else:
-        curSLD = curDomain
-    if curSLD in sldSet:
-        return 1.0
-        
-    # check domain
-    if curDomain in hostSet:
-        return 1.0
-    else:
-        if curSLD in hostSet:
-            return 1.0
-        else:
-            return 0.0
-    return 0.0
-
-def getSubSample(useDir,numUser,threshold=3,
-                      windowSize=10,minFlowsPerUser=10,
-                      maxLen=40,flagUseCiscoFeatures=False,
-                      urlSIPDIct=dict(),characterDict=dict(),
-                      maxLengthInSeconds=-1,
-                      timesNeg=-1,
-                      mongoHost='',mongoPort=0,dbName='',
-                      collectionName='',metaCollectionName=''):
-    curDFs = mongoDBConnector.sampleDataFromDir(mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName,
-                          useDir=useDir,collectionName=collectionName,
-                          metaCollectionName=metaCollectionName,
-                          numUser=numUser,minFlowsPerUser=minFlowsPerUser)    
-    
-    domainLists = []
-    dfLists = []
-    for i in tqdm(np.arange(len(curDFs)), miniters=10):
-        (domainListsTmp,dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(curDFs[i],
-            windowSize=windowSize,overlapping=False,maxLengthInSeconds=maxLengthInSeconds)
-        domainLists += domainListsTmp
-        dfLists += dfListsTmp   
-    
-    (testData,testLabel,testHits,testNames) = stackedNeuralModels.createTrainData(
-        domainLists=domainLists,dfLists=dfLists,charachterDict=characterDict,
-                maxLen=maxLen,threshold = threshold,
-                flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=urlSIPDIct,
-                windowSize=windowSize)
-                
-    useIDs = np.where(np.array(testLabel) == 1.0)[0]
-    useIDs = np.concatenate([useIDs, np.where(np.array(testLabel) == 0.0)[0]])
-    
-    if timesNeg != -1:
-        posIDs = np.where(np.array(testLabel)[useIDs] == 1.0)[0]
-        negIDs = np.where(np.array(testLabel)[useIDs] == 0.0)[0]
-        if len(negIDs) > len(posIDs) * timesNeg:
-            negIDs = np.random.permutation(negIDs)
-            negIDs = negIDs[0:len(posIDs) * timesNeg]
-        negIDs = useIDs[negIDs]
-        posIDs = useIDs[posIDs]
-        useIDs = np.concatenate([negIDs,posIDs])
-    testLabel = testLabel[useIDs]
-    testHits = testHits[useIDs]
-    testNames = testNames[useIDs]
-    for i in range(len(testData)):
-        testData[i] = testData[i][useIDs]
-    return (testData,testLabel,testHits,testNames)
-
-
-def getSubSampleAllPositiveUsers(useDir,threshold=3,
-                      windowSize=10,minFlowsPerUser=10,
-                      maxLen=40,flagUseCiscoFeatures=False,
-                      urlSIPDIct=dict(),characterDict=dict(),
-                      maxLengthInSeconds=-1,
-                      numNegUser=10000,
-                      mongoHost='',mongoPort=0,dbName='',
-                      collectionName='',metaCollectionName=''):
-    
-    curDFs = mongoDBConnector.sampleAllPositiveUserFromDir(mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName,
-                          useDir=useDir,collectionName=collectionName,
-                          metaCollectionName=metaCollectionName,
-                          numNegUser=numNegUser,minFlowsPerUser=minFlowsPerUser)
-    domainLists = []
-    dfLists = []
-    for i in tqdm(np.arange(len(curDFs)), miniters=10):
-        (domainListsTmp,dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(curDFs[i],
-            windowSize=windowSize,overlapping=False,maxLengthInSeconds=maxLengthInSeconds)
-        domainLists += domainListsTmp
-        dfLists += dfListsTmp   
-    
-    (testData,testLabel,testHits,testNames) = stackedNeuralModels.createTrainData(
-        domainLists=domainLists,dfLists=dfLists,charachterDict=characterDict,
-                maxLen=maxLen,threshold = threshold,
-                flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=urlSIPDIct,
-                windowSize=windowSize)
-                
-    useIDs = np.where(np.array(testLabel) == 1.0)[0]
-    useIDs = np.concatenate([useIDs, np.where(np.array(testLabel) == 0.0)[0]])
-    
-    testLabel = testLabel[useIDs]
-    testHits = testHits[useIDs]
-    testNames = testNames[useIDs]
-    for i in range(len(testData)):
-        testData[i] = testData[i][useIDs]
-    return (testData,testLabel,testHits,testNames)             
-
-def sequenceGenerator(useDir,numUser,threshold=3,
-                      windowSize=10,minFlowsPerUser=10,
-                      maxLen=40,flagUseCiscoFeatures=False,
-                      urlSIPDIct=dict(),characterDict=dict(),
-                      maxLengthInSeconds=-1,
-                      timesNeg=-1,
-                      mongoHost='',mongoPort=0,dbName='',
-                      collectionName='',metaCollectionName=''):
-    while 1:        
-        (testData,testLabel,testHits,testNames) = getSubSample(useDir,numUser,threshold=threshold,
-                          windowSize=windowSize,minFlowsPerUser=minFlowsPerUser,
-                          maxLen=maxLen,flagUseCiscoFeatures=flagUseCiscoFeatures,
-                          urlSIPDIct=urlSIPDIct,characterDict=characterDict,
-                          maxLengthInSeconds=maxLengthInSeconds,
-                          timesNeg=timesNeg,
-                          mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName,
-                          collectionName=collectionName,metaCollectionName=metaCollectionName)
-        testLabel = np_utils.to_categorical(testLabel, 2)
-        #print(testData.shape)
-        yield (testData, testLabel)
-
-
-def sequenceGeneratorTest(data,label):
-    while 1:
-        yield (data, label)
-
-# three different modes
-# if mode == 'correct' -> dont permute or touch the ordering
-# if mode == 'permutate' -> permute the ordering
-# if mode == 'sort' -> sort the flows by sent bytes
-def dataGenerator(trainData,trainLabel,numTimesPos,mode='correct'):
-    return True
-    
-def getMalwareClassDict(path):
-    outDict = dict()
-    for line in file(path):
-        lineSplit = line.strip().split('\t')
-        if len(lineSplit) == 3:
-            outDict[lineSplit[0]] = (lineSplit[1],lineSplit[2])
-    return outDict
-
-def applyLower(inStr):
-    try:
-        return inStr.lower()
-    except:
-        return inStr
-
-def logTransformData(inputMatrix):
-    # delete timestamps
-    try:
-        return np.log1p(np.array(inputMatrix,dtype='float64'))
-    except:
-        return inputMatrix
-
-def getTrainMatrixLabelFromDataFrame(dataFrame,parameter=dict(),\
-        hostDict=dict(),sipDict=dict(),vtDF = dict(),
-        flagReturnDomains=False):
-    if len(dataFrame) == 0:
-        return ([],-1)
-    if 'flowFeatures' in parameter:
-        flowFeatures = parameter['flowFeatures']
-    else:
-        flowFeatures = ['duration','bytes_down','bytes_up']
-    # extract flow features
-    data = dataFrame[flowFeatures].values
-    # get time-gap feature
-    timeStamps = np.array(dataFrame['timeStamp'].values,dtype='float32')
-    timeStampsPre = np.zeros([len(timeStamps),])
-    timeStampsPre[1:] = timeStamps[0:len(timeStamps)-1]
-    diffTimeStamps = timeStamps - timeStampsPre
-    diffTimeStamps[0] = 0.0
-    negIDs = np.where(diffTimeStamps < 0.0)[0]
-    diffTimeStamps[negIDs] = 0.0
-    diffTimeStamps = np.reshape(diffTimeStamps,[len(diffTimeStamps),1])
-    data = np.hstack([data,diffTimeStamps])
-    # log transform
-    data = logTransformData(data)
-    if 'urlFeature' in dataFrame:
-        urlFeatures = np.zeros([len(dataFrame),len(dataFrame.iloc[0]['urlFeature'])])
-        for i in range(len(dataFrame)):
-            urlFeatures[i,:] = dataFrame.iloc[i]['urlFeature']
-        data = np.hstack([data,urlFeatures])
-    # cisco feature
-    numCiscoFeature = 30
-    ciscoFeatures = np.zeros([data.shape[0],2*numCiscoFeature])
-    if len(hostDict) > 0:
-        counter = 0
-        for i in range(len(dataFrame)):
-            row = dataFrame.iloc[i]
-            curHost = extractHost(row['domain'])
-            if curHost in hostDict:
-                ciscoFeatures[counter,0:numCiscoFeature] = hostDict[curHost]
-    if len(sipDict) > 0:
-        counter = 0
-        for i in range(len(dataFrame)):
-            row = dataFrame.iloc[i]
-            curSIP = row['server_ip']
-            if curSIP in sipDict:
-                ciscoFeatures[counter,numCiscoFeature:] = sipDict[curSIP]        
-    data = np.hstack([data,ciscoFeatures])
-    if len(vtDF) != 0:
-        vtHashSet = set(vtDF['hash'])
-        hitNums = []
-        hashes = dataFrame['anyConnect_hash']
-        for curHash in hashes:
-            #print(vtDF.keys())
-            try:
-                if curHash.lower() in vtHashSet:
-                    curID = np.where(vtDF['hash'] == curHash.lower())[0]
-                    if len(curID) >= 1:
-                        curID = curID[0]
-                        hitNums.append(float(vtDF.iloc[curID]['hits']))
-                    else:
-                        hitNums.append(-1.0)
-                else:
-                    hitNums.append(-1.0)
-            except:
-                hitNums.append(-1.0)
-        maxHits = np.max(hitNums)
-    else:
-        if 'hits' in dataFrame:
-            maxHits = np.max(dataFrame['hits'])
-        else:
-            maxHits = -1
-    label = np.max(dataFrame['label'])
-    if flagReturnDomains:
-        return (data,label,maxHits,dataFrame['domain'])
-    else:
-        return (data,label,maxHits)
-    
-
-    
-def getDomainChunksByUser(data,useUserName,blockSize):
-    outData = []
-    outLabel = []
-    useDataAll = data[data['user_hash'] == useUserName]
-    userIDs = np.arange(len(useDataAll))
-    #print('number of found flows for user: ' + str(len(userIDs)))
-    numBlocks = int(np.ceil(float(len(userIDs)) / float(blockSize)))
-    for blockID in range(numBlocks):
-        curIDs = userIDs[(blockID * blockSize):((blockID+1)*blockSize)]
-        #print(curIDs)
-        useData = useDataAll.iloc[curIDs]
-        curDomains = useData['domain']
-        curLabel = np.max(useData['label'])
-        outData.append(curDomains)
-        outLabel.append(curLabel)
-    return (outData,outLabel)    
-    
-    
-def getChunksByUser(data,useUserName,blockSize,parameter=dict(),\
-    hostDict=dict(),sipDict=dict(), vtDF = dict, flagOnlyOneUser = False,
-    flagReturnDomains=False):
-    outData = []
-    outLabel = []
-    outHits = []
-    outDomains = []
-    if flagOnlyOneUser:
-        useDataAll = data
-    else:
-        useDataAll = data[data['user_hash'] == useUserName]
-    userIDs = np.arange(len(useDataAll))
-    #print('number of found flows for user: ' + str(len(userIDs)))
-    numBlocks = int(np.ceil(float(len(userIDs)) / float(blockSize)))
-    for blockID in range(numBlocks):
-        curIDs = userIDs[(blockID * blockSize):((blockID+1)*blockSize)]
-        #print(curIDs)
-        useData = useDataAll.iloc[curIDs]
-        if flagReturnDomains:
-            (curTrainData,curLabel,curMaxHits,curDomains) = getTrainMatrixLabelFromDataFrame(useData,\
-                parameter,hostDict,sipDict,vtDF=vtDF,flagReturnDomains=flagReturnDomains)
-        else:
-            (curTrainData,curLabel,curMaxHits) = getTrainMatrixLabelFromDataFrame(useData,\
-                parameter,hostDict,sipDict,vtDF=vtDF,flagReturnDomains=flagReturnDomains)
-        outData.append(curTrainData)
-        outLabel.append(curLabel)
-        outHits.append(curMaxHits)
-        if flagReturnDomains:
-            outDomains.append(curDomains)
-    if flagReturnDomains:
-        return (outData,outLabel,outHits,outDomains)
-    else:
-        return (outData,outLabel,outHits)
-    
-
-def getLSTMModel(blockSize=10,input_dim=103,lstmUnits=10,denseSize=128):
-    nb_classes = 2
-    model = Sequential()
-    model.add(LSTM(lstmUnits, input_dim=input_dim, input_length=blockSize))
-    model.add(Dense(denseSize, activation='relu'))
-    model.add(Dropout(0.5))
-    model.add(Dense(nb_classes, activation='softmax'))
-    model.compile(loss='binary_crossentropy',
-              optimizer='adam', metrics=['accuracy'])
-    # number of params:
-    # params = 4 * ((size_of_input + 1) * size_of_output + size_of_output^2)
-    return model
-    
-
-
-def getCiscoURLFeatureForRow(row):
-    sortKeys = list(row.keys())
-    sortKeys.sort()
-    featureVec = np.zeros([len(sortKeys)-1,])
-    counter = 0
-    for keyName in sortKeys:
-        if 'key' in keyName:
-            continue
-        try:
-            featureVec[counter] = float(row[keyName])
-        except:            
-            featureVec[counter] = 0.0
-        counter += 1
-    featureVec[np.where(np.isnan(featureVec))[0]] = 0.0
-    return featureVec
-
-
-def getCiscoFeatureDictForHost(headerPath,dataPath):
-    # get header
-    header = []
-    for line in file(headerPath):
-        header.append(line.strip())
-    
-    header = ['key'] + header
-        
-    fobj = open(dataPath,'r')
-    csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t')
-    hostDict = dict()
-    counter = 0
-    for row in csvReader:
-        featureVec = getCiscoURLFeatureForRow(row)
-        curHost = row['key']
-        curHost = extractHost(curHost)
-        hostDict[curHost] = featureVec
-        counter += 1
-        if counter % 10000 == 0:
-            print(str(counter) + ' host features collected')
-    return hostDict
-
-def getCiscoFeatureDictForSIP(headerPath,dataPath):
-    # get header
-    header = []
-    for line in file(headerPath):
-        header.append(line.strip())
-    
-    header = ['key'] + header
-        
-    fobj = open(dataPath,'r')
-    csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t')
-    hostDict = dict()
-    counter = 0
-    for row in csvReader:
-        featureVec = getCiscoURLFeatureForRow(row)
-        curHost = row['key']        
-        hostDict[curHost] = featureVec
-        counter += 1
-        if counter % 10000 == 0:
-            print(str(counter) + ' sip features collected')
-    return hostDict
-
-def getCiscoFeatureDictForSLD(headerPath,dataPath):
-    # get header
-    header = []
-    for line in file(headerPath):
-        header.append(line.strip())
-    
-    header = ['key'] + header
-        
-    fobj = open(dataPath,'r')
-    csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t')
-    hostDict = dict()
-    counter = 0
-    for row in csvReader:
-        featureVec = getCiscoURLFeatureForRow(row)
-        curHost = row['key']        
-        hostDict[curHost] = featureVec
-        counter += 1
-        if counter % 10000 == 0:
-            print(str(counter) + ' sld features collected')
-    return hostDict
-
-
-def extractHost(domain):
-    curHostSplit = domain.split('.')
-    try:
-        curHost = curHostSplit[-2] + '.' + curHostSplit[-1]
-        return curHost
-    except:
-        return domain
-    
-    
-def loadDataSetFromJoblib(trainDirs,minFlowsPerUser = 10,numTimesPos = 20):        
-    for dirID in range(len(trainDirs)):
-        curDir = trainDirs[dirID]
-        curFiles = os.listdir(curDir)
-        dayJoblibCounter = 0
-        for curFile in curFiles:
-            curFile = curDir + curFile
-            if curFile.endswith('.joblib'):
-                curData = joblib.load(curFile)
-                if dayJoblibCounter == 0:
-                    dayData = curData
-                else:
-                    dayData = dayData.append(curData,ignore_index=True)
-                dayJoblibCounter += 1
-                print('processed file number: ' + str(dayJoblibCounter) + ' (dir ' + str(curDir) +')')
-        # use flows with min minFlowsPerUser flows
-        if minFlowsPerUser != -1:
-            grouped = dayData.groupby('user_hash')        
-            useUsers = set()
-            for grouping in grouped:
-                numFlowsCurUser = len(grouping[1])
-                userLabel = np.max(grouping[1]['label'])
-                if numFlowsCurUser >= minFlowsPerUser and userLabel != -1.0:
-                    useUsers.add(grouping[0])
-            # get ids
-            userIDs = dayData.loc[dayData['user_hash'].isin(useUsers)].index.values
-            dayData = dayData.iloc[userIDs]
-            dayData = dayData.reset_index(drop=True)
-        if numTimesPos != -1:
-            grouped = dayData.groupby('user_hash')        
-            curUserLabel = []
-            curUserNames = []
-            for grouping in grouped:
-                numFlowsCurUser = len(grouping[1])
-                userLabel = np.max(grouping[1]['label'])
-                curUserLabel.append(userLabel)
-                curUserNames.append(grouping[1]['user_hash'].values[0])
-            posIDs = np.where(np.array(curUserLabel) == 1.0)[0]
-            negIDs = np.where(np.array(curUserLabel) == 0.0)[0]
-            maxNegLabel = len(posIDs) * numTimesPos
-            if len(negIDs) > maxNegLabel:
-                np.random.seed(1)
-                np.random.shuffle(negIDs)
-                negIDs = negIDs[0:maxNegLabel]
-            useIDs = np.concatenate([posIDs,negIDs])
-            useUsers = np.array(curUserNames)[useIDs]
-            useUsers = set(useUsers)
-            # get ids
-            userIDs = dayData.loc[dayData['user_hash'].isin(useUsers)].index.values
-            dayData = dayData.iloc[userIDs]
-            dayData = dayData.reset_index(drop=True)
-        if dirID == 0:
-            allData = dayData
-        else:
-            allData = allData.append(dayData,ignore_index=True)
-    return allData
-    
-def tokenizeDomain(domain,n=3):
-    domain = domain.replace('https://','')
-    domain = domain.replace('www.','')
-    domain = domain.replace('/','')
-    # reverse domain
-    domain = domain[::-1]
-    outList = []
-    splitCriterion = n
-    # overlapping n-grams
-    outList = [domain[i:i+splitCriterion] for i in range(0, len(domain), 1)] 
-    return outList
-    
-    
-def getDomainsInWindowData(allData,numNeg=-1,blockSize=10):
-    uniqueTrainUser = np.unique(allData['user_hash'])        
-    userLabel = []
-    for curTrainUser in uniqueTrainUser:
-         userIDs = allData.loc[allData['user_hash'] == curTrainUser].index.values
-         curLabel = np.max(allData.iloc[userIDs]['label'])
-         userLabel.append(curLabel)
-    negIDs = np.where(np.array(userLabel) == 0.0)[0]
-    userLabel = np.array(userLabel)
-    posUser = np.where(userLabel == 1.0)[0]
-    negUser = np.where(userLabel == 0.0)[0]
-    
-    if numNeg != -1:
-        if len(negUser) > numNeg:
-            np.random.shuffle(negUser)
-            negUser = negIDs[0:numNeg]
-    
-    useUser = posUser
-    useUser = np.concatenate([posUser,negUser])
-    counter         = 0
-    trainDomains     = []
-    trainBlockLabel = []
-    trainNames      = []
-    for uID in range(len(useUser)):
-        curTrainUser = uniqueTrainUser[useUser[uID]]
-        (curUserData,curUserLabel) = getDomainChunksByUser(allData,curTrainUser,blockSize)
-        for i in range(len(curUserLabel)):
-            trainNames.append(curTrainUser)
-        trainDomains += curUserData
-        trainBlockLabel += curUserLabel
-        print('processed ' + str(counter) + ' users of ' + str(len(useUser)))
-        counter+= 1
-    return (trainDomains,trainBlockLabel,trainNames)    
-    
-def getPaddedData(allData,numNeg=-1,blockSize=10,parameterDict=dict(),\
-    hostDict=dict(),sipDict = dict(),vtLabelPath=''):
-    if vtLabelPath != '':
-        vtDF = pd.read_csv(vtLabelPath,sep='\t')
-    else:
-        vtDF = dict()
-    uniqueTrainUser = np.unique(allData['user_hash'])        
-    userLabel = []
-    for curTrainUser in uniqueTrainUser:
-         userIDs = allData.loc[allData['user_hash'] == curTrainUser].index.values
-         curLabel = np.max(allData.iloc[userIDs]['label'])
-         userLabel.append(curLabel)
-    negIDs = np.where(np.array(userLabel) == 0.0)[0]
-    userLabel = np.array(userLabel)
-    posUser = np.where(userLabel == 1.0)[0]
-    negUser = np.where(userLabel == 0.0)[0]
-    
-    if numNeg != -1:
-        if len(negUser) > numNeg:
-            np.random.shuffle(negUser)
-            negUser = negIDs[0:numNeg]
-    
-    useUser = posUser
-    useUser = np.concatenate([posUser,negUser])
-    counter         = 0
-    trainBlocks     = []
-    trainBlockLabel = []
-    trainNames      = []
-    trainBlockHits  =  []
-    for uID in range(len(useUser)):
-        curTrainUser = uniqueTrainUser[useUser[uID]]
-        (curUserData,curUserLabel,curHits) = getChunksByUser(allData,curTrainUser,blockSize,\
-            parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF = vtDF)
-        for i in range(len(curUserLabel)):
-            trainNames.append(curTrainUser)
-        trainBlocks += curUserData
-        trainBlockLabel += curUserLabel
-        trainBlockHits += curHits
-        print('processed ' + str(counter) + ' users of ' + str(len(useUser)))
-        counter+= 1
-    paddedData = pad_sequences(trainBlocks, maxlen=blockSize,dtype='float32')
-    #paddedData = paddedData[:,:,featureTypeDict[useFeatureType]]
-    return (paddedData,trainBlockLabel,trainNames,trainBlockHits)
-
-    
-def createTrainDataFromJoblibsPerUser(joblibPaths,minFlowsPerUser = 10,blockSize=10,
-                                hostDict=dict(),sipDict=dict(),                                 
-                                vtLabelPath='',maxFlowsPerUser = 50000):
-    trainBlockLabel = []
-    trainNames      = []
-    trainBlockHits  = []
-    parameterDict   = dict()
-    numBlocksToInitialize = 10000
-    paddedData      = np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])
-    overallCounter  = 0
-    startTime = time.time()
-    for uID in range(len(joblibPaths)):
-        curSavePath = joblibPaths[uID]
-        curData = joblib.load(curSavePath)['dataFrame']
-        if len(curData) < minFlowsPerUser:
-            continue
-        #curUserName = np.unique(curData['user_hash'])[0]
-        (curUserData,curUserLabel,curHits) = getChunksByUser(curData,'',blockSize,\
-            parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=dict(),flagOnlyOneUser = True)           
-        curPaddedData = pad_sequences(curUserData, maxlen=blockSize,dtype='float32')
-        if (curPaddedData.shape[0] > maxFlowsPerUser):
-            curPaddedData = curPaddedData[0:maxFlowsPerUser]
-            curUserLabel = list(np.array(curUserLabel)[0:maxFlowsPerUser])
-            curHits = list(np.array(curHits)[0:maxFlowsPerUser])
-        for i in range(len(curPaddedData)):
-            trainNames.append(curSavePath)
-        trainBlockLabel += curUserLabel
-        trainBlockHits += curHits
-        #curPaddedData = curPaddedData[:,:,featureTypeDict[useFeatureType]]
-        numCurInstances = curPaddedData.shape[0]
-        while overallCounter+numCurInstances > paddedData.shape[0]:
-            paddedData = np.vstack([paddedData,np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])])
-        paddedData[overallCounter:overallCounter+numCurInstances,:] = curPaddedData
-        overallCounter += numCurInstances
-        if uID % 1000 == 0:
-            elapsedTime = time.time() - startTime
-            startTime = time.time()
-            print(str(uID+1) + ' user processed [' + str(elapsedTime) + ']')
-    paddedData = paddedData[0:overallCounter]
-    return (paddedData,trainBlockLabel,trainNames,trainBlockHits)
-    
-def loadDataSetFromJoblibPerUser(trainDirs,minFlowsPerUser = 10,numNegPerDay = 50000,
-                                 blockSize = 10,hostDict=dict(),sipDict=dict(),
-                                 seed =1,flagSkipNoLabelUser=False,
-                                 vtLabelPath='',maxFlowsPerUser = 50000,
-                                 flagReturnDomains=False):
-    if vtLabelPath != '':
-        vtDF = pd.read_csv(vtLabelPath,sep='\t')
-    else:
-        vtDF = dict()
-    trainBlockLabel = []
-    trainNames      = []
-    trainBlockHits  = []
-    trainBlockDomains = []
-    parameterDict   = dict()
-    numBlocksToInitialize = 10000
-    paddedData      = np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])
-    overallCounter  = 0
-    for curDirID in range(len(trainDirs)):
-        curDir = trainDirs[curDirID]
-        curLabelFile = curDir + 'data_label.joblib'
-        labelData = joblib.load(curLabelFile)
-        posIDs = np.where(np.array(labelData['label']) == 1.0)[0]
-        negIDs = np.where(np.array(labelData['label']) == 0.0)[0]
-        random.seed(seed)
-        random.shuffle(negIDs)        
-        useIDs = np.concatenate([posIDs,negIDs])        
-        counter = 0
-        negCounter = 0
-        startTime = time.time()
-        for uID in range(len(useIDs)):
-            curID = useIDs[uID]
-            curUserName = labelData['usernames'][curID]
-            curSavePath = curDir + str(curUserName) + '.joblib'
-            curData = joblib.load(curSavePath)['dataFrame']
-            if flagSkipNoLabelUser:
-                curUserLabel = np.max(curData['label'])
-                if curUserLabel == -1.0:
-                    continue
-            if len(curData) < minFlowsPerUser:
-                continue
-            if numNegPerDay != -1:
-                if negCounter > numNegPerDay:
-                    break
-            if flagReturnDomains:
-                (curUserData,curUserLabel,curHits,curDomains) = getChunksByUser(curData,curUserName,blockSize,\
-                    parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=vtDF,\
-                    flagReturnDomains=flagReturnDomains)       
-            else:
-                (curUserData,curUserLabel,curHits) = getChunksByUser(curData,curUserName,blockSize,\
-                    parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=vtDF,\
-                    flagReturnDomains=flagReturnDomains)           
-            curPaddedData = pad_sequences(curUserData, maxlen=blockSize,dtype='float32')
-            if (curPaddedData.shape[0] > maxFlowsPerUser):
-                curPaddedData = curPaddedData[0:maxFlowsPerUser]
-                curUserLabel = list(np.array(curUserLabel)[0:maxFlowsPerUser])
-                curHits = list(np.array(curHits)[0:maxFlowsPerUser])
-                if 'curDomains' in locals():
-                    curDomains = list(np.array(curDomains)[0:maxFlowsPerUser])
-            for i in range(len(curPaddedData)):
-                trainNames.append(curUserName)
-            trainBlockLabel += curUserLabel
-            trainBlockHits += curHits
-            trainBlockDomains += curDomains
-            #curPaddedData = curPaddedData[:,:,featureTypeDict[useFeatureType]]
-            numCurInstances = curPaddedData.shape[0]
-            while overallCounter+numCurInstances > paddedData.shape[0]:
-                paddedData = np.vstack([paddedData,np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])])
-            paddedData[overallCounter:overallCounter+numCurInstances,:] = curPaddedData
-            overallCounter += numCurInstances
-            #print('num of instances: ' + str(numCurInstances))
-            if (counter+1) % 1000 == 0:
-                elapsedTime = time.time() - startTime
-                print('processed ' + str(counter+1) + ' users of ' +\
-                    str(len(useIDs)) + ' with '  + str(len(curData['label'])) +\
-                    ' flows [dir ' + str(curDirID+1) + ' of ' +\
-                    str(len(trainDirs)) + '] in ' + str(elapsedTime) + ' sec')
-                startTime = time.time()
-            if np.max(np.array(curUserLabel)) == 0.0:
-                negCounter += 1
-            counter+= 1
-    paddedData = paddedData[0:overallCounter]
-    if flagReturnDomains:
-        return (paddedData,trainBlockLabel,trainNames,trainBlockHits,trainBlockDomains)
-    else:
-        return (paddedData,trainBlockLabel,trainNames,trainBlockHits)
-    
-def loadRawDataSetFromJoblibPerUser(trainDirs,numNegPerDay = 2000, seed = 1):
-    dataFrameList = []
-    overallCounter  = 0
-    from tqdm import tqdm
-    for curDirID in tqdm(np.arange(len(trainDirs)), miniters=1):    
-        curDir = trainDirs[curDirID]
-        curLabelFile = curDir + 'data_label.joblib'
-        labelData = joblib.load(curLabelFile)
-        posIDs = np.where(np.array(labelData['label']) == 1.0)[0]
-        negIDs = np.where(np.array(labelData['label']) == 0.0)[0]
-        random.seed(seed)
-        random.shuffle(negIDs)
-        if len(negIDs) >= numNegPerDay:
-            negIDs = negIDs[0:numNegPerDay]
-        useIDs = np.concatenate([posIDs,negIDs])
-        for uID in range(len(useIDs)):
-            curID = useIDs[uID]
-            curUserName = labelData['usernames'][curID]
-            curSavePath = curDir + str(curUserName) + '.joblib'
-            curData = joblib.load(curSavePath)['dataFrame']
-            dataFrameList.append(curData)
-            overallCounter += 1
-    return dataFrameList
-                    
- 
-def checkDomainForSecondLevelDomain(inDomain,sldDomainDict):
-    if not 'str' in str(type(inDomain)):
-        return False
-    splitDomain = inDomain.split('.')
-    if len(splitDomain) <= 2:
-        return False
-    sldDomain = splitDomain[-2] + '.' + splitDomain[-1]
-    if sldDomain in sldDomainDict:
-        return True
-    else:
-        return False
-    '''
-    out = False
-    for sldDomain in sldDomainDict:
-        if inDomain.endswith(sldDomain):
-            out = True
-            break
-    return out
-    '''
-    
-def save_model(model,jsonPath,h5Path):
-    # saving model
-    json_model = model.to_json()
-    open(jsonPath, 'w').write(json_model)
-    # saving weights
-    model.save_weights(h5Path, overwrite=True)
-
-def load_model(jsonPath,h5Path):
-    # loading model
-    model = model_from_json(open(jsonPath).read())
-    model.load_weights(h5Path)
-    return model
-    
-    
-def getResultsFromSavedJoblibFile(joblibFiles,threshold=3):
-    testUserScores = []
-    testUserLabel = []
-    testLabel = []
-    testScores = []
-    testNames = []
-    for joblibPath in joblibFiles:
-        print('process: ' + joblibPath)
-        tmpJoblib = joblib.load(joblibPath)
-        if 'testBlockScores' in tmpJoblib.keys():
-            curTestBlockScores = tmpJoblib['testBlockScores']
-            for i in range(len(curTestBlockScores)):
-                if i == 0:
-                    curTestScores = curTestBlockScores[i]
-                else:
-                    curTestScores = np.concatenate([curTestScores,curTestBlockScores[i]])
-            curTestHits = tmpJoblib['blockHits']
-            curTestHits = np.array(curTestHits)
-            curTestScores = np.array(curTestScores)
-            curTestLabel = np.ones([len(curTestScores),]) * -1.0
-            curTestLabel[np.where(curTestHits == 0)[0]] = 0.0
-            curTestLabel[np.where(curTestHits >= threshold)[0]] = 1.0
-            curTestNames = tmpJoblib['testNames']
-        else:
-            curTestHits = tmpJoblib['testHits']
-            curTestScores = tmpJoblib['testScores']
-            curTestLabel = tmpJoblib['testLabel']
-            curTestNames = tmpJoblib['testNames']
-
-        useIDs = np.where(curTestHits >= threshold)[0]
-        useIDs = np.concatenate([useIDs,np.where(curTestHits == 0.0)[0]])
-        # old code
-        #useIDs = np.where(tmpJoblib['testLabel'] == 1.0)[0]
-        #useIDs = np.concatenate([useIDs,np.where(tmpJoblib['testLabel'] == 0.0)[0]])
-        curTestScoresT = curTestScores[useIDs]
-        curTestLabelT = curTestLabel[useIDs]
-        if len(testScores) == 0:
-            testScores = curTestScoresT
-            testLabel = curTestLabelT
-        else:
-            testScores = np.concatenate([testScores,curTestScoresT])
-            testLabel = np.concatenate([testLabel,curTestLabelT])
-
-        if 'testBlockScores' in tmpJoblib.keys():
-            tmpScores = np.array(tmpJoblib['testScores'])
-            tmpHits = np.array(tmpJoblib['testHits'])
-            tmpLabel = np.ones([len(tmpHits),])*-1
-            tmpLabel[np.where(tmpHits == 0.0)[0]] = 0.0
-            tmpLabel[np.where(tmpHits >= threshold)[0]] = 1.0
-            useIDs = np.where(tmpLabel == 1.0)[0]
-            useIDs = np.concatenate([useIDs,np.where(tmpLabel == 0.0)[0]])
-            testUserLabel += list(np.array(tmpLabel)[useIDs])
-            testUserScores += list(np.array(tmpScores)[useIDs])
-        else:
-            # get user label
-            uniqueTestNames = list(np.unique(curTestNames))    
-            for testName in uniqueTestNames:
-                curIDs = np.where(curTestNames == testName)[0]
-                curMaxHits = np.max(curTestHits[curIDs])
-                if curMaxHits > 0 and curMaxHits < threshold:
-                    continue
-                if curMaxHits >= threshold:
-                    testUserLabel.append(1.0)
-                else:
-                    testUserLabel.append(0.0)
-                curScore = np.max(curTestScores[curIDs])
-                testUserScores.append(curScore)
-                testNames.append(testName)
-    testUserScores = np.array(testUserScores)
-    testUserLabel = np.array(testUserLabel)
-    testNames = np.array(testNames)
-    return (testUserScores,testUserLabel,testLabel,testScores,testNames)
-    
-def checkIfIP(host):
-    ipMask = '^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$'
-    if re.search(ipMask, host) is not None:
-        return True
-    else:
-        return False    
-    
-# GLOBAL VALUES
-numCiscoFeatures = 30
-featureTypeDict = {'neural':np.arange(4,104,1),\
-                    'packet':np.array([0,1,2,3]),\
-                    'neural+packet':np.arange(0,104,1),\
-                    'neural+packet+cisco':np.arange(0,104+(2*numCiscoFeatures),1),\
-                    'cisco':np.arange(104,104+(2*numCiscoFeatures),1)}
-                    
-globalNumFeatures = len(featureTypeDict['neural+packet+cisco'])
--- a/stackedNeuralModels.py
+++ b/stackedNeuralModels.py
@@ -1,215 +0,0 @@
-# -*- coding: utf-8 -*-
-import csv
-
-import numpy as np
-from keras.layers import Dense, Activation, Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Lambda
-from keras.layers import Input
-from keras.models import Model
-from keras.models import Sequential
-from tqdm import tqdm
-
-
-def getCiscoFeatures(curDataLine, urlSIPDict):
-    numCiscoFeatures = 30
-    try:
-        ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
-        # print('cisco features: ' + str(ciscoFeatures))
-        # log transform
-        ciscoFeatures = np.log1p(ciscoFeatures, dtype='float32')
-        # print('log transformed: ' + str(ciscoFeatures))
-        return ciscoFeatures.ravel()
-    except:
-        return np.zeros([numCiscoFeatures, ]).ravel()
-
-
-def getCNNWithoutLastLayer(vocabSize, embeddingSize, input_length, filters, kernel_size,
-                           hidden_dims, drop_out):
-    model = Sequential()
-    model.add(Embedding(input_dim=vocabSize, output_dim=embeddingSize,
-                        input_length=input_length))
-
-    model.add(Conv1D(filters,
-                     kernel_size,
-                     activation='relu'))
-
-    # we use max pooling:
-    model.add(GlobalMaxPooling1D())
-
-    # We add a vanilla hidden layer:
-    model.add(Dense(hidden_dims))
-    model.add(Dropout(drop_out))
-    model.add(Activation('relu'))
-    return model
-
-
-def getCNNWitoutLastLayerFunctional(vocabSize, embeddingSize, input_length, filters, kernel_size,
-                                    hidden_dims, drop_out):
-    a = Input(shape=(input_length,))
-    embedding = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(a)
-    conv1 = Conv1D(filters, kernel_size, activation='relu')(embedding)
-    glob = GlobalMaxPooling1D()(conv1)
-    dense = Dense(hidden_dims)(glob)
-    drop = Dropout(drop_out)(dense)
-    model = Activation('relu')(drop)
-    model = Model(a, model)
-    return model
-
-
-def getFlowFeatureLayer(numFeatures):
-    model = Sequential()
-    # slpModel.add(Dense(1, input_shape=(1,)))
-    model.add(Lambda(lambda x: x + 0.0, input_shape=(numFeatures,)))
-    return model
-
-
-def createCNNDataSet(domains, label, characterDict, maxLen=40):
-    # process domains in reverse order
-    outFeature = np.zeros([len(domains), maxLen])
-    outLabel = np.zeros([len(domains), ])
-    for i in range(len(domains)):
-        domain = domains[i]
-        curLabel = label[i]
-        curFeature = np.zeros([maxLen, ])
-        # print(domain + ' ' + str(len(domain)))
-        for j in range(np.min([len(domain), maxLen])):
-            # print(j)
-            curCharacter = domain[-j]
-            if curCharacter in characterDict:
-                curFeature[j] = characterDict[curCharacter]
-        outFeature[i] = curFeature
-        outLabel[i] = curLabel
-    return (outFeature, outLabel)
-
-
-def getFeatureVecForDomain(domain, characterDict, maxLen=40):
-    curFeature = np.zeros([maxLen, ])
-    for j in range(np.min([len(domain), maxLen])):
-        # print(j)
-        curCharacter = domain[-j]
-        if curCharacter in characterDict:
-            curFeature[j] = characterDict[curCharacter]
-    return curFeature
-
-
-def getFlowFeatures(curDataLine):
-    useKeys = ['duration', 'bytes_down', 'bytes_up']
-    curFeature = np.zeros([len(useKeys), ])
-    for i in range(len(useKeys)):
-        curKey = useKeys[i]
-        try:
-            curFeature[i] = np.log1p(curDataLine[curKey], dtype='float32')
-        except:
-            pass
-    return curFeature
-
-
-def getChunksFromUserDataFrame(dataFrame, windowSize=10, overlapping=False,
-                               maxLengthInSeconds=300):
-    # print('maxLength: ' + str(maxLengthInSeconds))
-    maxMilliSeconds = maxLengthInSeconds * 1000
-    outDomainLists = []
-    outDFFrames = []
-    if overlapping == False:
-        numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize)))
-        userIDs = np.arange(len(dataFrame))
-        for blockID in np.arange(numBlocks):
-            curIDs = userIDs[(blockID * windowSize):((blockID + 1) * windowSize)]
-            # print(curIDs)
-            useData = dataFrame.iloc[curIDs]
-            curDomains = useData['domain']
-            if maxLengthInSeconds != -1:
-                curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
-                underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
-                if len(underTimeOutIDs) != len(curIDs):
-                    curIDs = curIDs[underTimeOutIDs]
-                    useData = dataFrame.iloc[curIDs]
-                    curDomains = useData['domain']
-            outDomainLists.append(list(curDomains))
-            outDFFrames.append(useData)
-    else:
-        numBlocks = len(dataFrame) + 1 - windowSize
-        userIDs = np.arange(len(dataFrame))
-        for blockID in np.arange(numBlocks):
-            curIDs = userIDs[blockID:blockID + windowSize]
-            # print(curIDs)
-            useData = dataFrame.iloc[curIDs]
-            curDomains = useData['domain']
-            if maxLengthInSeconds != -1:
-                curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
-                underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
-                if len(underTimeOutIDs) != len(curIDs):
-                    curIDs = curIDs[underTimeOutIDs]
-                    useData = dataFrame.iloc[curIDs]
-                    curDomains = useData['domain']
-            outDomainLists.append(list(curDomains))
-            outDFFrames.append(useData)
-    return (outDomainLists, outDFFrames)
-
-
-def createTrainData(domainLists, dfLists, charachterDict, maxLen, threshold=3,
-                    flagUseCiscoFeatures=False, urlSIPDIct=dict,
-                    windowSize=10):
-    if 'hits' in dfLists[0].keys():
-        hitName = 'hits'
-    elif 'virusTotalHits' in dfLists[0].keys():
-        hitName = 'virusTotalHits'
-    numFlowFeatures = 3
-    numCiscoFeatures = 30
-    numFeatures = numFlowFeatures
-    if flagUseCiscoFeatures:
-        numFeatures += numCiscoFeatures
-    outputFeatures = []
-    label = []
-    hits = []
-    trainNames = []
-    for i in range(windowSize):
-        outputFeatures.append(np.zeros([len(domainLists), maxLen]))
-        outputFeatures.append(np.zeros([len(domainLists), numFeatures]))
-
-    for i in tqdm(np.arange(len(domainLists)), miniters=10):
-        curCounter = 0
-        # print('len domainList: ' + str(len(domainLists[i])))
-        # print('len df: ' + str(len(dfLists[i])))
-        for j in range(np.min([windowSize, len(domainLists[i])])):
-            outputFeatures[curCounter][i, :] = getFeatureVecForDomain(domainLists[i][j], charachterDict, maxLen)
-            curCounter += 1
-            if flagUseCiscoFeatures:
-                outputFeatures[curCounter][i, 0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j])
-                outputFeatures[curCounter][i, numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j], urlSIPDIct)
-            else:
-                outputFeatures[curCounter][i, :] = getFlowFeatures(dfLists[i].iloc[j])
-            curCounter += 1
-        curLabel = 0.0
-        if np.max(dfLists[i][hitName]) >= threshold:
-            curLabel = 1.0
-        elif np.max(dfLists[i][hitName]) == -1:
-            curLabel = -1.0
-        elif np.max(dfLists[i][hitName]) > 0 and np.max(dfLists[i][hitName]) < threshold:
-            curLabel = -2.0
-        label.append(curLabel)
-        hits.append(np.max(dfLists[i][hitName]))
-        trainNames.append(np.unique(dfLists[i]['user_hash']))
-    return (outputFeatures, np.array(label), np.array(hits), np.array(trainNames))
-
-
-def transformStringListToNumpyArray(listString):
-    listString = listString.replace('[', '').replace(']', '')
-    return np.array(listString.split(','), dtype='float32')
-
-
-def getCiscoFeatureDict(csvPathList):
-    outDict = dict()
-    for path in tqdm(csvPathList, miniters=1):
-        fobj = open(path, 'r')
-        csvReader = csv.DictReader(fobj, delimiter=',')
-        for row in csvReader:
-            urlSIPString = row['Domain'] + row['ServerIP']
-            ciscoFeatures = row['CiscoFeature']
-            outDict[urlSIPString] = transformStringListToNumpyArray(ciscoFeatures)
-            # if len(outDict) % 10000 == 0:
-            #    print('numbers in dict: ' + str(len(outDict)))
-    return outDict
-
-
-if __name__ == "__main__":
-    pass