diff --git a/ciscoProcessing.py b/ciscoProcessing.py
new file mode 100644
index 0000000..edbc542
--- /dev/null
+++ b/ciscoProcessing.py
@@ -0,0 +1,846 @@
+# -*- coding: utf-8 -*-
+import sys
+sys.path.append('..')
+sys.path.append('/mnt/projekte/pmlcluster/home/prasse/projects/ciscoSVN/cisco/trunk/code/')
+import os
+import numpy as np
+import joblib
+from keras.preprocessing.sequence import pad_sequences
+from keras.utils import np_utils
+from keras.models import Sequential
+from keras.layers import Dense
+from keras.layers import LSTM
+from keras.layers import Dropout
+import csv
+import pandas as pd
+import random
+from keras.models import model_from_json
+import time
+import re
+import mongoDBConnector as mongoDBConnector
+import stackedNeuralModels as stackedNeuralModels
+from tqdm import tqdm
+
+
+def getCiscoDomainLabel(curDomain,curSIP,hostSet,sipSet,sldSet):
+    # check server-ip
+    if curSIP in sipSet:
+        return 1.0        
+    # check second level domain
+    splitDomain = curDomain.split('.')
+    if len(splitDomain) >= 2:
+        curSLD = splitDomain[-2] + '.' + splitDomain[-1]
+    else:
+        curSLD = curDomain
+    if curSLD in sldSet:
+        return 1.0
+        
+    # check domain
+    if curDomain in hostSet:
+        return 1.0
+    else:
+        if curSLD in hostSet:
+            return 1.0
+        else:
+            return 0.0
+    return 0.0
+
+def getSubSample(useDir,numUser,threshold=3,
+                      windowSize=10,minFlowsPerUser=10,
+                      maxLen=40,flagUseCiscoFeatures=False,
+                      urlSIPDIct=dict(),characterDict=dict(),
+                      maxLengthInSeconds=-1,
+                      timesNeg=-1,
+                      mongoHost='',mongoPort=0,dbName='',
+                      collectionName='',metaCollectionName=''):
+    curDFs = mongoDBConnector.sampleDataFromDir(mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName,
+                          useDir=useDir,collectionName=collectionName,
+                          metaCollectionName=metaCollectionName,
+                          numUser=numUser,minFlowsPerUser=minFlowsPerUser)    
+    
+    domainLists = []
+    dfLists = []
+    for i in tqdm(np.arange(len(curDFs)), miniters=10):
+        (domainListsTmp,dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(curDFs[i],
+            windowSize=windowSize,overlapping=False,maxLengthInSeconds=maxLengthInSeconds)
+        domainLists += domainListsTmp
+        dfLists += dfListsTmp   
+    
+    (testData,testLabel,testHits,testNames) = stackedNeuralModels.createTrainData(
+        domainLists=domainLists,dfLists=dfLists,charachterDict=characterDict,
+                maxLen=maxLen,threshold = threshold,
+                flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=urlSIPDIct,
+                windowSize=windowSize)
+                
+    useIDs = np.where(np.array(testLabel) == 1.0)[0]
+    useIDs = np.concatenate([useIDs, np.where(np.array(testLabel) == 0.0)[0]])
+    
+    if timesNeg != -1:
+        posIDs = np.where(np.array(testLabel)[useIDs] == 1.0)[0]
+        negIDs = np.where(np.array(testLabel)[useIDs] == 0.0)[0]
+        if len(negIDs) > len(posIDs) * timesNeg:
+            negIDs = np.random.permutation(negIDs)
+            negIDs = negIDs[0:len(posIDs) * timesNeg]
+        negIDs = useIDs[negIDs]
+        posIDs = useIDs[posIDs]
+        useIDs = np.concatenate([negIDs,posIDs])
+    testLabel = testLabel[useIDs]
+    testHits = testHits[useIDs]
+    testNames = testNames[useIDs]
+    for i in range(len(testData)):
+        testData[i] = testData[i][useIDs]
+    return (testData,testLabel,testHits,testNames)
+
+
+def getSubSampleAllPositiveUsers(useDir,threshold=3,
+                      windowSize=10,minFlowsPerUser=10,
+                      maxLen=40,flagUseCiscoFeatures=False,
+                      urlSIPDIct=dict(),characterDict=dict(),
+                      maxLengthInSeconds=-1,
+                      numNegUser=10000,
+                      mongoHost='',mongoPort=0,dbName='',
+                      collectionName='',metaCollectionName=''):
+    
+    curDFs = mongoDBConnector.sampleAllPositiveUserFromDir(mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName,
+                          useDir=useDir,collectionName=collectionName,
+                          metaCollectionName=metaCollectionName,
+                          numNegUser=numNegUser,minFlowsPerUser=minFlowsPerUser)
+    domainLists = []
+    dfLists = []
+    for i in tqdm(np.arange(len(curDFs)), miniters=10):
+        (domainListsTmp,dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(curDFs[i],
+            windowSize=windowSize,overlapping=False,maxLengthInSeconds=maxLengthInSeconds)
+        domainLists += domainListsTmp
+        dfLists += dfListsTmp   
+    
+    (testData,testLabel,testHits,testNames) = stackedNeuralModels.createTrainData(
+        domainLists=domainLists,dfLists=dfLists,charachterDict=characterDict,
+                maxLen=maxLen,threshold = threshold,
+                flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=urlSIPDIct,
+                windowSize=windowSize)
+                
+    useIDs = np.where(np.array(testLabel) == 1.0)[0]
+    useIDs = np.concatenate([useIDs, np.where(np.array(testLabel) == 0.0)[0]])
+    
+    testLabel = testLabel[useIDs]
+    testHits = testHits[useIDs]
+    testNames = testNames[useIDs]
+    for i in range(len(testData)):
+        testData[i] = testData[i][useIDs]
+    return (testData,testLabel,testHits,testNames)             
+
+def sequenceGenerator(useDir,numUser,threshold=3,
+                      windowSize=10,minFlowsPerUser=10,
+                      maxLen=40,flagUseCiscoFeatures=False,
+                      urlSIPDIct=dict(),characterDict=dict(),
+                      maxLengthInSeconds=-1,
+                      timesNeg=-1,
+                      mongoHost='',mongoPort=0,dbName='',
+                      collectionName='',metaCollectionName=''):
+    while 1:        
+        (testData,testLabel,testHits,testNames) = getSubSample(useDir,numUser,threshold=threshold,
+                          windowSize=windowSize,minFlowsPerUser=minFlowsPerUser,
+                          maxLen=maxLen,flagUseCiscoFeatures=flagUseCiscoFeatures,
+                          urlSIPDIct=urlSIPDIct,characterDict=characterDict,
+                          maxLengthInSeconds=maxLengthInSeconds,
+                          timesNeg=timesNeg,
+                          mongoHost=mongoHost,mongoPort=mongoPort,dbName=dbName,
+                          collectionName=collectionName,metaCollectionName=metaCollectionName)
+        testLabel = np_utils.to_categorical(testLabel, 2)
+        #print(testData.shape)
+        yield (testData, testLabel)
+
+
+def sequenceGeneratorTest(data,label):
+    while 1:
+        yield (data, label)
+
+# three different modes
+# if mode == 'correct' -> dont permute or touch the ordering
+# if mode == 'permutate' -> permute the ordering
+# if mode == 'sort' -> sort the flows by sent bytes
+def dataGenerator(trainData,trainLabel,numTimesPos,mode='correct'):
+    return True
+    
+def getMalwareClassDict(path):
+    outDict = dict()
+    for line in file(path):
+        lineSplit = line.strip().split('\t')
+        if len(lineSplit) == 3:
+            outDict[lineSplit[0]] = (lineSplit[1],lineSplit[2])
+    return outDict
+
+def applyLower(inStr):
+    try:
+        return inStr.lower()
+    except:
+        return inStr
+
+def logTransformData(inputMatrix):
+    # delete timestamps
+    try:
+        return np.log1p(np.array(inputMatrix,dtype='float64'))
+    except:
+        return inputMatrix
+
+def getTrainMatrixLabelFromDataFrame(dataFrame,parameter=dict(),\
+        hostDict=dict(),sipDict=dict(),vtDF = dict(),
+        flagReturnDomains=False):
+    if len(dataFrame) == 0:
+        return ([],-1)
+    if 'flowFeatures' in parameter:
+        flowFeatures = parameter['flowFeatures']
+    else:
+        flowFeatures = ['duration','bytes_down','bytes_up']
+    # extract flow features
+    data = dataFrame[flowFeatures].values
+    # get time-gap feature
+    timeStamps = np.array(dataFrame['timeStamp'].values,dtype='float32')
+    timeStampsPre = np.zeros([len(timeStamps),])
+    timeStampsPre[1:] = timeStamps[0:len(timeStamps)-1]
+    diffTimeStamps = timeStamps - timeStampsPre
+    diffTimeStamps[0] = 0.0
+    negIDs = np.where(diffTimeStamps < 0.0)[0]
+    diffTimeStamps[negIDs] = 0.0
+    diffTimeStamps = np.reshape(diffTimeStamps,[len(diffTimeStamps),1])
+    data = np.hstack([data,diffTimeStamps])
+    # log transform
+    data = logTransformData(data)
+    if 'urlFeature' in dataFrame:
+        urlFeatures = np.zeros([len(dataFrame),len(dataFrame.iloc[0]['urlFeature'])])
+        for i in range(len(dataFrame)):
+            urlFeatures[i,:] = dataFrame.iloc[i]['urlFeature']
+        data = np.hstack([data,urlFeatures])
+    # cisco feature
+    numCiscoFeature = 30
+    ciscoFeatures = np.zeros([data.shape[0],2*numCiscoFeature])
+    if len(hostDict) > 0:
+        counter = 0
+        for i in range(len(dataFrame)):
+            row = dataFrame.iloc[i]
+            curHost = extractHost(row['domain'])
+            if curHost in hostDict:
+                ciscoFeatures[counter,0:numCiscoFeature] = hostDict[curHost]
+    if len(sipDict) > 0:
+        counter = 0
+        for i in range(len(dataFrame)):
+            row = dataFrame.iloc[i]
+            curSIP = row['server_ip']
+            if curSIP in sipDict:
+                ciscoFeatures[counter,numCiscoFeature:] = sipDict[curSIP]        
+    data = np.hstack([data,ciscoFeatures])
+    if len(vtDF) != 0:
+        vtHashSet = set(vtDF['hash'])
+        hitNums = []
+        hashes = dataFrame['anyConnect_hash']
+        for curHash in hashes:
+            #print(vtDF.keys())
+            try:
+                if curHash.lower() in vtHashSet:
+                    curID = np.where(vtDF['hash'] == curHash.lower())[0]
+                    if len(curID) >= 1:
+                        curID = curID[0]
+                        hitNums.append(float(vtDF.iloc[curID]['hits']))
+                    else:
+                        hitNums.append(-1.0)
+                else:
+                    hitNums.append(-1.0)
+            except:
+                hitNums.append(-1.0)
+        maxHits = np.max(hitNums)
+    else:
+        if 'hits' in dataFrame:
+            maxHits = np.max(dataFrame['hits'])
+        else:
+            maxHits = -1
+    label = np.max(dataFrame['label'])
+    if flagReturnDomains:
+        return (data,label,maxHits,dataFrame['domain'])
+    else:
+        return (data,label,maxHits)
+    
+
+    
+def getDomainChunksByUser(data,useUserName,blockSize):
+    outData = []
+    outLabel = []
+    useDataAll = data[data['user_hash'] == useUserName]
+    userIDs = np.arange(len(useDataAll))
+    #print('number of found flows for user: ' + str(len(userIDs)))
+    numBlocks = int(np.ceil(float(len(userIDs)) / float(blockSize)))
+    for blockID in range(numBlocks):
+        curIDs = userIDs[(blockID * blockSize):((blockID+1)*blockSize)]
+        #print(curIDs)
+        useData = useDataAll.iloc[curIDs]
+        curDomains = useData['domain']
+        curLabel = np.max(useData['label'])
+        outData.append(curDomains)
+        outLabel.append(curLabel)
+    return (outData,outLabel)    
+    
+    
+def getChunksByUser(data,useUserName,blockSize,parameter=dict(),\
+    hostDict=dict(),sipDict=dict(), vtDF = dict, flagOnlyOneUser = False,
+    flagReturnDomains=False):
+    outData = []
+    outLabel = []
+    outHits = []
+    outDomains = []
+    if flagOnlyOneUser:
+        useDataAll = data
+    else:
+        useDataAll = data[data['user_hash'] == useUserName]
+    userIDs = np.arange(len(useDataAll))
+    #print('number of found flows for user: ' + str(len(userIDs)))
+    numBlocks = int(np.ceil(float(len(userIDs)) / float(blockSize)))
+    for blockID in range(numBlocks):
+        curIDs = userIDs[(blockID * blockSize):((blockID+1)*blockSize)]
+        #print(curIDs)
+        useData = useDataAll.iloc[curIDs]
+        if flagReturnDomains:
+            (curTrainData,curLabel,curMaxHits,curDomains) = getTrainMatrixLabelFromDataFrame(useData,\
+                parameter,hostDict,sipDict,vtDF=vtDF,flagReturnDomains=flagReturnDomains)
+        else:
+            (curTrainData,curLabel,curMaxHits) = getTrainMatrixLabelFromDataFrame(useData,\
+                parameter,hostDict,sipDict,vtDF=vtDF,flagReturnDomains=flagReturnDomains)
+        outData.append(curTrainData)
+        outLabel.append(curLabel)
+        outHits.append(curMaxHits)
+        if flagReturnDomains:
+            outDomains.append(curDomains)
+    if flagReturnDomains:
+        return (outData,outLabel,outHits,outDomains)
+    else:
+        return (outData,outLabel,outHits)
+    
+
+def getLSTMModel(blockSize=10,input_dim=103,lstmUnits=10,denseSize=128):
+    nb_classes = 2
+    model = Sequential()
+    model.add(LSTM(lstmUnits, input_dim=input_dim, input_length=blockSize))
+    model.add(Dense(denseSize, activation='relu'))
+    model.add(Dropout(0.5))
+    model.add(Dense(nb_classes, activation='softmax'))
+    model.compile(loss='binary_crossentropy',
+              optimizer='adam', metrics=['accuracy'])
+    # number of params:
+    # params = 4 * ((size_of_input + 1) * size_of_output + size_of_output^2)
+    return model
+    
+
+
+def getCiscoURLFeatureForRow(row):
+    sortKeys = list(row.keys())
+    sortKeys.sort()
+    featureVec = np.zeros([len(sortKeys)-1,])
+    counter = 0
+    for keyName in sortKeys:
+        if 'key' in keyName:
+            continue
+        try:
+            featureVec[counter] = float(row[keyName])
+        except:            
+            featureVec[counter] = 0.0
+        counter += 1
+    featureVec[np.where(np.isnan(featureVec))[0]] = 0.0
+    return featureVec
+
+
+def getCiscoFeatureDictForHost(headerPath,dataPath):
+    # get header
+    header = []
+    for line in file(headerPath):
+        header.append(line.strip())
+    
+    header = ['key'] + header
+        
+    fobj = open(dataPath,'r')
+    csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t')
+    hostDict = dict()
+    counter = 0
+    for row in csvReader:
+        featureVec = getCiscoURLFeatureForRow(row)
+        curHost = row['key']
+        curHost = extractHost(curHost)
+        hostDict[curHost] = featureVec
+        counter += 1
+        if counter % 10000 == 0:
+            print(str(counter) + ' host features collected')
+    return hostDict
+
+def getCiscoFeatureDictForSIP(headerPath,dataPath):
+    # get header
+    header = []
+    for line in file(headerPath):
+        header.append(line.strip())
+    
+    header = ['key'] + header
+        
+    fobj = open(dataPath,'r')
+    csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t')
+    hostDict = dict()
+    counter = 0
+    for row in csvReader:
+        featureVec = getCiscoURLFeatureForRow(row)
+        curHost = row['key']        
+        hostDict[curHost] = featureVec
+        counter += 1
+        if counter % 10000 == 0:
+            print(str(counter) + ' sip features collected')
+    return hostDict
+
+def getCiscoFeatureDictForSLD(headerPath,dataPath):
+    # get header
+    header = []
+    for line in file(headerPath):
+        header.append(line.strip())
+    
+    header = ['key'] + header
+        
+    fobj = open(dataPath,'r')
+    csvReader = csv.DictReader(fobj,fieldnames = header,delimiter='\t')
+    hostDict = dict()
+    counter = 0
+    for row in csvReader:
+        featureVec = getCiscoURLFeatureForRow(row)
+        curHost = row['key']        
+        hostDict[curHost] = featureVec
+        counter += 1
+        if counter % 10000 == 0:
+            print(str(counter) + ' sld features collected')
+    return hostDict
+
+
+def extractHost(domain):
+    curHostSplit = domain.split('.')
+    try:
+        curHost = curHostSplit[-2] + '.' + curHostSplit[-1]
+        return curHost
+    except:
+        return domain
+    
+    
+def loadDataSetFromJoblib(trainDirs,minFlowsPerUser = 10,numTimesPos = 20):        
+    for dirID in range(len(trainDirs)):
+        curDir = trainDirs[dirID]
+        curFiles = os.listdir(curDir)
+        dayJoblibCounter = 0
+        for curFile in curFiles:
+            curFile = curDir + curFile
+            if curFile.endswith('.joblib'):
+                curData = joblib.load(curFile)
+                if dayJoblibCounter == 0:
+                    dayData = curData
+                else:
+                    dayData = dayData.append(curData,ignore_index=True)
+                dayJoblibCounter += 1
+                print('processed file number: ' + str(dayJoblibCounter) + ' (dir ' + str(curDir) +')')
+        # use flows with min minFlowsPerUser flows
+        if minFlowsPerUser != -1:
+            grouped = dayData.groupby('user_hash')        
+            useUsers = set()
+            for grouping in grouped:
+                numFlowsCurUser = len(grouping[1])
+                userLabel = np.max(grouping[1]['label'])
+                if numFlowsCurUser >= minFlowsPerUser and userLabel != -1.0:
+                    useUsers.add(grouping[0])
+            # get ids
+            userIDs = dayData.loc[dayData['user_hash'].isin(useUsers)].index.values
+            dayData = dayData.iloc[userIDs]
+            dayData = dayData.reset_index(drop=True)
+        if numTimesPos != -1:
+            grouped = dayData.groupby('user_hash')        
+            curUserLabel = []
+            curUserNames = []
+            for grouping in grouped:
+                numFlowsCurUser = len(grouping[1])
+                userLabel = np.max(grouping[1]['label'])
+                curUserLabel.append(userLabel)
+                curUserNames.append(grouping[1]['user_hash'].values[0])
+            posIDs = np.where(np.array(curUserLabel) == 1.0)[0]
+            negIDs = np.where(np.array(curUserLabel) == 0.0)[0]
+            maxNegLabel = len(posIDs) * numTimesPos
+            if len(negIDs) > maxNegLabel:
+                np.random.seed(1)
+                np.random.shuffle(negIDs)
+                negIDs = negIDs[0:maxNegLabel]
+            useIDs = np.concatenate([posIDs,negIDs])
+            useUsers = np.array(curUserNames)[useIDs]
+            useUsers = set(useUsers)
+            # get ids
+            userIDs = dayData.loc[dayData['user_hash'].isin(useUsers)].index.values
+            dayData = dayData.iloc[userIDs]
+            dayData = dayData.reset_index(drop=True)
+        if dirID == 0:
+            allData = dayData
+        else:
+            allData = allData.append(dayData,ignore_index=True)
+    return allData
+    
+def tokenizeDomain(domain,n=3):
+    domain = domain.replace('https://','')
+    domain = domain.replace('www.','')
+    domain = domain.replace('/','')
+    # reverse domain
+    domain = domain[::-1]
+    outList = []
+    splitCriterion = n
+    # overlapping n-grams
+    outList = [domain[i:i+splitCriterion] for i in range(0, len(domain), 1)] 
+    return outList
+    
+    
+def getDomainsInWindowData(allData,numNeg=-1,blockSize=10):
+    uniqueTrainUser = np.unique(allData['user_hash'])        
+    userLabel = []
+    for curTrainUser in uniqueTrainUser:
+         userIDs = allData.loc[allData['user_hash'] == curTrainUser].index.values
+         curLabel = np.max(allData.iloc[userIDs]['label'])
+         userLabel.append(curLabel)
+    negIDs = np.where(np.array(userLabel) == 0.0)[0]
+    userLabel = np.array(userLabel)
+    posUser = np.where(userLabel == 1.0)[0]
+    negUser = np.where(userLabel == 0.0)[0]
+    
+    if numNeg != -1:
+        if len(negUser) > numNeg:
+            np.random.shuffle(negUser)
+            negUser = negIDs[0:numNeg]
+    
+    useUser = posUser
+    useUser = np.concatenate([posUser,negUser])
+    counter         = 0
+    trainDomains     = []
+    trainBlockLabel = []
+    trainNames      = []
+    for uID in range(len(useUser)):
+        curTrainUser = uniqueTrainUser[useUser[uID]]
+        (curUserData,curUserLabel) = getDomainChunksByUser(allData,curTrainUser,blockSize)
+        for i in range(len(curUserLabel)):
+            trainNames.append(curTrainUser)
+        trainDomains += curUserData
+        trainBlockLabel += curUserLabel
+        print('processed ' + str(counter) + ' users of ' + str(len(useUser)))
+        counter+= 1
+    return (trainDomains,trainBlockLabel,trainNames)    
+    
+def getPaddedData(allData,numNeg=-1,blockSize=10,parameterDict=dict(),\
+    hostDict=dict(),sipDict = dict(),vtLabelPath=''):
+    if vtLabelPath != '':
+        vtDF = pd.read_csv(vtLabelPath,sep='\t')
+    else:
+        vtDF = dict()
+    uniqueTrainUser = np.unique(allData['user_hash'])        
+    userLabel = []
+    for curTrainUser in uniqueTrainUser:
+         userIDs = allData.loc[allData['user_hash'] == curTrainUser].index.values
+         curLabel = np.max(allData.iloc[userIDs]['label'])
+         userLabel.append(curLabel)
+    negIDs = np.where(np.array(userLabel) == 0.0)[0]
+    userLabel = np.array(userLabel)
+    posUser = np.where(userLabel == 1.0)[0]
+    negUser = np.where(userLabel == 0.0)[0]
+    
+    if numNeg != -1:
+        if len(negUser) > numNeg:
+            np.random.shuffle(negUser)
+            negUser = negIDs[0:numNeg]
+    
+    useUser = posUser
+    useUser = np.concatenate([posUser,negUser])
+    counter         = 0
+    trainBlocks     = []
+    trainBlockLabel = []
+    trainNames      = []
+    trainBlockHits  =  []
+    for uID in range(len(useUser)):
+        curTrainUser = uniqueTrainUser[useUser[uID]]
+        (curUserData,curUserLabel,curHits) = getChunksByUser(allData,curTrainUser,blockSize,\
+            parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF = vtDF)
+        for i in range(len(curUserLabel)):
+            trainNames.append(curTrainUser)
+        trainBlocks += curUserData
+        trainBlockLabel += curUserLabel
+        trainBlockHits += curHits
+        print('processed ' + str(counter) + ' users of ' + str(len(useUser)))
+        counter+= 1
+    paddedData = pad_sequences(trainBlocks, maxlen=blockSize,dtype='float32')
+    #paddedData = paddedData[:,:,featureTypeDict[useFeatureType]]
+    return (paddedData,trainBlockLabel,trainNames,trainBlockHits)
+
+    
+def createTrainDataFromJoblibsPerUser(joblibPaths,minFlowsPerUser = 10,blockSize=10,
+                                hostDict=dict(),sipDict=dict(),                                 
+                                vtLabelPath='',maxFlowsPerUser = 50000):
+    trainBlockLabel = []
+    trainNames      = []
+    trainBlockHits  = []
+    parameterDict   = dict()
+    numBlocksToInitialize = 10000
+    paddedData      = np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])
+    overallCounter  = 0
+    startTime = time.time()
+    for uID in range(len(joblibPaths)):
+        curSavePath = joblibPaths[uID]
+        curData = joblib.load(curSavePath)['dataFrame']
+        if len(curData) < minFlowsPerUser:
+            continue
+        #curUserName = np.unique(curData['user_hash'])[0]
+        (curUserData,curUserLabel,curHits) = getChunksByUser(curData,'',blockSize,\
+            parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=dict(),flagOnlyOneUser = True)           
+        curPaddedData = pad_sequences(curUserData, maxlen=blockSize,dtype='float32')
+        if (curPaddedData.shape[0] > maxFlowsPerUser):
+            curPaddedData = curPaddedData[0:maxFlowsPerUser]
+            curUserLabel = list(np.array(curUserLabel)[0:maxFlowsPerUser])
+            curHits = list(np.array(curHits)[0:maxFlowsPerUser])
+        for i in range(len(curPaddedData)):
+            trainNames.append(curSavePath)
+        trainBlockLabel += curUserLabel
+        trainBlockHits += curHits
+        #curPaddedData = curPaddedData[:,:,featureTypeDict[useFeatureType]]
+        numCurInstances = curPaddedData.shape[0]
+        while overallCounter+numCurInstances > paddedData.shape[0]:
+            paddedData = np.vstack([paddedData,np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])])
+        paddedData[overallCounter:overallCounter+numCurInstances,:] = curPaddedData
+        overallCounter += numCurInstances
+        if uID % 1000 == 0:
+            elapsedTime = time.time() - startTime
+            startTime = time.time()
+            print(str(uID+1) + ' user processed [' + str(elapsedTime) + ']')
+    paddedData = paddedData[0:overallCounter]
+    return (paddedData,trainBlockLabel,trainNames,trainBlockHits)
+    
+def loadDataSetFromJoblibPerUser(trainDirs,minFlowsPerUser = 10,numNegPerDay = 50000,
+                                 blockSize = 10,hostDict=dict(),sipDict=dict(),
+                                 seed =1,flagSkipNoLabelUser=False,
+                                 vtLabelPath='',maxFlowsPerUser = 50000,
+                                 flagReturnDomains=False):
+    if vtLabelPath != '':
+        vtDF = pd.read_csv(vtLabelPath,sep='\t')
+    else:
+        vtDF = dict()
+    trainBlockLabel = []
+    trainNames      = []
+    trainBlockHits  = []
+    trainBlockDomains = []
+    parameterDict   = dict()
+    numBlocksToInitialize = 10000
+    paddedData      = np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])
+    overallCounter  = 0
+    for curDirID in range(len(trainDirs)):
+        curDir = trainDirs[curDirID]
+        curLabelFile = curDir + 'data_label.joblib'
+        labelData = joblib.load(curLabelFile)
+        posIDs = np.where(np.array(labelData['label']) == 1.0)[0]
+        negIDs = np.where(np.array(labelData['label']) == 0.0)[0]
+        random.seed(seed)
+        random.shuffle(negIDs)        
+        useIDs = np.concatenate([posIDs,negIDs])        
+        counter = 0
+        negCounter = 0
+        startTime = time.time()
+        for uID in range(len(useIDs)):
+            curID = useIDs[uID]
+            curUserName = labelData['usernames'][curID]
+            curSavePath = curDir + str(curUserName) + '.joblib'
+            curData = joblib.load(curSavePath)['dataFrame']
+            if flagSkipNoLabelUser:
+                curUserLabel = np.max(curData['label'])
+                if curUserLabel == -1.0:
+                    continue
+            if len(curData) < minFlowsPerUser:
+                continue
+            if numNegPerDay != -1:
+                if negCounter > numNegPerDay:
+                    break
+            if flagReturnDomains:
+                (curUserData,curUserLabel,curHits,curDomains) = getChunksByUser(curData,curUserName,blockSize,\
+                    parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=vtDF,\
+                    flagReturnDomains=flagReturnDomains)       
+            else:
+                (curUserData,curUserLabel,curHits) = getChunksByUser(curData,curUserName,blockSize,\
+                    parameter=parameterDict,hostDict=hostDict,sipDict=sipDict,vtDF=vtDF,\
+                    flagReturnDomains=flagReturnDomains)           
+            curPaddedData = pad_sequences(curUserData, maxlen=blockSize,dtype='float32')
+            if (curPaddedData.shape[0] > maxFlowsPerUser):
+                curPaddedData = curPaddedData[0:maxFlowsPerUser]
+                curUserLabel = list(np.array(curUserLabel)[0:maxFlowsPerUser])
+                curHits = list(np.array(curHits)[0:maxFlowsPerUser])
+                if 'curDomains' in locals():
+                    curDomains = list(np.array(curDomains)[0:maxFlowsPerUser])
+            for i in range(len(curPaddedData)):
+                trainNames.append(curUserName)
+            trainBlockLabel += curUserLabel
+            trainBlockHits += curHits
+            trainBlockDomains += curDomains
+            #curPaddedData = curPaddedData[:,:,featureTypeDict[useFeatureType]]
+            numCurInstances = curPaddedData.shape[0]
+            while overallCounter+numCurInstances > paddedData.shape[0]:
+                paddedData = np.vstack([paddedData,np.zeros([numBlocksToInitialize,blockSize,globalNumFeatures])])
+            paddedData[overallCounter:overallCounter+numCurInstances,:] = curPaddedData
+            overallCounter += numCurInstances
+            #print('num of instances: ' + str(numCurInstances))
+            if (counter+1) % 1000 == 0:
+                elapsedTime = time.time() - startTime
+                print('processed ' + str(counter+1) + ' users of ' +\
+                    str(len(useIDs)) + ' with '  + str(len(curData['label'])) +\
+                    ' flows [dir ' + str(curDirID+1) + ' of ' +\
+                    str(len(trainDirs)) + '] in ' + str(elapsedTime) + ' sec')
+                startTime = time.time()
+            if np.max(np.array(curUserLabel)) == 0.0:
+                negCounter += 1
+            counter+= 1
+    paddedData = paddedData[0:overallCounter]
+    if flagReturnDomains:
+        return (paddedData,trainBlockLabel,trainNames,trainBlockHits,trainBlockDomains)
+    else:
+        return (paddedData,trainBlockLabel,trainNames,trainBlockHits)
+    
+def loadRawDataSetFromJoblibPerUser(trainDirs,numNegPerDay = 2000, seed = 1):
+    dataFrameList = []
+    overallCounter  = 0
+    from tqdm import tqdm
+    for curDirID in tqdm(np.arange(len(trainDirs)), miniters=1):    
+        curDir = trainDirs[curDirID]
+        curLabelFile = curDir + 'data_label.joblib'
+        labelData = joblib.load(curLabelFile)
+        posIDs = np.where(np.array(labelData['label']) == 1.0)[0]
+        negIDs = np.where(np.array(labelData['label']) == 0.0)[0]
+        random.seed(seed)
+        random.shuffle(negIDs)
+        if len(negIDs) >= numNegPerDay:
+            negIDs = negIDs[0:numNegPerDay]
+        useIDs = np.concatenate([posIDs,negIDs])
+        for uID in range(len(useIDs)):
+            curID = useIDs[uID]
+            curUserName = labelData['usernames'][curID]
+            curSavePath = curDir + str(curUserName) + '.joblib'
+            curData = joblib.load(curSavePath)['dataFrame']
+            dataFrameList.append(curData)
+            overallCounter += 1
+    return dataFrameList
+                    
+ 
+def checkDomainForSecondLevelDomain(inDomain,sldDomainDict):
+    if not 'str' in str(type(inDomain)):
+        return False
+    splitDomain = inDomain.split('.')
+    if len(splitDomain) <= 2:
+        return False
+    sldDomain = splitDomain[-2] + '.' + splitDomain[-1]
+    if sldDomain in sldDomainDict:
+        return True
+    else:
+        return False
+    '''
+    out = False
+    for sldDomain in sldDomainDict:
+        if inDomain.endswith(sldDomain):
+            out = True
+            break
+    return out
+    '''
+    
+def save_model(model,jsonPath,h5Path):
+    # saving model
+    json_model = model.to_json()
+    open(jsonPath, 'w').write(json_model)
+    # saving weights
+    model.save_weights(h5Path, overwrite=True)
+
+def load_model(jsonPath,h5Path):
+    # loading model
+    model = model_from_json(open(jsonPath).read())
+    model.load_weights(h5Path)
+    return model
+    
+    
+def getResultsFromSavedJoblibFile(joblibFiles,threshold=3):
+    testUserScores = []
+    testUserLabel = []
+    testLabel = []
+    testScores = []
+    testNames = []
+    for joblibPath in joblibFiles:
+        print('process: ' + joblibPath)
+        tmpJoblib = joblib.load(joblibPath)
+        if 'testBlockScores' in tmpJoblib.keys():
+            curTestBlockScores = tmpJoblib['testBlockScores']
+            for i in range(len(curTestBlockScores)):
+                if i == 0:
+                    curTestScores = curTestBlockScores[i]
+                else:
+                    curTestScores = np.concatenate([curTestScores,curTestBlockScores[i]])
+            curTestHits = tmpJoblib['blockHits']
+            curTestHits = np.array(curTestHits)
+            curTestScores = np.array(curTestScores)
+            curTestLabel = np.ones([len(curTestScores),]) * -1.0
+            curTestLabel[np.where(curTestHits == 0)[0]] = 0.0
+            curTestLabel[np.where(curTestHits >= threshold)[0]] = 1.0
+            curTestNames = tmpJoblib['testNames']
+        else:
+            curTestHits = tmpJoblib['testHits']
+            curTestScores = tmpJoblib['testScores']
+            curTestLabel = tmpJoblib['testLabel']
+            curTestNames = tmpJoblib['testNames']
+
+        useIDs = np.where(curTestHits >= threshold)[0]
+        useIDs = np.concatenate([useIDs,np.where(curTestHits == 0.0)[0]])
+        # old code
+        #useIDs = np.where(tmpJoblib['testLabel'] == 1.0)[0]
+        #useIDs = np.concatenate([useIDs,np.where(tmpJoblib['testLabel'] == 0.0)[0]])
+        curTestScoresT = curTestScores[useIDs]
+        curTestLabelT = curTestLabel[useIDs]
+        if len(testScores) == 0:
+            testScores = curTestScoresT
+            testLabel = curTestLabelT
+        else:
+            testScores = np.concatenate([testScores,curTestScoresT])
+            testLabel = np.concatenate([testLabel,curTestLabelT])
+
+        if 'testBlockScores' in tmpJoblib.keys():
+            tmpScores = np.array(tmpJoblib['testScores'])
+            tmpHits = np.array(tmpJoblib['testHits'])
+            tmpLabel = np.ones([len(tmpHits),])*-1
+            tmpLabel[np.where(tmpHits == 0.0)[0]] = 0.0
+            tmpLabel[np.where(tmpHits >= threshold)[0]] = 1.0
+            useIDs = np.where(tmpLabel == 1.0)[0]
+            useIDs = np.concatenate([useIDs,np.where(tmpLabel == 0.0)[0]])
+            testUserLabel += list(np.array(tmpLabel)[useIDs])
+            testUserScores += list(np.array(tmpScores)[useIDs])
+        else:
+            # get user label
+            uniqueTestNames = list(np.unique(curTestNames))    
+            for testName in uniqueTestNames:
+                curIDs = np.where(curTestNames == testName)[0]
+                curMaxHits = np.max(curTestHits[curIDs])
+                if curMaxHits > 0 and curMaxHits < threshold:
+                    continue
+                if curMaxHits >= threshold:
+                    testUserLabel.append(1.0)
+                else:
+                    testUserLabel.append(0.0)
+                curScore = np.max(curTestScores[curIDs])
+                testUserScores.append(curScore)
+                testNames.append(testName)
+    testUserScores = np.array(testUserScores)
+    testUserLabel = np.array(testUserLabel)
+    testNames = np.array(testNames)
+    return (testUserScores,testUserLabel,testLabel,testScores,testNames)
+    
+def checkIfIP(host):
+    ipMask = '^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$'
+    if re.search(ipMask, host) is not None:
+        return True
+    else:
+        return False    
+    
+# GLOBAL VALUES
+numCiscoFeatures = 30
+featureTypeDict = {'neural':np.arange(4,104,1),\
+                    'packet':np.array([0,1,2,3]),\
+                    'neural+packet':np.arange(0,104,1),\
+                    'neural+packet+cisco':np.arange(0,104+(2*numCiscoFeatures),1),\
+                    'cisco':np.arange(104,104+(2*numCiscoFeatures),1)}
+                    
+globalNumFeatures = len(featureTypeDict['neural+packet+cisco'])
diff --git a/cnnOnCnnParameterSelection.py b/cnnOnCnnParameterSelection.py
new file mode 100644
index 0000000..21a5f9d
--- /dev/null
+++ b/cnnOnCnnParameterSelection.py
@@ -0,0 +1,143 @@
+# -*- coding: utf-8 -*-
+from tqdm import tqdm
+
+import tensorflow as tf
+config = tf.ConfigProto(log_device_placement=True)
+config.gpu_options.per_process_gpu_memory_fraction = 0.5
+config.gpu_options.allow_growth = True
+session = tf.Session(config=config)
+
+from pymongo import MongoClient
+import joblib
+import pickle
+import numpy as np
+
+import ciscoProcessing as ciscoProcessing
+import stackedNeuralModels as stackedNeuralModels
+
+from sklearn.metrics import precision_recall_curve
+from sklearn.metrics import auc, roc_curve
+import matplotlib.pyplot as plt
+
+import keras
+from keras.models import Sequential
+from keras.layers import Dense, Activation,LSTM,Embedding,Dropout,Conv1D, GlobalMaxPooling1D, Merge, Reshape, Lambda
+from keras.layers import Convolution1D
+from keras.layers import Input
+from keras.models import Model
+from keras.utils import np_utils
+
+
+if __name__ == "__main__":    
+    # parameter    
+    innerCNNFilters = 512
+    innerCNNKernelSize = 2
+    cnnDropout = 0.5
+    cnnHiddenDims = 1024
+    domainFeatures  = 512
+    flowFeatures    = 3
+    numCiscoFeatures=30
+    windowSize      = 10
+    maxLen = 40
+    embeddingSize = 100
+    kernel_size = 2
+    drop_out = 0.5
+    filters = 2
+    hidden_dims = 100
+    vocabSize = 40
+    threshold = 3
+    minFlowsPerUser = 10
+    numEpochs = 100
+    maxLengthInSeconds = -1
+    timesNeg = -1
+    
+    trainDataPath   = '/mnt/projekte/pmlcluster/cisco/trainData/equalClass/currentData.joblib'
+    testDataPath    = '/mnt/projekte/pmlcluster/cisco/trainData/equalClass/futureData.joblib'
+    
+    if 'characterDict' not in locals():
+        characterDictPath = 'trainData/characterIDDict.joblib'
+        characterDict = joblib.load(characterDictPath)['characterIDDict']
+                        
+    # load train and test data from joblib
+    # created with createTrainDataMultipleTaskLearning.py
+    if 'trainDFs' not in locals():
+        tmpLoad = joblib.load(trainDataPath)
+        trainDFs = tmpLoad['data']
+        
+    if 'testDFs' not in locals():
+        tmpLoad = joblib.load(testDataPath)
+        
+    
+    sharedCNNFun = stackedNeuralModels.getCNNWitoutLastLayerFunctional(len(characterDict)+1,embeddingSize,maxLen,domainFeatures,kernel_size,domainFeatures,0.5)
+    
+    domainLists = []
+    dfLists     = []
+    for i in tqdm(np.arange(len(trainDFs)), miniters=10):
+        (domainListsTmp,dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(trainDFs[i],
+            windowSize=windowSize,overlapping=False,maxLengthInSeconds=maxLengthInSeconds)
+        domainLists += domainListsTmp
+        dfLists += dfListsTmp
+        if i == 100:
+            break
+    
+    (testData,testLabel,testHits,testNames) = stackedNeuralModels.createTrainData(
+        domainLists=domainLists,dfLists=dfLists,charachterDict=characterDict,
+                maxLen=maxLen,threshold = threshold,
+                flagUseCiscoFeatures=False,urlSIPDIct=dict(),
+                windowSize=windowSize)
+    
+    useIDs = np.where(testLabel == 1.0)[0]
+    useIDs = np.concatenate([useIDs,np.where(testLabel == 0.0)[0]])
+    
+    
+    testLabel = testLabel[useIDs]
+    testHits = testHits[useIDs]
+    testNames = testNames[useIDs]
+    for i in range(len(testData)):
+        testData[i] = testData[i][useIDs]
+        
+        
+    inputList = []
+    encodedList = []
+    numFeatures = flowFeatures
+    for i in range(windowSize):
+        inputList.append(Input(shape=(maxLen,)))
+        encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model
+        inputList.append(Input(shape=(numFeatures,)))
+    
+    merge_layer_input = []
+    for i in range(windowSize):
+        merge_layer_input.append(encodedList[i])
+        merge_layer_input.append(inputList[(2*i)+1])
+        
+        
+    # We can then concatenate the two vectors:
+    merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
+    reshape = Reshape((windowSize, domainFeatures+numFeatures))(merged_vector)
+    # add second cnn
+    
+    cnn = Conv1D(filters,
+                 kernel_size,
+                 activation='relu',
+                 input_shape=(windowSize,domainFeatures+numFeatures))(reshape)
+    # we use max pooling:
+    maxPool = GlobalMaxPooling1D()(cnn)
+    cnnDropout = Dropout(cnnDropout)(maxPool)
+    cnnDense = Dense(cnnHiddenDims,activation='relu')(cnnDropout)
+    cnnOutput = Dense(2,activation='softmax')(cnnDense)
+        
+    # We define a trainable model linking the
+    # tweet inputs to the predictions
+    model = Model(inputs=inputList, outputs=cnnOutput)
+    model.compile(optimizer='adam',
+                  loss='binary_crossentropy',
+                  metrics=['accuracy'])
+
+
+    
+    epochNumber= 0 
+    trainLabel = np_utils.to_categorical(testLabel, 2)
+    model.fit(x=testData, y=trainLabel,
+              epochs=epochNumber + 1,shuffle=True,initial_epoch=epochNumber)#,
+              #validation_data=(testData,testLabel))
+              
\ No newline at end of file
diff --git a/stackedNeuralModels.py b/stackedNeuralModels.py
new file mode 100644
index 0000000..23045a8
--- /dev/null
+++ b/stackedNeuralModels.py
@@ -0,0 +1,412 @@
+# -*- coding: utf-8 -*-
+from keras.models import Sequential
+from keras.layers import Dense, Activation,LSTM,Embedding,Dropout,Conv1D, GlobalMaxPooling1D, Merge, Reshape, Lambda
+from keras.layers import Convolution1D
+import ciscoProcessing as ciscoProcessing
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import joblib
+import csv
+
+import keras
+from keras.layers import Input
+from keras.models import Model
+from keras.utils import np_utils
+
+from sklearn.metrics import precision_recall_curve
+from sklearn.metrics import auc, roc_curve
+from tqdm import tqdm
+import os
+
+
+def getCiscoFeatures(curDataLine,urlSIPDict):
+    numCiscoFeatures = 30
+    try:
+        ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
+        #print('cisco features: ' + str(ciscoFeatures))
+        # log transform
+        ciscoFeatures = np.log1p(ciscoFeatures,dtype='float32')
+        #print('log transformed: ' + str(ciscoFeatures))
+        return ciscoFeatures.ravel()
+    except:
+        return np.zeros([numCiscoFeatures,]).ravel()
+
+
+    
+def getCNNWithoutLastLayer(vocabSize,embeddingSize,input_length,filters,kernel_size,
+                       hidden_dims,drop_out):
+    model = Sequential()
+    model.add(Embedding(input_dim=vocabSize, output_dim=embeddingSize,
+                        input_length=input_length))
+                        
+    model.add(Conv1D(filters,
+                     kernel_size,
+                     activation='relu'))
+                     
+    # we use max pooling:
+    model.add(GlobalMaxPooling1D())
+    
+    # We add a vanilla hidden layer:
+    model.add(Dense(hidden_dims))
+    model.add(Dropout(drop_out))
+    model.add(Activation('relu'))
+    return model
+    
+def getCNNWitoutLastLayerFunctional(vocabSize,embeddingSize,input_length,filters,kernel_size,
+                       hidden_dims,drop_out):                           
+    a = Input(shape=(input_length,))
+    embedding = Embedding(input_dim=vocabSize,output_dim=embeddingSize)(a)
+    conv1 = Conv1D(filters,kernel_size,activation='relu')(embedding)
+    glob = GlobalMaxPooling1D()(conv1)
+    dense = Dense(hidden_dims)(glob)
+    drop = Dropout(drop_out)(dense)
+    model = Activation('relu')(drop)
+    model = Model(a, model)
+    return model
+
+def getFlowFeatureLayer(numFeatures):
+    model = Sequential()
+    #slpModel.add(Dense(1, input_shape=(1,)))
+    model.add(Lambda(lambda x: x + 0.0, input_shape=(numFeatures,)))
+    return model
+    
+
+def createCNNDataSet(domains,label,characterDict,maxLen=40):
+    # process domains in reverse order
+    outFeature = np.zeros([len(domains),maxLen])
+    outLabel = np.zeros([len(domains),])
+    for i in range(len(domains)):
+        domain = domains[i]
+        curLabel = label[i]
+        curFeature = np.zeros([maxLen,])
+       # print(domain + ' ' + str(len(domain)))        
+        for j in range(np.min([len(domain),maxLen])):
+            #print(j)
+            curCharacter = domain[-j]
+            if curCharacter in characterDict:
+                curFeature[j] = characterDict[curCharacter]
+        outFeature[i] = curFeature
+        outLabel[i] = curLabel
+    return (outFeature,outLabel)
+
+def getFeatureVecForDomain(domain,characterDict,maxLen=40):
+    curFeature = np.zeros([maxLen,])
+    for j in range(np.min([len(domain),maxLen])):
+        #print(j)
+        curCharacter = domain[-j]
+        if curCharacter in characterDict:
+            curFeature[j] = characterDict[curCharacter]
+    return curFeature
+    
+def getFlowFeatures(curDataLine):
+    useKeys = ['duration','bytes_down','bytes_up']
+    curFeature = np.zeros([len(useKeys),])
+    for i in range(len(useKeys)):
+        curKey = useKeys[i]
+        try:
+            curFeature[i] = np.log1p(curDataLine[curKey],dtype='float32')
+        except:
+            pass
+    return curFeature
+    
+    
+def getChunksFromUserDataFrame(dataFrame,windowSize=10,overlapping=False,
+                               maxLengthInSeconds=300):
+    #print('maxLength: ' + str(maxLengthInSeconds))
+    maxMilliSeconds = maxLengthInSeconds * 1000
+    outDomainLists = []
+    outDFFrames = []
+    if overlapping == False:
+        numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize)))
+        userIDs = np.arange(len(dataFrame))
+        for blockID in np.arange(numBlocks):
+            curIDs = userIDs[(blockID * windowSize):((blockID+1)*windowSize)]
+            #print(curIDs)
+            useData = dataFrame.iloc[curIDs]
+            curDomains = useData['domain']
+            if maxLengthInSeconds != -1:
+                curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
+                underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
+                if len(underTimeOutIDs) != len(curIDs):
+                    curIDs = curIDs[underTimeOutIDs]
+                    useData = dataFrame.iloc[curIDs]
+                    curDomains = useData['domain']
+            outDomainLists.append(list(curDomains))
+            outDFFrames.append(useData)
+    else:
+        numBlocks = len(dataFrame) + 1 - windowSize
+        userIDs = np.arange(len(dataFrame))
+        for blockID in np.arange(numBlocks):
+            curIDs = userIDs[blockID:blockID+windowSize]
+            #print(curIDs)
+            useData = dataFrame.iloc[curIDs]
+            curDomains = useData['domain']
+            if maxLengthInSeconds != -1:
+                curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
+                underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
+                if len(underTimeOutIDs) != len(curIDs):
+                    curIDs = curIDs[underTimeOutIDs]
+                    useData = dataFrame.iloc[curIDs]
+                    curDomains = useData['domain']
+            outDomainLists.append(list(curDomains))
+            outDFFrames.append(useData)
+    return (outDomainLists,outDFFrames)
+    
+    
+def createTrainData(domainLists,dfLists,charachterDict,maxLen,threshold = 3,
+                    flagUseCiscoFeatures=False,urlSIPDIct=dict,
+                    windowSize=10):
+    if 'hits' in dfLists[0].keys():
+        hitName = 'hits'
+    elif 'virusTotalHits' in dfLists[0].keys():
+        hitName = 'virusTotalHits'
+    numFlowFeatures = 3
+    numCiscoFeatures = 30
+    numFeatures = numFlowFeatures
+    if flagUseCiscoFeatures:
+        numFeatures += numCiscoFeatures
+    outputFeatures = []
+    label = []
+    hits = []
+    trainNames = []
+    for i in range(windowSize):
+        outputFeatures.append(np.zeros([len(domainLists),maxLen]))
+        outputFeatures.append(np.zeros([len(domainLists),numFeatures]))
+    
+    for i in tqdm(np.arange(len(domainLists)), miniters=10):
+        curCounter = 0
+        #print('len domainList: ' + str(len(domainLists[i])))
+        #print('len df: ' + str(len(dfLists[i])))
+        for j in range(np.min([windowSize,len(domainLists[i])])):
+            outputFeatures[curCounter][i,:] = getFeatureVecForDomain(domainLists[i][j],charachterDict,maxLen)
+            curCounter += 1
+            if flagUseCiscoFeatures:                
+                outputFeatures[curCounter][i,0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j])
+                outputFeatures[curCounter][i,numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j],urlSIPDIct)
+            else:
+                outputFeatures[curCounter][i,:] = getFlowFeatures(dfLists[i].iloc[j])
+            curCounter += 1
+        curLabel = 0.0
+        if np.max(dfLists[i][hitName]) >= threshold:
+            curLabel = 1.0
+        elif np.max(dfLists[i][hitName]) == -1:
+            curLabel = -1.0
+        elif np.max(dfLists[i][hitName]) > 0 and np.max(dfLists[i][hitName]) < threshold:
+            curLabel = -2.0
+        label.append(curLabel)
+        hits.append(np.max(dfLists[i][hitName]))
+        trainNames.append(np.unique(dfLists[i]['user_hash']))
+    return (outputFeatures, np.array(label), np.array(hits), np.array(trainNames))
+    
+    
+def transformStringListToNumpyArray(listString):
+    listString = listString.replace('[','').replace(']','')
+    return np.array(listString.split(','),dtype='float32')
+    
+def getCiscoFeatureDict(csvPathList):
+    outDict = dict()
+    for path in tqdm(csvPathList, miniters=1):
+        fobj = open(path,'r')
+        csvReader = csv.DictReader(fobj,delimiter=',')
+        for row in csvReader:
+            urlSIPString = row['Domain'] + row['ServerIP']
+            ciscoFeatures = row['CiscoFeature']
+            outDict[urlSIPString] = transformStringListToNumpyArray(ciscoFeatures)
+            #if len(outDict) % 10000 == 0:
+            #    print('numbers in dict: ' + str(len(outDict)))
+    return outDict
+
+    
+if __name__ == "__main__":
+    
+    # get data
+    trainDirsUserLevel = ['trainData/joblib2016-07-annomalous-stg-new/10/',
+                          'trainData/joblib2016-07-annomalous-stg-new/09/',
+                          'trainData/joblib2016-07-annomalous-stg-new/08/',
+                          'trainData/joblib2016-07-annomalous-stg-new/07/',
+                          'trainData/joblib2016-07-annomalous-stg-new/06/']
+    
+    testDirsUserLevel = ['trainData/joblib2016-09-annomalous-stg-new/07/',\
+                'trainData/joblib2016-09-annomalous-stg-new/08/',\
+                'trainData/joblib2016-09-annomalous-stg-new/09/',\
+                'trainData/joblib2016-09-annomalous-stg-new/10/',\
+                'trainData/joblib2016-09-annomalous-stg-new/11/',\
+                'trainData/joblib2016-09-annomalous-stg-new/12/',\
+                'trainData/joblib2016-09-annomalous-stg-new/13/',\
+                'trainData/joblib2016-09-annomalous-stg-new/14/']
+    
+    trainCiscoFeatureCSVPaths   = ['trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_07.csv',
+                'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_06.csv',
+                'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_08.csv',
+                'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_10.csv',
+                'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_09.csv']
+                
+    testCiscoFeatureCSVPaths    = ['trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_12.csv',
+                'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_08.csv',
+                'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_07.csv',
+                'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_09.csv',
+                'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_13.csv',
+                'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_14.csv',
+                'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_10.csv',
+                'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_11.csv']
+                
+    # parameter
+    numNegPerDay = 5000
+    numEpochs = 10
+    domainFeatures  = 512
+    flowFeatures    = 3
+    numCiscoFeatures= 30
+    windowSize      = 10
+    maxLen = 40
+    
+    lstmUnits = 32
+    lstmDenseSize = 128
+    embeddingSize = 100
+    kernel_size = 2
+    drop_out = 0.5
+    filters = 2
+    hidden_dims = 100
+    vocabSize = 40
+    flagUseCiscoFeatures = True
+    threshold = 3
+    resultStoreDir = 'results/201705/'
+    if flagUseCiscoFeatures:
+        resultStorePath = resultStoreDir + 'cnn_plus_cisco_plus_lstm_numNegPerDay' + str(numNegPerDay) + '.joblib'
+        resultModelPath = resultStoreDir + 'cnn_plus_cisco_plus_lstm_numNegPerDay' + str(numNegPerDay)
+    else:
+        resultStorePath = resultStoreDir + 'cnn_plus_lstm_numNegPerDay' + str(numNegPerDay) + '.joblib'
+        resultModelPath = resultStoreDir + 'cnn_plus_lstm_numNegPerDay' + str(numNegPerDay)
+    flagRedo = True
+    
+    
+    if flagUseCiscoFeatures:
+        if 'trainCiscoFeatureDict' not in locals():
+            trainCiscoFeatureDict = getCiscoFeatureDict(trainCiscoFeatureCSVPaths)
+            
+        if 'testCiscoFeatureDict' not in locals():
+            testCiscoFeatureDict = getCiscoFeatureDict(testCiscoFeatureCSVPaths)
+    else:
+        trainCiscoFeatureDict = dict()
+        testCiscoFeatureDict = dict()
+    
+    if flagRedo or not os.path.exists(resultStorePath):        
+        if 'characterDict' not in locals():
+            characterDictPath = 'trainData/characterIDDict.joblib'
+            characterDict = joblib.load(characterDictPath)['characterIDDict']
+        
+        
+        print('create train data')
+        if 'dataFrameList' not in locals():
+            (dataFrameList) = ciscoProcessing.loadRawDataSetFromJoblibPerUser(\
+                            trainDirsUserLevel,numNegPerDay = numNegPerDay)
+            maxHits = []
+            for i in range(len(dataFrameList)):
+                maxHits.append(np.max(dataFrameList[i]['hits']))
+        
+        print('create test data')
+        # validation error
+        if 'testDataFrameList' not in locals():
+            (testDataFrameList) = ciscoProcessing.loadRawDataSetFromJoblibPerUser(\
+                            [testDirsUserLevel[0]],numNegPerDay = numNegPerDay)
+            maxHits = []
+            for i in range(len(testDataFrameList)):
+                maxHits.append(np.max(testDataFrameList[i]['hits']))
+    
+        sharedCNNFun = getCNNWitoutLastLayerFunctional(len(characterDict)+1,embeddingSize,maxLen,domainFeatures,kernel_size,domainFeatures,0.5)
+    
+        inputList = []
+        encodedList = []
+        numFeatures = flowFeatures
+        if flagUseCiscoFeatures:
+            numFeatures += numCiscoFeatures
+        for i in range(windowSize):
+            inputList.append(Input(shape=(maxLen,)))
+            encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model
+            inputList.append(Input(shape=(numFeatures,)))
+        
+        merge_layer_input = []
+        for i in range(windowSize):
+            merge_layer_input.append(encodedList[i])
+            merge_layer_input.append(inputList[(2*i)+1])
+            
+            
+        # We can then concatenate the two vectors:
+        merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
+        reshape = Reshape((windowSize, domainFeatures+numFeatures))(merged_vector)
+        lstm = LSTM(lstmUnits, input_shape=(windowSize,domainFeatures+numFeatures))(reshape)
+        dense = Dense(lstmDenseSize, activation='relu')(lstm)
+        dropout = Dropout(0.5)(dense)
+        # And add a logistic regression on top
+        predictions = Dense(2, activation='softmax')(dropout)
+        
+        # We define a trainable model linking the
+        # tweet inputs to the predictions
+        model = Model(inputs=inputList, outputs=predictions)
+        
+        model.compile(optimizer='adam',
+                      loss='binary_crossentropy',
+                      metrics=['accuracy'])
+                      
+        
+        # get train data
+        domainLists = []
+        dfLists = []
+        for i in tqdm(np.arange(len(dataFrameList)), miniters=10):
+            (domainListsTmp,dfListsTmp) = getChunksFromUserDataFrame(dataFrameList[i],windowSize=windowSize,overlapping=False)
+            domainLists += domainListsTmp
+            dfLists += dfListsTmp    
+        
+        (trainData,trainLabel,trainHits,trainNames) = createTrainData(domainLists,dfLists,characterDict,
+                maxLen,threshold = threshold,
+                flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=trainCiscoFeatureDict)
+        useIDs = np.where(trainHits == 0)[0]
+        useIDs = np.concatenate([useIDs,np.where(trainHits >= threshold)[0]])
+        for i in range(len(trainData)):
+            trainData[i] = np.array(trainData[i])[useIDs]
+        trainLabel = trainLabel[useIDs]
+        trainHits = trainHits[useIDs]
+        trainNames = trainNames[useIDs]
+        
+        # get test data
+        domainLists = []
+        dfLists = []
+        for i in tqdm(np.arange(len(testDataFrameList)), miniters=10):
+            (domainListsTmp,dfListsTmp) = getChunksFromUserDataFrame(testDataFrameList[i],windowSize=windowSize,overlapping=False)
+            domainLists += domainListsTmp
+            dfLists += dfListsTmp    
+        
+        (testData,testLabel,testHits,testNames) = createTrainData(domainLists,dfLists,characterDict,
+                maxLen,threshold = threshold,
+                flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=testCiscoFeatureDict)
+        useIDs = np.where(testHits == 0)[0]
+        useIDs = np.concatenate([useIDs,np.where(testHits >= threshold)[0]])
+        for i in range(len(testData)):
+            testData[i] = np.array(testData[i])[useIDs]
+        testLabel = testLabel[useIDs]
+        testHits = testHits[useIDs]
+        testNames = testNames[useIDs]
+        
+        numPos = len(np.where(trainLabel == 1.0)[0])
+        numNeg = len(np.where(trainLabel == 0.0)[0])
+        print('major class: ' + str(float(numNeg) / float(numNeg + numPos)))
+        lstmLabel = np_utils.to_categorical(trainLabel, 2)
+        lstmTestLabel = np_utils.to_categorical(testLabel, 2)
+        trainHist = model.fit(trainData,lstmLabel,epochs=numEpochs,batch_size=128, validation_data=(testData,lstmTestLabel))
+        
+        
+        # save lstm model
+        ciscoProcessing.save_model(model,resultModelPath+'.json',
+            resultModelPath + '.h5')
+        
+        # classify train and test
+        trainScores = model.predict(trainData)[:,1]
+        testScores = model.predict(testData)[:,1]  
+        
+        joblib.dump({'testLabel':testLabel,
+                     'testHits':testHits,
+                     'testNames':testNames,
+                     'testScores':testScores,
+                     'trainLabel':trainLabel,
+                     'trainScores':trainScores},resultStorePath,compress=3)
+    
\ No newline at end of file