# -*- coding: utf-8 -*- from keras.models import Sequential from keras.layers import Dense, Activation,LSTM,Embedding,Dropout,Conv1D, GlobalMaxPooling1D, Merge, Reshape, Lambda from keras.layers import Convolution1D import ciscoProcessing as ciscoProcessing import numpy as np import matplotlib.pyplot as plt import pandas as pd import joblib import csv import keras from keras.layers import Input from keras.models import Model from keras.utils import np_utils from sklearn.metrics import precision_recall_curve from sklearn.metrics import auc, roc_curve from tqdm import tqdm import os def getCiscoFeatures(curDataLine,urlSIPDict): numCiscoFeatures = 30 try: ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])] #print('cisco features: ' + str(ciscoFeatures)) # log transform ciscoFeatures = np.log1p(ciscoFeatures,dtype='float32') #print('log transformed: ' + str(ciscoFeatures)) return ciscoFeatures.ravel() except: return np.zeros([numCiscoFeatures,]).ravel() def getCNNWithoutLastLayer(vocabSize,embeddingSize,input_length,filters,kernel_size, hidden_dims,drop_out): model = Sequential() model.add(Embedding(input_dim=vocabSize, output_dim=embeddingSize, input_length=input_length)) model.add(Conv1D(filters, kernel_size, activation='relu')) # we use max pooling: model.add(GlobalMaxPooling1D()) # We add a vanilla hidden layer: model.add(Dense(hidden_dims)) model.add(Dropout(drop_out)) model.add(Activation('relu')) return model def getCNNWitoutLastLayerFunctional(vocabSize,embeddingSize,input_length,filters,kernel_size, hidden_dims,drop_out): a = Input(shape=(input_length,)) embedding = Embedding(input_dim=vocabSize,output_dim=embeddingSize)(a) conv1 = Conv1D(filters,kernel_size,activation='relu')(embedding) glob = GlobalMaxPooling1D()(conv1) dense = Dense(hidden_dims)(glob) drop = Dropout(drop_out)(dense) model = Activation('relu')(drop) model = Model(a, model) return model def getFlowFeatureLayer(numFeatures): model = Sequential() #slpModel.add(Dense(1, input_shape=(1,))) model.add(Lambda(lambda x: x + 0.0, input_shape=(numFeatures,))) return model def createCNNDataSet(domains,label,characterDict,maxLen=40): # process domains in reverse order outFeature = np.zeros([len(domains),maxLen]) outLabel = np.zeros([len(domains),]) for i in range(len(domains)): domain = domains[i] curLabel = label[i] curFeature = np.zeros([maxLen,]) # print(domain + ' ' + str(len(domain))) for j in range(np.min([len(domain),maxLen])): #print(j) curCharacter = domain[-j] if curCharacter in characterDict: curFeature[j] = characterDict[curCharacter] outFeature[i] = curFeature outLabel[i] = curLabel return (outFeature,outLabel) def getFeatureVecForDomain(domain,characterDict,maxLen=40): curFeature = np.zeros([maxLen,]) for j in range(np.min([len(domain),maxLen])): #print(j) curCharacter = domain[-j] if curCharacter in characterDict: curFeature[j] = characterDict[curCharacter] return curFeature def getFlowFeatures(curDataLine): useKeys = ['duration','bytes_down','bytes_up'] curFeature = np.zeros([len(useKeys),]) for i in range(len(useKeys)): curKey = useKeys[i] try: curFeature[i] = np.log1p(curDataLine[curKey],dtype='float32') except: pass return curFeature def getChunksFromUserDataFrame(dataFrame,windowSize=10,overlapping=False, maxLengthInSeconds=300): #print('maxLength: ' + str(maxLengthInSeconds)) maxMilliSeconds = maxLengthInSeconds * 1000 outDomainLists = [] outDFFrames = [] if overlapping == False: numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize))) userIDs = np.arange(len(dataFrame)) for blockID in np.arange(numBlocks): curIDs = userIDs[(blockID * windowSize):((blockID+1)*windowSize)] #print(curIDs) useData = dataFrame.iloc[curIDs] curDomains = useData['domain'] if maxLengthInSeconds != -1: curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds) if len(underTimeOutIDs) != len(curIDs): curIDs = curIDs[underTimeOutIDs] useData = dataFrame.iloc[curIDs] curDomains = useData['domain'] outDomainLists.append(list(curDomains)) outDFFrames.append(useData) else: numBlocks = len(dataFrame) + 1 - windowSize userIDs = np.arange(len(dataFrame)) for blockID in np.arange(numBlocks): curIDs = userIDs[blockID:blockID+windowSize] #print(curIDs) useData = dataFrame.iloc[curIDs] curDomains = useData['domain'] if maxLengthInSeconds != -1: curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds) if len(underTimeOutIDs) != len(curIDs): curIDs = curIDs[underTimeOutIDs] useData = dataFrame.iloc[curIDs] curDomains = useData['domain'] outDomainLists.append(list(curDomains)) outDFFrames.append(useData) return (outDomainLists,outDFFrames) def createTrainData(domainLists,dfLists,charachterDict,maxLen,threshold = 3, flagUseCiscoFeatures=False,urlSIPDIct=dict, windowSize=10): if 'hits' in dfLists[0].keys(): hitName = 'hits' elif 'virusTotalHits' in dfLists[0].keys(): hitName = 'virusTotalHits' numFlowFeatures = 3 numCiscoFeatures = 30 numFeatures = numFlowFeatures if flagUseCiscoFeatures: numFeatures += numCiscoFeatures outputFeatures = [] label = [] hits = [] trainNames = [] for i in range(windowSize): outputFeatures.append(np.zeros([len(domainLists),maxLen])) outputFeatures.append(np.zeros([len(domainLists),numFeatures])) for i in tqdm(np.arange(len(domainLists)), miniters=10): curCounter = 0 #print('len domainList: ' + str(len(domainLists[i]))) #print('len df: ' + str(len(dfLists[i]))) for j in range(np.min([windowSize,len(domainLists[i])])): outputFeatures[curCounter][i,:] = getFeatureVecForDomain(domainLists[i][j],charachterDict,maxLen) curCounter += 1 if flagUseCiscoFeatures: outputFeatures[curCounter][i,0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j]) outputFeatures[curCounter][i,numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j],urlSIPDIct) else: outputFeatures[curCounter][i,:] = getFlowFeatures(dfLists[i].iloc[j]) curCounter += 1 curLabel = 0.0 if np.max(dfLists[i][hitName]) >= threshold: curLabel = 1.0 elif np.max(dfLists[i][hitName]) == -1: curLabel = -1.0 elif np.max(dfLists[i][hitName]) > 0 and np.max(dfLists[i][hitName]) < threshold: curLabel = -2.0 label.append(curLabel) hits.append(np.max(dfLists[i][hitName])) trainNames.append(np.unique(dfLists[i]['user_hash'])) return (outputFeatures, np.array(label), np.array(hits), np.array(trainNames)) def transformStringListToNumpyArray(listString): listString = listString.replace('[','').replace(']','') return np.array(listString.split(','),dtype='float32') def getCiscoFeatureDict(csvPathList): outDict = dict() for path in tqdm(csvPathList, miniters=1): fobj = open(path,'r') csvReader = csv.DictReader(fobj,delimiter=',') for row in csvReader: urlSIPString = row['Domain'] + row['ServerIP'] ciscoFeatures = row['CiscoFeature'] outDict[urlSIPString] = transformStringListToNumpyArray(ciscoFeatures) #if len(outDict) % 10000 == 0: # print('numbers in dict: ' + str(len(outDict))) return outDict if __name__ == "__main__": # get data trainDirsUserLevel = ['trainData/joblib2016-07-annomalous-stg-new/10/', 'trainData/joblib2016-07-annomalous-stg-new/09/', 'trainData/joblib2016-07-annomalous-stg-new/08/', 'trainData/joblib2016-07-annomalous-stg-new/07/', 'trainData/joblib2016-07-annomalous-stg-new/06/'] testDirsUserLevel = ['trainData/joblib2016-09-annomalous-stg-new/07/',\ 'trainData/joblib2016-09-annomalous-stg-new/08/',\ 'trainData/joblib2016-09-annomalous-stg-new/09/',\ 'trainData/joblib2016-09-annomalous-stg-new/10/',\ 'trainData/joblib2016-09-annomalous-stg-new/11/',\ 'trainData/joblib2016-09-annomalous-stg-new/12/',\ 'trainData/joblib2016-09-annomalous-stg-new/13/',\ 'trainData/joblib2016-09-annomalous-stg-new/14/'] trainCiscoFeatureCSVPaths = ['trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_07.csv', 'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_06.csv', 'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_08.csv', 'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_10.csv', 'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_09.csv'] testCiscoFeatureCSVPaths = ['trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_12.csv', 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_08.csv', 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_07.csv', 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_09.csv', 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_13.csv', 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_14.csv', 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_10.csv', 'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_11.csv'] # parameter numNegPerDay = 5000 numEpochs = 10 domainFeatures = 512 flowFeatures = 3 numCiscoFeatures= 30 windowSize = 10 maxLen = 40 lstmUnits = 32 lstmDenseSize = 128 embeddingSize = 100 kernel_size = 2 drop_out = 0.5 filters = 2 hidden_dims = 100 vocabSize = 40 flagUseCiscoFeatures = True threshold = 3 resultStoreDir = 'results/201705/' if flagUseCiscoFeatures: resultStorePath = resultStoreDir + 'cnn_plus_cisco_plus_lstm_numNegPerDay' + str(numNegPerDay) + '.joblib' resultModelPath = resultStoreDir + 'cnn_plus_cisco_plus_lstm_numNegPerDay' + str(numNegPerDay) else: resultStorePath = resultStoreDir + 'cnn_plus_lstm_numNegPerDay' + str(numNegPerDay) + '.joblib' resultModelPath = resultStoreDir + 'cnn_plus_lstm_numNegPerDay' + str(numNegPerDay) flagRedo = True if flagUseCiscoFeatures: if 'trainCiscoFeatureDict' not in locals(): trainCiscoFeatureDict = getCiscoFeatureDict(trainCiscoFeatureCSVPaths) if 'testCiscoFeatureDict' not in locals(): testCiscoFeatureDict = getCiscoFeatureDict(testCiscoFeatureCSVPaths) else: trainCiscoFeatureDict = dict() testCiscoFeatureDict = dict() if flagRedo or not os.path.exists(resultStorePath): if 'characterDict' not in locals(): characterDictPath = 'trainData/characterIDDict.joblib' characterDict = joblib.load(characterDictPath)['characterIDDict'] print('create train data') if 'dataFrameList' not in locals(): (dataFrameList) = ciscoProcessing.loadRawDataSetFromJoblibPerUser(\ trainDirsUserLevel,numNegPerDay = numNegPerDay) maxHits = [] for i in range(len(dataFrameList)): maxHits.append(np.max(dataFrameList[i]['hits'])) print('create test data') # validation error if 'testDataFrameList' not in locals(): (testDataFrameList) = ciscoProcessing.loadRawDataSetFromJoblibPerUser(\ [testDirsUserLevel[0]],numNegPerDay = numNegPerDay) maxHits = [] for i in range(len(testDataFrameList)): maxHits.append(np.max(testDataFrameList[i]['hits'])) sharedCNNFun = getCNNWitoutLastLayerFunctional(len(characterDict)+1,embeddingSize,maxLen,domainFeatures,kernel_size,domainFeatures,0.5) inputList = [] encodedList = [] numFeatures = flowFeatures if flagUseCiscoFeatures: numFeatures += numCiscoFeatures for i in range(windowSize): inputList.append(Input(shape=(maxLen,))) encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model inputList.append(Input(shape=(numFeatures,))) merge_layer_input = [] for i in range(windowSize): merge_layer_input.append(encodedList[i]) merge_layer_input.append(inputList[(2*i)+1]) # We can then concatenate the two vectors: merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1) reshape = Reshape((windowSize, domainFeatures+numFeatures))(merged_vector) lstm = LSTM(lstmUnits, input_shape=(windowSize,domainFeatures+numFeatures))(reshape) dense = Dense(lstmDenseSize, activation='relu')(lstm) dropout = Dropout(0.5)(dense) # And add a logistic regression on top predictions = Dense(2, activation='softmax')(dropout) # We define a trainable model linking the # tweet inputs to the predictions model = Model(inputs=inputList, outputs=predictions) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # get train data domainLists = [] dfLists = [] for i in tqdm(np.arange(len(dataFrameList)), miniters=10): (domainListsTmp,dfListsTmp) = getChunksFromUserDataFrame(dataFrameList[i],windowSize=windowSize,overlapping=False) domainLists += domainListsTmp dfLists += dfListsTmp (trainData,trainLabel,trainHits,trainNames) = createTrainData(domainLists,dfLists,characterDict, maxLen,threshold = threshold, flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=trainCiscoFeatureDict) useIDs = np.where(trainHits == 0)[0] useIDs = np.concatenate([useIDs,np.where(trainHits >= threshold)[0]]) for i in range(len(trainData)): trainData[i] = np.array(trainData[i])[useIDs] trainLabel = trainLabel[useIDs] trainHits = trainHits[useIDs] trainNames = trainNames[useIDs] # get test data domainLists = [] dfLists = [] for i in tqdm(np.arange(len(testDataFrameList)), miniters=10): (domainListsTmp,dfListsTmp) = getChunksFromUserDataFrame(testDataFrameList[i],windowSize=windowSize,overlapping=False) domainLists += domainListsTmp dfLists += dfListsTmp (testData,testLabel,testHits,testNames) = createTrainData(domainLists,dfLists,characterDict, maxLen,threshold = threshold, flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=testCiscoFeatureDict) useIDs = np.where(testHits == 0)[0] useIDs = np.concatenate([useIDs,np.where(testHits >= threshold)[0]]) for i in range(len(testData)): testData[i] = np.array(testData[i])[useIDs] testLabel = testLabel[useIDs] testHits = testHits[useIDs] testNames = testNames[useIDs] numPos = len(np.where(trainLabel == 1.0)[0]) numNeg = len(np.where(trainLabel == 0.0)[0]) print('major class: ' + str(float(numNeg) / float(numNeg + numPos))) lstmLabel = np_utils.to_categorical(trainLabel, 2) lstmTestLabel = np_utils.to_categorical(testLabel, 2) trainHist = model.fit(trainData,lstmLabel,epochs=numEpochs,batch_size=128, validation_data=(testData,lstmTestLabel)) # save lstm model ciscoProcessing.save_model(model,resultModelPath+'.json', resultModelPath + '.h5') # classify train and test trainScores = model.predict(trainData)[:,1] testScores = model.predict(testData)[:,1] joblib.dump({'testLabel':testLabel, 'testHits':testHits, 'testNames':testNames, 'testScores':testScores, 'trainLabel':trainLabel, 'trainScores':trainScores},resultStorePath,compress=3)