# -*- coding: utf-8 -*- import csv import numpy as np from keras.layers import Dense, Activation, Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Lambda from keras.layers import Input from keras.models import Model from keras.models import Sequential from tqdm import tqdm def getCiscoFeatures(curDataLine, urlSIPDict): numCiscoFeatures = 30 try: ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])] # print('cisco features: ' + str(ciscoFeatures)) # log transform ciscoFeatures = np.log1p(ciscoFeatures, dtype='float32') # print('log transformed: ' + str(ciscoFeatures)) return ciscoFeatures.ravel() except: return np.zeros([numCiscoFeatures, ]).ravel() def getCNNWithoutLastLayer(vocabSize, embeddingSize, input_length, filters, kernel_size, hidden_dims, drop_out): model = Sequential() model.add(Embedding(input_dim=vocabSize, output_dim=embeddingSize, input_length=input_length)) model.add(Conv1D(filters, kernel_size, activation='relu')) # we use max pooling: model.add(GlobalMaxPooling1D()) # We add a vanilla hidden layer: model.add(Dense(hidden_dims)) model.add(Dropout(drop_out)) model.add(Activation('relu')) return model def getCNNWitoutLastLayerFunctional(vocabSize, embeddingSize, input_length, filters, kernel_size, hidden_dims, drop_out): a = Input(shape=(input_length,)) embedding = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(a) conv1 = Conv1D(filters, kernel_size, activation='relu')(embedding) glob = GlobalMaxPooling1D()(conv1) dense = Dense(hidden_dims)(glob) drop = Dropout(drop_out)(dense) model = Activation('relu')(drop) model = Model(a, model) return model def getFlowFeatureLayer(numFeatures): model = Sequential() # slpModel.add(Dense(1, input_shape=(1,))) model.add(Lambda(lambda x: x + 0.0, input_shape=(numFeatures,))) return model def createCNNDataSet(domains, label, characterDict, maxLen=40): # process domains in reverse order outFeature = np.zeros([len(domains), maxLen]) outLabel = np.zeros([len(domains), ]) for i in range(len(domains)): domain = domains[i] curLabel = label[i] curFeature = np.zeros([maxLen, ]) # print(domain + ' ' + str(len(domain))) for j in range(np.min([len(domain), maxLen])): # print(j) curCharacter = domain[-j] if curCharacter in characterDict: curFeature[j] = characterDict[curCharacter] outFeature[i] = curFeature outLabel[i] = curLabel return (outFeature, outLabel) def getFeatureVecForDomain(domain, characterDict, maxLen=40): curFeature = np.zeros([maxLen, ]) for j in range(np.min([len(domain), maxLen])): # print(j) curCharacter = domain[-j] if curCharacter in characterDict: curFeature[j] = characterDict[curCharacter] return curFeature def getFlowFeatures(curDataLine): useKeys = ['duration', 'bytes_down', 'bytes_up'] curFeature = np.zeros([len(useKeys), ]) for i in range(len(useKeys)): curKey = useKeys[i] try: curFeature[i] = np.log1p(curDataLine[curKey], dtype='float32') except: pass return curFeature def getChunksFromUserDataFrame(dataFrame, windowSize=10, overlapping=False, maxLengthInSeconds=300): # print('maxLength: ' + str(maxLengthInSeconds)) maxMilliSeconds = maxLengthInSeconds * 1000 outDomainLists = [] outDFFrames = [] if overlapping == False: numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize))) userIDs = np.arange(len(dataFrame)) for blockID in np.arange(numBlocks): curIDs = userIDs[(blockID * windowSize):((blockID + 1) * windowSize)] # print(curIDs) useData = dataFrame.iloc[curIDs] curDomains = useData['domain'] if maxLengthInSeconds != -1: curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds) if len(underTimeOutIDs) != len(curIDs): curIDs = curIDs[underTimeOutIDs] useData = dataFrame.iloc[curIDs] curDomains = useData['domain'] outDomainLists.append(list(curDomains)) outDFFrames.append(useData) else: numBlocks = len(dataFrame) + 1 - windowSize userIDs = np.arange(len(dataFrame)) for blockID in np.arange(numBlocks): curIDs = userIDs[blockID:blockID + windowSize] # print(curIDs) useData = dataFrame.iloc[curIDs] curDomains = useData['domain'] if maxLengthInSeconds != -1: curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds) if len(underTimeOutIDs) != len(curIDs): curIDs = curIDs[underTimeOutIDs] useData = dataFrame.iloc[curIDs] curDomains = useData['domain'] outDomainLists.append(list(curDomains)) outDFFrames.append(useData) return (outDomainLists, outDFFrames) def createTrainData(domainLists, dfLists, charachterDict, maxLen, threshold=3, flagUseCiscoFeatures=False, urlSIPDIct=dict, windowSize=10): if 'hits' in dfLists[0].keys(): hitName = 'hits' elif 'virusTotalHits' in dfLists[0].keys(): hitName = 'virusTotalHits' numFlowFeatures = 3 numCiscoFeatures = 30 numFeatures = numFlowFeatures if flagUseCiscoFeatures: numFeatures += numCiscoFeatures outputFeatures = [] label = [] hits = [] trainNames = [] for i in range(windowSize): outputFeatures.append(np.zeros([len(domainLists), maxLen])) outputFeatures.append(np.zeros([len(domainLists), numFeatures])) for i in tqdm(np.arange(len(domainLists)), miniters=10): curCounter = 0 # print('len domainList: ' + str(len(domainLists[i]))) # print('len df: ' + str(len(dfLists[i]))) for j in range(np.min([windowSize, len(domainLists[i])])): outputFeatures[curCounter][i, :] = getFeatureVecForDomain(domainLists[i][j], charachterDict, maxLen) curCounter += 1 if flagUseCiscoFeatures: outputFeatures[curCounter][i, 0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j]) outputFeatures[curCounter][i, numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j], urlSIPDIct) else: outputFeatures[curCounter][i, :] = getFlowFeatures(dfLists[i].iloc[j]) curCounter += 1 curLabel = 0.0 if np.max(dfLists[i][hitName]) >= threshold: curLabel = 1.0 elif np.max(dfLists[i][hitName]) == -1: curLabel = -1.0 elif np.max(dfLists[i][hitName]) > 0 and np.max(dfLists[i][hitName]) < threshold: curLabel = -2.0 label.append(curLabel) hits.append(np.max(dfLists[i][hitName])) trainNames.append(np.unique(dfLists[i]['user_hash'])) return (outputFeatures, np.array(label), np.array(hits), np.array(trainNames)) def transformStringListToNumpyArray(listString): listString = listString.replace('[', '').replace(']', '') return np.array(listString.split(','), dtype='float32') def getCiscoFeatureDict(csvPathList): outDict = dict() for path in tqdm(csvPathList, miniters=1): fobj = open(path, 'r') csvReader = csv.DictReader(fobj, delimiter=',') for row in csvReader: urlSIPString = row['Domain'] + row['ServerIP'] ciscoFeatures = row['CiscoFeature'] outDict[urlSIPString] = transformStringListToNumpyArray(ciscoFeatures) # if len(outDict) % 10000 == 0: # print('numbers in dict: ' + str(len(outDict))) return outDict if __name__ == "__main__": pass