# -*- coding: utf-8 -*- import string import keras import numpy as np import pandas as pd from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Reshape, Embedding, Input, Activation from keras.models import Model from keras.utils import np_utils from tqdm import tqdm # config = tf.ConfigProto(log_device_placement=True) # config.gpu_options.per_process_gpu_memory_fraction = 0.5 # config.gpu_options.allow_growth = True # session = tf.Session(config=config) def get_character_dict(): return dict((char, idx) for (idx, char) in enumerate(string.ascii_lowercase + string.punctuation)) def get_cnn(vocabSize, embeddingSize, input_length, filters, kernel_size, hidden_dims, drop_out): x = y = Input(shape=(input_length,)) y = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(y) y = Conv1D(filters, kernel_size, activation='relu')(y) y = GlobalMaxPooling1D()(y) y = Dense(hidden_dims)(y) y = Dropout(drop_out)(y) y = Activation('relu')(y) return Model(x, y) def get_user_chunks(dataFrame, windowSize=10, overlapping=False, maxLengthInSeconds=300): # print('maxLength: ' + str(maxLengthInSeconds)) maxMilliSeconds = maxLengthInSeconds * 1000 outDomainLists = [] outDFFrames = [] if overlapping == False: numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize))) userIDs = np.arange(len(dataFrame)) for blockID in np.arange(numBlocks): curIDs = userIDs[(blockID * windowSize):((blockID + 1) * windowSize)] # print(curIDs) useData = dataFrame.iloc[curIDs] curDomains = useData['domain'] if maxLengthInSeconds != -1: curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds) if len(underTimeOutIDs) != len(curIDs): curIDs = curIDs[underTimeOutIDs] useData = dataFrame.iloc[curIDs] curDomains = useData['domain'] outDomainLists.append(list(curDomains)) outDFFrames.append(useData) else: numBlocks = len(dataFrame) + 1 - windowSize userIDs = np.arange(len(dataFrame)) for blockID in np.arange(numBlocks): curIDs = userIDs[blockID:blockID + windowSize] # print(curIDs) useData = dataFrame.iloc[curIDs] curDomains = useData['domain'] if maxLengthInSeconds != -1: curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds) if len(underTimeOutIDs) != len(curIDs): curIDs = curIDs[underTimeOutIDs] useData = dataFrame.iloc[curIDs] curDomains = useData['domain'] outDomainLists.append(list(curDomains)) outDFFrames.append(useData) return (outDomainLists, outDFFrames) def getFeatureVecForDomain(domain, characterDict, maxLen=40): curFeature = np.zeros([maxLen, ]) for j in range(np.min([len(domain), maxLen])): # print(j) curCharacter = domain[-j] if curCharacter in characterDict: curFeature[j] = characterDict[curCharacter] return curFeature def getFlowFeatures(curDataLine): useKeys = ['duration', 'bytes_down', 'bytes_up'] curFeature = np.zeros([len(useKeys), ]) for i in range(len(useKeys)): curKey = useKeys[i] try: curFeature[i] = np.log1p(curDataLine[curKey]).astype(float) except: pass return curFeature def getCiscoFeatures(curDataLine, urlSIPDict): numCiscoFeatures = 30 try: ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])] # print('cisco features: ' + str(ciscoFeatures)) # log transform ciscoFeatures = np.log1p(ciscoFeatures).astype(float) # print('log transformed: ' + str(ciscoFeatures)) return ciscoFeatures.ravel() except: return np.zeros([numCiscoFeatures, ]).ravel() def create_dataset_from_flows(user_flow_df, char_dict, maxLen, threshold=3, windowSize=10): domainLists = [] dfLists = [] print("get chunks from user data frames") for i, user_flow in enumerate(get_flow_per_user(user_flow_df)): (domainListsTmp, dfListsTmp) = get_user_chunks(user_flow, windowSize=windowSize, overlapping=False, maxLengthInSeconds=maxLengthInSeconds) domainLists += domainListsTmp dfLists += dfListsTmp if i >= 10: break print("create training dataset") return create_dataset_from_lists( domainLists=domainLists, dfLists=dfLists, charachterDict=char_dict, maxLen=maxLen, threshold=threshold, flagUseCiscoFeatures=False, urlSIPDIct=dict(), windowSize=windowSize) def create_dataset_from_lists(domainLists, dfLists, charachterDict, maxLen, threshold=3, flagUseCiscoFeatures=False, urlSIPDIct=dict(), windowSize=10): if 'hits' in dfLists[0].keys(): hitName = 'hits' elif 'virusTotalHits' in dfLists[0].keys(): hitName = 'virusTotalHits' numFlowFeatures = 3 numCiscoFeatures = 30 numFeatures = numFlowFeatures if flagUseCiscoFeatures: numFeatures += numCiscoFeatures outputFeatures = [] label = [] hits = [] trainNames = [] for i in range(windowSize): outputFeatures.append(np.zeros([len(domainLists), maxLen])) outputFeatures.append(np.zeros([len(domainLists), numFeatures])) for i in tqdm(np.arange(len(domainLists)), miniters=10): curCounter = 0 # print('len domainList: ' + str(len(domainLists[i]))) # print('len df: ' + str(len(dfLists[i]))) for j in range(np.min([windowSize, len(domainLists[i])])): outputFeatures[curCounter][i, :] = getFeatureVecForDomain(domainLists[i][j], charachterDict, maxLen) curCounter += 1 if flagUseCiscoFeatures: outputFeatures[curCounter][i, 0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j]) outputFeatures[curCounter][i, numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j], urlSIPDIct) else: outputFeatures[curCounter][i, :] = getFlowFeatures(dfLists[i].iloc[j]) curCounter += 1 curLabel = 0.0 if np.max(dfLists[i][hitName]) >= threshold: curLabel = 1.0 elif np.max(dfLists[i][hitName]) == -1: curLabel = -1.0 elif np.max(dfLists[i][hitName]) > 0 and np.max(dfLists[i][hitName]) < threshold: curLabel = -2.0 label.append(curLabel) hits.append(np.max(dfLists[i][hitName])) trainNames.append(np.unique(dfLists[i]['user_hash'])) return (outputFeatures, np.array(label), np.array(hits), np.array(trainNames)) def get_user_flow_data(): # load train and test data from joblib # created with createTrainDataMultipleTaskLearning.py # rk: changed to csv file trainDFs = pd.read_csv("data/rk_data.csv.gz") trainDFs.drop("Unnamed: 0", 1, inplace=True) trainDFs.set_index(keys=['user_hash'], drop=False, inplace=True) users = trainDFs['user_hash'].unique().tolist() u0 = trainDFs.loc[trainDFs.user_hash == users[0]] return trainDFs def get_flow_per_user(df): users = df['user_hash'].unique().tolist() for user in users: yield df.loc[df.user_hash == user] if __name__ == "__main__": # parameter innerCNNFilters = 512 innerCNNKernelSize = 2 cnnDropout = 0.5 cnnHiddenDims = 1024 domainFeatures = 512 flowFeatures = 3 numCiscoFeatures = 30 windowSize = 10 maxLen = 40 embeddingSize = 100 kernel_size = 2 drop_out = 0.5 filters = 2 hidden_dims = 100 vocabSize = 40 threshold = 3 minFlowsPerUser = 10 numEpochs = 100 maxLengthInSeconds = -1 timesNeg = -1 char_dict = get_character_dict() user_flow_df = get_user_flow_data() print("create training dataset") (X_tr, y_tr, hits_tr, names_tr) = create_dataset_from_flows( user_flow_df, char_dict, maxLen=maxLen, threshold=threshold, windowSize=windowSize) pos_idx = np.where(y_tr == 1.0)[0] neg_idx = np.where(y_tr == 0.0)[0] use_idx = np.concatenate((pos_idx, neg_idx)) y_tr = y_tr[use_idx] # hits_tr = hits_tr[use_idx] # names_tr = names_tr[use_idx] for i in range(len(X_tr)): X_tr[i] = X_tr[i][use_idx] # TODO: WTF? I don't get it... sharedCNNFun = get_cnn(len(char_dict) + 1, embeddingSize, maxLen, domainFeatures, kernel_size, domainFeatures, 0.5) inputList = [] encodedList = [] numFeatures = flowFeatures for i in range(windowSize): inputList.append(Input(shape=(maxLen,))) encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model inputList.append(Input(shape=(numFeatures,))) merge_layer_input = [] for i in range(windowSize): merge_layer_input.append(encodedList[i]) merge_layer_input.append(inputList[(2 * i) + 1]) # We can then concatenate the two vectors: merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1) reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector) # add second cnn cnn = Conv1D(filters, kernel_size, activation='relu', input_shape=(windowSize, domainFeatures + numFeatures))(reshape) # we use max pooling: maxPool = GlobalMaxPooling1D()(cnn) cnnDropout = Dropout(cnnDropout)(maxPool) cnnDense = Dense(cnnHiddenDims, activation='relu')(cnnDropout) cnnOutput = Dense(2, activation='softmax')(cnnDense) # We define a trainable model linking the # tweet inputs to the predictions model = Model(inputs=inputList, outputs=cnnOutput) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) epochNumber = 0 trainLabel = np_utils.to_categorical(y_tr, 2) model.fit(x=X_tr, y=trainLabel, batch_size=128, epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber) # , # validation_data=(testData,testLabel))