ma_cisco_malware/stackedNeuralModels.py

216 lines
8.2 KiB
Python

# -*- coding: utf-8 -*-
import csv
import numpy as np
from keras.layers import Dense, Activation, Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Lambda
from keras.layers import Input
from keras.models import Model
from keras.models import Sequential
from tqdm import tqdm
def getCiscoFeatures(curDataLine, urlSIPDict):
numCiscoFeatures = 30
try:
ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
# print('cisco features: ' + str(ciscoFeatures))
# log transform
ciscoFeatures = np.log1p(ciscoFeatures, dtype='float32')
# print('log transformed: ' + str(ciscoFeatures))
return ciscoFeatures.ravel()
except:
return np.zeros([numCiscoFeatures, ]).ravel()
def getCNNWithoutLastLayer(vocabSize, embeddingSize, input_length, filters, kernel_size,
hidden_dims, drop_out):
model = Sequential()
model.add(Embedding(input_dim=vocabSize, output_dim=embeddingSize,
input_length=input_length))
model.add(Conv1D(filters,
kernel_size,
activation='relu'))
# we use max pooling:
model.add(GlobalMaxPooling1D())
# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(drop_out))
model.add(Activation('relu'))
return model
def getCNNWitoutLastLayerFunctional(vocabSize, embeddingSize, input_length, filters, kernel_size,
hidden_dims, drop_out):
a = Input(shape=(input_length,))
embedding = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(a)
conv1 = Conv1D(filters, kernel_size, activation='relu')(embedding)
glob = GlobalMaxPooling1D()(conv1)
dense = Dense(hidden_dims)(glob)
drop = Dropout(drop_out)(dense)
model = Activation('relu')(drop)
model = Model(a, model)
return model
def getFlowFeatureLayer(numFeatures):
model = Sequential()
# slpModel.add(Dense(1, input_shape=(1,)))
model.add(Lambda(lambda x: x + 0.0, input_shape=(numFeatures,)))
return model
def createCNNDataSet(domains, label, characterDict, maxLen=40):
# process domains in reverse order
outFeature = np.zeros([len(domains), maxLen])
outLabel = np.zeros([len(domains), ])
for i in range(len(domains)):
domain = domains[i]
curLabel = label[i]
curFeature = np.zeros([maxLen, ])
# print(domain + ' ' + str(len(domain)))
for j in range(np.min([len(domain), maxLen])):
# print(j)
curCharacter = domain[-j]
if curCharacter in characterDict:
curFeature[j] = characterDict[curCharacter]
outFeature[i] = curFeature
outLabel[i] = curLabel
return (outFeature, outLabel)
def getFeatureVecForDomain(domain, characterDict, maxLen=40):
curFeature = np.zeros([maxLen, ])
for j in range(np.min([len(domain), maxLen])):
# print(j)
curCharacter = domain[-j]
if curCharacter in characterDict:
curFeature[j] = characterDict[curCharacter]
return curFeature
def getFlowFeatures(curDataLine):
useKeys = ['duration', 'bytes_down', 'bytes_up']
curFeature = np.zeros([len(useKeys), ])
for i in range(len(useKeys)):
curKey = useKeys[i]
try:
curFeature[i] = np.log1p(curDataLine[curKey], dtype='float32')
except:
pass
return curFeature
def getChunksFromUserDataFrame(dataFrame, windowSize=10, overlapping=False,
maxLengthInSeconds=300):
# print('maxLength: ' + str(maxLengthInSeconds))
maxMilliSeconds = maxLengthInSeconds * 1000
outDomainLists = []
outDFFrames = []
if overlapping == False:
numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize)))
userIDs = np.arange(len(dataFrame))
for blockID in np.arange(numBlocks):
curIDs = userIDs[(blockID * windowSize):((blockID + 1) * windowSize)]
# print(curIDs)
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
if maxLengthInSeconds != -1:
curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
if len(underTimeOutIDs) != len(curIDs):
curIDs = curIDs[underTimeOutIDs]
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
outDomainLists.append(list(curDomains))
outDFFrames.append(useData)
else:
numBlocks = len(dataFrame) + 1 - windowSize
userIDs = np.arange(len(dataFrame))
for blockID in np.arange(numBlocks):
curIDs = userIDs[blockID:blockID + windowSize]
# print(curIDs)
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
if maxLengthInSeconds != -1:
curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
if len(underTimeOutIDs) != len(curIDs):
curIDs = curIDs[underTimeOutIDs]
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
outDomainLists.append(list(curDomains))
outDFFrames.append(useData)
return (outDomainLists, outDFFrames)
def createTrainData(domainLists, dfLists, charachterDict, maxLen, threshold=3,
flagUseCiscoFeatures=False, urlSIPDIct=dict,
windowSize=10):
if 'hits' in dfLists[0].keys():
hitName = 'hits'
elif 'virusTotalHits' in dfLists[0].keys():
hitName = 'virusTotalHits'
numFlowFeatures = 3
numCiscoFeatures = 30
numFeatures = numFlowFeatures
if flagUseCiscoFeatures:
numFeatures += numCiscoFeatures
outputFeatures = []
label = []
hits = []
trainNames = []
for i in range(windowSize):
outputFeatures.append(np.zeros([len(domainLists), maxLen]))
outputFeatures.append(np.zeros([len(domainLists), numFeatures]))
for i in tqdm(np.arange(len(domainLists)), miniters=10):
curCounter = 0
# print('len domainList: ' + str(len(domainLists[i])))
# print('len df: ' + str(len(dfLists[i])))
for j in range(np.min([windowSize, len(domainLists[i])])):
outputFeatures[curCounter][i, :] = getFeatureVecForDomain(domainLists[i][j], charachterDict, maxLen)
curCounter += 1
if flagUseCiscoFeatures:
outputFeatures[curCounter][i, 0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j])
outputFeatures[curCounter][i, numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j], urlSIPDIct)
else:
outputFeatures[curCounter][i, :] = getFlowFeatures(dfLists[i].iloc[j])
curCounter += 1
curLabel = 0.0
if np.max(dfLists[i][hitName]) >= threshold:
curLabel = 1.0
elif np.max(dfLists[i][hitName]) == -1:
curLabel = -1.0
elif np.max(dfLists[i][hitName]) > 0 and np.max(dfLists[i][hitName]) < threshold:
curLabel = -2.0
label.append(curLabel)
hits.append(np.max(dfLists[i][hitName]))
trainNames.append(np.unique(dfLists[i]['user_hash']))
return (outputFeatures, np.array(label), np.array(hits), np.array(trainNames))
def transformStringListToNumpyArray(listString):
listString = listString.replace('[', '').replace(']', '')
return np.array(listString.split(','), dtype='float32')
def getCiscoFeatureDict(csvPathList):
outDict = dict()
for path in tqdm(csvPathList, miniters=1):
fobj = open(path, 'r')
csvReader = csv.DictReader(fobj, delimiter=',')
for row in csvReader:
urlSIPString = row['Domain'] + row['ServerIP']
ciscoFeatures = row['CiscoFeature']
outDict[urlSIPString] = transformStringListToNumpyArray(ciscoFeatures)
# if len(outDict) % 10000 == 0:
# print('numbers in dict: ' + str(len(outDict)))
return outDict
if __name__ == "__main__":
pass