add feature to use both hits information from dataset

This commit is contained in:
René Knaebel 2017-07-06 16:27:47 +02:00
parent b2f5c56019
commit 933f6bf1d7
3 changed files with 27 additions and 19 deletions

View File

@ -28,7 +28,7 @@ def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
maxMilliSeconds = maxLengthInSeconds * 1000 maxMilliSeconds = maxLengthInSeconds * 1000
outDomainLists = [] outDomainLists = []
outDFFrames = [] outDFFrames = []
if overlapping == False: if not overlapping:
numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize))) numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize)))
userIDs = np.arange(len(dataFrame)) userIDs = np.arange(len(dataFrame))
for blockID in np.arange(numBlocks): for blockID in np.arange(numBlocks):
@ -70,9 +70,9 @@ def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
def get_domain_features(domain, vocab, max_length=40): def get_domain_features(domain, vocab, max_length=40):
encoding = np.zeros((max_length,)) encoding = np.zeros((max_length,))
for j in range(np.min([len(domain), max_length])): for j in range(np.min([len(domain), max_length])):
curCharacter = domain[-j] char = domain[-j]
if curCharacter in vocab: if char in vocab:
encoding[j] = vocab[curCharacter] encoding[j] = vocab[char]
return encoding return encoding
@ -90,6 +90,7 @@ def get_flow_features(flow):
return features return features
# NOT USED ATM
def get_cisco_features(curDataLine, urlSIPDict): def get_cisco_features(curDataLine, urlSIPDict):
numCiscoFeatures = 30 numCiscoFeatures = 30
try: try:
@ -124,19 +125,19 @@ def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10,
window_size=window_size) window_size=window_size)
# make client labels discrete with 4 different values # make client labels discrete with 4 different values
# TODO: use trusted_hits_tr for client classification too hits_tr = np.apply_along_axis(lambda x: discretize_label(x, 3), 0, np.atleast_2d(hits_tr))
client_labels = np.apply_along_axis(lambda x: discretize_label(x, 3), 0, np.atleast_2d(hits_tr))
# select only 1.0 and 0.0 from training data # select only 1.0 and 0.0 from training data
pos_idx = np.where(client_labels == 1.0)[0] pos_idx = np.where(np.logical_or(hits_tr == 1.0, trusted_hits_tr >= 1.0))[0]
neg_idx = np.where(client_labels == 0.0)[0] neg_idx = np.where(hits_tr == 0.0)[0]
idx = np.concatenate((pos_idx, neg_idx)) idx = np.concatenate((pos_idx, neg_idx))
# choose selected sample to train on # choose selected sample to train on
domain_tr = domain_tr[idx] domain_tr = domain_tr[idx]
flow_tr = flow_tr[idx] flow_tr = flow_tr[idx]
client_labels = client_labels[idx] client_tr = np.zeros_like(idx, float)
server_labels = server_tr[idx] client_tr[:pos_idx.shape[-1]] = 1.0
server_tr = server_tr[idx]
return domain_tr, flow_tr, client_labels, server_labels return domain_tr, flow_tr, client_tr, server_tr
def create_dataset_from_lists(domains, features, vocab, max_len, def create_dataset_from_lists(domains, features, vocab, max_len,
@ -202,7 +203,8 @@ def discretize_label(values, threshold):
def get_user_flow_data(csv_file): def get_user_flow_data(csv_file):
df = pd.read_csv(csv_file) df = pd.read_csv(csv_file)
keys = ["duration", "bytes_down", "bytes_up", "domain", "timeStamp", "server_ip", "user_hash", "virusTotalHits", keys = ["duration", "bytes_down", "bytes_up", "domain",
"timeStamp", "server_ip", "user_hash", "virusTotalHits",
"serverLabel", "trustedHits"] "serverLabel", "trustedHits"]
df = df[keys] df = df[keys]
df.set_index(keys=['user_hash'], drop=False, inplace=True) df.set_index(keys=['user_hash'], drop=False, inplace=True)

11
main.py
View File

@ -87,6 +87,7 @@ def main():
kernel_size = 3 kernel_size = 3
drop_out = 0.5 drop_out = 0.5
filters = 128 filters = 128
network = models.pauls_networks
char_dict = dataset.get_character_dict() char_dict = dataset.get_character_dict()
user_flow_df = dataset.get_user_flow_data(args.train_data) user_flow_df = dataset.get_user_flow_data(args.train_data)
@ -96,11 +97,11 @@ def main():
user_flow_df, char_dict, user_flow_df, char_dict,
max_len=args.domain_length, window_size=args.window) max_len=args.domain_length, window_size=args.window)
shared_cnn = models.renes_networks.get_embedding(len(char_dict) + 1, args.embedding, args.domain_length, shared_cnn = network.get_embedding(len(char_dict) + 1, args.embedding, args.domain_length,
args.hidden_char_dims, kernel_size, args.domain_embedding, 0.5) args.hidden_char_dims, kernel_size, args.domain_embedding, 0.5)
shared_cnn.summary() shared_cnn.summary()
model = models.renes_networks.get_model(cnnDropout, flow_tr.shape[-1], args.domain_embedding, model = network.get_model(cnnDropout, flow_tr.shape[-1], args.domain_embedding,
args.window, args.domain_length, filters, kernel_size, args.window, args.domain_length, filters, kernel_size,
cnnHiddenDims, shared_cnn) cnnHiddenDims, shared_cnn)
model.summary() model.summary()
@ -119,7 +120,11 @@ def main():
validation_split=0.2) validation_split=0.2)
def test(): def main_train():
pass
def main_test():
char_dict = dataset.get_character_dict() char_dict = dataset.get_character_dict()
user_flow_df = dataset.get_user_flow_data(args.test_data) user_flow_df = dataset.get_user_flow_data(args.test_data)
domain_val, flow_val, client_val, server_val = dataset.create_dataset_from_flows( domain_val, flow_val, client_val, server_val = dataset.create_dataset_from_flows(

View File

@ -25,11 +25,12 @@ def get_model(cnnDropout, flow_features, domain_features, window_size, domain_le
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows") ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
merged = keras.layers.concatenate([encoded, ipt_flows], -1) merged = keras.layers.concatenate([encoded, ipt_flows], -1)
# CNN processing a small slides of flow windows # CNN processing a small slides of flow windows
# TODO: add more layers?
y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu', y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu',
input_shape=(window_size, domain_features + flow_features))(merged) input_shape=(window_size, domain_features + flow_features))(merged)
y = MaxPool1D(pool_size=3, strides=1)(y) y = MaxPool1D(pool_size=3, strides=1)(y)
y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu')(y) y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu')(y)
y = MaxPool1D(pool_size=3, strides=1)(y)
y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu')(y)
# remove temporal dimension by global max pooling # remove temporal dimension by global max pooling
y = GlobalMaxPooling1D()(y) y = GlobalMaxPooling1D()(y)
y = Dropout(cnnDropout)(y) y = Dropout(cnnDropout)(y)