add h5py example
This commit is contained in:
parent
4a9f94a029
commit
fdc03c9922
91
dataset.py
91
dataset.py
@ -1,6 +1,7 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import string
|
import string
|
||||||
|
|
||||||
|
import h5py
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from keras.utils import np_utils
|
from keras.utils import np_utils
|
||||||
@ -91,39 +92,24 @@ def get_flow_features(flow):
|
|||||||
return features
|
return features
|
||||||
|
|
||||||
|
|
||||||
# NOT USED ATM
|
|
||||||
def get_cisco_features(curDataLine, urlSIPDict):
|
|
||||||
numCiscoFeatures = 30
|
|
||||||
try:
|
|
||||||
ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
|
|
||||||
# log transform
|
|
||||||
ciscoFeatures = np.log1p(ciscoFeatures).astype(float)
|
|
||||||
return ciscoFeatures.ravel()
|
|
||||||
except:
|
|
||||||
return np.zeros([numCiscoFeatures, ]).ravel()
|
|
||||||
|
|
||||||
|
|
||||||
def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10, use_cisco_features=False):
|
def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10, use_cisco_features=False):
|
||||||
domains = []
|
domains = []
|
||||||
features = []
|
features = []
|
||||||
print("get chunks from user data frames")
|
print("get chunks from user data frames")
|
||||||
for i, user_flow in enumerate(get_flow_per_user(user_flow_df)):
|
for i, user_flow in tqdm(list(enumerate(get_flow_per_user(user_flow_df)))):
|
||||||
(domain_windows, feature_windows) = get_user_chunks(user_flow,
|
(domain_windows, feature_windows) = get_user_chunks(user_flow,
|
||||||
windowSize=window_size,
|
windowSize=window_size,
|
||||||
overlapping=False,
|
overlapping=False,
|
||||||
maxLengthInSeconds=-1)
|
maxLengthInSeconds=-1)
|
||||||
domains += domain_windows
|
domains += domain_windows
|
||||||
features += feature_windows
|
features += feature_windows
|
||||||
# TODO: remove later
|
|
||||||
if i >= 50:
|
|
||||||
break
|
|
||||||
|
|
||||||
print("create training dataset")
|
print("create training dataset")
|
||||||
domain_tr, flow_tr, hits_tr, names_tr, server_tr, trusted_hits_tr = create_dataset_from_lists(
|
domain_tr, flow_tr, hits_tr, names_tr, server_tr, trusted_hits_tr = create_dataset_from_lists(domains=domains,
|
||||||
domains=domains, features=features, vocab=char_dict,
|
flows=features,
|
||||||
max_len=max_len,
|
vocab=char_dict,
|
||||||
use_cisco_features=use_cisco_features, urlSIPDIct=dict(),
|
max_len=max_len,
|
||||||
window_size=window_size)
|
window_size=window_size)
|
||||||
|
|
||||||
# make client labels discrete with 4 different values
|
# make client labels discrete with 4 different values
|
||||||
hits_tr = np.apply_along_axis(lambda x: discretize_label(x, 3), 0, np.atleast_2d(hits_tr))
|
hits_tr = np.apply_along_axis(lambda x: discretize_label(x, 3), 0, np.atleast_2d(hits_tr))
|
||||||
@ -144,32 +130,29 @@ def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10,
|
|||||||
return domain_tr, flow_tr, client_tr, server_tr
|
return domain_tr, flow_tr, client_tr, server_tr
|
||||||
|
|
||||||
|
|
||||||
def create_dataset_from_lists(domains, features, vocab, max_len,
|
def store_h5dataset(domain_tr, flow_tr, client_tr, server_tr):
|
||||||
use_cisco_features=False, urlSIPDIct=dict(),
|
f = h5py.File("data/full_dataset.h5", "w")
|
||||||
window_size=10):
|
domain_tr = domain_tr.astype(np.int8)
|
||||||
|
f.create_dataset("domain", data=domain_tr)
|
||||||
|
f.create_dataset("flow", data=flow_tr)
|
||||||
|
server_tr = server_tr.astype(np.bool)
|
||||||
|
client_tr = client_tr.astype(np.bool)
|
||||||
|
f.create_dataset("client", data=client_tr)
|
||||||
|
f.create_dataset("server", data=server_tr)
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
|
def create_dataset_from_lists(domains, flows, vocab, max_len, window_size=10):
|
||||||
"""
|
"""
|
||||||
combines domain and feature windows to sequential training data
|
combines domain and feature windows to sequential training data
|
||||||
:param domains: list of domain windows
|
:param domains: list of domain windows
|
||||||
:param features: list of feature windows
|
:param flows: list of flow feature windows
|
||||||
:param vocab:
|
:param vocab:
|
||||||
:param max_len:
|
:param max_len:
|
||||||
:param use_cisco_features: idk
|
|
||||||
:param urlSIPDIct: idk
|
|
||||||
:param window_size: size of the flow window
|
:param window_size: size of the flow window
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
# TODO: check for hits vs vth consistency
|
numFeatures = 3
|
||||||
# if 'hits' in dfs[0].keys():
|
|
||||||
# hits_col = 'hits'
|
|
||||||
# elif 'virusTotalHits' in dfs[0].keys():
|
|
||||||
# hits_col = 'virusTotalHits'
|
|
||||||
hits_col = "virusTotalHits"
|
|
||||||
|
|
||||||
numFlowFeatures = 3
|
|
||||||
numCiscoFeatures = 30
|
|
||||||
numFeatures = numFlowFeatures
|
|
||||||
if use_cisco_features:
|
|
||||||
numFeatures += numCiscoFeatures
|
|
||||||
sample_size = len(domains)
|
sample_size = len(domains)
|
||||||
hits = []
|
hits = []
|
||||||
names = []
|
names = []
|
||||||
@ -181,14 +164,13 @@ def create_dataset_from_lists(domains, features, vocab, max_len,
|
|||||||
|
|
||||||
for i in tqdm(np.arange(sample_size), miniters=10):
|
for i in tqdm(np.arange(sample_size), miniters=10):
|
||||||
for j in range(window_size):
|
for j in range(window_size):
|
||||||
domain_features[i, j] = get_domain_features(domains[i][j], vocab, max_len)
|
domain_features[i, j, :] = get_domain_features(domains[i][j], vocab, max_len)
|
||||||
flow_features[i, j] = get_flow_features(features[i].iloc[j])
|
flow_features[i, j, :] = get_flow_features(flows[i].iloc[j])
|
||||||
# TODO: cisco features?
|
|
||||||
|
|
||||||
hits.append(np.max(features[i][hits_col]))
|
hits.append(np.max(flows[i]['virusTotalHits']))
|
||||||
names.append(np.unique(features[i]['user_hash']))
|
names.append(np.unique(flows[i]['user_hash']))
|
||||||
servers.append(np.max(features[i]['serverLabel']))
|
servers.append(np.max(flows[i]['serverLabel']))
|
||||||
trusted_hits.append(np.max(features[i]['trustedHits']))
|
trusted_hits.append(np.max(flows[i]['trustedHits']))
|
||||||
return (domain_features, flow_features,
|
return (domain_features, flow_features,
|
||||||
np.array(hits), np.array(names), np.array(servers), np.array(trusted_hits))
|
np.array(hits), np.array(names), np.array(servers), np.array(trusted_hits))
|
||||||
|
|
||||||
@ -206,11 +188,20 @@ def discretize_label(values, threshold):
|
|||||||
|
|
||||||
|
|
||||||
def get_user_flow_data(csv_file):
|
def get_user_flow_data(csv_file):
|
||||||
|
types = {
|
||||||
|
"duration": int,
|
||||||
|
"bytes_down": int,
|
||||||
|
"bytes_up": int,
|
||||||
|
"domain": object,
|
||||||
|
"timeStamp": float,
|
||||||
|
"server_ip": object,
|
||||||
|
"user_hash": float,
|
||||||
|
"virusTotalHits": int,
|
||||||
|
"serverLabel": int,
|
||||||
|
"trustedHits": int
|
||||||
|
}
|
||||||
df = pd.read_csv(csv_file)
|
df = pd.read_csv(csv_file)
|
||||||
keys = ["duration", "bytes_down", "bytes_up", "domain",
|
df = df[list(types.keys())]
|
||||||
"timeStamp", "server_ip", "user_hash", "virusTotalHits",
|
|
||||||
"serverLabel", "trustedHits"]
|
|
||||||
df = df[keys]
|
|
||||||
df.set_index(keys=['user_hash'], drop=False, inplace=True)
|
df.set_index(keys=['user_hash'], drop=False, inplace=True)
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
39
main.py
39
main.py
@ -1,5 +1,7 @@
|
|||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
import h5py
|
||||||
|
from keras.models import load_model
|
||||||
from keras.utils import np_utils
|
from keras.utils import np_utils
|
||||||
|
|
||||||
import dataset
|
import dataset
|
||||||
@ -8,7 +10,8 @@ import models
|
|||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
parser.add_argument("--modes", action="store", dest="modes", nargs="+")
|
parser.add_argument("--modes", action="store", dest="modes", nargs="+",
|
||||||
|
default=[])
|
||||||
|
|
||||||
parser.add_argument("--train", action="store", dest="train_data",
|
parser.add_argument("--train", action="store", dest="train_data",
|
||||||
default="data/full_dataset.csv.tar.bz2")
|
default="data/full_dataset.csv.tar.bz2")
|
||||||
@ -193,7 +196,39 @@ def main_train():
|
|||||||
model.save(args.clf_model)
|
model.save(args.clf_model)
|
||||||
|
|
||||||
|
|
||||||
from keras.models import load_model
|
def main_train_h5():
|
||||||
|
# parameter
|
||||||
|
dropout_main = 0.5
|
||||||
|
dense_main = 512
|
||||||
|
kernel_main = 3
|
||||||
|
filter_main = 128
|
||||||
|
network = models.pauls_networks if args.model_type == "paul" else models.renes_networks
|
||||||
|
|
||||||
|
char_dict = dataset.get_character_dict()
|
||||||
|
data = h5py.File("data/full_dataset.h5", "r")
|
||||||
|
|
||||||
|
embedding = network.get_embedding(len(char_dict) + 1, args.embedding, args.domain_length,
|
||||||
|
args.hidden_char_dims, kernel_main, args.domain_embedding, 0.5)
|
||||||
|
embedding.summary()
|
||||||
|
|
||||||
|
model = network.get_model(dropout_main, data["flow"].shape[-1], args.domain_embedding,
|
||||||
|
args.window, args.domain_length, filter_main, kernel_main,
|
||||||
|
dense_main, embedding)
|
||||||
|
model.summary()
|
||||||
|
|
||||||
|
model.compile(optimizer='adam',
|
||||||
|
loss='categorical_crossentropy',
|
||||||
|
metrics=['accuracy'])
|
||||||
|
|
||||||
|
model.fit([data["domain"], data["flow"]],
|
||||||
|
[data["client"], data["server"]],
|
||||||
|
batch_size=args.batch_size,
|
||||||
|
epochs=args.epochs,
|
||||||
|
shuffle=True,
|
||||||
|
validation_split=0.2)
|
||||||
|
|
||||||
|
embedding.save(args.embedding_model)
|
||||||
|
model.save(args.clf_model)
|
||||||
|
|
||||||
|
|
||||||
def main_test():
|
def main_test():
|
||||||
|
@ -1,9 +1,18 @@
|
|||||||
#!/usr/bin/python2
|
#!/usr/bin/python2
|
||||||
|
|
||||||
import joblib
|
import joblib
|
||||||
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
df = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/currentData.joblib")
|
df = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/currentData.joblib")
|
||||||
df = pd.concat(df["data"])
|
df = pd.concat(df["data"])
|
||||||
df.reset_index(inplace=True)
|
df.reset_index(inplace=True)
|
||||||
|
df.dropna(axis=0, how="any", inplace=True)
|
||||||
|
df[["duration", "bytes_down", "bytes_up"]] = df[["duration", "bytes_down", "bytes_up"]].astype(np.int)
|
||||||
|
df[["domain", "server_ip"]] = df[["domain", "server_ip"]].astype(str)
|
||||||
|
df[["server_label"]] = df[["server_label"]].astype(np.bool)
|
||||||
|
df.serverLabel = df.serverLabel.astype(np.bool)
|
||||||
|
df.virusTotalHits = df.virusTotalHits.astype(np.int)
|
||||||
|
df.trustedHits = df.trustedHits.astype(np.int)
|
||||||
|
|
||||||
df.to_csv("/tmp/rk/full_future_dataset.csv.gz", compression="gzip")
|
df.to_csv("/tmp/rk/full_future_dataset.csv.gz", compression="gzip")
|
||||||
|
Loading…
Reference in New Issue
Block a user