ma_cisco_malware/main.py

369 lines
14 KiB
Python

import json
import logging
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping
from keras.models import load_model
from keras.utils import np_utils
from sklearn.utils import class_weight
import arguments
import dataset
import hyperband
import models
# create logger
import visualize
from dataset import load_or_generate_h5data
from utils import exists_or_make_path
logger = logging.getLogger('logger')
logger.setLevel(logging.DEBUG)
# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# add formatter to ch
ch.setFormatter(formatter)
# add ch to logger
logger.addHandler(ch)
ch = logging.FileHandler("info.log")
ch.setLevel(logging.DEBUG)
# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# add formatter to ch
ch.setFormatter(formatter)
# add ch to logger
logger.addHandler(ch)
args = arguments.parse()
if args.gpu:
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.per_process_gpu_memory_fraction = 0.5
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
def main_paul_best():
char_dict = dataset.get_character_dict()
pauls_best_params = models.pauls_networks.best_config
pauls_best_params["vocab_size"] = len(char_dict) + 1
main_train(pauls_best_params)
def main_hyperband():
char_dict = dataset.get_character_dict()
params = {
# static params
"type": ["paul"],
"batch_size": [args.batch_size],
"vocab_size": [len(char_dict) + 1],
"window_size": [10],
"domain_length": [40],
"flow_features": [3],
"input_length": [40],
# model params
"embedding_size": [8, 16, 32, 64, 128, 256],
"filter_embedding": [8, 16, 32, 64, 128, 256],
"kernel_embedding": [1, 3, 5, 7, 9],
"hidden_embedding": [8, 16, 32, 64, 128, 256],
"dropout": [0.5],
"domain_features": [8, 16, 32, 64, 128, 256],
"filter_main": [8, 16, 32, 64, 128, 256],
"kernels_main": [1, 3, 5, 7, 9],
"dense_main": [8, 16, 32, 64, 128, 256],
}
logger.info("create training dataset")
domain_tr, flow_tr, client_tr, server_tr = load_or_generate_h5data(args.train_h5data, args.train_data,
args.domain_length, args.window)
hp = hyperband.Hyperband(params,
[domain_tr, flow_tr],
[client_tr, server_tr])
results = hp.run()
json.dump(results, open("hyperband.json"))
def get_custom_class_weights(client_tr, server_tr):
client = client_tr.value.argmax(1) if type(client_tr) != np.ndarray else client_tr.argmax(1)
server = server_tr.value.argmax(1) if type(server_tr) != np.ndarray else server_tr.argmax(1)
client_class_weight = class_weight.compute_class_weight('balanced', np.unique(client), client)
server_class_weight = class_weight.compute_class_weight('balanced', np.unique(server), server)
return {
"client": client_class_weight,
"server": server_class_weight
}
def main_train(param=None):
exists_or_make_path(args.model_path)
char_dict = dataset.get_character_dict()
domain_tr, flow_tr, client_tr, server_windows_tr = load_or_generate_h5data(args.train_h5data, args.train_data,
args.domain_length, args.window)
# parameter
p = {
"type": args.model_type,
"batch_size": 64,
"window_size": args.window,
"domain_length": args.domain_length,
"flow_features": 3,
"vocab_size": len(char_dict) + 1,
#
'dropout': 0.5,
'domain_features': args.domain_embedding,
'embedding_size': args.embedding,
'filter_main': 64,
'flow_features': 3,
# 'dense_main': 512,
'dense_main': 64,
'filter_embedding': args.hidden_char_dims,
'hidden_embedding': args.domain_embedding,
'kernel_embedding': 3,
'kernels_main': 3,
'input_length': 40
}
if not param:
param = p
embedding, model, _ = models.get_models_by_params(param)
embedding.summary()
model.summary()
logger.info("define callbacks")
callbacks = []
callbacks.append(ModelCheckpoint(filepath=args.clf_model,
monitor='val_loss',
verbose=False,
save_best_only=True))
callbacks.append(CSVLogger(args.train_log))
if args.stop_early:
callbacks.append(EarlyStopping(monitor='val_loss',
patience=5,
verbose=False))
logger.info("compile model")
custom_metrics = models.get_metric_functions()
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'] + custom_metrics)
server_tr = np_utils.to_categorical(np.max(server_windows_tr, axis=1), 2)
if args.class_weights:
logger.info("class weights: compute custom weights")
custom_class_weights = get_custom_class_weights(client_tr, server_tr)
logger.info(custom_class_weights)
else:
logger.info("class weights: set default")
custom_class_weights = None
logger.info("start training")
model.fit([domain_tr, flow_tr],
[client_tr, server_tr],
batch_size=args.batch_size,
epochs=args.epochs,
callbacks=callbacks,
shuffle=True,
validation_split=0.2,
class_weight=custom_class_weights)
logger.info("save embedding")
embedding.save(args.embedding_model)
def main_test():
domain_val, flow_val, client_val, server_val = load_or_generate_h5data(args.test_h5data, args.test_data,
args.domain_length, args.window)
clf = load_model(args.clf_model, custom_objects=models.get_metrics())
# stats = clf.evaluate([domain_val, flow_val],
# [client_val, server_val],
# batch_size=args.batch_size)
y_pred = clf.predict([domain_val, flow_val],
batch_size=args.batch_size,
verbose=1)
np.save(args.future_prediction, y_pred)
char_dict = dataset.get_character_dict()
user_flow_df = dataset.get_user_flow_data(args.test_data)
domains = user_flow_df.domain.unique()[:-1]
def get_domain_features_reduced(d):
return dataset.get_domain_features(d[0], char_dict, args.domain_length)
domain_features = []
for ds in domains:
domain_features.append(np.apply_along_axis(get_domain_features_reduced, 2, np.atleast_3d(ds)))
model = load_model(args.embedding_model)
domain_features = np.stack(domain_features).reshape((-1, 40))
pred = model.predict(domain_features, batch_size=args.batch_size, verbose=1)
np.save("/tmp/rk/domains.npy", domains)
np.save("/tmp/rk/domain_features.npy", domain_features)
np.save("/tmp/rk/domain_embd.npy", pred)
def main_new_model():
exists_or_make_path(args.model_path)
char_dict = dataset.get_character_dict()
domain_tr, flow_tr, client_tr, server_windows_tr = load_or_generate_h5data(args.train_h5data, args.train_data,
args.domain_length, args.window)
# parameter
p = {
"type": args.model_type,
"batch_size": 64,
"window_size": args.window,
"domain_length": args.domain_length,
"flow_features": 3,
"vocab_size": len(char_dict) + 1,
#
'dropout': 0.5,
'domain_features': args.domain_embedding,
'embedding_size': args.embedding,
'filter_main': 64,
'flow_features': 3,
# 'dense_main': 512,
'dense_main': 64,
'filter_embedding': args.hidden_char_dims,
'hidden_embedding': args.domain_embedding,
'kernel_embedding': 3,
'kernels_main': 3,
'input_length': 40
}
embedding, _, model = models.get_models_by_params(p)
embedding.summary()
model.summary()
logger.info("define callbacks")
callbacks = []
callbacks.append(ModelCheckpoint(filepath=args.clf_model,
monitor='val_loss',
verbose=False,
save_best_only=True))
callbacks.append(CSVLogger(args.train_log))
if args.stop_early:
callbacks.append(EarlyStopping(monitor='val_loss',
patience=5,
verbose=False))
logger.info("compile model")
custom_metrics = models.get_metric_functions()
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'] + custom_metrics)
server_tr = np_utils.to_categorical(np.max(server_windows_tr, axis=1), 2)
if args.class_weights:
logger.info("class weights: compute custom weights")
custom_class_weights = get_custom_class_weights(client_tr, server_tr)
logger.info(custom_class_weights)
else:
logger.info("class weights: set default")
custom_class_weights = None
logger.info("start training")
server_tr = np.stack(np_utils.to_categorical(s, 2) for s in server_windows_tr)
model.fit([domain_tr, flow_tr],
[client_tr, server_tr],
batch_size=args.batch_size,
epochs=args.epochs,
callbacks=callbacks,
shuffle=True,
validation_split=0.2,
class_weight=custom_class_weights)
logger.info("save embedding")
embedding.save(args.embedding_model)
def main_embedding():
model = load_model(args.embedding_model)
domain_encs, labels = dataset.load_or_generate_domains(args.train_data, args.domain_length)
domain_embedding = model.predict(domain_encs, batch_size=args.batch_size, verbose=1)
visualize.plot_embedding(domain_embedding, labels, path="results/pp3/embd.png")
def main_visualization():
domain_val, flow_val, client_val, server_val = load_or_generate_h5data(args.test_h5data, args.test_data,
args.domain_length, args.window)
logger.info("plot model")
model = load_model(args.clf_model, custom_objects=models.get_metrics())
visualize.plot_model(model, os.path.join(args.model_path, "model.png"))
try:
logger.info("plot training curve")
logs = pd.read_csv(args.train_log)
visualize.plot_training_curve(logs, "client", "{}/client_train.png".format(args.model_path))
visualize.plot_training_curve(logs, "server", "{}/server_train.png".format(args.model_path))
except Exception as e:
logger.warning(f"could not generate training curves: {e}")
client_pred, server_pred = np.load(args.future_prediction)
logger.info("plot pr curve")
visualize.plot_precision_recall(client_val.value, client_pred, "{}/client_prc.png".format(args.model_path))
visualize.plot_precision_recall(server_val.value, server_pred, "{}/server_prc.png".format(args.model_path))
visualize.plot_precision_recall_curves(client_val.value, client_pred, "{}/client_prc2.png".format(args.model_path))
visualize.plot_precision_recall_curves(server_val.value, server_pred, "{}/server_prc2.png".format(args.model_path))
logger.info("plot roc curve")
visualize.plot_roc_curve(client_val.value, client_pred, "{}/client_roc.png".format(args.model_path))
visualize.plot_roc_curve(server_val.value, server_pred, "{}/server_roc.png".format(args.model_path))
visualize.plot_confusion_matrix(client_val.value.argmax(1), client_pred.argmax(1),
"{}/client_cov.png".format(args.model_path),
normalize=False, title="Client Confusion Matrix")
visualize.plot_confusion_matrix(server_val.value.argmax(1), server_pred.argmax(1),
"{}/server_cov.png".format(args.model_path),
normalize=False, title="Server Confusion Matrix")
def main_score():
# mask = dataset.load_mask_eval(args.data, args.test_image)
# pred = np.load(args.pred)
# visualize.score_model(mask, pred)
pass
def main_data():
char_dict = dataset.get_character_dict()
user_flow_df = dataset.get_user_flow_data(args.train_data)
logger.info("create training dataset")
domain_tr, flow_tr, client_tr, server_tr, _ = dataset.create_dataset_from_flows(user_flow_df, char_dict,
max_len=args.domain_length,
window_size=args.window)
print(f"domain shape {domain_tr.shape}")
print(f"flow shape {flow_tr.shape}")
print(f"client shape {client_tr.shape}")
print(f"server shape {server_tr.shape}")
def main():
if "train" in args.modes:
main_train()
if "hyperband" in args.modes:
main_hyperband()
if "test" in args.modes:
main_test()
if "fancy" in args.modes:
main_visualization()
if "score" in args.modes:
main_score()
if "paul" in args.modes:
main_paul_best()
if "data" in args.modes:
main_data()
if "train_new" in args.modes:
main_new_model()
if __name__ == "__main__":
main()