add feature to use both hits information from dataset
This commit is contained in:
parent
b2f5c56019
commit
933f6bf1d7
26
dataset.py
26
dataset.py
@ -28,7 +28,7 @@ def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
|
|||||||
maxMilliSeconds = maxLengthInSeconds * 1000
|
maxMilliSeconds = maxLengthInSeconds * 1000
|
||||||
outDomainLists = []
|
outDomainLists = []
|
||||||
outDFFrames = []
|
outDFFrames = []
|
||||||
if overlapping == False:
|
if not overlapping:
|
||||||
numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize)))
|
numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize)))
|
||||||
userIDs = np.arange(len(dataFrame))
|
userIDs = np.arange(len(dataFrame))
|
||||||
for blockID in np.arange(numBlocks):
|
for blockID in np.arange(numBlocks):
|
||||||
@ -70,9 +70,9 @@ def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
|
|||||||
def get_domain_features(domain, vocab, max_length=40):
|
def get_domain_features(domain, vocab, max_length=40):
|
||||||
encoding = np.zeros((max_length,))
|
encoding = np.zeros((max_length,))
|
||||||
for j in range(np.min([len(domain), max_length])):
|
for j in range(np.min([len(domain), max_length])):
|
||||||
curCharacter = domain[-j]
|
char = domain[-j]
|
||||||
if curCharacter in vocab:
|
if char in vocab:
|
||||||
encoding[j] = vocab[curCharacter]
|
encoding[j] = vocab[char]
|
||||||
return encoding
|
return encoding
|
||||||
|
|
||||||
|
|
||||||
@ -90,6 +90,7 @@ def get_flow_features(flow):
|
|||||||
return features
|
return features
|
||||||
|
|
||||||
|
|
||||||
|
# NOT USED ATM
|
||||||
def get_cisco_features(curDataLine, urlSIPDict):
|
def get_cisco_features(curDataLine, urlSIPDict):
|
||||||
numCiscoFeatures = 30
|
numCiscoFeatures = 30
|
||||||
try:
|
try:
|
||||||
@ -124,19 +125,19 @@ def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10,
|
|||||||
window_size=window_size)
|
window_size=window_size)
|
||||||
|
|
||||||
# make client labels discrete with 4 different values
|
# make client labels discrete with 4 different values
|
||||||
# TODO: use trusted_hits_tr for client classification too
|
hits_tr = np.apply_along_axis(lambda x: discretize_label(x, 3), 0, np.atleast_2d(hits_tr))
|
||||||
client_labels = np.apply_along_axis(lambda x: discretize_label(x, 3), 0, np.atleast_2d(hits_tr))
|
|
||||||
# select only 1.0 and 0.0 from training data
|
# select only 1.0 and 0.0 from training data
|
||||||
pos_idx = np.where(client_labels == 1.0)[0]
|
pos_idx = np.where(np.logical_or(hits_tr == 1.0, trusted_hits_tr >= 1.0))[0]
|
||||||
neg_idx = np.where(client_labels == 0.0)[0]
|
neg_idx = np.where(hits_tr == 0.0)[0]
|
||||||
idx = np.concatenate((pos_idx, neg_idx))
|
idx = np.concatenate((pos_idx, neg_idx))
|
||||||
# choose selected sample to train on
|
# choose selected sample to train on
|
||||||
domain_tr = domain_tr[idx]
|
domain_tr = domain_tr[idx]
|
||||||
flow_tr = flow_tr[idx]
|
flow_tr = flow_tr[idx]
|
||||||
client_labels = client_labels[idx]
|
client_tr = np.zeros_like(idx, float)
|
||||||
server_labels = server_tr[idx]
|
client_tr[:pos_idx.shape[-1]] = 1.0
|
||||||
|
server_tr = server_tr[idx]
|
||||||
|
|
||||||
return domain_tr, flow_tr, client_labels, server_labels
|
return domain_tr, flow_tr, client_tr, server_tr
|
||||||
|
|
||||||
|
|
||||||
def create_dataset_from_lists(domains, features, vocab, max_len,
|
def create_dataset_from_lists(domains, features, vocab, max_len,
|
||||||
@ -202,7 +203,8 @@ def discretize_label(values, threshold):
|
|||||||
|
|
||||||
def get_user_flow_data(csv_file):
|
def get_user_flow_data(csv_file):
|
||||||
df = pd.read_csv(csv_file)
|
df = pd.read_csv(csv_file)
|
||||||
keys = ["duration", "bytes_down", "bytes_up", "domain", "timeStamp", "server_ip", "user_hash", "virusTotalHits",
|
keys = ["duration", "bytes_down", "bytes_up", "domain",
|
||||||
|
"timeStamp", "server_ip", "user_hash", "virusTotalHits",
|
||||||
"serverLabel", "trustedHits"]
|
"serverLabel", "trustedHits"]
|
||||||
df = df[keys]
|
df = df[keys]
|
||||||
df.set_index(keys=['user_hash'], drop=False, inplace=True)
|
df.set_index(keys=['user_hash'], drop=False, inplace=True)
|
||||||
|
17
main.py
17
main.py
@ -87,6 +87,7 @@ def main():
|
|||||||
kernel_size = 3
|
kernel_size = 3
|
||||||
drop_out = 0.5
|
drop_out = 0.5
|
||||||
filters = 128
|
filters = 128
|
||||||
|
network = models.pauls_networks
|
||||||
|
|
||||||
char_dict = dataset.get_character_dict()
|
char_dict = dataset.get_character_dict()
|
||||||
user_flow_df = dataset.get_user_flow_data(args.train_data)
|
user_flow_df = dataset.get_user_flow_data(args.train_data)
|
||||||
@ -96,13 +97,13 @@ def main():
|
|||||||
user_flow_df, char_dict,
|
user_flow_df, char_dict,
|
||||||
max_len=args.domain_length, window_size=args.window)
|
max_len=args.domain_length, window_size=args.window)
|
||||||
|
|
||||||
shared_cnn = models.renes_networks.get_embedding(len(char_dict) + 1, args.embedding, args.domain_length,
|
shared_cnn = network.get_embedding(len(char_dict) + 1, args.embedding, args.domain_length,
|
||||||
args.hidden_char_dims, kernel_size, args.domain_embedding, 0.5)
|
args.hidden_char_dims, kernel_size, args.domain_embedding, 0.5)
|
||||||
shared_cnn.summary()
|
shared_cnn.summary()
|
||||||
|
|
||||||
model = models.renes_networks.get_model(cnnDropout, flow_tr.shape[-1], args.domain_embedding,
|
model = network.get_model(cnnDropout, flow_tr.shape[-1], args.domain_embedding,
|
||||||
args.window, args.domain_length, filters, kernel_size,
|
args.window, args.domain_length, filters, kernel_size,
|
||||||
cnnHiddenDims, shared_cnn)
|
cnnHiddenDims, shared_cnn)
|
||||||
model.summary()
|
model.summary()
|
||||||
|
|
||||||
model.compile(optimizer='adam',
|
model.compile(optimizer='adam',
|
||||||
@ -119,7 +120,11 @@ def main():
|
|||||||
validation_split=0.2)
|
validation_split=0.2)
|
||||||
|
|
||||||
|
|
||||||
def test():
|
def main_train():
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def main_test():
|
||||||
char_dict = dataset.get_character_dict()
|
char_dict = dataset.get_character_dict()
|
||||||
user_flow_df = dataset.get_user_flow_data(args.test_data)
|
user_flow_df = dataset.get_user_flow_data(args.test_data)
|
||||||
domain_val, flow_val, client_val, server_val = dataset.create_dataset_from_flows(
|
domain_val, flow_val, client_val, server_val = dataset.create_dataset_from_flows(
|
||||||
|
@ -25,11 +25,12 @@ def get_model(cnnDropout, flow_features, domain_features, window_size, domain_le
|
|||||||
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
|
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
|
||||||
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
|
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
|
||||||
# CNN processing a small slides of flow windows
|
# CNN processing a small slides of flow windows
|
||||||
# TODO: add more layers?
|
|
||||||
y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu',
|
y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu',
|
||||||
input_shape=(window_size, domain_features + flow_features))(merged)
|
input_shape=(window_size, domain_features + flow_features))(merged)
|
||||||
y = MaxPool1D(pool_size=3, strides=1)(y)
|
y = MaxPool1D(pool_size=3, strides=1)(y)
|
||||||
y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu')(y)
|
y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu')(y)
|
||||||
|
y = MaxPool1D(pool_size=3, strides=1)(y)
|
||||||
|
y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu')(y)
|
||||||
# remove temporal dimension by global max pooling
|
# remove temporal dimension by global max pooling
|
||||||
y = GlobalMaxPooling1D()(y)
|
y = GlobalMaxPooling1D()(y)
|
||||||
y = Dropout(cnnDropout)(y)
|
y = Dropout(cnnDropout)(y)
|
||||||
|
Loading…
Reference in New Issue
Block a user