From 8334e9a84f328b59d09ab9e82af0af8849901cf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Knaebel?= Date: Fri, 30 Jun 2017 17:42:18 +0200 Subject: [PATCH] removed ys from training data generation --- dataset.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dataset.py b/dataset.py index 9c8364f..f9163be 100644 --- a/dataset.py +++ b/dataset.py @@ -110,6 +110,7 @@ def create_dataset_from_flows(user_flow_df, char_dict, maxLen, threshold=3, wind def create_dataset_from_lists(domains, dfs, vocab, maxLen, threshold=3, use_cisco_features=False, urlSIPDIct=dict(), window_size=10): + # TODO: check for hits vs vth consistency if 'hits' in dfs[0].keys(): hitName = 'hits' elif 'virusTotalHits' in dfs[0].keys(): @@ -120,7 +121,6 @@ def create_dataset_from_lists(domains, dfs, vocab, maxLen, threshold=3, if use_cisco_features: numFeatures += numCiscoFeatures Xs = [] - ys = [] hits = [] names = [] servers = [] @@ -141,12 +141,11 @@ def create_dataset_from_lists(domains, dfs, vocab, maxLen, threshold=3, Xs[ctr][i, :] = get_flow_features(dfs[i].iloc[j]) ctr += 1 - ys.append(discretize_label(dfs[i][hitName], threshold)) hits.append(np.max(dfs[i][hitName])) names.append(np.unique(dfs[i]['user_hash'])) servers.append(np.max(dfs[i]['serverLabel'])) trusted_hits.append(np.max(dfs[i]['trustedHits'])) - return Xs, np.array(ys), np.array(hits), np.array(names), np.array(servers), np.array(trusted_hits) + return Xs, np.array(hits), np.array(names), np.array(servers), np.array(trusted_hits) def discretize_label(values, threshold):