diff --git a/dataset.py b/dataset.py index b1af606..f4a40e6 100644 --- a/dataset.py +++ b/dataset.py @@ -92,11 +92,18 @@ def get_flow_features(flow): return features +def get_all_flow_features(features): + flows = np.stack(list( + map(lambda f: f[["duration", "bytes_up", "bytes_down"]], features)) + ) + return np.log1p(flows) + + def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10, use_cisco_features=False): domains = [] features = [] print("get chunks from user data frames") - for i, user_flow in tqdm(list(enumerate(get_flow_per_user(user_flow_df)))[:50]): + for i, user_flow in tqdm(list(enumerate(get_flow_per_user(user_flow_df)))): (domain_windows, feature_windows) = get_user_chunks(user_flow, windowSize=window_size, overlapping=False,