ma_cisco_malware/scripts/make_csv_dataset.py

27 lines
814 B
Python
Raw Normal View History

#!/usr/bin/python2
import sys
import joblib
2017-07-08 17:46:07 +02:00
import numpy as np
2017-06-30 10:49:49 +02:00
import pandas as pd
fn = sys.argv[1]
df = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/{}.joblib".format(fn))
df = pd.concat(df["data"])
2017-06-30 10:49:49 +02:00
df.reset_index(inplace=True)
2017-07-08 17:46:07 +02:00
df.dropna(axis=0, how="any", inplace=True)
df.serverLabel = pd.to_numeric(df.serverLabel, errors='coerce')
df.duration = pd.to_numeric(df.duration, errors='coerce')
df.bytes_down = pd.to_numeric(df.bytes_down, errors='coerce')
df.bytes_up = pd.to_numeric(df.bytes_up, errors='coerce')
df.http_method = df.http_method.astype("category")
2017-07-08 17:46:07 +02:00
df.serverLabel = df.serverLabel.astype(np.bool)
df.virusTotalHits = df.virusTotalHits.astype(np.int8)
df.trustedHits = df.trustedHits.astype(np.int8)
2017-07-08 17:46:07 +02:00
df.to_csv("/tmp/rk/data/{}.csv".format(fn), encoding="utf-8")