2017-06-30 10:12:20 +02:00
|
|
|
#!/usr/bin/python2
|
|
|
|
|
2017-08-03 07:51:58 +02:00
|
|
|
import sys
|
|
|
|
|
2017-06-30 10:12:20 +02:00
|
|
|
import joblib
|
2017-07-08 17:46:07 +02:00
|
|
|
import numpy as np
|
2017-06-30 10:49:49 +02:00
|
|
|
import pandas as pd
|
2017-06-30 10:12:20 +02:00
|
|
|
|
2017-08-03 07:51:58 +02:00
|
|
|
fn = sys.argv[1]
|
|
|
|
|
|
|
|
df = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/{}.joblib".format(fn))
|
2017-07-05 17:37:08 +02:00
|
|
|
df = pd.concat(df["data"])
|
2017-06-30 10:49:49 +02:00
|
|
|
df.reset_index(inplace=True)
|
2017-07-08 17:46:07 +02:00
|
|
|
df.dropna(axis=0, how="any", inplace=True)
|
2017-08-03 07:51:58 +02:00
|
|
|
|
|
|
|
df.serverLabel = pd.to_numeric(df.serverLabel, errors='coerce')
|
|
|
|
df.duration = pd.to_numeric(df.duration, errors='coerce')
|
|
|
|
df.bytes_down = pd.to_numeric(df.bytes_down, errors='coerce')
|
|
|
|
df.bytes_up = pd.to_numeric(df.bytes_up, errors='coerce')
|
|
|
|
|
|
|
|
df.http_method = df.http_method.astype("category")
|
2017-07-08 17:46:07 +02:00
|
|
|
df.serverLabel = df.serverLabel.astype(np.bool)
|
2017-07-11 11:12:03 +02:00
|
|
|
df.virusTotalHits = df.virusTotalHits.astype(np.int8)
|
|
|
|
df.trustedHits = df.trustedHits.astype(np.int8)
|
2017-07-08 17:46:07 +02:00
|
|
|
|
2017-09-16 15:25:34 +02:00
|
|
|
df.to_csv("/tmp/rk/data/{}.csv".format(fn), encoding="utf-8")
|