#!/usr/bin/python2 import joblib import numpy as np import pandas as pd df = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/currentData.joblib") df = pd.concat(df["data"]) df.reset_index(inplace=True) df.dropna(axis=0, how="any", inplace=True) df[["duration", "bytes_down", "bytes_up"]] = df[["duration", "bytes_down", "bytes_up"]].astype(np.int) df[["domain", "server_ip"]] = df[["domain", "server_ip"]].astype(str) df[["server_label"]] = df[["server_label"]].astype(np.bool) df.serverLabel = df.serverLabel.astype(np.bool) df.virusTotalHits = df.virusTotalHits.astype(np.int) df.trustedHits = df.trustedHits.astype(np.int) df.to_csv("/tmp/rk/full_future_dataset.csv.gz", compression="gzip")