separating logical sections into dataset, models and main.

continued initial refactoring
2017-06-30 10:12:20 +02:00 · 2017-06-30 10:12:20 +02:00 · bbd63fd1da
commit bbd63fd1da
parent be273d9247
5 changed files with 132 additions and 107 deletions
--- a/.gitignore
+++ b/.gitignore
@ -96,4 +96,7 @@ ENV/
 .DS_Store
 # data
-*.tif
+*.tif
 *.joblib
 *.csv
 *.csv.gz
--- a/cnnOnCnnParameterSelection.py
+++ b/cnnOnCnnParameterSelection.py
@ -1,12 +1,8 @@
 # -*- coding: utf-8 -*-
 import string
 import keras
 import numpy as np
 import pandas as pd
 from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Reshape, Embedding, Input, Activation
 from keras.models import Model
 from keras.utils import np_utils
 from tqdm import tqdm
@ -21,18 +17,6 @@ def get_character_dict():
                enumerate(string.ascii_lowercase + string.punctuation))
 def get_cnn(vocabSize, embeddingSize, input_length, filters, kernel_size,
            hidden_dims, drop_out):
    x = y = Input(shape=(input_length,))
    y = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(y)
    y = Conv1D(filters, kernel_size, activation='relu')(y)
    y = GlobalMaxPooling1D()(y)
    y = Dense(hidden_dims)(y)
    y = Dropout(drop_out)(y)
    y = Activation('relu')(y)
    return Model(x, y)
 def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
                    maxLengthInSeconds=300):
    # print('maxLength: ' + str(maxLengthInSeconds))
@ -102,10 +86,8 @@ def getCiscoFeatures(curDataLine, urlSIPDict):
    numCiscoFeatures = 30
    try:
        ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
        # print('cisco features: ' + str(ciscoFeatures))
        # log transform
        ciscoFeatures = np.log1p(ciscoFeatures).astype(float)
        # print('log transformed: ' + str(ciscoFeatures))
        return ciscoFeatures.ravel()
    except:
        return np.zeros([numCiscoFeatures, ]).ravel()
@ -117,7 +99,7 @@ def create_dataset_from_flows(user_flow_df, char_dict, maxLen, threshold=3, wind
    print("get chunks from user data frames")
    for i, user_flow in enumerate(get_flow_per_user(user_flow_df)):
        (domainListsTmp, dfListsTmp) = get_user_chunks(user_flow, windowSize=windowSize,
-                                                       overlapping=False, maxLengthInSeconds=maxLengthInSeconds)
+                                                       overlapping=False, maxLengthInSeconds=-1)
        domainLists += domainListsTmp
        dfLists += dfListsTmp
        if i >= 10:
@ -193,90 +175,3 @@ def get_flow_per_user(df):
    users = df['user_hash'].unique().tolist()
    for user in users:
        yield df.loc[df.user_hash == user]
 if __name__ == "__main__":
    # parameter    
    innerCNNFilters = 512
    innerCNNKernelSize = 2
    cnnDropout = 0.5
    cnnHiddenDims = 1024
    domainFeatures = 512
    flowFeatures = 3
    numCiscoFeatures = 30
    windowSize = 10
    maxLen = 40
    embeddingSize = 100
    kernel_size = 2
    drop_out = 0.5
    filters = 2
    hidden_dims = 100
    vocabSize = 40
    threshold = 3
    minFlowsPerUser = 10
    numEpochs = 100
    maxLengthInSeconds = -1
    timesNeg = -1
    char_dict = get_character_dict()
    user_flow_df = get_user_flow_data()
    print("create training dataset")
    (X_tr, y_tr, hits_tr, names_tr) = create_dataset_from_flows(
        user_flow_df, char_dict,
        maxLen=maxLen, threshold=threshold, windowSize=windowSize)
    pos_idx = np.where(y_tr == 1.0)[0]
    neg_idx = np.where(y_tr == 0.0)[0]
    use_idx = np.concatenate((pos_idx, neg_idx))
    y_tr = y_tr[use_idx]
    # hits_tr = hits_tr[use_idx]
    # names_tr = names_tr[use_idx]
    for i in range(len(X_tr)):
        X_tr[i] = X_tr[i][use_idx]
    # TODO: WTF? I don't get it...
    sharedCNNFun = get_cnn(len(char_dict) + 1, embeddingSize, maxLen,
                           domainFeatures, kernel_size, domainFeatures, 0.5)
    inputList = []
    encodedList = []
    numFeatures = flowFeatures
    for i in range(windowSize):
        inputList.append(Input(shape=(maxLen,)))
        encodedList.append(sharedCNNFun(inputList[-1]))  # add shared domain model
        inputList.append(Input(shape=(numFeatures,)))
    merge_layer_input = []
    for i in range(windowSize):
        merge_layer_input.append(encodedList[i])
        merge_layer_input.append(inputList[(2 * i) + 1])
    # We can then concatenate the two vectors:
    merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
    reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector)
    # add second cnn
    cnn = Conv1D(filters,
                 kernel_size,
                 activation='relu',
                 input_shape=(windowSize, domainFeatures + numFeatures))(reshape)
    # we use max pooling:
    maxPool = GlobalMaxPooling1D()(cnn)
    cnnDropout = Dropout(cnnDropout)(maxPool)
    cnnDense = Dense(cnnHiddenDims, activation='relu')(cnnDropout)
    cnnOutput = Dense(2, activation='softmax')(cnnDense)
    # We define a trainable model linking the
    # tweet inputs to the predictions
    model = Model(inputs=inputList, outputs=cnnOutput)
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    epochNumber = 0
    trainLabel = np_utils.to_categorical(y_tr, 2)
    model.fit(x=X_tr, y=trainLabel, batch_size=128,
              epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber)  # ,
    # validation_data=(testData,testLabel))
--- a/main.py
+++ b/main.py
@ -0,0 +1,68 @@
 import numpy as np
 from keras.utils import np_utils
 import dataset
 import models
 def main():
    # parameter
    innerCNNFilters = 512
    innerCNNKernelSize = 2
    cnnDropout = 0.5
    cnnHiddenDims = 1024
    domainFeatures = 512
    flowFeatures = 3
    numCiscoFeatures = 30
    windowSize = 10
    maxLen = 40
    embeddingSize = 100
    kernel_size = 2
    drop_out = 0.5
    filters = 2
    hidden_dims = 100
    vocabSize = 40
    threshold = 3
    minFlowsPerUser = 10
    numEpochs = 100
    timesNeg = -1
    char_dict = dataset.get_character_dict()
    user_flow_df = dataset.get_user_flow_data()
    print("create training dataset")
    (X_tr, y_tr, hits_tr, names_tr) = dataset.create_dataset_from_flows(
        user_flow_df, char_dict,
        maxLen=maxLen, threshold=threshold, windowSize=windowSize)
    pos_idx = np.where(y_tr == 1.0)[0]
    neg_idx = np.where(y_tr == 0.0)[0]
    use_idx = np.concatenate((pos_idx, neg_idx))
    y_tr = y_tr[use_idx]
    # hits_tr = hits_tr[use_idx]
    # names_tr = names_tr[use_idx]
    for i in range(len(X_tr)):
        X_tr[i] = X_tr[i][use_idx]
    # TODO: WTF? I don't get it...
    shared_cnn = models.get_shared_cnn(len(char_dict) + 1, embeddingSize, maxLen,
                                       domainFeatures, kernel_size, domainFeatures, 0.5)
    model = models.get_top_cnn(shared_cnn, flowFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size,
                               cnnHiddenDims, cnnDropout)
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    epochNumber = 0
    y_tr = np_utils.to_categorical(y_tr, 2)
    model.fit(x=X_tr, y=y_tr, batch_size=128,
              epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber)  # ,
    # validation_data=(testData,testLabel))
 if __name__ == "__main__":
    main()
--- a/models.py
+++ b/models.py
@ -0,0 +1,53 @@
 import keras
 from keras.engine import Input, Model
 from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, Reshape
 def get_shared_cnn(vocabSize, embeddingSize, input_length, filters, kernel_size,
                   hidden_dims, drop_out):
    x = y = Input(shape=(input_length,))
    y = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(y)
    y = Conv1D(filters, kernel_size, activation='relu')(y)
    y = GlobalMaxPooling1D()(y)
    y = Dense(hidden_dims)(y)
    y = Dropout(drop_out)(y)
    y = Activation('relu')(y)
    return Model(x, y)
 def get_full_model(vocabSize, embeddingSize, maxLen, domainFeatures, flowFeatures,
                   filters, h1, h2, dropout, dense):
    pass
 def get_top_cnn(cnn, numFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size, cnnHiddenDims, cnnDropout):
    inputList = []
    encodedList = []
    # TODO: ???
    for i in range(windowSize):
        inputList.append(Input(shape=(maxLen,)))
        encodedList.append(cnn(inputList[-1]))  # add shared domain model
        inputList.append(Input(shape=(numFeatures,)))
    # TODO: ???
    merge_layer_input = []
    for i in range(windowSize):
        merge_layer_input.append(encodedList[i])
        merge_layer_input.append(inputList[(2 * i) + 1])
    # We can then concatenate the two vectors:
    merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
    reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector)
    # add second cnn
    cnn = Conv1D(filters,
                 kernel_size,
                 activation='relu',
                 input_shape=(windowSize, domainFeatures + numFeatures))(reshape)
    # we use max pooling:
    maxPool = GlobalMaxPooling1D()(cnn)
    cnnDropout = Dropout(cnnDropout)(maxPool)
    cnnDense = Dense(cnnHiddenDims, activation='relu')(cnnDropout)
    cnnOutput = Dense(2, activation='softmax')(cnnDense)
    # We define a trainable model linking the
    # tweet inputs to the predictions
    model = Model(inputs=inputList, outputs=cnnOutput)
    return model
--- a/scripts/make_csv_dataset.py
+++ b/scripts/make_csv_dataset.py
@ -0,0 +1,6 @@
 #!/usr/bin/python2
 import joblib
 datafile = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/currentData.joblib")
 user_flows = datafile["data"]