added new networks for domain embedding and classification task
This commit is contained in:
		
							
								
								
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@@ -99,4 +99,5 @@ ENV/
 | 
			
		||||
*.tif
 | 
			
		||||
*.joblib
 | 
			
		||||
*.csv
 | 
			
		||||
*.csv.gz
 | 
			
		||||
*.csv.gz
 | 
			
		||||
*.csv.tar.*
 | 
			
		||||
							
								
								
									
										43
									
								
								main.py
									
									
									
									
									
								
							
							
						
						
									
										43
									
								
								main.py
									
									
									
									
									
								
							@@ -37,9 +37,21 @@ parser.add_argument("--epochs", action="store", dest="epochs",
 | 
			
		||||
# parser.add_argument("--samples_val", action="store", dest="samples_val",
 | 
			
		||||
#                     default=10000, type=int)
 | 
			
		||||
#
 | 
			
		||||
# parser.add_argument("--area", action="store", dest="area_size",
 | 
			
		||||
#                     default=25, type=int)
 | 
			
		||||
#
 | 
			
		||||
parser.add_argument("--embd", action="store", dest="embedding",
 | 
			
		||||
                    default=128, type=int)
 | 
			
		||||
 | 
			
		||||
parser.add_argument("--hidden_char_dims", action="store", dest="hidden_char_dims",
 | 
			
		||||
                    default=256, type=int)
 | 
			
		||||
 | 
			
		||||
parser.add_argument("--window", action="store", dest="window",
 | 
			
		||||
                    default=10, type=int)
 | 
			
		||||
 | 
			
		||||
parser.add_argument("--domain_length", action="store", dest="domain_length",
 | 
			
		||||
                    default=40, type=int)
 | 
			
		||||
 | 
			
		||||
parser.add_argument("--domain_embd", action="store", dest="domain_embedding",
 | 
			
		||||
                    default=512, type=int)
 | 
			
		||||
 | 
			
		||||
# parser.add_argument("--queue", action="store", dest="queue_size",
 | 
			
		||||
#                     default=50, type=int)
 | 
			
		||||
#
 | 
			
		||||
@@ -59,6 +71,7 @@ parser.add_argument("--epochs", action="store", dest="epochs",
 | 
			
		||||
 | 
			
		||||
args = parser.parse_args()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# config = tf.ConfigProto(log_device_placement=True)
 | 
			
		||||
# config.gpu_options.per_process_gpu_memory_fraction = 0.5
 | 
			
		||||
# config.gpu_options.allow_growth = True
 | 
			
		||||
@@ -67,24 +80,17 @@ args = parser.parse_args()
 | 
			
		||||
 | 
			
		||||
def main():
 | 
			
		||||
    # parameter
 | 
			
		||||
    innerCNNFilters = 512
 | 
			
		||||
    innerCNNKernelSize = 2
 | 
			
		||||
    cnnDropout = 0.5
 | 
			
		||||
    cnnHiddenDims = 1024
 | 
			
		||||
    domainFeatures = 512
 | 
			
		||||
    flowFeatures = 3
 | 
			
		||||
    numCiscoFeatures = 30
 | 
			
		||||
    windowSize = 10
 | 
			
		||||
    maxLen = 40
 | 
			
		||||
    embeddingSize = 100
 | 
			
		||||
    kernel_size = 2
 | 
			
		||||
    kernel_size = 3
 | 
			
		||||
    drop_out = 0.5
 | 
			
		||||
    filters = 2
 | 
			
		||||
    filters = 128
 | 
			
		||||
    hidden_dims = 100
 | 
			
		||||
    vocabSize = 40
 | 
			
		||||
    threshold = 3
 | 
			
		||||
    minFlowsPerUser = 10
 | 
			
		||||
    numEpochs = 100
 | 
			
		||||
 | 
			
		||||
    char_dict = dataset.get_character_dict()
 | 
			
		||||
    user_flow_df = dataset.get_user_flow_data()
 | 
			
		||||
@@ -92,7 +98,7 @@ def main():
 | 
			
		||||
    print("create training dataset")
 | 
			
		||||
    (X_tr, hits_tr, names_tr, server_tr, trusted_hits_tr) = dataset.create_dataset_from_flows(
 | 
			
		||||
        user_flow_df, char_dict,
 | 
			
		||||
        max_len=maxLen, window_size=windowSize)
 | 
			
		||||
        max_len=args.domain_length, window_size=args.window)
 | 
			
		||||
    # make client labels discrete with 4 different values
 | 
			
		||||
    # TODO: use trusted_hits_tr for client classification too
 | 
			
		||||
    client_labels = np.apply_along_axis(lambda x: dataset.discretize_label(x, 3), 0, np.atleast_2d(hits_tr))
 | 
			
		||||
@@ -104,11 +110,14 @@ def main():
 | 
			
		||||
    client_labels = client_labels[idx]
 | 
			
		||||
    server_labels = server_tr[idx]
 | 
			
		||||
 | 
			
		||||
    shared_cnn = models.get_shared_cnn(len(char_dict) + 1, embeddingSize, maxLen,
 | 
			
		||||
                                       domainFeatures, kernel_size, domainFeatures, 0.5)
 | 
			
		||||
    shared_cnn = models.get_embedding_network_rene(len(char_dict) + 1, args.embedding, args.domain_length,
 | 
			
		||||
                                                   args.hidden_char_dims, args.domain_embedding, 0.5)
 | 
			
		||||
    shared_cnn.summary()
 | 
			
		||||
 | 
			
		||||
    model = models.get_top_cnn(shared_cnn, flowFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size,
 | 
			
		||||
                               cnnHiddenDims, cnnDropout)
 | 
			
		||||
    model = models.get_top_cnn_rene(cnnDropout, flowFeatures, args.domain_embedding,
 | 
			
		||||
                                    args.window, args.domain_length, filters, kernel_size,
 | 
			
		||||
                                    cnnHiddenDims, shared_cnn)
 | 
			
		||||
    model.summary()
 | 
			
		||||
 | 
			
		||||
    model.compile(optimizer='adam',
 | 
			
		||||
                  loss='binary_crossentropy',
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										64
									
								
								models.py
									
									
									
									
									
								
							
							
						
						
									
										64
									
								
								models.py
									
									
									
									
									
								
							@@ -1,10 +1,11 @@
 | 
			
		||||
import keras
 | 
			
		||||
from keras.engine import Input, Model
 | 
			
		||||
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, TimeDistributed
 | 
			
		||||
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, TimeDistributed, MaxPool1D
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_shared_cnn(vocab_size, embedding_size, input_length, filters, kernel_size,
 | 
			
		||||
                   hidden_dims, drop_out):
 | 
			
		||||
# designed by paul
 | 
			
		||||
def get_embedding_network_paul(vocab_size, embedding_size, input_length, filters, kernel_size,
 | 
			
		||||
                               hidden_dims, drop_out=0.5):
 | 
			
		||||
    x = y = Input(shape=(input_length,))
 | 
			
		||||
    y = Embedding(input_dim=vocab_size, output_dim=embedding_size)(y)
 | 
			
		||||
    y = Conv1D(filters, kernel_size, activation='relu')(y)
 | 
			
		||||
@@ -15,26 +16,65 @@ def get_shared_cnn(vocab_size, embedding_size, input_length, filters, kernel_siz
 | 
			
		||||
    return Model(x, y)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_embedding_network_rene(vocab_size, embedding_size, input_length,
 | 
			
		||||
                               hidden_char_dims, hidden_dims, drop_out=0.5):
 | 
			
		||||
    x = y = Input(shape=(input_length,))
 | 
			
		||||
    y = Embedding(input_dim=vocab_size, output_dim=embedding_size, mask_zero=True)(y)
 | 
			
		||||
    y = Conv1D(hidden_char_dims, kernel_size=5, activation='relu')(y)
 | 
			
		||||
    y = MaxPool1D(pool_size=3, strides=1)(y)
 | 
			
		||||
    y = Conv1D(hidden_char_dims, kernel_size=3, activation='relu')(y)
 | 
			
		||||
    y = MaxPool1D(pool_size=3, strides=1)(y)
 | 
			
		||||
    y = Conv1D(hidden_char_dims, kernel_size=3, activation='relu')(y)
 | 
			
		||||
    y = GlobalMaxPooling1D()(y)
 | 
			
		||||
    y = Dense(hidden_dims)(y)
 | 
			
		||||
    y = Dropout(drop_out)(y)
 | 
			
		||||
    y = Activation('relu')(y)
 | 
			
		||||
    return Model(x, y)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_full_model(vocabSize, embeddingSize, maxLen, domainFeatures, flowFeatures,
 | 
			
		||||
                   filters, h1, h2, dropout, dense):
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_top_cnn(cnn, numFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size, cnnHiddenDims, cnnDropout):
 | 
			
		||||
    ipt_domains = Input(shape=(windowSize, maxLen), name="ipt_domains")
 | 
			
		||||
# designed by paul
 | 
			
		||||
def get_top_cnn(cnnDropout, flow_features, domain_features, window_size, domain_length, cnn_dims, kernel_size,
 | 
			
		||||
                dense_dim,
 | 
			
		||||
                cnn):
 | 
			
		||||
    ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
 | 
			
		||||
    encoded = TimeDistributed(cnn)(ipt_domains)
 | 
			
		||||
    ipt_flows = Input(shape=(windowSize, numFeatures), name="ipt_flows")
 | 
			
		||||
    ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
 | 
			
		||||
    merged = keras.layers.concatenate([encoded, ipt_flows], -1)
 | 
			
		||||
    # add second cnn
 | 
			
		||||
    y = Conv1D(filters,
 | 
			
		||||
    # CNN processing a small slides of flow windows
 | 
			
		||||
    # TODO: add more layers?
 | 
			
		||||
    y = Conv1D(cnn_dims,
 | 
			
		||||
               kernel_size,
 | 
			
		||||
               activation='relu',
 | 
			
		||||
               input_shape=(windowSize, domainFeatures + numFeatures))(merged)
 | 
			
		||||
    # TODO: why global pooling? -> 3D to 2D
 | 
			
		||||
    # we use max pooling:
 | 
			
		||||
               input_shape=(window_size, domain_features + flow_features))(merged)
 | 
			
		||||
    # remove temporal dimension by global max pooling
 | 
			
		||||
    y = GlobalMaxPooling1D()(y)
 | 
			
		||||
    y = Dropout(cnnDropout)(y)
 | 
			
		||||
    y = Dense(cnnHiddenDims, activation='relu')(y)
 | 
			
		||||
    y = Dense(dense_dim, activation='relu')(y)
 | 
			
		||||
    y1 = Dense(2, activation='softmax', name="client")(y)
 | 
			
		||||
    y2 = Dense(2, activation='softmax', name="server")(y)
 | 
			
		||||
 | 
			
		||||
    return Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_top_cnn_rene(cnnDropout, flow_features, domain_features, window_size, domain_length, cnn_dims, kernel_size,
 | 
			
		||||
                     dense_dim, cnn):
 | 
			
		||||
    ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
 | 
			
		||||
    encoded = TimeDistributed(cnn)(ipt_domains)
 | 
			
		||||
    ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
 | 
			
		||||
    merged = keras.layers.concatenate([encoded, ipt_flows], -1)
 | 
			
		||||
    # CNN processing a small slides of flow windows
 | 
			
		||||
    # TODO: add more layers?
 | 
			
		||||
    y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu',
 | 
			
		||||
               input_shape=(window_size, domain_features + flow_features))(merged)
 | 
			
		||||
    # remove temporal dimension by global max pooling
 | 
			
		||||
    y = GlobalMaxPooling1D()(y)
 | 
			
		||||
    y = Dropout(cnnDropout)(y)
 | 
			
		||||
    y = Dense(dense_dim, activation='relu')(y)
 | 
			
		||||
    y1 = Dense(2, activation='softmax', name="client")(y)
 | 
			
		||||
    y2 = Dense(2, activation='softmax', name="server")(y)
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -4,7 +4,6 @@ import joblib
 | 
			
		||||
import pandas as pd
 | 
			
		||||
 | 
			
		||||
df = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/currentData.joblib")
 | 
			
		||||
df = df["data"]
 | 
			
		||||
df = pd.concat(df)
 | 
			
		||||
df = pd.concat(df["data"])
 | 
			
		||||
df.reset_index(inplace=True)
 | 
			
		||||
df.to_csv("/tmp/rk/full_dataset.csv.gz", compression="gzip")
 | 
			
		||||
df.to_csv("/tmp/rk/full_future_dataset.csv.gz", compression="gzip")
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user