add feature to use both hits information from dataset
This commit is contained in:
		
							
								
								
									
										26
									
								
								dataset.py
									
									
									
									
									
								
							
							
						
						
									
										26
									
								
								dataset.py
									
									
									
									
									
								
							@@ -28,7 +28,7 @@ def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
 | 
			
		||||
    maxMilliSeconds = maxLengthInSeconds * 1000
 | 
			
		||||
    outDomainLists = []
 | 
			
		||||
    outDFFrames = []
 | 
			
		||||
    if overlapping == False:
 | 
			
		||||
    if not overlapping:
 | 
			
		||||
        numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize)))
 | 
			
		||||
        userIDs = np.arange(len(dataFrame))
 | 
			
		||||
        for blockID in np.arange(numBlocks):
 | 
			
		||||
@@ -70,9 +70,9 @@ def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
 | 
			
		||||
def get_domain_features(domain, vocab, max_length=40):
 | 
			
		||||
    encoding = np.zeros((max_length,))
 | 
			
		||||
    for j in range(np.min([len(domain), max_length])):
 | 
			
		||||
        curCharacter = domain[-j]
 | 
			
		||||
        if curCharacter in vocab:
 | 
			
		||||
            encoding[j] = vocab[curCharacter]
 | 
			
		||||
        char = domain[-j]
 | 
			
		||||
        if char in vocab:
 | 
			
		||||
            encoding[j] = vocab[char]
 | 
			
		||||
    return encoding
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -90,6 +90,7 @@ def get_flow_features(flow):
 | 
			
		||||
    return features
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# NOT USED ATM
 | 
			
		||||
def get_cisco_features(curDataLine, urlSIPDict):
 | 
			
		||||
    numCiscoFeatures = 30
 | 
			
		||||
    try:
 | 
			
		||||
@@ -124,19 +125,19 @@ def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10,
 | 
			
		||||
        window_size=window_size)
 | 
			
		||||
 | 
			
		||||
    # make client labels discrete with 4 different values
 | 
			
		||||
    # TODO: use trusted_hits_tr for client classification too
 | 
			
		||||
    client_labels = np.apply_along_axis(lambda x: discretize_label(x, 3), 0, np.atleast_2d(hits_tr))
 | 
			
		||||
    hits_tr = np.apply_along_axis(lambda x: discretize_label(x, 3), 0, np.atleast_2d(hits_tr))
 | 
			
		||||
    # select only 1.0 and 0.0 from training data
 | 
			
		||||
    pos_idx = np.where(client_labels == 1.0)[0]
 | 
			
		||||
    neg_idx = np.where(client_labels == 0.0)[0]
 | 
			
		||||
    pos_idx = np.where(np.logical_or(hits_tr == 1.0, trusted_hits_tr >= 1.0))[0]
 | 
			
		||||
    neg_idx = np.where(hits_tr == 0.0)[0]
 | 
			
		||||
    idx = np.concatenate((pos_idx, neg_idx))
 | 
			
		||||
    # choose selected sample to train on
 | 
			
		||||
    domain_tr = domain_tr[idx]
 | 
			
		||||
    flow_tr = flow_tr[idx]
 | 
			
		||||
    client_labels = client_labels[idx]
 | 
			
		||||
    server_labels = server_tr[idx]
 | 
			
		||||
    client_tr = np.zeros_like(idx, float)
 | 
			
		||||
    client_tr[:pos_idx.shape[-1]] = 1.0
 | 
			
		||||
    server_tr = server_tr[idx]
 | 
			
		||||
 | 
			
		||||
    return domain_tr, flow_tr, client_labels, server_labels
 | 
			
		||||
    return domain_tr, flow_tr, client_tr, server_tr
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def create_dataset_from_lists(domains, features, vocab, max_len,
 | 
			
		||||
@@ -202,7 +203,8 @@ def discretize_label(values, threshold):
 | 
			
		||||
 | 
			
		||||
def get_user_flow_data(csv_file):
 | 
			
		||||
    df = pd.read_csv(csv_file)
 | 
			
		||||
    keys = ["duration", "bytes_down", "bytes_up", "domain", "timeStamp", "server_ip", "user_hash", "virusTotalHits",
 | 
			
		||||
    keys = ["duration", "bytes_down", "bytes_up", "domain",
 | 
			
		||||
            "timeStamp", "server_ip", "user_hash", "virusTotalHits",
 | 
			
		||||
            "serverLabel", "trustedHits"]
 | 
			
		||||
    df = df[keys]
 | 
			
		||||
    df.set_index(keys=['user_hash'], drop=False, inplace=True)
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										11
									
								
								main.py
									
									
									
									
									
								
							
							
						
						
									
										11
									
								
								main.py
									
									
									
									
									
								
							@@ -87,6 +87,7 @@ def main():
 | 
			
		||||
    kernel_size = 3
 | 
			
		||||
    drop_out = 0.5
 | 
			
		||||
    filters = 128
 | 
			
		||||
    network = models.pauls_networks
 | 
			
		||||
 | 
			
		||||
    char_dict = dataset.get_character_dict()
 | 
			
		||||
    user_flow_df = dataset.get_user_flow_data(args.train_data)
 | 
			
		||||
@@ -96,11 +97,11 @@ def main():
 | 
			
		||||
        user_flow_df, char_dict,
 | 
			
		||||
        max_len=args.domain_length, window_size=args.window)
 | 
			
		||||
 | 
			
		||||
    shared_cnn = models.renes_networks.get_embedding(len(char_dict) + 1, args.embedding, args.domain_length,
 | 
			
		||||
    shared_cnn = network.get_embedding(len(char_dict) + 1, args.embedding, args.domain_length,
 | 
			
		||||
                                       args.hidden_char_dims, kernel_size, args.domain_embedding, 0.5)
 | 
			
		||||
    shared_cnn.summary()
 | 
			
		||||
 | 
			
		||||
    model = models.renes_networks.get_model(cnnDropout, flow_tr.shape[-1], args.domain_embedding,
 | 
			
		||||
    model = network.get_model(cnnDropout, flow_tr.shape[-1], args.domain_embedding,
 | 
			
		||||
                              args.window, args.domain_length, filters, kernel_size,
 | 
			
		||||
                              cnnHiddenDims, shared_cnn)
 | 
			
		||||
    model.summary()
 | 
			
		||||
@@ -119,7 +120,11 @@ def main():
 | 
			
		||||
              validation_split=0.2)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test():
 | 
			
		||||
def main_train():
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def main_test():
 | 
			
		||||
    char_dict = dataset.get_character_dict()
 | 
			
		||||
    user_flow_df = dataset.get_user_flow_data(args.test_data)
 | 
			
		||||
    domain_val, flow_val, client_val, server_val = dataset.create_dataset_from_flows(
 | 
			
		||||
 
 | 
			
		||||
@@ -25,11 +25,12 @@ def get_model(cnnDropout, flow_features, domain_features, window_size, domain_le
 | 
			
		||||
    ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
 | 
			
		||||
    merged = keras.layers.concatenate([encoded, ipt_flows], -1)
 | 
			
		||||
    # CNN processing a small slides of flow windows
 | 
			
		||||
    # TODO: add more layers?
 | 
			
		||||
    y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu',
 | 
			
		||||
               input_shape=(window_size, domain_features + flow_features))(merged)
 | 
			
		||||
    y = MaxPool1D(pool_size=3, strides=1)(y)
 | 
			
		||||
    y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu')(y)
 | 
			
		||||
    y = MaxPool1D(pool_size=3, strides=1)(y)
 | 
			
		||||
    y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu')(y)
 | 
			
		||||
    # remove temporal dimension by global max pooling
 | 
			
		||||
    y = GlobalMaxPooling1D()(y)
 | 
			
		||||
    y = Dropout(cnnDropout)(y)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user