My master thesis project on malware detection using neural networks and multi task learning
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

dataset.py 6.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. # -*- coding: utf-8 -*-
  2. import string
  3. import numpy as np
  4. import pandas as pd
  5. from tqdm import tqdm
  6. chars = dict((char, idx + 1) for (idx, char) in
  7. enumerate(string.ascii_lowercase + string.punctuation + string.digits))
  8. def get_character_dict():
  9. return chars
  10. def encode_char(c):
  11. if c in chars:
  12. return chars[c]
  13. else:
  14. return 0
  15. encode_char = np.vectorize(encode_char)
  16. def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
  17. maxLengthInSeconds=300):
  18. maxMilliSeconds = maxLengthInSeconds * 1000
  19. outDomainLists = []
  20. outDFFrames = []
  21. if overlapping == False:
  22. numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize)))
  23. userIDs = np.arange(len(dataFrame))
  24. for blockID in np.arange(numBlocks):
  25. curIDs = userIDs[(blockID * windowSize):((blockID + 1) * windowSize)]
  26. # print(curIDs)
  27. useData = dataFrame.iloc[curIDs]
  28. curDomains = useData['domain']
  29. if maxLengthInSeconds != -1:
  30. curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
  31. underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
  32. if len(underTimeOutIDs) != len(curIDs):
  33. curIDs = curIDs[underTimeOutIDs]
  34. useData = dataFrame.iloc[curIDs]
  35. curDomains = useData['domain']
  36. outDomainLists.append(list(curDomains))
  37. outDFFrames.append(useData)
  38. else:
  39. numBlocks = len(dataFrame) + 1 - windowSize
  40. userIDs = np.arange(len(dataFrame))
  41. for blockID in np.arange(numBlocks):
  42. curIDs = userIDs[blockID:blockID + windowSize]
  43. useData = dataFrame.iloc[curIDs]
  44. curDomains = useData['domain']
  45. if maxLengthInSeconds != -1:
  46. curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
  47. underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
  48. if len(underTimeOutIDs) != len(curIDs):
  49. curIDs = curIDs[underTimeOutIDs]
  50. useData = dataFrame.iloc[curIDs]
  51. curDomains = useData['domain']
  52. outDomainLists.append(list(curDomains))
  53. outDFFrames.append(useData)
  54. if len(outDomainLists[-1]) != windowSize:
  55. outDomainLists.pop(-1)
  56. outDFFrames.pop(-1)
  57. return (outDomainLists, outDFFrames)
  58. def get_domain_features(domain, vocab, max_length=40):
  59. encoding = np.zeros((max_length,))
  60. for j in range(np.min([len(domain), max_length])):
  61. curCharacter = domain[-j]
  62. if curCharacter in vocab:
  63. encoding[j] = vocab[curCharacter]
  64. return encoding
  65. def get_flow_features(flow):
  66. keys = ['duration', 'bytes_down', 'bytes_up']
  67. features = np.zeros([len(keys), ])
  68. for i, key in enumerate(keys):
  69. # TODO: does it still works after exceptions occur -- default: zero!
  70. # i wonder whether something brokes
  71. # if there are exceptions regarding to inconsistent feature length
  72. try:
  73. features[i] = np.log1p(flow[key]).astype(float)
  74. except:
  75. pass
  76. return features
  77. def get_cisco_features(curDataLine, urlSIPDict):
  78. numCiscoFeatures = 30
  79. try:
  80. ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
  81. # log transform
  82. ciscoFeatures = np.log1p(ciscoFeatures).astype(float)
  83. return ciscoFeatures.ravel()
  84. except:
  85. return np.zeros([numCiscoFeatures, ]).ravel()
  86. def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10, use_cisco_features=False):
  87. domains = []
  88. features = []
  89. print("get chunks from user data frames")
  90. for i, user_flow in enumerate(get_flow_per_user(user_flow_df)):
  91. (domain_windows, feature_windows) = get_user_chunks(user_flow,
  92. windowSize=window_size,
  93. overlapping=True,
  94. maxLengthInSeconds=-1)
  95. domains += domain_windows
  96. features += feature_windows
  97. # TODO: remove later
  98. if i >= 10:
  99. break
  100. print("create training dataset")
  101. return create_dataset_from_lists(
  102. domains=domains, features=features, vocab=char_dict,
  103. max_len=max_len,
  104. use_cisco_features=use_cisco_features, urlSIPDIct=dict(),
  105. window_size=window_size)
  106. def create_dataset_from_lists(domains, features, vocab, max_len,
  107. use_cisco_features=False, urlSIPDIct=dict(),
  108. window_size=10):
  109. """
  110. combines domain and feature windows to sequential training data
  111. :param domains: list of domain windows
  112. :param features: list of feature windows
  113. :param vocab:
  114. :param max_len:
  115. :param use_cisco_features: idk
  116. :param urlSIPDIct: idk
  117. :param window_size: size of the flow window
  118. :return:
  119. """
  120. # TODO: check for hits vs vth consistency
  121. # if 'hits' in dfs[0].keys():
  122. # hits_col = 'hits'
  123. # elif 'virusTotalHits' in dfs[0].keys():
  124. # hits_col = 'virusTotalHits'
  125. hits_col = "virusTotalHits"
  126. numFlowFeatures = 3
  127. numCiscoFeatures = 30
  128. numFeatures = numFlowFeatures
  129. if use_cisco_features:
  130. numFeatures += numCiscoFeatures
  131. sample_size = len(domains)
  132. hits = []
  133. names = []
  134. servers = []
  135. trusted_hits = []
  136. domain_features = np.zeros((sample_size, window_size, max_len))
  137. flow_features = np.zeros((sample_size, window_size, numFeatures))
  138. for i in tqdm(np.arange(sample_size), miniters=10):
  139. for j in range(window_size):
  140. domain_features[i, j] = get_domain_features(domains[i][j], vocab, max_len)
  141. flow_features[i, j] = get_flow_features(features[i].iloc[j])
  142. # TODO: cisco features?
  143. hits.append(np.max(features[i][hits_col]))
  144. names.append(np.unique(features[i]['user_hash']))
  145. servers.append(np.max(features[i]['serverLabel']))
  146. trusted_hits.append(np.max(features[i]['trustedHits']))
  147. X = [domain_features, flow_features]
  148. return X, np.array(hits), np.array(names), np.array(servers), np.array(trusted_hits)
  149. def discretize_label(values, threshold):
  150. maxVal = np.max(values)
  151. if maxVal >= threshold:
  152. return 1.0
  153. elif maxVal == -1:
  154. return -1.0
  155. elif 0 < maxVal < threshold:
  156. return -2.0
  157. else:
  158. return 0.0
  159. def get_user_flow_data():
  160. df = pd.read_csv("data/rk_data.csv.gz")
  161. df.drop("Unnamed: 0", 1, inplace=True)
  162. df.set_index(keys=['user_hash'], drop=False, inplace=True)
  163. return df
  164. def get_flow_per_user(df):
  165. users = df['user_hash'].unique().tolist()
  166. for user in users:
  167. yield df.loc[df.user_hash == user]