My master thesis project on malware detection using neural networks and multi task learning
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

main.py 4.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. import argparse
  2. import numpy as np
  3. from keras.utils import np_utils
  4. import dataset
  5. import models
  6. parser = argparse.ArgumentParser()
  7. parser.add_argument("--modes", action="store", dest="modes", nargs="+")
  8. # parser.add_argument("--data", action="store", dest="data",
  9. # default="data/")
  10. #
  11. # parser.add_argument("--h5data", action="store", dest="h5data",
  12. # default="")
  13. #
  14. # parser.add_argument("--model", action="store", dest="model",
  15. # default="model_x")
  16. #
  17. # parser.add_argument("--pred", action="store", dest="pred",
  18. # default="")
  19. #
  20. # parser.add_argument("--type", action="store", dest="model_type",
  21. # default="simple_conv")
  22. #
  23. parser.add_argument("--batch", action="store", dest="batch_size",
  24. default=64, type=int)
  25. parser.add_argument("--epochs", action="store", dest="epochs",
  26. default=10, type=int)
  27. # parser.add_argument("--samples", action="store", dest="samples",
  28. # default=100000, type=int)
  29. #
  30. # parser.add_argument("--samples_val", action="store", dest="samples_val",
  31. # default=10000, type=int)
  32. #
  33. # parser.add_argument("--area", action="store", dest="area_size",
  34. # default=25, type=int)
  35. #
  36. # parser.add_argument("--queue", action="store", dest="queue_size",
  37. # default=50, type=int)
  38. #
  39. # parser.add_argument("--p", action="store", dest="p_train",
  40. # default=0.5, type=float)
  41. #
  42. # parser.add_argument("--p_val", action="store", dest="p_val",
  43. # default=0.01, type=float)
  44. #
  45. # parser.add_argument("--gpu", action="store", dest="gpu",
  46. # default=0, type=int)
  47. #
  48. # parser.add_argument("--tmp", action="store_true", dest="tmp")
  49. #
  50. # parser.add_argument("--test", action="store", dest="test_image",
  51. # default=6, choices=range(7), type=int)
  52. args = parser.parse_args()
  53. # config = tf.ConfigProto(log_device_placement=True)
  54. # config.gpu_options.per_process_gpu_memory_fraction = 0.5
  55. # config.gpu_options.allow_growth = True
  56. # session = tf.Session(config=config)
  57. def main():
  58. # parameter
  59. innerCNNFilters = 512
  60. innerCNNKernelSize = 2
  61. cnnDropout = 0.5
  62. cnnHiddenDims = 1024
  63. domainFeatures = 512
  64. flowFeatures = 3
  65. numCiscoFeatures = 30
  66. windowSize = 10
  67. maxLen = 40
  68. embeddingSize = 100
  69. kernel_size = 2
  70. drop_out = 0.5
  71. filters = 2
  72. hidden_dims = 100
  73. vocabSize = 40
  74. threshold = 3
  75. minFlowsPerUser = 10
  76. numEpochs = 100
  77. char_dict = dataset.get_character_dict()
  78. user_flow_df = dataset.get_user_flow_data()
  79. print("create training dataset")
  80. (X_tr, hits_tr, names_tr, server_tr, trusted_hits_tr) = dataset.create_dataset_from_flows(
  81. user_flow_df, char_dict,
  82. max_len=maxLen, window_size=windowSize)
  83. # make client labels discrete with 4 different values
  84. # TODO: use trusted_hits_tr for client classification too
  85. client_labels = np.apply_along_axis(lambda x: dataset.discretize_label(x, 3), 0, np.atleast_2d(hits_tr))
  86. # select only 1.0 and 0.0 from training data
  87. pos_idx = np.where(client_labels == 1.0)[0]
  88. neg_idx = np.where(client_labels == 0.0)[0]
  89. idx = np.concatenate((pos_idx, neg_idx))
  90. # select labels for prediction
  91. client_labels = client_labels[idx]
  92. server_labels = server_tr[idx]
  93. shared_cnn = models.get_shared_cnn(len(char_dict) + 1, embeddingSize, maxLen,
  94. domainFeatures, kernel_size, domainFeatures, 0.5)
  95. model = models.get_top_cnn(shared_cnn, flowFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size,
  96. cnnHiddenDims, cnnDropout)
  97. model.compile(optimizer='adam',
  98. loss='binary_crossentropy',
  99. metrics=['accuracy'])
  100. client_labels = np_utils.to_categorical(client_labels, 2)
  101. server_labels = np_utils.to_categorical(server_labels, 2)
  102. model.fit(X_tr,
  103. [client_labels, server_labels],
  104. batch_size=args.batch_size,
  105. epochs=args.epochs,
  106. shuffle=True)
  107. # TODO: for validation we use future data -> validation_data=(testData,testLabel))
  108. if __name__ == "__main__":
  109. main()