ma_cisco_malware/hyperband.py

# -*- coding: utf-8 -*-
# implementation of hyperband:
# https://arxiv.org/pdf/1603.06560.pdf
import logging
import random
from math import ceil, log
from random import random as rng
from time import ctime, time

import joblib
import numpy as np
from keras.callbacks import EarlyStopping

import models
from main import create_model

logger = logging.getLogger('logger')


def sample_params(param_distribution: dict):
    p = {}
    for key, val in param_distribution.items():
        p[key] = random.choice(val)
    return p


class Hyperband:
    def __init__(self, param_distribution, X, y, max_iter=81, savefile=None):
        self.get_params = lambda: sample_params(param_distribution)

        self.max_iter = max_iter  # maximum iterations per configuration
        self.eta = 3  # defines configuration downsampling rate (default = 3)

        self.logeta = lambda x: log(x) / log(self.eta)
        self.s_max = int(self.logeta(self.max_iter))
        self.B = (self.s_max + 1) * self.max_iter

        self.results = []  # list of dicts
        self.counter = 0
        self.best_loss = np.inf
        self.best_counter = -1

        self.savefile = savefile
        
        self.X = X
        self.y = y
    
    def try_params(self, n_iterations, params):
        n_iterations = int(round(n_iterations))
        embedding, model, new_model = models.get_models_by_params(params)

        model = create_model(model, params["model_output"])
        new_model = create_model(new_model, params["model_output"])

        if params["type"] in ("inter", "staggered"):
            model = new_model

        callbacks = [EarlyStopping(monitor='val_loss',
                                   patience=5,
                                   verbose=False)]
        
        model.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

        history = model.fit(self.X,
                            self.y,
                            batch_size=params["batch_size"],
                            epochs=n_iterations,
                            callbacks=callbacks,
                            shuffle=True,
                            validation_split=0.4)

        return {"loss": np.min(history.history['val_loss']),
                "early_stop": len(history.history["loss"]) < n_iterations}
    
    # can be called multiple times
    def run(self, skip_last=0, dry_run=False):
    
        for s in reversed(range(self.s_max + 1)):
        
            # initial number of configurations
            n = int(ceil(self.B / self.max_iter / (s + 1) * self.eta ** s))
        
            # initial number of iterations per config
            r = self.max_iter * self.eta ** (-s)
        
            # n random configurations
            random_configs = [self.get_params() for _ in range(n)]
            
            for i in range((s + 1) - int(skip_last)):  # changed from s + 1
    
                # Run each of the n configs for <iterations>
                # and keep best (n_configs / eta) configurations
    
                n_configs = n * self.eta ** (-i)
                n_iterations = r * self.eta ** (i)
    
                logger.info("\n*** {} configurations x {:.1f} iterations each".format(
                        n_configs, n_iterations))
                
                val_losses = []
                early_stops = []
    
                for t in random_configs:
                    
                    self.counter += 1
                    logger.info("\n{} | {} | lowest loss so far: {:.4f} (run {})\n".format(
                            self.counter, ctime(), self.best_loss, self.best_counter))
                    
                    start_time = time()

                    if dry_run:
                        result = {'loss': rng(), 'log_loss': rng(), 'auc': rng()}
                    else:
                        result = self.try_params(n_iterations, t)  # <---

                    assert (type(result) == dict)
                    assert ('loss' in result)

                    seconds = int(round(time() - start_time))
                    logger.info("\n{} seconds.".format(seconds))

                    loss = result['loss']
                    val_losses.append(loss)

                    early_stop = result.get('early_stop', False)
                    early_stops.append(early_stop)

                    # keeping track of the best result so far (for display only)
                    # could do it be checking results each time, but hey
                    if loss < self.best_loss:
                        self.best_loss = loss
                        self.best_counter = self.counter

                    result['counter'] = self.counter
                    result['seconds'] = seconds
                    result['params'] = t
                    result['iterations'] = n_iterations

                    self.results.append(result)
    
                # select a number of best configurations for the next loop
                # filter out early stops, if any
                indices = np.argsort(val_losses)
                random_configs = [random_configs[i] for i in indices if not early_stops[i]]
                random_configs = random_configs[0:int(n_configs / self.eta)]
        
            if self.savefile:
                joblib.dump(self.results, self.savefile)
        
        return self.results
refactor main functions - separate things into different functions 2017-07-07 08:43:16 +02:00			`# -- coding: utf-8 --`
			`# implementation of hyperband:`
			`# https://arxiv.org/pdf/1603.06560.pdf`
refactor argparser into separate file, add logger 2017-07-12 10:25:55 +02:00			`import logging`
added params 2017-07-07 16:48:10 +02:00			`import random`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00			`from math import ceil, log`
added params 2017-07-07 16:48:10 +02:00			`from random import random as rng`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00			`from time import ctime, time`
added params 2017-07-07 16:48:10 +02:00
add hyperband savefile config, minor change of parameter name 2017-10-03 18:58:54 +02:00			`import joblib`
refactor main functions - separate things into different functions 2017-07-07 08:43:16 +02:00			`import numpy as np`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00			`from keras.callbacks import EarlyStopping`
refactor main functions - separate things into different functions 2017-07-07 08:43:16 +02:00
added params 2017-07-07 16:48:10 +02:00			`import models`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00			`from main import create_model`
added params 2017-07-07 16:48:10 +02:00
refactor argparser into separate file, add logger 2017-07-12 10:25:55 +02:00			`logger = logging.getLogger('logger')`

added params 2017-07-07 16:48:10 +02:00
			`def sample_params(param_distribution: dict):`
			`p = {}`
			`for key, val in param_distribution.items():`
			`p[key] = random.choice(val)`
			`return p`


			`class Hyperband:`
add hyperband savefile config, minor change of parameter name 2017-10-03 18:58:54 +02:00			`def __init__(self, param_distribution, X, y, max_iter=81, savefile=None):`
added params 2017-07-07 16:48:10 +02:00			`self.get_params = lambda: sample_params(param_distribution)`

refactor hyperband implementation 2017-09-29 22:59:57 +02:00			`self.max_iter = max_iter # maximum iterations per configuration`
added params 2017-07-07 16:48:10 +02:00			`self.eta = 3 # defines configuration downsampling rate (default = 3)`

			`self.logeta = lambda x: log(x) / log(self.eta)`
			`self.s_max = int(self.logeta(self.max_iter))`
			`self.B = (self.s_max + 1) * self.max_iter`

			`self.results = [] # list of dicts`
			`self.counter = 0`
			`self.best_loss = np.inf`
			`self.best_counter = -1`

add hyperband savefile config, minor change of parameter name 2017-10-03 18:58:54 +02:00			`self.savefile = savefile`

added params 2017-07-07 16:48:10 +02:00			`self.X = X`
			`self.y = y`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00
added params 2017-07-07 16:48:10 +02:00			`def try_params(self, n_iterations, params):`
			`n_iterations = int(round(n_iterations))`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00			`embedding, model, new_model = models.get_models_by_params(params)`

fix hyperband wrong variable names 2017-09-29 23:34:39 +02:00			`model = create_model(model, params["model_output"])`
			`new_model = create_model(new_model, params["model_output"])`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00
			`if params["type"] in ("inter", "staggered"):`
			`model = new_model`

			`callbacks = [EarlyStopping(monitor='val_loss',`
			`patience=5,`
			`verbose=False)]`

added params 2017-07-07 16:48:10 +02:00			`model.compile(optimizer='adam',`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00			`loss='binary_crossentropy',`
added params 2017-07-07 16:48:10 +02:00			`metrics=['accuracy'])`

			`history = model.fit(self.X,`
			`self.y,`
			`batch_size=params["batch_size"],`
			`epochs=n_iterations,`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00			`callbacks=callbacks,`
added params 2017-07-07 16:48:10 +02:00			`shuffle=True,`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00			`validation_split=0.4)`
added params 2017-07-07 16:48:10 +02:00
change hyperband to count minimal val_loss over all losses 2017-10-05 12:55:46 +02:00			`return {"loss": np.min(history.history['val_loss']),`
fix hyperband wrong variable names 2017-09-29 23:34:39 +02:00			`"early_stop": len(history.history["loss"]) < n_iterations}`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00
added params 2017-07-07 16:48:10 +02:00			`# can be called multiple times`
			`def run(self, skip_last=0, dry_run=False):`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00
added params 2017-07-07 16:48:10 +02:00			`for s in reversed(range(self.s_max + 1)):`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00
added params 2017-07-07 16:48:10 +02:00			`# initial number of configurations`
			`n = int(ceil(self.B / self.max_iter / (s + 1) * self.eta ** s))`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00
added params 2017-07-07 16:48:10 +02:00			`# initial number of iterations per config`
			`r = self.max_iter * self.eta ** (-s)`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00
added params 2017-07-07 16:48:10 +02:00			`# n random configurations`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00			`random_configs = [self.get_params() for _ in range(n)]`

added params 2017-07-07 16:48:10 +02:00			`for i in range((s + 1) - int(skip_last)): # changed from s + 1`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00
added params 2017-07-07 16:48:10 +02:00			`# Run each of the n configs for <iterations>`
			`# and keep best (n_configs / eta) configurations`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00
added params 2017-07-07 16:48:10 +02:00			`n_configs = n * self.eta ** (-i)`
			`n_iterations = r * self.eta ** (i)`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00
refactor argparser into separate file, add logger 2017-07-12 10:25:55 +02:00			`logger.info("\n*** {} configurations x {:.1f} iterations each".format(`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00			`n_configs, n_iterations))`

added params 2017-07-07 16:48:10 +02:00			`val_losses = []`
			`early_stops = []`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00
			`for t in random_configs:`

added params 2017-07-07 16:48:10 +02:00			`self.counter += 1`
refactor argparser into separate file, add logger 2017-07-12 10:25:55 +02:00			`logger.info("\n{} \| {} \| lowest loss so far: {:.4f} (run {})\n".format(`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00			`self.counter, ctime(), self.best_loss, self.best_counter))`

added params 2017-07-07 16:48:10 +02:00			`start_time = time()`

			`if dry_run:`
			`result = {'loss': rng(), 'log_loss': rng(), 'auc': rng()}`
			`else:`
			`result = self.try_params(n_iterations, t) # <---`

			`assert (type(result) == dict)`
			`assert ('loss' in result)`

			`seconds = int(round(time() - start_time))`
refactor argparser into separate file, add logger 2017-07-12 10:25:55 +02:00			`logger.info("\n{} seconds.".format(seconds))`
added params 2017-07-07 16:48:10 +02:00
			`loss = result['loss']`
			`val_losses.append(loss)`

			`early_stop = result.get('early_stop', False)`
			`early_stops.append(early_stop)`

			`# keeping track of the best result so far (for display only)`
			`# could do it be checking results each time, but hey`
			`if loss < self.best_loss:`
			`self.best_loss = loss`
			`self.best_counter = self.counter`

			`result['counter'] = self.counter`
			`result['seconds'] = seconds`
			`result['params'] = t`
			`result['iterations'] = n_iterations`

			`self.results.append(result)`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00
added params 2017-07-07 16:48:10 +02:00			`# select a number of best configurations for the next loop`
			`# filter out early stops, if any`
			`indices = np.argsort(val_losses)`
refactor hyperband implementation 2017-09-29 22:59:57 +02:00			`random_configs = [random_configs[i] for i in indices if not early_stops[i]]`
			`random_configs = random_configs[0:int(n_configs / self.eta)]`

add hyperband savefile config, minor change of parameter name 2017-10-03 18:58:54 +02:00			`if self.savefile:`
			`joblib.dump(self.results, self.savefile)`

added params 2017-07-07 16:48:10 +02:00			`return self.results`