toolkit.py

#!/usr/bin/env python

__all__ = ["ClassificationProject", "ClassificationProjectDataFrame", "ClassificationProjectRNN"]

from sys import version_info

if version_info[0] > 2:
    raw_input = input
    izip = zip
else:
    from itertools import izip

import os
import json
import pickle
import importlib
import csv
import math
import glob
import shutil
import gc

import logging
logger = logging.getLogger("KerasROOTClassification")
logger.addHandler(logging.NullHandler())

from root_numpy import tree2array, rec2array, array2root
import numpy as np
import pandas as pd
import h5py
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.externals import joblib
from sklearn.metrics import roc_curve, auc
from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Dropout, Input, Masking, GRU, concatenate
from keras.callbacks import History, EarlyStopping, CSVLogger, ModelCheckpoint, TensorBoard
from keras.optimizers import SGD
import keras.optimizers
import matplotlib.pyplot as plt

from .utils import WeightedRobustScaler, weighted_quantile

# configure number of cores
# this doesn't seem to work, but at least with these settings keras only uses 4 processes
# import tensorflow as tf
# from keras import backend as K
# num_cores = 1
# config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,
#                         inter_op_parallelism_threads=num_cores,
#                         allow_soft_placement=True,
#                         device_count = {'CPU': num_cores})
# session = tf.Session(config=config)
# K.set_session(session)

import ROOT


def byteify(input):
    "From stackoverflow https://stackoverflow.com/a/13105359"
    if isinstance(input, dict):
        return {byteify(key): byteify(value)
                for key, value in input.iteritems()}
    elif isinstance(input, list):
        return [byteify(element) for element in input]
    elif isinstance(input, unicode):
        return input.encode('utf-8')
    else:
        return input

if version_info[0] > 2:
    byteify = lambda input : input


class ClassificationProject(object):

    """Simple framework to load data from ROOT TTrees and train Keras
    neural networks for classification according to some global settings.

    See the `Keras documentation <https://keras.io>` for further information

    All needed data that is created is stored in a project dir and can
    be used again later without the need to be recreated.

    :param name: Name of the project - this will also be the name of
                 the project directory in the output dir. If no further arguments
                 are given, this argument is interpreted as a directory name, from
                 which a previously created project should be initialised

    :param signal_trees: list of tuples (filename, treename) for the data that should be used as signal

    :param bkg_trees: list of tuples (filename, treename) for the data that should be used as background

    :param branches: list of branch names or expressions to be used as input values for training

    :param rename_branches: dictionary that maps branch expressions to names for better readability

    :param weight_expr: expression to weight the events in the loss function

    :param data_dir: if given, load the data from a previous project with the given name
                     instead of creating it from trees. If the data is on the same
                     disk (and the filesystem supports it), hard links will be used,
                     otherwise symlinks.

    :param identifiers: list of branches or expressions that uniquely
                        identify events. This is used to store the list of training
                        events, such that they can be marked later on, for example when
                        creating friend trees with output score

    :param selection: selection expression that events have to fulfill to be considered for training

    :param layers: number of layers in the neural network

    :param nodes: list number of nodes in each layer. If only a single number is given, use this number for every layer

    :param dropout: dropout fraction after each hidden layer. You can also pass a list for dropout fractions for each layer. Set to None for no Dropout.

    :param dropout_input: dropout fraction for the input layer. Set to None for no Dropout.

    :param batch_size: size of the training batches

    :param validation_split: split off this fraction of training events for loss evaluation

    :param activation_function: activation function in the hidden layers

    :param activation_function_output: activation function in the output layer

    :param out_dir: base directory in which the project directories should be stored

    :param scaler_type: sklearn scaler class name to transform the data before training (options: "StandardScaler", "RobustScaler")

    :param step_signal: step size when selecting signal training events (e.g. 2 means take every second event)

    :param step_bkg: step size when selecting background training events (e.g. 2 means take every second event)

    :param stop_train: stop after this number of events for reading in training events

    :param stop_test: stop after this number of events for reading in test events

    :param optimizer: name of optimizer class in keras.optimizers

    :param optimizer_opts: dictionary of options for the optimizer

    :param use_earlystopping: set true to use the keras EarlyStopping callback

    :param earlystopping_opts: options for the keras EarlyStopping callback

    :param use_modelcheckpoint: save model weights after each epoch and don't save after no validation loss improvement (except if the options are set otherwise).

    :param modelcheckpoint_opts: options for the Keras ModelCheckpoint
                                 callback. After training, the newest saved weight will be used. If
                                 you change the format of the saved model weights it has to be of
                                 the form "weights*.h5"

    :param use_tensorboard: if True, use the tensorboard callback to write logs for tensorboard

    :param tensorboard_opts: options for the TensorBoard callback

    :param balance_dataset: if True, balance the dataset instead of
                            applying class weights. Only a fraction of the overrepresented
                            class will be used in each epoch, but different subsets of the
                            overrepresented class will be used in each epoch.

    :param random_seed: use this seed value when initialising the model and produce consistent results. Note:
                        random data is also used for shuffling the training data, so results may vary still. To
                        produce consistent results, set the numpy random seed before training.

    :param loss: loss function name

    """


    # Datasets that are stored to (and dynamically loaded from) hdf5
    dataset_names = ["x_train", "x_test", "y_train", "y_test", "w_train", "w_test", "scores_train", "scores_test"]

    # Datasets that are retrieved from ROOT trees the first time
    dataset_names_tree = ["x_train", "x_test", "y_train", "y_test", "w_train", "w_test"]

    def __init__(self, name, *args, **kwargs):
        if len(args) < 1 and len(kwargs) < 1:
            # if no further arguments given, interpret as directory name
            self._init_from_dir(name)
        else:
            # otherwise initialise new project
            self._init_from_args(name, *args, **kwargs)
            with open(os.path.join(self.project_dir, "options.pickle"), "wb") as of:
                pickle.dump(dict(args=args, kwargs=kwargs), of)


    def _init_from_dir(self, dirname):
        if not os.path.exists(os.path.join(dirname, "options.pickle")):
            # for backward compatibility
            with open(os.path.join(dirname, "options.json")) as f:
                options = byteify(json.load(f))
        else:
            with open(os.path.join(dirname, "options.pickle"), "rb") as f:
                options = pickle.load(f)
        options["kwargs"]["project_dir"] = dirname
        self._init_from_args(os.path.basename(dirname), *options["args"], **options["kwargs"])