toolkit.py

#!/usr/bin/env python

import os
import json

import logging
logger = logging.getLogger("KerasROOTClassification")
logger.addHandler(logging.NullHandler())

from root_numpy import tree2array, rec2array
import numpy as np
import pandas as pd
import h5py
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib

from keras.models import Sequential
from keras.layers import Dense
from keras.models import model_from_json

# configure number of cores
# this doesn't seem to work, but at least with these settings keras only uses 4 processes
import tensorflow as tf
from keras import backend as K
num_cores = 1
config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,
                        inter_op_parallelism_threads=num_cores,
                        allow_soft_placement=True,
                        device_count = {'CPU': num_cores})
session = tf.Session(config=config)
K.set_session(session)

import ROOT

class KerasROOTClassification:


    dataset_names = ["x_train", "x_test", "y_train", "y_test", "w_train", "w_test"]


    def __init__(self, name,
                 signal_trees, bkg_trees, branches, weight_expr, identifiers,
                 layers=3, nodes=64, batch_size=128, activation_function='relu', out_dir="./outputs"):
        self.name = name
        self.signal_trees = signal_trees
        self.bkg_trees = bkg_trees
        self.branches = branches
        self.weight_expr = weight_expr
        self.identifiers = identifiers
        self.layers = layers
        self.nodes = nodes
        self.batch_size = batch_size
        self.activation_function = activation_function
        self.out_dir = out_dir

        self.project_dir = os.path.join(self.out_dir, name)

        if not os.path.exists(self.out_dir):
            os.mkdir(self.out_dir)

        if not os.path.exists(self.project_dir):
            os.mkdir(self.project_dir)

        self.s_train = None
        self.b_train = None
        self.s_test = None
        self.b_test = None
        self.x_train = None
        self.x_test = None
        self.y_train = None
        self.y_test = None

        self.s_eventlist_train = None
        self.b_eventlist_train = None

        self._scaler = None
        self._class_weight = None
        self._model = None

        # track the number of epochs this model has been trained
        self.total_epochs = 0

        self.data_loaded = False


    def _load_data(self):

        try:

            self._load_from_hdf5()

        except KeyError:

            logger.info("Couldn't load all datasets - reading from ROOT trees")

            # Read signal and background trees into structured numpy arrays
            signal_chain = ROOT.TChain()
            bkg_chain = ROOT.TChain()
            for filename, treename in self.signal_trees:
                signal_chain.AddFile(filename, -1, treename)
            for filename, treename in self.bkg_trees:
                bkg_chain.AddFile(filename, -1, treename)
            self.s_train = tree2array(signal_chain, branches=self.branches+[self.weight_expr]+self.identifiers, start=0, step=2)
            self.b_train = tree2array(bkg_chain, branches=self.branches+[self.weight_expr]+self.identifiers, start=0, step=2)
            self.s_test = tree2array(signal_chain, branches=self.branches+[self.weight_expr], start=1, step=2)
            self.b_test = tree2array(bkg_chain, branches=self.branches+[self.weight_expr], start=1, step=2)

            self._dump_training_list()
            self.s_eventlist_train = self.s_train[self.identifiers]
            self.b_eventlist_train = self.b_train[self.identifiers]

            # now we don't need the identifiers anymore
            self.s_train = self.s_train[self.branches+[self.weight_expr]]
            self.b_train = self.b_train[self.branches+[self.weight_expr]]

            # create x (input), y (target) and w (weights) arrays
            # the first block will be signals, the second block backgrounds
            self.x_train = rec2array(self.s_train[self.branches])
            self.x_train = np.concatenate((self.x_train, rec2array(self.b_train[self.branches])))
            self.x_test = rec2array(self.s_test[self.branches])
            self.x_test = np.concatenate((self.x_test, rec2array(self.b_test[self.branches])))
            self.w_train = self.s_train[self.weight_expr]
            self.w_train = np.concatenate((self.w_train, self.b_train[self.weight_expr]))
            self.w_test = self.s_test[self.weight_expr]
            self.w_test = np.concatenate((self.w_test, self.b_test[self.weight_expr]))

            self.y_train = np.empty(len(self.x_train))
            self.y_train[:len(self.s_train)] = 1
            self.y_train[len(self.s_train):] = 0
            self.y_test = np.empty(len(self.x_test))
            self.y_test[:len(self.s_test)] = 1
            self.y_test[len(self.s_test):] = 0

            logger.info("Writing to hdf5")
            self._dump_to_hdf5()

        self.data_loaded = True


    def _dump_training_list(self):
        s_eventlist = pd.DataFrame(self.s_train[self.identifiers])
        b_eventlist = pd.DataFrame(self.b_train[self.identifiers])

        s_eventlist.to_csv(os.path.join(self.project_dir, "s_eventlist_train.csv"))
        s_eventlist.to_csv(os.path.join(self.project_dir, "b_eventlist_train.csv"))


    def _dump_to_hdf5(self):
        for dataset_name in self.dataset_names:
            with h5py.File(os.path.join(self.project_dir, dataset_name+".h5"), "w") as hf:
                hf.create_dataset(dataset_name, data=getattr(self, dataset_name))


    def _load_from_hdf5(self):
        for dataset_name in self.dataset_names:
            filename = os.path.join(self.project_dir, dataset_name+".h5")
            logger.info("Trying to load {} from {}".format(dataset_name, filename))
            with h5py.File(filename) as hf:
                setattr(self, dataset_name,  hf[dataset_name][:])
        logger.info("Data loaded")


    @property
    def scaler(self):
        # create the scaler (and fit to training data) if not existent
        if self._scaler is None:
            filename = os.path.join(self.project_dir, "scaler.pkl")
            try:
                self._scaler = joblib.load(filename)
                logger.info("Loaded existing StandardScaler from {}".format(filename))
            except IOError:
                logger.info("Creating new StandardScaler")
                self._scaler = StandardScaler()
                logger.info("Fitting StandardScaler to training data")
                self._scaler.fit(self.x_train)
                joblib.dump(self._scaler, filename)
        return self._scaler


    def _read_info(self, key, default):
        filename = os.path.join(self.project_dir, "info.json")
        if not os.path.exists(filename):
            with open(filename, "w") as of:
                json.dump({}, of)
        with open(filename) as f:
            info = json.load(f)
        return info.get(key, default)


    def _write_info(self, key, value):
        filename = os.path.join(self.project_dir, "info.json")
        with open(filename) as f:
            info = json.load(f)
        info[key] = value
        with open(filename, "w") as of:
            json.dump(info, of)


    @property
    def model(self):
        "Simple MLP"

        if self._model is None:

            self._model = Sequential()

            # first hidden layer
            self._model.add(Dense(self.nodes, input_dim=len(self.branches), activation=self.activation_function))
            # the other hidden layers
            for layer_number in range(self.layers-1):
                self._model.add(Dense(self.nodes, activation=self.activation_function))
            # last layer is one neuron (binary classification)
            self._model.add(Dense(1, activation='sigmoid'))

            self._model.compile(optimizer='SGD',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

            # dump to json for documentation
            with open(os.path.join(self.project_dir, "model.json"), "w") as of:
                of.write(self._model.to_json())

        return self._model

    @property
    def class_weight(self):
        if self._class_weight is None:
            sumw_bkg = np.sum(self.w_train[self.y_train == 0])
            sumw_sig = np.sum(self.w_train[self.y_train == 1])
            self._class_weight = [(sumw_sig+sumw_bkg)/(2*sumw_bkg), (sumw_sig+sumw_bkg)/(2*sumw_sig)]
        return self._class_weight

    def train(self, epochs=10):

        if not self.data_loaded:
            self._load_data()

        try:
            self.model.load_weights(os.path.join(self.project_dir, "weights.h5"))
            logger.info("Weights found and loaded")
            logger.info("Continue training")
        except IOError:
            logger.info("No weights found, starting completely new training")

        self.total_epochs = self._read_info("epochs", 0)

        self.model.fit(self.x_train, self.y_train,
                       epochs=epochs,
                       class_weight=self.class_weight,
                       shuffle=True,
                       batch_size=self.batch_size)

        self.model.save_weights(os.path.join(self.project_dir, "weights.h5"))

        self.total_epochs += epochs
        self._write_info("epochs", self.total_epochs)

    def evaluate(self):
        pass

    def writeFriendTree(self):
        pass

    def plotROC(self):
        pass

    def plotScore(self):
        pass


if __name__ == "__main__":

    logging.basicConfig()
    logging.getLogger("KerasROOTClassification").setLevel(logging.INFO)

    filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys.root"

    c = KerasROOTClassification("test",
                                signal_trees = [(filename, "GG_oneStep_1705_1105_505_NoSys")],
                                bkg_trees = [(filename, "ttbar_NoSys"),
                                             (filename, "wjets_Sherpa221_NoSys")
                                ],
                                branches = ["met", "mt"],
                                weight_expr = "eventWeight*genWeight",
                                identifiers = ["DatasetNumber", "EventNumber"])

    c.train(epochs=1)