Skip to content
Snippets Groups Projects
toolkit.py 4.68 KiB
Newer Older
Nikolai.Hartmann's avatar
Nikolai.Hartmann committed
#!/usr/bin/env python

import os

from root_numpy import tree2array, rec2array
import numpy as np
import pandas as pd
import h5py

import ROOT

class KerasROOTClassification:

    def __init__(self, name,
                 signal_trees, bkg_trees, branches, weight_expr, identifiers,
                 layers=3, nodes=64, out_dir="./outputs"):
        self.name = name
        self.signal_trees = signal_trees
        self.bkg_trees = bkg_trees
        self.branches = branches
        self.weight_expr = weight_expr
        self.identifiers = identifiers
        self.layers = layers
        self.nodes = nodes
        self.out_dir = out_dir

        self.project_dir = os.path.join(self.out_dir, name)

        if not os.path.exists(self.out_dir):
            os.mkdir(self.out_dir)

        if not os.path.exists(self.project_dir):
            os.mkdir(self.project_dir)

        self.s_train = None
        self.b_train = None
        self.s_test = None
        self.b_test = None
        self.x_train = None
        self.x_test = None
        self.y_train = None
        self.y_test = None

        self.s_eventlist_train = None
        self.b_eventlist_train = None

    def load_data(self):

        # Read signal and background trees into structured numpy arrays
        signal_chain = ROOT.TChain()
        bkg_chain = ROOT.TChain()
        for filename, treename in self.signal_trees:
            signal_chain.AddFile(filename, -1, treename)
        for filename, treename in self.bkg_trees:
            bkg_chain.AddFile(filename, -1, treename)
        self.s_train = tree2array(signal_chain, branches=self.branches+[self.weight_expr]+self.identifiers, start=0, step=2)
        self.b_train = tree2array(bkg_chain, branches=self.branches+[self.weight_expr]+self.identifiers, start=0, step=2)
        self.s_test = tree2array(signal_chain, branches=self.branches+[self.weight_expr], start=1, step=2)
        self.b_test = tree2array(bkg_chain, branches=self.branches+[self.weight_expr], start=1, step=2)

        self._dump_training_list()
        self.s_eventlist_train = self.s_train[self.identifiers]
        self.b_eventlist_train = self.b_train[self.identifiers]

        # now we don't need the identifiers anymore
        self.s_train = self.s_train[self.branches+[self.weight_expr]]
        self.b_train = self.b_train[self.branches+[self.weight_expr]]

        # create x (input), y (target) and w (weights) arrays
        # the first block will be signals, the second block backgrounds
        self.x_train = rec2array(self.s_train[self.branches])
        self.x_train = np.concatenate((self.x_train, rec2array(self.b_train[self.branches])))
        self.x_test = rec2array(self.s_test[self.branches])
        self.x_test = np.concatenate((self.x_test, rec2array(self.b_test[self.branches])))
        self.w_train = self.s_train[self.weight_expr]
        self.w_train = np.concatenate((self.w_train, self.b_train[self.weight_expr]))
        self.w_test = self.s_test[self.weight_expr]
        self.w_test = np.concatenate((self.w_test, self.b_test[self.weight_expr]))

        self._dump_to_hdf5()


    def _dump_training_list(self):
        s_eventlist = pd.DataFrame(self.s_train[self.identifiers])
        b_eventlist = pd.DataFrame(self.b_train[self.identifiers])

        s_eventlist.to_csv(os.path.join(self.project_dir, "s_eventlist_train.csv"))
        s_eventlist.to_csv(os.path.join(self.project_dir, "b_eventlist_train.csv"))


    def _dump_to_hdf5(self):
        for dataset_name in ["x_train", "x_test"]:
            with h5py.File(os.path.join(self.project_dir, dataset_name+".h5"), "w") as hf:
                hf.create_dataset(dataset_name, data=getattr(self, dataset_name))


    def _load_from_hdf5(self):
        dataset_names = ["x_train", "x_test"]


        # example:
        with h5py.File("x_test.h5") as hf:
            self.x_test = hf["x_test"][:]


    def train(self):
        pass

    def evaluate(self):
        pass

    def writeFriendTree(self):
        pass

    def plotROC(self):
        pass

    def plotScore(self):
        pass



if __name__ == "__main__":

    filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys.root"

    c = KerasROOTClassification("test",
                                signal_trees = [(filename, "GG_oneStep_1705_1105_505_NoSys")],
                                bkg_trees = [(filename, "ttbar_NoSys"),
                                             (filename, "wjets_Sherpa221_NoSys")
                                ],
                                branches = ["met", "mt"],
                                weight_expr = "eventWeight*genWeight",
                                identifiers = ["DatasetNumber", "EventNumber"])

    c.load_data()
    print(c.x_train)
    print(len(c.x_train))