Newer
Older
import logging
logger = logging.getLogger("KerasROOTClassification")
logger.addHandler(logging.NullHandler())
from root_numpy import tree2array, rec2array
import numpy as np
import pandas as pd
import h5py
import ROOT
class KerasROOTClassification:
dataset_names = ["x_train", "x_test", "y_train", "y_test", "w_train", "w_test"]
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def __init__(self, name,
signal_trees, bkg_trees, branches, weight_expr, identifiers,
layers=3, nodes=64, out_dir="./outputs"):
self.name = name
self.signal_trees = signal_trees
self.bkg_trees = bkg_trees
self.branches = branches
self.weight_expr = weight_expr
self.identifiers = identifiers
self.layers = layers
self.nodes = nodes
self.out_dir = out_dir
self.project_dir = os.path.join(self.out_dir, name)
if not os.path.exists(self.out_dir):
os.mkdir(self.out_dir)
if not os.path.exists(self.project_dir):
os.mkdir(self.project_dir)
self.s_train = None
self.b_train = None
self.s_test = None
self.b_test = None
self.x_train = None
self.x_test = None
self.y_train = None
self.y_test = None
self.s_eventlist_train = None
self.b_eventlist_train = None
def load_data(self):
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
try:
self._load_from_hdf5()
except KeyError:
logger.info("Couldn't load all datasets - reading from ROOT trees")
# Read signal and background trees into structured numpy arrays
signal_chain = ROOT.TChain()
bkg_chain = ROOT.TChain()
for filename, treename in self.signal_trees:
signal_chain.AddFile(filename, -1, treename)
for filename, treename in self.bkg_trees:
bkg_chain.AddFile(filename, -1, treename)
self.s_train = tree2array(signal_chain, branches=self.branches+[self.weight_expr]+self.identifiers, start=0, step=2)
self.b_train = tree2array(bkg_chain, branches=self.branches+[self.weight_expr]+self.identifiers, start=0, step=2)
self.s_test = tree2array(signal_chain, branches=self.branches+[self.weight_expr], start=1, step=2)
self.b_test = tree2array(bkg_chain, branches=self.branches+[self.weight_expr], start=1, step=2)
self._dump_training_list()
self.s_eventlist_train = self.s_train[self.identifiers]
self.b_eventlist_train = self.b_train[self.identifiers]
# now we don't need the identifiers anymore
self.s_train = self.s_train[self.branches+[self.weight_expr]]
self.b_train = self.b_train[self.branches+[self.weight_expr]]
# create x (input), y (target) and w (weights) arrays
# the first block will be signals, the second block backgrounds
self.x_train = rec2array(self.s_train[self.branches])
self.x_train = np.concatenate((self.x_train, rec2array(self.b_train[self.branches])))
self.x_test = rec2array(self.s_test[self.branches])
self.x_test = np.concatenate((self.x_test, rec2array(self.b_test[self.branches])))
self.w_train = self.s_train[self.weight_expr]
self.w_train = np.concatenate((self.w_train, self.b_train[self.weight_expr]))
self.w_test = self.s_test[self.weight_expr]
self.w_test = np.concatenate((self.w_test, self.b_test[self.weight_expr]))
self.y_train = np.empty(len(self.x_train))
self.y_train[:len(self.s_train)] = 1
self.y_train[len(self.s_train):] = 0
self.y_test = np.empty(len(self.x_test))
self.y_test[:len(self.s_test)] = 1
self.y_test[len(self.s_test):] = 0
logger.info("Writing to hdf5")
self._dump_to_hdf5()
def _dump_training_list(self):
s_eventlist = pd.DataFrame(self.s_train[self.identifiers])
b_eventlist = pd.DataFrame(self.b_train[self.identifiers])
s_eventlist.to_csv(os.path.join(self.project_dir, "s_eventlist_train.csv"))
s_eventlist.to_csv(os.path.join(self.project_dir, "b_eventlist_train.csv"))
def _dump_to_hdf5(self):
for dataset_name in self.dataset_names:
with h5py.File(os.path.join(self.project_dir, dataset_name+".h5"), "w") as hf:
hf.create_dataset(dataset_name, data=getattr(self, dataset_name))
def _load_from_hdf5(self):
for dataset_name in self.dataset_names:
filename = os.path.join(self.project_dir, dataset_name+".h5")
logger.info("Trying to load {} from {}".format(dataset_name, filename))
with h5py.File(filename) as hf:
setattr(self, dataset_name, hf[dataset_name][:])
logger.info("Data loaded")
def train(self):
pass
def evaluate(self):
pass
def writeFriendTree(self):
pass
def plotROC(self):
pass
def plotScore(self):
pass
if __name__ == "__main__":
logging.basicConfig()
logging.getLogger("KerasROOTClassification").setLevel(logging.INFO)
filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys.root"
c = KerasROOTClassification("test",
signal_trees = [(filename, "GG_oneStep_1705_1105_505_NoSys")],
bkg_trees = [(filename, "ttbar_NoSys"),
(filename, "wjets_Sherpa221_NoSys")
],
branches = ["met", "mt"],
weight_expr = "eventWeight*genWeight",
identifiers = ["DatasetNumber", "EventNumber"])
c.load_data()
print(c.x_train)
print(len(c.x_train))