diff --git a/.gitignore b/.gitignore index 38538a2b0da7279887cac5064e2603591d966a36..a3c950c737b9c1d698a9bbe55f5cca35ec3b9d61 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ run.py *.swp *.pyc *.pdf +*.root diff --git a/addFriend.py b/addFriend.py new file mode 100755 index 0000000000000000000000000000000000000000..ee13395b99e045f19ae59914195ff3482fb03cd6 --- /dev/null +++ b/addFriend.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python + +import argparse + +import ROOT + +parser = argparse.ArgumentParser(description='add a friend tree to a tree in another file') +parser.add_argument("infile", help="input file that contains the friend tree") +parser.add_argument("intree", help="name of the friend tree") +parser.add_argument("outfile", help="output file where the friend tree should be added") +parser.add_argument("outtree", help="name of the tree (in output file) to which the friend should be added") + +args = parser.parse_args() + +outfile = ROOT.TFile.Open(args.outfile, "UPDATE") +infile = ROOT.TFile.Open(args.infile) + +for k in outfile.GetListOfKeys(): + if k.GetName() == args.intree: + raise ValueError("Tree with name {} already exists in outputfile".format(args.intree)) + +outfile.cd() +outtree = outfile.Get(args.outtree) + +if not outtree: + raise KeyError("Tree {} not found in file {}".format(args.outtree, args.outfile)) + + +if outtree.GetListOfFriends(): + for k in outtree.GetListOfFriends(): + if k.GetName() == args.intree: + raise ValueError("Tree with name {} is already friend of {}".format(args.intree, args.outtree)) + +infile.cd() +intree = infile.Get(args.intree) + +if not intree: + raise KeyError("Tree {} not found in file {}".format(args.intree, args.infile)) + +# Add friend and write friend tree and original tree to outfile +outfile.cd() +outtree.AddFriend(intree) +outtree.Write(outtree.GetName()) +outfile.cd() +clonetree = intree.CloneTree(-1, "fast") +clonetree.Write(intree.GetName()) +infile.Close() +outfile.Close() diff --git a/toolkit.py b/toolkit.py index 25231056fb769f6253fe6c825eac65e85451aaf9..8424fd5480ef4db6c826dfb138715931cef4503e 100755 --- a/toolkit.py +++ b/toolkit.py @@ -9,7 +9,7 @@ import logging logger = logging.getLogger("KerasROOTClassification") logger.addHandler(logging.NullHandler()) -from root_numpy import tree2array, rec2array +from root_numpy import tree2array, rec2array, array2root import numpy as np import pandas as pd import h5py @@ -117,8 +117,8 @@ class KerasROOTClassification(object): self._scores_train = None self._scores_test = None - self.s_eventlist_train = None - self.b_eventlist_train = None + self._s_eventlist_train = None + self._b_eventlist_train = None self._scaler = None self._class_weight = None @@ -170,9 +170,9 @@ class KerasROOTClassification(object): selection=self.selection, start=1, step=self.step_bkg) - self._dump_training_list() self.s_eventlist_train = self.s_train[self.identifiers] self.b_eventlist_train = self.b_train[self.identifiers] + self._dump_training_list() # now we don't need the identifiers anymore self.s_train = self.s_train[self.branches+[self.weight_expr]] @@ -202,11 +202,37 @@ class KerasROOTClassification(object): def _dump_training_list(self): - s_eventlist = pd.DataFrame(self.s_train[self.identifiers]) - b_eventlist = pd.DataFrame(self.b_train[self.identifiers]) + s_eventlist_df = pd.DataFrame(self.s_eventlist_train) + b_eventlist_df = pd.DataFrame(self.b_eventlist_train) + + s_eventlist_df.to_csv(os.path.join(self.project_dir, "s_eventlist_train.csv")) + b_eventlist_df.to_csv(os.path.join(self.project_dir, "b_eventlist_train.csv")) - s_eventlist.to_csv(os.path.join(self.project_dir, "s_eventlist_train.csv")) - s_eventlist.to_csv(os.path.join(self.project_dir, "b_eventlist_train.csv")) + + @property + def s_eventlist_train(self): + if self._s_eventlist_train is None: + df = pd.read_csv(os.path.join(self.project_dir, "s_eventlist_train.csv")) + self._s_eventlist_train = df.to_records()[self.identifiers] + return self._s_eventlist_train + + + @s_eventlist_train.setter + def s_eventlist_train(self, value): + self._s_eventlist_train = value + + + @property + def b_eventlist_train(self): + if self._b_eventlist_train is None: + df = pd.read_csv(os.path.join(self.project_dir, "b_eventlist_train.csv")) + self._b_eventlist_train = df.to_records()[self.identifiers] + return self._b_eventlist_train + + + @b_eventlist_train.setter + def b_eventlist_train(self, value): + self._b_eventlist_train = value def _dump_to_hdf5(self, *dataset_names): @@ -435,10 +461,51 @@ class KerasROOTClassification(object): - def evaluate(self): - pass - - def write_friend_tree(self): + def evaluate(self, x_eval): + logger.debug("Evaluate score for {}".format(x_eval)) + x_eval = self.scaler.transform(x_eval) + logger.debug("Evaluate for transformed array: {}".format(x_eval)) + return self.model.predict(x_eval) + + + def write_friend_tree(self, score_name, + source_filename, source_treename, + target_filename, target_treename, + batch_size=100000): + f = ROOT.TFile.Open(source_filename) + tree = f.Get(source_treename) + entries = tree.GetEntries() + if os.path.exists(target_filename): + raise IOError("{} already exists, if you want to recreate it, delete it first".format(target_filename)) + for start in range(0, entries, batch_size): + logger.info("Evaluating score for entry {}/{}".format(start, entries)) + logger.debug("Loading next batch") + x_from_tree = tree2array(tree, + branches=self.branches+self.identifiers, + start=start, stop=start+batch_size) + x_eval = rec2array(x_from_tree[self.branches]) + + # create list of booleans that indicate which events where used for training + df_identifiers = pd.DataFrame(x_from_tree[self.identifiers]) + total_train_list = self.s_eventlist_train + total_train_list = np.concatenate((total_train_list, self.b_eventlist_train)) + merged = df_identifiers.merge(pd.DataFrame(total_train_list), on=tuple(self.identifiers), indicator=True, how="left") + is_train = np.array(merged["_merge"] == "both") + + # join scores and is_train array + scores = self.evaluate(x_eval).reshape(-1) + friend_df = pd.DataFrame(np.array(scores, dtype=[(score_name, np.float64)])) + friend_df["is_train"] = is_train + friend_tree = friend_df.to_records()[[score_name, "is_train"]] + if start == 0: + mode = "recreate" + else: + mode = "update" + logger.debug("Write to root file") + array2root(friend_tree, target_filename, treename=target_treename, mode=mode) + logger.debug("Done") + + def write_all_friend_trees(self): pass @@ -615,4 +682,13 @@ if __name__ == "__main__": c.plot_ROC() c.plot_loss() c.plot_accuracy() - c.plot_weights() + + c.write_friend_tree("test4_score", + source_filename=filename, source_treename="GG_oneStep_1705_1105_505_NoSys", + target_filename="friend.root", target_treename="test4_score") + + np.random.seed(1234) + + c.write_friend_tree("test4_score", + source_filename=filename, source_treename="ttbar_NoSys", + target_filename="friend_ttbar_NoSys.root", target_treename="test4_score")