From 365ba76ddf2970b98f4e0db9b93b091ee6550d55 Mon Sep 17 00:00:00 2001 From: Nikolai Hartmann <Nikolai.Hartmann@physik.uni-muenchen.de> Date: Fri, 27 Apr 2018 15:44:29 +0200 Subject: [PATCH] Selection argument, RobustScaler and workaround for numpy bug --- toolkit.py | 74 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 60 insertions(+), 14 deletions(-) diff --git a/toolkit.py b/toolkit.py index 05dfb0e..f4451f3 100755 --- a/toolkit.py +++ b/toolkit.py @@ -11,7 +11,7 @@ from root_numpy import tree2array, rec2array import numpy as np import pandas as pd import h5py -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import StandardScaler, RobustScaler from sklearn.externals import joblib from sklearn.metrics import roc_curve @@ -44,12 +44,20 @@ class KerasROOTClassification: def __init__(self, name, signal_trees, bkg_trees, branches, weight_expr, identifiers, - layers=3, nodes=64, batch_size=128, validation_split=0.33, activation_function='relu', out_dir="./outputs"): + selection=None, + layers=3, + nodes=64, + batch_size=128, + validation_split=0.33, + activation_function='relu', + out_dir="./outputs", + scaler_type="RobustScaler"): self.name = name self.signal_trees = signal_trees self.bkg_trees = bkg_trees self.branches = branches self.weight_expr = weight_expr + self.selection = selection self.identifiers = identifiers self.layers = layers self.nodes = nodes @@ -57,6 +65,7 @@ class KerasROOTClassification: self.validation_split = validation_split self.activation_function = activation_function self.out_dir = out_dir + self.scaler_type = scaler_type self.project_dir = os.path.join(self.out_dir, name) @@ -112,10 +121,22 @@ class KerasROOTClassification: signal_chain.AddFile(filename, -1, treename) for filename, treename in self.bkg_trees: bkg_chain.AddFile(filename, -1, treename) - self.s_train = tree2array(signal_chain, branches=self.branches+[self.weight_expr]+self.identifiers, start=0, step=2) - self.b_train = tree2array(bkg_chain, branches=self.branches+[self.weight_expr]+self.identifiers, start=0, step=2) - self.s_test = tree2array(signal_chain, branches=self.branches+[self.weight_expr], start=1, step=2) - self.b_test = tree2array(bkg_chain, branches=self.branches+[self.weight_expr], start=1, step=2) + self.s_train = tree2array(signal_chain, + branches=self.branches+[self.weight_expr]+self.identifiers, + selection=self.selection, + start=0, step=2) + self.b_train = tree2array(bkg_chain, + branches=self.branches+[self.weight_expr]+self.identifiers, + selection=self.selection, + start=0, step=2) + self.s_test = tree2array(signal_chain, + branches=self.branches+[self.weight_expr], + selection=self.selection, + start=1, step=2) + self.b_test = tree2array(bkg_chain, + branches=self.branches+[self.weight_expr], + selection=self.selection, + start=1, step=2) self._dump_training_list() self.s_eventlist_train = self.s_train[self.identifiers] @@ -179,11 +200,16 @@ class KerasROOTClassification: filename = os.path.join(self.project_dir, "scaler.pkl") try: self._scaler = joblib.load(filename) - logger.info("Loaded existing StandardScaler from {}".format(filename)) + logger.info("Loaded existing scaler from {}".format(filename)) except IOError: - logger.info("Creating new StandardScaler") - self._scaler = StandardScaler() - logger.info("Fitting StandardScaler to training data") + logger.info("Creating new {}".format(self.scaler_type)) + if self.scaler_type == "StandardScaler": + self._scaler = StandardScaler() + elif self.scaler_type == "RobustScaler": + self._scaler = RobustScaler() + else: + raise ValueError("Scaler type {} unknown".format(self.scaler_type)) + logger.info("Fitting {} to training data".format(self.scaler_type)) self._scaler.fit(self.x_train) # i think this would refit to test data (and overwrite the parameters) # probably we either want to fit only training data or training and test data together @@ -239,7 +265,7 @@ class KerasROOTClassification: self._model.add(Dense(self.nodes, activation=self.activation_function)) # last layer is one neuron (binary classification) self._model.add(Dense(1, activation='sigmoid')) - + logger.info("Compile model") self._model.compile(optimizer='SGD', loss='binary_crossentropy', @@ -321,6 +347,7 @@ class KerasROOTClassification: self._bkg_weights = np.empty(sum(self.y_train == 0)) self._bkg_weights.fill(self.class_weight[0]) self._bkg_weights *= self.w_train[self.y_train == 0] + logger.debug("Background weights: {}".format(self._bkg_weights)) return self._bkg_weights @@ -335,6 +362,7 @@ class KerasROOTClassification: self._sig_weights = np.empty(sum(self.y_train == 1)) self._sig_weights.fill(self.class_weight[1]) self._sig_weights *= self.w_train[self.y_train == 1] + logger.debug("Signal weights: {}".format(self._sig_weights)) return self._sig_weights @@ -344,10 +372,27 @@ class KerasROOTClassification: fig, ax = plt.subplots() bkg = self.x_train[:,var_index][self.y_train == 0] sig = self.x_train[:,var_index][self.y_train == 1] + logger.debug("Plotting bkg (min={}, max={}) from {}".format(np.min(bkg), np.max(bkg), bkg)) logger.debug("Plotting sig (min={}, max={}) from {}".format(np.min(sig), np.max(sig), sig)) - ax.hist(bkg, color="b", alpha=0.5, bins=50, weights=self.bkg_weights) - ax.hist(sig, color="r", alpha=0.5, bins=50, weights=self.sig_weights) + + # calculate percentiles to get a heuristic for the range to be plotted + # (should in principle also be done with weights, but for now do it unweighted) + range_sig = np.percentile(sig, [1, 99]) + range_bkg = np.percentile(sig, [1, 99]) + plot_range = (min(range_sig[0], range_bkg[0]), max(range_sig[1], range_sig[1])) + + logger.debug("Calculated range based on percentiles: {}".format(plot_range)) + + try: + ax.hist(bkg, color="b", alpha=0.5, bins=50, range=plot_range, weights=self.bkg_weights) + ax.hist(sig, color="r", alpha=0.5, bins=50, range=plot_range, weights=self.sig_weights) + except ValueError: + # weird, probably not always working workaround for a numpy bug + plot_range = (float("{:.2f}".format(plot_range[0])), float("{:.2f}".format(plot_range[1]))) + logger.warn("Got a value error during plotting, maybe this is due to a numpy bug - changing range to {}".format(plot_range)) + ax.hist(bkg, color="b", alpha=0.5, bins=50, range=plot_range, weights=self.bkg_weights) + ax.hist(sig, color="r", alpha=0.5, bins=50, range=plot_range, weights=self.sig_weights) ax.set_xlabel(branch+" (transformed)") plot_dir = os.path.join(self.project_dir, "plots") if not os.path.exists(plot_dir): @@ -411,11 +456,12 @@ if __name__ == "__main__": filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys.root" - c = KerasROOTClassification("test", + c = KerasROOTClassification("test2", signal_trees = [(filename, "GG_oneStep_1705_1105_505_NoSys")], bkg_trees = [(filename, "ttbar_NoSys"), (filename, "wjets_Sherpa221_NoSys") ], + selection="lep1Pt<5000", # cut out a few very weird outliers branches = ["met", "mt"], weight_expr = "eventWeight*genWeight", identifiers = ["DatasetNumber", "EventNumber"]) -- GitLab