From 365ba76ddf2970b98f4e0db9b93b091ee6550d55 Mon Sep 17 00:00:00 2001
From: Nikolai Hartmann <Nikolai.Hartmann@physik.uni-muenchen.de>
Date: Fri, 27 Apr 2018 15:44:29 +0200
Subject: [PATCH] Selection argument, RobustScaler and workaround for numpy bug

---
 toolkit.py | 74 +++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 60 insertions(+), 14 deletions(-)

diff --git a/toolkit.py b/toolkit.py
index 05dfb0e..f4451f3 100755
--- a/toolkit.py
+++ b/toolkit.py
@@ -11,7 +11,7 @@ from root_numpy import tree2array, rec2array
 import numpy as np
 import pandas as pd
 import h5py
-from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import StandardScaler, RobustScaler
 from sklearn.externals import joblib
 from sklearn.metrics import roc_curve
 
@@ -44,12 +44,20 @@ class KerasROOTClassification:
 
     def __init__(self, name,
                  signal_trees, bkg_trees, branches, weight_expr, identifiers,
-                 layers=3, nodes=64, batch_size=128, validation_split=0.33, activation_function='relu', out_dir="./outputs"):
+                 selection=None,
+                 layers=3,
+                 nodes=64,
+                 batch_size=128,
+                 validation_split=0.33,
+                 activation_function='relu',
+                 out_dir="./outputs",
+                 scaler_type="RobustScaler"):
         self.name = name
         self.signal_trees = signal_trees
         self.bkg_trees = bkg_trees
         self.branches = branches
         self.weight_expr = weight_expr
+        self.selection = selection
         self.identifiers = identifiers
         self.layers = layers
         self.nodes = nodes
@@ -57,6 +65,7 @@ class KerasROOTClassification:
         self.validation_split = validation_split
         self.activation_function = activation_function
         self.out_dir = out_dir
+        self.scaler_type = scaler_type
 
         self.project_dir = os.path.join(self.out_dir, name)
 
@@ -112,10 +121,22 @@ class KerasROOTClassification:
                 signal_chain.AddFile(filename, -1, treename)
             for filename, treename in self.bkg_trees:
                 bkg_chain.AddFile(filename, -1, treename)
-            self.s_train = tree2array(signal_chain, branches=self.branches+[self.weight_expr]+self.identifiers, start=0, step=2)
-            self.b_train = tree2array(bkg_chain, branches=self.branches+[self.weight_expr]+self.identifiers, start=0, step=2)
-            self.s_test = tree2array(signal_chain, branches=self.branches+[self.weight_expr], start=1, step=2)
-            self.b_test = tree2array(bkg_chain, branches=self.branches+[self.weight_expr], start=1, step=2)
+            self.s_train = tree2array(signal_chain,
+                                      branches=self.branches+[self.weight_expr]+self.identifiers,
+                                      selection=self.selection,
+                                      start=0, step=2)
+            self.b_train = tree2array(bkg_chain,
+                                      branches=self.branches+[self.weight_expr]+self.identifiers,
+                                      selection=self.selection,
+                                      start=0, step=2)
+            self.s_test = tree2array(signal_chain,
+                                     branches=self.branches+[self.weight_expr],
+                                     selection=self.selection,
+                                     start=1, step=2)
+            self.b_test = tree2array(bkg_chain,
+                                     branches=self.branches+[self.weight_expr],
+                                     selection=self.selection,
+                                     start=1, step=2)
 
             self._dump_training_list()
             self.s_eventlist_train = self.s_train[self.identifiers]
@@ -179,11 +200,16 @@ class KerasROOTClassification:
             filename = os.path.join(self.project_dir, "scaler.pkl")
             try:
                 self._scaler = joblib.load(filename)
-                logger.info("Loaded existing StandardScaler from {}".format(filename))
+                logger.info("Loaded existing scaler from {}".format(filename))
             except IOError:
-                logger.info("Creating new StandardScaler")
-                self._scaler = StandardScaler()
-                logger.info("Fitting StandardScaler to training data")
+                logger.info("Creating new {}".format(self.scaler_type))
+                if self.scaler_type == "StandardScaler":
+                    self._scaler = StandardScaler()
+                elif self.scaler_type == "RobustScaler":
+                    self._scaler = RobustScaler()
+                else:
+                    raise ValueError("Scaler type {} unknown".format(self.scaler_type))
+                logger.info("Fitting {} to training data".format(self.scaler_type))
                 self._scaler.fit(self.x_train)
                 # i think this would refit to test data (and overwrite the parameters)
                 # probably we either want to fit only training data or training and test data together
@@ -239,7 +265,7 @@ class KerasROOTClassification:
                 self._model.add(Dense(self.nodes, activation=self.activation_function))
             # last layer is one neuron (binary classification)
             self._model.add(Dense(1, activation='sigmoid'))
-            
+
             logger.info("Compile model")
             self._model.compile(optimizer='SGD',
                   loss='binary_crossentropy',
@@ -321,6 +347,7 @@ class KerasROOTClassification:
             self._bkg_weights = np.empty(sum(self.y_train == 0))
             self._bkg_weights.fill(self.class_weight[0])
             self._bkg_weights *= self.w_train[self.y_train == 0]
+            logger.debug("Background weights: {}".format(self._bkg_weights))
         return self._bkg_weights
 
 
@@ -335,6 +362,7 @@ class KerasROOTClassification:
             self._sig_weights = np.empty(sum(self.y_train == 1))
             self._sig_weights.fill(self.class_weight[1])
             self._sig_weights *= self.w_train[self.y_train == 1]
+            logger.debug("Signal weights: {}".format(self._sig_weights))
         return self._sig_weights
 
 
@@ -344,10 +372,27 @@ class KerasROOTClassification:
         fig, ax = plt.subplots()
         bkg = self.x_train[:,var_index][self.y_train == 0]
         sig = self.x_train[:,var_index][self.y_train == 1]
+
         logger.debug("Plotting bkg (min={}, max={}) from {}".format(np.min(bkg), np.max(bkg), bkg))
         logger.debug("Plotting sig (min={}, max={}) from {}".format(np.min(sig), np.max(sig), sig))
-        ax.hist(bkg, color="b", alpha=0.5, bins=50, weights=self.bkg_weights)
-        ax.hist(sig, color="r", alpha=0.5, bins=50, weights=self.sig_weights)
+
+        # calculate percentiles to get a heuristic for the range to be plotted
+        # (should in principle also be done with weights, but for now do it unweighted)
+        range_sig = np.percentile(sig, [1, 99])
+        range_bkg = np.percentile(sig, [1, 99])
+        plot_range = (min(range_sig[0], range_bkg[0]), max(range_sig[1], range_sig[1]))
+
+        logger.debug("Calculated range based on percentiles: {}".format(plot_range))
+
+        try:
+            ax.hist(bkg, color="b", alpha=0.5, bins=50, range=plot_range, weights=self.bkg_weights)
+            ax.hist(sig, color="r", alpha=0.5, bins=50, range=plot_range, weights=self.sig_weights)
+        except ValueError:
+            # weird, probably not always working workaround for a numpy bug
+            plot_range = (float("{:.2f}".format(plot_range[0])), float("{:.2f}".format(plot_range[1])))
+            logger.warn("Got a value error during plotting, maybe this is due to a numpy bug - changing range to {}".format(plot_range))
+            ax.hist(bkg, color="b", alpha=0.5, bins=50, range=plot_range, weights=self.bkg_weights)
+            ax.hist(sig, color="r", alpha=0.5, bins=50, range=plot_range, weights=self.sig_weights)
         ax.set_xlabel(branch+" (transformed)")
         plot_dir = os.path.join(self.project_dir, "plots")
         if not os.path.exists(plot_dir):
@@ -411,11 +456,12 @@ if __name__ == "__main__":
 
     filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys.root"
 
-    c = KerasROOTClassification("test",
+    c = KerasROOTClassification("test2",
                                 signal_trees = [(filename, "GG_oneStep_1705_1105_505_NoSys")],
                                 bkg_trees = [(filename, "ttbar_NoSys"),
                                              (filename, "wjets_Sherpa221_NoSys")
                                 ],
+                                selection="lep1Pt<5000", # cut out a few very weird outliers
                                 branches = ["met", "mt"],
                                 weight_expr = "eventWeight*genWeight",
                                 identifiers = ["DatasetNumber", "EventNumber"])
-- 
GitLab