From 013cb17990099aaf7e31dddf21c11f9da43ace4f Mon Sep 17 00:00:00 2001
From: Nikolai Hartmann <Nikolai.Hartmann@physik.uni-muenchen.de>
Date: Mon, 3 Sep 2018 14:51:30 +0200
Subject: [PATCH] planing in 1D implemented

---
 toolkit.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/toolkit.py b/toolkit.py
index fa1c893..467536f 100755
--- a/toolkit.py
+++ b/toolkit.py
@@ -187,10 +187,10 @@ class ClassificationProject(object):
 
 
     # Datasets that are stored to (and dynamically loaded from) hdf5
-    dataset_names = ["x_train", "x_test", "y_train", "y_test", "w_train", "w_test", "scores_train", "scores_test", "planing_array"]
+    dataset_names = ["x_train", "x_test", "y_train", "y_test", "w_train", "w_test", "scores_train", "scores_test"]
 
     # Datasets that are retrieved from ROOT trees the first time
-    dataset_names_tree = ["x_train", "x_test", "y_train", "y_test", "w_train", "w_test", "planing_array"]
+    dataset_names_tree = ["x_train", "x_test", "y_train", "y_test", "w_train", "w_test"]
 
     def __init__(self, name, *args, **kwargs):
         if len(args) < 1 and len(kwargs) < 1:
@@ -329,6 +329,10 @@ class ClassificationProject(object):
 
         self.planing_var, self.planing_bins, self.planing_range = planing_vars
 
+        if self.planing_var is not None:
+            self.dataset_names_tree.append("planing_array")
+            self.dataset_names.append("planing_array")
+
         self.s_train = None
         self.b_train = None
         self.s_test = None
@@ -413,11 +417,12 @@ class ClassificationProject(object):
             for filename, treename in self.bkg_trees:
                 bkg_chain.AddFile(filename, -1, treename)
 
-            branches = self.branches
+            branches = list(self.branches)
             if self.planing_var is not None:
                 branches.append(self.planing_var)
 
-            print(branches)
+            # remove duplicates
+            branches = list(set(branches))
 
             self.s_train = tree2array(signal_chain,
                                       branches=branches+[self.weight_expr]+self.identifiers,
@@ -851,7 +856,7 @@ class ClassificationProject(object):
         it becomes flat for both signal and background (information
         effectively removed)
         """
-        if self._w_train_plane is None and self.planing_array is not None:
+        if self._w_train_plane is None and self.planing_var is not None:
             self._w_train_plane = np.empty(len(self.x_train), dtype=float)
             for class_label in [0, 1]:
                 ar = self.planing_array[self.y_train==class_label]
@@ -864,9 +869,9 @@ class ClassificationProject(object):
                 sfs = 1./hist
                 sfs[np.isinf(sfs)] = 0
                 sfs = np.concatenate([sfs, [0]]) # overflow is reweighted to 0
-                bin_idx = np.digitize(ar, bins)
-                bin_inds -= 1 # different convention for digitize and histogram?
-                self._w_train_plane[self.y_train==class_label] = sfs[bin_inds]
+                bin_idx = np.digitize(ar, edges)
+                bin_idx -= 1 # different convention for digitize and histogram?
+                self._w_train_plane[self.y_train==class_label] = sfs[bin_idx]
         return self._w_train_plane
 
 
@@ -882,8 +887,6 @@ class ClassificationProject(object):
             w_train_tot = self.w_train*np.array(class_weight)[self.y_train.astype(int)]
         else:
             w_train_tot = np.array(self.w_train)
-        if self.normalize_weights:
-            w_train_tot /= np.mean(w_train_tot)
         return w_train_tot
 
 
@@ -900,6 +903,8 @@ class ClassificationProject(object):
             self._w_train_tot = self.get_total_weight()
             if self.w_train_plane is not None:
                 self._w_train_tot *= self._w_train_plane
+            if self.normalize_weights:
+                self._w_train_tot /= np.mean(self._w_train_tot)
         return self._w_train_tot
 
 
-- 
GitLab