diff --git a/toolkit.py b/toolkit.py index dd7651a03b9428fe790f71dc9fad1952ba489a12..cb32d90c675ebd333999c09a1295c123a9480bde 100755 --- a/toolkit.py +++ b/toolkit.py @@ -441,19 +441,19 @@ class ClassificationProject(object): for filename, treename in self.bkg_trees: bkg_chain.AddFile(filename, -1, treename) self.s_train = tree2array(signal_chain, - branches=self.branches+self.regression_branches+[self.weight_expr]+self.identifiers, + branches=set(self.branches+self.regression_branches+[self.weight_expr]+self.identifiers), selection=self.selection, start=0, step=self.step_signal, stop=self.stop_train) self.b_train = tree2array(bkg_chain, - branches=self.branches+self.regression_branches+[self.weight_expr]+self.identifiers, + branches=set(self.branches+self.regression_branches+[self.weight_expr]+self.identifiers), selection=self.selection, start=0, step=self.step_bkg, stop=self.stop_train) self.s_test = tree2array(signal_chain, - branches=self.branches+self.regression_branches+[self.weight_expr], + branches=set(self.branches+self.regression_branches+[self.weight_expr]), selection=self.selection, start=1, step=self.step_signal, stop=self.stop_test) self.b_test = tree2array(bkg_chain, - branches=self.branches+self.regression_branches+[self.weight_expr], + branches=set(self.branches+self.regression_branches+[self.weight_expr]), selection=self.selection, start=1, step=self.step_bkg, stop=self.stop_test) @@ -466,10 +466,6 @@ class ClassificationProject(object): self.b_eventlist_train = self.b_train[self.identifiers].astype(dtype=[(branchName, "u8") for branchName in self.identifiers]) self._dump_training_list() - # now we don't need the identifiers anymore - self.s_train = self.s_train[self.fields+[self.weight_expr]] - self.b_train = self.b_train[self.fields+[self.weight_expr]] - # create x (input), y (target) and w (weights) arrays # the first block will be signals, the second block backgrounds self.x_train = rec2array(self.s_train[self.fields]) @@ -486,8 +482,8 @@ class ClassificationProject(object): y = np.empty((len(x), 1+len(self.target_fields)), dtype=np.float) y[:len(s),0] = 1 y[len(s):,0] = 0 - y[:len(s),1:] = s[self.target_fields] - y[len(s):,1:] = b[self.target_fields] + y[:len(s),1:] = rec2array(s[self.target_fields]) + y[len(s):,1:] = rec2array(b[self.target_fields]) return y self.y_train = fill_target(self.x_train, self.s_train, self.b_train) @@ -784,8 +780,8 @@ class ClassificationProject(object): @property def class_weight(self): if self._class_weight is None: - sumw_bkg = np.sum(self.w_train[self.y_train == 0]) - sumw_sig = np.sum(self.w_train[self.y_train == 1]) + sumw_bkg = np.sum(self.w_train[self.l_train == 0]) + sumw_sig = np.sum(self.w_train[self.l_train == 1]) self._class_weight = [(sumw_sig+sumw_bkg)/(2*sumw_bkg), (sumw_sig+sumw_bkg)/(2*sumw_sig)] logger.debug("Calculated class_weight: {}".format(self._class_weight)) return self._class_weight @@ -800,11 +796,11 @@ class ClassificationProject(object): event with class weights """ if self._balanced_class_weight is None: - sumw_bkg = np.sum(self.w_train[self.y_train == 0]) - sumw_sig = np.sum(self.w_train[self.y_train == 1]) + sumw_bkg = np.sum(self.w_train[self.l_train == 0]) + sumw_sig = np.sum(self.w_train[self.l_train == 1]) # use sumw *per event* in this case - sumw_bkg /= len(self.w_train[self.y_train == 0]) - sumw_sig /= len(self.w_train[self.y_train == 1]) + sumw_bkg /= len(self.w_train[self.l_train == 0]) + sumw_sig /= len(self.w_train[self.l_train == 1]) self._balanced_class_weight = [(sumw_sig+sumw_bkg)/(2*sumw_bkg), (sumw_sig+sumw_bkg)/(2*sumw_sig)] logger.debug("Calculated balanced_class_weight: {}".format(self._balanced_class_weight)) return self._balanced_class_weight @@ -820,6 +816,24 @@ class ClassificationProject(object): self._load_data() + @property + def l_train(self): + "labels (in case y contains regression targets)" + if not self.target_fields: + return self.y_train + else: + return self.y_train[:,0] + + + @property + def l_test(self): + "labels (in case y contains regression targets)" + if not self.target_fields: + return self.y_test + else: + return self.y_test[:,0] + + @property def w_train_tot(self): "(sample weight * class weight), divided by mean" @@ -831,7 +845,7 @@ class ClassificationProject(object): raise ValueError("Data not loaded! can't calculate total weight") if self._w_train_tot is None: if self.apply_class_weight: - self._w_train_tot = self.w_train*np.array(class_weight)[self.y_train.astype(int)] + self._w_train_tot = self.w_train*np.array(class_weight)[self.l_train.astype(int)] else: self._w_train_tot = np.array(self.w_train) if self.normalize_weights: @@ -889,14 +903,22 @@ class ClassificationProject(object): return x + def get_output_list(self, y): + "Split target vector column wise in case of regression targets" + if not self.target_fields: + return y + else: + return np.hsplit(y, len(self.target_fields)+1) + + def yield_batch(self): "Batch generator - optionally shuffle the indices after each epoch" x_train, y_train, w_train = self.x_train, self.y_train, self.w_train_tot train_idx = list(self.train_val_idx[0]) np.random.seed(self.shuffle_seed+1) logger.info("Generating training batches from {} signal and {} background events" - .format(len(np.where(self.y_train[train_idx]==1)[0]), - len(np.where(self.y_train[train_idx]==0)[0]))) + .format(len(np.where(self.l_train[train_idx]==1)[0]), + len(np.where(self.l_train[train_idx]==0)[0]))) while True: if self.shuffle: shuffled_idx = np.random.permutation(train_idx) @@ -907,7 +929,8 @@ class ClassificationProject(object): y_batch = y_train[shuffled_idx[start:start+int(self.batch_size)]] w_batch = w_train[shuffled_idx[start:start+int(self.batch_size)]] x_input = self.get_input_list(self.transform(x_batch)) - yield (x_input, y_batch, w_batch) + y_output = self.get_output_list(y_batch) + yield (x_input, y_output, w_batch) def yield_single_class_batch(self, class_label): @@ -916,7 +939,8 @@ class ClassificationProject(object): The weights are multiplied by balanced_class_weight. """ x_train, y_train, w_train = self.training_data - class_idx = np.where(y_train==class_label)[0] + l_train = y_train[:,0] if self.target_fields else y_train + class_idx = np.where(l_train==class_label)[0] while True: # shuffle the indices for this class label if self.shuffle: @@ -925,9 +949,12 @@ class ClassificationProject(object): shuffled_idx = class_idx # yield them batch wise for start in range(0, len(shuffled_idx), int(self.batch_size/2)): - yield (x_train[shuffled_idx[start:start+int(self.batch_size/2)]], - y_train[shuffled_idx[start:start+int(self.batch_size/2)]], - w_train[shuffled_idx[start:start+int(self.batch_size/2)]]) + x_batch = x_train[shuffled_idx[start:start+int(self.batch_size/2)]] + y_batch = y_train[shuffled_idx[start:start+int(self.batch_size/2)]] + w_batch = w_train[shuffled_idx[start:start+int(self.batch_size/2)]] + x_input = self.get_input_list(self.transform(x_batch)) + y_output = self.get_output_list(y_batch) + yield (x_input, y_output, w_batch) def yield_balanced_batch(self): @@ -967,7 +994,7 @@ class ClassificationProject(object): else: try: self.is_training = True - labels, label_counts = np.unique(self.y_train, return_counts=True) + labels, label_counts = np.unique(self.l_train, return_counts=True) logger.info("Training on balanced batches") # note: the batches have balanced_class_weight already applied self.model.fit_generator(self.yield_balanced_batch(), @@ -1084,6 +1111,7 @@ class ClassificationProject(object): batch_size=100000, score_mode=None, fixed_params=None): + "TODO: doesn't work for regression targets" f = ROOT.TFile.Open(source_filename) tree = f.Get(source_treename) entries = tree.GetEntries() @@ -1161,10 +1189,10 @@ class ClassificationProject(object): fig, ax = plt.subplots() else: fig = None - bkg = self.x_train[:,var_index][self.y_train == 0] - sig = self.x_train[:,var_index][self.y_train == 1] - bkg_weights = self.w_train_tot[self.y_train == 0] - sig_weights = self.w_train_tot[self.y_train == 1] + bkg = self.x_train[:,var_index][self.l_train == 0] + sig = self.x_train[:,var_index][self.l_train == 1] + bkg_weights = self.w_train_tot[self.l_train == 0] + sig_weights = self.w_train_tot[self.l_train == 1] if hasattr(self, "mask_value"): bkg_not_masked = np.where(bkg != self.mask_value)[0] @@ -1240,8 +1268,8 @@ class ClassificationProject(object): def plot_weights(self, bins=100, range=None): fig, ax = plt.subplots() - bkg = self.w_train_tot[self.y_train == 0] - sig = self.w_train_tot[self.y_train == 1] + bkg = self.w_train_tot[self.l_train == 0] + sig = self.w_train_tot[self.l_train == 1] ax.hist(bkg, bins=bins, range=range, color="b", alpha=0.5) ax.set_yscale("log") save_show(plt, fig, os.path.join(self.project_dir, "eventweights_bkg.pdf")) @@ -1259,8 +1287,8 @@ class ClassificationProject(object): ax.grid(color='gray', linestyle='--', linewidth=1) for y, scores, weight, label in [ - (self.y_train, self.scores_train, self.w_train, "train"), - (self.y_test, self.scores_test, self.w_test, "test") + (self.l_train, self.scores_train, self.w_train, "train"), + (self.l_test, self.scores_test, self.w_test, "test") ]: fpr, tpr, threshold = roc_curve(y, scores, sample_weight = weight) fpr = 1.0 - fpr # background rejection @@ -1293,10 +1321,10 @@ class ClassificationProject(object): trf = lambda y : y fig, ax = plt.subplots() for scores, weights, y, class_label, fn, opts in [ - (self.scores_train, self.w_train, self.y_train, 1, ax.bar, dict(color="r", label="signal train")), - (self.scores_train, self.w_train, self.y_train, 0, ax.bar, dict(color="b", label="background train")), - (self.scores_test, self.w_test, self.y_test, 1, ax.errorbar, dict(fmt="ro", label="signal test")), - (self.scores_test, self.w_test, self.y_test, 0, ax.errorbar, dict(fmt="bo", label="background test")), + (self.scores_train, self.w_train, self.l_train, 1, ax.bar, dict(color="r", label="signal train")), + (self.scores_train, self.w_train, self.l_train, 0, ax.bar, dict(color="b", label="background train")), + (self.scores_test, self.w_test, self.l_test, 1, ax.errorbar, dict(fmt="ro", label="signal test")), + (self.scores_test, self.w_test, self.l_test, 0, ax.errorbar, dict(fmt="bo", label="background test")), ]: weights = weights[y==class_label] if apply_class_weight is True and (lumifactor is not None): @@ -1349,16 +1377,16 @@ class ClassificationProject(object): else: trf = lambda y : y - centers_sig_train, hist_sig_train, rel_errors_sig_train = self.get_bin_centered_hist(trf(self.scores_train[self.y_train==1].reshape(-1)), weights=self.w_train[self.y_train==1], **plot_opts) - centers_bkg_train, hist_bkg_train, rel_errors_bkg_train = self.get_bin_centered_hist(trf(self.scores_train[self.y_train==0].reshape(-1)), weights=self.w_train[self.y_train==0], **plot_opts) - centers_sig_test, hist_sig_test, rel_errors_sig_test = self.get_bin_centered_hist(trf(self.scores_test[self.y_test==1].reshape(-1)), weights=self.w_test[self.y_test==1], **plot_opts) - centers_bkg_test, hist_bkg_test, rel_errors_bkg_test = self.get_bin_centered_hist(trf(self.scores_test[self.y_test==0].reshape(-1)), weights=self.w_test[self.y_test==0], **plot_opts) + centers_sig_train, hist_sig_train, rel_errors_sig_train = self.get_bin_centered_hist(trf(self.scores_train[self.l_train==1].reshape(-1)), weights=self.w_train[self.l_train==1], **plot_opts) + centers_bkg_train, hist_bkg_train, rel_errors_bkg_train = self.get_bin_centered_hist(trf(self.scores_train[self.l_train==0].reshape(-1)), weights=self.w_train[self.l_train==0], **plot_opts) + centers_sig_test, hist_sig_test, rel_errors_sig_test = self.get_bin_centered_hist(trf(self.scores_test[self.l_test==1].reshape(-1)), weights=self.w_test[self.l_test==1], **plot_opts) + centers_bkg_test, hist_bkg_test, rel_errors_bkg_test = self.get_bin_centered_hist(trf(self.scores_test[self.l_test==0].reshape(-1)), weights=self.w_test[self.l_test==0], **plot_opts) significances_train = [] significances_test = [] - for hist_sig, hist_bkg, rel_errors_sig, rel_errors_bkg, significances, w, y in [ - (hist_sig_train, hist_bkg_train, rel_errors_sig_train, rel_errors_bkg_train, significances_train, self.w_train, self.y_train), - (hist_sig_test, hist_bkg_test, rel_errors_sig_test, rel_errors_bkg_test, significances_test, self.w_test, self.y_test), + for hist_sig, hist_bkg, rel_errors_sig, rel_errors_bkg, significances in [ + (hist_sig_train, hist_bkg_train, rel_errors_sig_train, rel_errors_bkg_train, significances_train), + (hist_sig_test, hist_bkg_test, rel_errors_sig_test, rel_errors_bkg_test, significances_test), ]: # factor to rescale due to using only a fraction of events (training and test samples) # normfactor_sig = (np.sum(self.w_train[self.y_train==1])+np.sum(self.w_test[self.y_test==1]))/np.sum(w[y==1]) @@ -1447,8 +1475,8 @@ class ClassificationProject(object): prop_cycle = plt.rcParams['axes.prop_cycle'] colors = prop_cycle.by_key()['color'] for (scores, y, w, label), col in zip( - [(self.scores_train, self.y_train, self.w_train, "train"), - (self.scores_test, self.y_test, self.w_test, "test")], + [(self.scores_train, self.l_train, self.w_train, "train"), + (self.scores_test, self.l_test, self.w_test, "test")], colors ): scores = trf(scores) @@ -1570,7 +1598,7 @@ class ClassificationProject(object): df = pd.DataFrame(np.concatenate([self.x_train, self.x_test]), columns=self.fields) df["weight"] = np.concatenate([self.w_train, self.w_test]) df["labels"] = pd.Categorical.from_codes( - np.concatenate([self.y_train, self.y_test]), + np.concatenate([self.l_train, self.l_test]), categories=["background", "signal"] ) for identifier in self.identifiers: