diff --git a/toolkit.py b/toolkit.py index 8970eb8620a48e7c1f735e9cca6727ef235c2a62..db11d81d5ea79819871acec39275c61f85a854ec 100755 --- a/toolkit.py +++ b/toolkit.py @@ -1047,22 +1047,32 @@ class ClassificationProject(object): logger.debug("Plotting sig (min={}, max={}) from {}".format(np.min(sig), np.max(sig), sig)) # calculate percentiles to get a heuristic for the range to be plotted + x_total = np.concatenate([bkg, sig]) + w_total = np.concatenate([bkg_weights, sig_weights]) plot_range = weighted_quantile( - np.concatenate([bkg, sig]), + x_total, [0.01, 0.99], - sample_weight=np.concatenate([bkg_weights, sig_weights]), + sample_weight=w_total, ) logger.debug("Calculated range based on percentiles: {}".format(plot_range)) + bins = 50 + + # check if we have a distribution of integer numbers (e.g. njet or something categorical) + # in that case we want to have a bin for each number + if (x_total == x_total.astype(int)).all(): + plot_range = (math.floor(plot_range[0])-0.5, math.ceil(plot_range[1])+0.5) + bins = int(plot_range[1]-plot_range[0]) + try: - centers_sig, hist_sig, _ = self.get_bin_centered_hist(sig, bins=50, range=plot_range, weights=sig_weights) - centers_bkg, hist_bkg, _ = self.get_bin_centered_hist(bkg, bins=50, range=plot_range, weights=bkg_weights) + centers_sig, hist_sig, _ = self.get_bin_centered_hist(sig, bins=bins, range=plot_range, weights=sig_weights) + centers_bkg, hist_bkg, _ = self.get_bin_centered_hist(bkg, bins=bins, range=plot_range, weights=bkg_weights) except ValueError: # weird, probably not always working workaround for a numpy bug plot_range = (float("{:.3f}".format(plot_range[0])), float("{:.3f}".format(plot_range[1]))) logger.warn("Got a value error during plotting, maybe this is due to a numpy bug - changing range to {}".format(plot_range)) - centers_sig, hist_sig, _ = self.get_bin_centered_hist(sig, bins=50, range=plot_range, weights=sig_weights) - centers_bkg, hist_bkg, _ = self.get_bin_centered_hist(bkg, bins=50, range=plot_range, weights=bkg_weights) + centers_sig, hist_sig, _ = self.get_bin_centered_hist(sig, bins=bins, range=plot_range, weights=sig_weights) + centers_bkg, hist_bkg, _ = self.get_bin_centered_hist(bkg, bins=bins, range=plot_range, weights=bkg_weights) width = centers_sig[1]-centers_sig[0] ax.bar(centers_bkg, hist_bkg, color="b", alpha=0.5, width=width) @@ -1139,11 +1149,7 @@ class ClassificationProject(object): lumifactor=None, apply_class_weight=True, invert_activation=False): if invert_activation: - if not self.activation_function_output == "sigmoid": - raise NotImplementedError("Inverse function of {} not supported yet - " - "currently only sigmoid" - .format(self.activation_function_output)) - trf = lambda y : np.log(y/(1-y)) + trf = self.get_inverse_act_fn() else: trf = lambda y : y fig, ax = plt.subplots() @@ -1191,7 +1197,7 @@ class ClassificationProject(object): return save_show(plt, fig, os.path.join(self.project_dir, "scores.pdf")) - def plot_significance_hist(self, lumifactor=1., significance_function=None, plot_opts=dict(bins=50, range=(0, 1))): + def plot_significance_hist(self, lumifactor=1., significance_function=None, plot_opts=dict(bins=50, range=(0, 1)), invert_activation=False): """ Plot significances based on a histogram of scores @@ -1199,10 +1205,15 @@ class ClassificationProject(object): logger.info("Plot significances") - centers_sig_train, hist_sig_train, rel_errors_sig_train = self.get_bin_centered_hist(self.scores_train[self.y_train==1].reshape(-1), weights=self.w_train[self.y_train==1], **plot_opts) - centers_bkg_train, hist_bkg_train, rel_errors_bkg_train = self.get_bin_centered_hist(self.scores_train[self.y_train==0].reshape(-1), weights=self.w_train[self.y_train==0], **plot_opts) - centers_sig_test, hist_sig_test, rel_errors_sig_test = self.get_bin_centered_hist(self.scores_test[self.y_test==1].reshape(-1), weights=self.w_test[self.y_test==1], **plot_opts) - centers_bkg_test, hist_bkg_test, rel_errors_bkg_test = self.get_bin_centered_hist(self.scores_test[self.y_test==0].reshape(-1), weights=self.w_test[self.y_test==0], **plot_opts) + if invert_activation: + trf = self.get_inverse_act_fn() + else: + trf = lambda y : y + + centers_sig_train, hist_sig_train, rel_errors_sig_train = self.get_bin_centered_hist(trf(self.scores_train[self.y_train==1].reshape(-1)), weights=self.w_train[self.y_train==1], **plot_opts) + centers_bkg_train, hist_bkg_train, rel_errors_bkg_train = self.get_bin_centered_hist(trf(self.scores_train[self.y_train==0].reshape(-1)), weights=self.w_train[self.y_train==0], **plot_opts) + centers_sig_test, hist_sig_test, rel_errors_sig_test = self.get_bin_centered_hist(trf(self.scores_test[self.y_test==1].reshape(-1)), weights=self.w_test[self.y_test==1], **plot_opts) + centers_bkg_test, hist_bkg_test, rel_errors_bkg_test = self.get_bin_centered_hist(trf(self.scores_test[self.y_test==0].reshape(-1)), weights=self.w_test[self.y_test==0], **plot_opts) significances_train = [] significances_test = [] @@ -1270,7 +1281,15 @@ class ClassificationProject(object): return s_sumw, np.sqrt(s_sumw2), b_sumw, np.sqrt(b_sumw2), scores_sorted[threshold_idxs] - def plot_significance(self, significance_function=None, maxsteps=1000, lumifactor=1., vectorized=False): + def get_inverse_act_fn(self): + if not self.activation_function_output == "sigmoid": + raise NotImplementedError("Inverse function of {} not supported yet - " + "currently only sigmoid" + .format(self.activation_function_output)) + return lambda y : np.log(y/(1-y)) + + + def plot_significance(self, significance_function=None, maxsteps=1000, lumifactor=1., vectorized=False, invert_activation=False): """ Plot the significance when cutting on all posible thresholds and plot against signal efficiency. """ @@ -1279,6 +1298,11 @@ class ClassificationProject(object): vectorized = True significance_function = poisson_asimov_significance + if invert_activation: + trf = self.get_inverse_act_fn() + else: + trf = lambda y : y + fig, ax = plt.subplots() ax2 = ax.twinx() prop_cycle = plt.rcParams['axes.prop_cycle'] @@ -1288,6 +1312,7 @@ class ClassificationProject(object): (self.scores_test, self.y_test, self.w_test, "test")], colors ): + scores = trf(scores) s_sumws, s_errs, b_sumws, b_errs, thresholds = self.calc_s_ds_b_db(scores, y, w) stepsize = int(len(s_sumws))/int(maxsteps) if stepsize == 0: