toolkit.py

        plt.close(fig)


    def plot_significance(self, lumifactor=1., significanceFunction=None, plot_opts=dict(bins=50, range=(0, 1))):
        logger.info("Plot significances")

        centers_sig_train, hist_sig_train, rel_errors_sig_train = self.get_bin_centered_hist(self.scores_train[self.y_train==1].reshape(-1), weights=self.w_train[self.y_train==1], **plot_opts)
        centers_bkg_train, hist_bkg_train, rel_errors_bkg_train = self.get_bin_centered_hist(self.scores_train[self.y_train==0].reshape(-1), weights=self.w_train[self.y_train==0], **plot_opts)
        centers_sig_test, hist_sig_test, rel_errors_sig_test = self.get_bin_centered_hist(self.scores_test[self.y_test==1].reshape(-1), weights=self.w_test[self.y_test==1], **plot_opts)
        centers_bkg_test, hist_bkg_test, rel_errors_bkg_test = self.get_bin_centered_hist(self.scores_test[self.y_test==0].reshape(-1), weights=self.w_test[self.y_test==0], **plot_opts)

        significances_train = []
        significances_test = []
        for hist_sig, hist_bkg, rel_errors_sig, rel_errors_bkg, significances, w, y in [
                (hist_sig_train, hist_bkg_train, rel_errors_sig_train, rel_errors_bkg_train, significances_train, self.w_train, self.y_train),
                (hist_sig_test, hist_bkg_test, rel_errors_sig_test, rel_errors_bkg_test, significances_test, self.w_test, self.y_test),
        ]:
            # factor to rescale due to using only a fraction of events (training and test samples)
            # normfactor_sig = (np.sum(self.w_train[self.y_train==1])+np.sum(self.w_test[self.y_test==1]))/np.sum(w[y==1])
            # normfactor_bkg = (np.sum(self.w_train[self.y_train==0])+np.sum(self.w_test[self.y_test==0]))/np.sum(w[y==0])
            normfactor_sig = self.step_signal
            normfactor_bkg = self.step_bkg
            # first set nan values to 0 and multiply by lumi
            for arr in hist_sig, hist_bkg, rel_errors_bkg:
                arr[np.isnan(arr)] = 0
            hist_sig *= lumifactor*normfactor_sig
            hist_bkg *= lumifactor*normfactor_bkg
            for i in range(len(hist_sig)):
                s = sum(hist_sig[i:])
                b = sum(hist_bkg[i:])
                db = math.sqrt(sum((rel_errors_bkg[i:]*hist_bkg[i:])**2))
                if significanceFunction is None:
                    try:
                        z = s/math.sqrt(b+db**2)
                    except (ZeroDivisionError, ValueError) as e:
                        z = 0
                else:
                    z = significanceFunction(s, b, db)
                if z == float('inf'):
                    z = 0
                logger.debug("s, b, db, z = {}, {}, {}, {}".format(s, b, db, z))
                significances.append(z)

        fig, ax = plt.subplots()
        width = centers_sig_train[1]-centers_sig_train[0]
        ax.plot(centers_bkg_train, significances_train, label="train, Z_max={}".format(np.amax(significances_train)))
        ax.plot(centers_bkg_test, significances_test, label="test, Z_max={}".format(np.amax(significances_test)))
        ax.set_xlabel("Cut on NN score")
        ax.set_ylabel("Significance")
        ax.legend(loc='lower center', framealpha=0.5)
        fig.savefig(os.path.join(self.project_dir, "significances.pdf"))
        plt.close(fig)


    @property
    def csv_hist(self):
        with open(os.path.join(self.project_dir, "training.log")) as f:
            reader = csv.reader(f)
            history_list = list(reader)
        hist_dict = {}
        for hist_key_index, hist_key in enumerate(history_list[0]):
            hist_dict[hist_key] = [float(line[hist_key_index]) for line in history_list[1:]]
        return hist_dict

    def plot_loss(self, all_trainings=False, log=False, ylim=None, xlim=None):
        """
        Plot the value of the loss function for each epoch

        :param all_trainings: set to true if you want to plot all trainings (otherwise the previous history is used)
        """

        if all_trainings:
            hist_dict = self.csv_hist
        else:
            hist_dict = self.history.history

        if (not 'loss' in hist_dict) or (not 'val_loss' in hist_dict):
            logger.warning("No previous history found for plotting, try global history")
            hist_dict = self.csv_hist

        logger.info("Plot losses")
        plt.plot(hist_dict['loss'])
        plt.plot(hist_dict['val_loss'])
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['training data','validation data'], loc='upper left')
        if log:
            plt.yscale("log")
        if xlim is not None:
            plt.xlim(*xlim)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.savefig(os.path.join(self.project_dir, "losses.pdf"))
        plt.clf()


    def plot_accuracy(self, all_trainings=False, log=False, acc_suffix="weighted_acc"):
        """
        Plot the value of the accuracy metric for each epoch

        :param all_trainings: set to true if you want to plot all trainings (otherwise the previous history is used)
        """

        if all_trainings:
            hist_dict = self.csv_hist
        else:
            hist_dict = self.history.history

        if (not acc_suffix in hist_dict) or (not 'val_'+acc_suffix in hist_dict):
            logger.warning("No previous history found for plotting, try global history")
            hist_dict = self.csv_hist

        logger.info("Plot accuracy")

        plt.plot(hist_dict[acc_suffix])
        plt.plot(hist_dict['val_'+acc_suffix])
        plt.title('model accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.legend(['training data', 'validation data'], loc='upper left')
        if log:
            plt.yscale("log")
        plt.savefig(os.path.join(self.project_dir, "accuracy.pdf"))
        plt.clf()


    def plot_all(self):
        self.plot_ROC()
        # self.plot_accuracy()
        self.plot_loss()
        self.plot_score()
        self.plot_weights()
        # self.plot_significance()


    def to_DataFrame(self):
        df = pd.DataFrame(np.concatenate([self.x_train, self.x_test]), columns=self.fields)
        df["weight"] = np.concatenate([self.w_train, self.w_test])
        df["labels"] = pd.Categorical.from_codes(
            np.concatenate([self.y_train, self.y_test]),
            categories=["background", "signal"]
        )
        return df


def create_getter(dataset_name):
    def getx(self):
        if getattr(self, "_"+dataset_name) is None:
            self._load_from_hdf5(dataset_name)
        return getattr(self, "_"+dataset_name)
    return getx

def create_setter(dataset_name):
    def setx(self, value):
        setattr(self, "_"+dataset_name, value)
    return setx

# define getters and setters for all datasets
for dataset_name in ClassificationProject.dataset_names:
    setattr(ClassificationProject, dataset_name, property(create_getter(dataset_name),
                                                          create_setter(dataset_name)))


if __name__ == "__main__":

    logging.basicConfig()
    logging.getLogger("KerasROOTClassification").setLevel(logging.INFO)
    #logging.getLogger("KerasROOTClassification").setLevel(logging.DEBUG)

    filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys.root"


    c = ClassificationProject("test4",
                              signal_trees = [(filename, "GG_oneStep_1705_1105_505_NoSys")],
                              bkg_trees = [(filename, "ttbar_NoSys"),
                                           (filename, "wjets_Sherpa221_NoSys")
                              ],
                              optimizer="Adam",
                              #optimizer="SGD",
                              #optimizer_opts=dict(lr=100., decay=1e-6, momentum=0.9),
                              earlystopping_opts=dict(monitor='val_loss',
                                                      min_delta=0, patience=2, verbose=0, mode='auto'),
                              selection="1",
                              branches = ["met", "mt"],
                              weight_expr = "eventWeight*genWeight",
                              identifiers = ["DatasetNumber", "EventNumber"],
                              step_bkg = 100)

    np.random.seed(42)
    c.train(epochs=20)
    #c.plot_all()

    # c.write_friend_tree("test4_score",
    #                     source_filename=filename, source_treename="GG_oneStep_1705_1105_505_NoSys",
    #                     target_filename="friend.root", target_treename="test4_score")

    # c.write_friend_tree("test4_score",
    #                     source_filename=filename, source_treename="ttbar_NoSys",
    #                     target_filename="friend_ttbar_NoSys.root", target_treename="test4_score")