toolkit.py

        fpr, tpr, threshold = roc_curve(self.y_test, self.scores_test, sample_weight = self.w_test)
        fpr = 1.0 - fpr
        try:
            roc_auc = auc(tpr, fpr, reorder=True)
        except ValueError:
            logger.warning("Got a value error from auc - trying to rerun with reorder=True")
            roc_auc = auc(tpr, fpr, reorder=True)

        plt.grid(color='gray', linestyle='--', linewidth=1)
        plt.plot(tpr,  fpr, label=str(self.name + " (AUC = {})".format(roc_auc)))
        plt.plot([0,1],[1,0], linestyle='--', color='black', label='Luck')
        plt.ylabel("Background rejection")
        plt.xlabel("Signal efficiency")
        plt.title('Receiver operating characteristic')
        plt.xlim(0,1)
        plt.ylim(0,1)
        plt.xticks(np.arange(0,1,0.1))
        plt.yticks(np.arange(0,1,0.1))
        plt.legend(loc='lower left', framealpha=1.0)
        plt.savefig(os.path.join(self.project_dir, "ROC.pdf"))
        plt.clf()


    def plot_score(self, log=True, plot_opts=dict(bins=50, range=(0, 1)), ylim=None, xlim=None):
        centers_sig_train, hist_sig_train, _ = self.get_bin_centered_hist(self.scores_train[self.y_train==1].reshape(-1), density=True, weights=self.w_train[self.y_train==1], **plot_opts)
        centers_bkg_train, hist_bkg_train, _ = self.get_bin_centered_hist(self.scores_train[self.y_train==0].reshape(-1), density=True, weights=self.w_train[self.y_train==0], **plot_opts)
        centers_sig_test, hist_sig_test, rel_errors_sig_test = self.get_bin_centered_hist(self.scores_test[self.y_test==1].reshape(-1), density=True, weights=self.w_test[self.y_test==1], **plot_opts)
        centers_bkg_test, hist_bkg_test, rel_errors_bkg_test = self.get_bin_centered_hist(self.scores_test[self.y_test==0].reshape(-1), density=True, weights=self.w_test[self.y_test==0], **plot_opts)
        errors_sig_test = hist_sig_test*rel_errors_sig_test
        errors_bkg_test = hist_bkg_test*rel_errors_bkg_test
        fig, ax = plt.subplots()
        width = centers_sig_train[1]-centers_sig_train[0]
        ax.bar(centers_bkg_train, hist_bkg_train, color="b", alpha=0.5, width=width, label="background train")
        ax.bar(centers_sig_train, hist_sig_train, color="r", alpha=0.5, width=width, label="signal train")
        ax.errorbar(centers_bkg_test, hist_bkg_test, fmt="bo", yerr=errors_bkg_test, label="background test")
        ax.errorbar(centers_sig_test, hist_sig_test, fmt="ro", yerr=errors_sig_test, label="signal test")
        if log:
            ax.set_yscale("log")
        if ylim is not None:
            ax.set_ylim(*ylim)
        if xlim is not None:
            ax.set_xlim(*xlim)
        ax.set_xlabel("NN output")
        fig.legend(loc='upper center', framealpha=0.5)
        fig.savefig(os.path.join(self.project_dir, "scores.pdf"))
        plt.close(fig)


    def plot_significance(self, lumifactor=1., significanceFunction=None, plot_opts=dict(bins=50, range=(0, 1))):
        logger.info("Plot significances")

        centers_sig_train, hist_sig_train, rel_errors_sig_train = self.get_bin_centered_hist(self.scores_train[self.y_train==1].reshape(-1), weights=self.w_train[self.y_train==1], **plot_opts)
        centers_bkg_train, hist_bkg_train, rel_errors_bkg_train = self.get_bin_centered_hist(self.scores_train[self.y_train==0].reshape(-1), weights=self.w_train[self.y_train==0], **plot_opts)
        centers_sig_test, hist_sig_test, rel_errors_sig_test = self.get_bin_centered_hist(self.scores_test[self.y_test==1].reshape(-1), weights=self.w_test[self.y_test==1], **plot_opts)
        centers_bkg_test, hist_bkg_test, rel_errors_bkg_test = self.get_bin_centered_hist(self.scores_test[self.y_test==0].reshape(-1), weights=self.w_test[self.y_test==0], **plot_opts)

        significances_train = []
        significances_test = []
        for hist_sig, hist_bkg, rel_errors_sig, rel_errors_bkg, significances, w, y in [
                (hist_sig_train, hist_bkg_train, rel_errors_sig_train, rel_errors_bkg_train, significances_train, self.w_train, self.y_train),
                (hist_sig_test, hist_bkg_test, rel_errors_sig_test, rel_errors_bkg_test, significances_test, self.w_test, self.y_test),
        ]:
            # factor to rescale due to using only a fraction of events (training and test samples)
            # normfactor_sig = (np.sum(self.w_train[self.y_train==1])+np.sum(self.w_test[self.y_test==1]))/np.sum(w[y==1])
            # normfactor_bkg = (np.sum(self.w_train[self.y_train==0])+np.sum(self.w_test[self.y_test==0]))/np.sum(w[y==0])
            normfactor_sig = self.step_signal
            normfactor_bkg = self.step_bkg
            # first set nan values to 0 and multiply by lumi
            for arr in hist_sig, hist_bkg, rel_errors_bkg:
                arr[np.isnan(arr)] = 0
            hist_sig *= lumifactor*normfactor_sig
            hist_bkg *= lumifactor*normfactor_bkg
            for i in range(len(hist_sig)):
                s = sum(hist_sig[i:])
                b = sum(hist_bkg[i:])
                db = math.sqrt(sum((rel_errors_bkg[i:]*hist_bkg[i:])**2))
                if significanceFunction is None:
                    try:
                        z = s/math.sqrt(b+db**2)
                    except (ZeroDivisionError, ValueError) as e:
                        z = 0
                else:
                    z = significanceFunction(s, b, db)
                if z == float('inf'):
                    z = 0
                logger.debug("s, b, db, z = {}, {}, {}, {}".format(s, b, db, z))
                significances.append(z)

        fig, ax = plt.subplots()
        width = centers_sig_train[1]-centers_sig_train[0]
        ax.plot(centers_bkg_train, significances_train, label="train, Z_max={}".format(np.amax(significances_train)))
        ax.plot(centers_bkg_test, significances_test, label="test, Z_max={}".format(np.amax(significances_test)))
        ax.set_xlabel("Cut on NN score")
        ax.set_ylabel("Significance")
        ax.legend(loc='lower center', framealpha=0.5)
        fig.savefig(os.path.join(self.project_dir, "significances.pdf"))
        plt.close(fig)


    @property
    def csv_hist(self):
        with open(os.path.join(self.project_dir, "training.log")) as f:
            reader = csv.reader(f)
            history_list = list(reader)
        hist_dict = {}
        for hist_key_index, hist_key in enumerate(history_list[0]):
            hist_dict[hist_key] = [float(line[hist_key_index]) for line in history_list[1:]]
        return hist_dict

    def plot_loss(self, all_trainings=False, log=False, ylim=None, xlim=None):
        """
        Plot the value of the loss function for each epoch

        :param all_trainings: set to true if you want to plot all trainings (otherwise the previous history is used)
        """

        if all_trainings:
            hist_dict = self.csv_hist
        else:
            hist_dict = self.history.history

        if (not 'loss' in hist_dict) or (not 'val_loss' in hist_dict):
            logger.warning("No previous history found for plotting, try global history")
            hist_dict = self.csv_hist

        logger.info("Plot losses")
        plt.plot(hist_dict['loss'])
        plt.plot(hist_dict['val_loss'])
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['training data','validation data'], loc='upper left')
        if log:
            plt.yscale("log")
        if xlim is not None:
            plt.xlim(*xlim)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.savefig(os.path.join(self.project_dir, "losses.pdf"))
        plt.clf()


    def plot_accuracy(self, all_trainings=False, log=False, acc_suffix="weighted_acc"):
        """
        Plot the value of the accuracy metric for each epoch

        :param all_trainings: set to true if you want to plot all trainings (otherwise the previous history is used)
        """

        if all_trainings:
            hist_dict = self.csv_hist
        else:
            hist_dict = self.history.history

        if (not acc_suffix in hist_dict) or (not 'val_'+acc_suffix in hist_dict):
            logger.warning("No previous history found for plotting, try global history")
            hist_dict = self.csv_hist

        logger.info("Plot accuracy")

        plt.plot(hist_dict[acc_suffix])
        plt.plot(hist_dict['val_'+acc_suffix])
        plt.title('model accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.legend(['training data', 'validation data'], loc='upper left')
        if log:
            plt.yscale("log")
        plt.savefig(os.path.join(self.project_dir, "accuracy.pdf"))
        plt.clf()


    def plot_all(self):
        self.plot_ROC()
        # self.plot_accuracy()
        self.plot_loss()
        self.plot_score()
        self.plot_weights()
        # self.plot_significance()


    def to_DataFrame(self):
        df = pd.DataFrame(np.concatenate([self.x_train, self.x_test]), columns=self.fields)
        df["weight"] = np.concatenate([self.w_train, self.w_test])
        df["labels"] = pd.Categorical.from_codes(
            np.concatenate([self.y_train, self.y_test]),
            categories=["background", "signal"]
        )
        for identifier in self.identifiers:
            try:
                df[identifier] = np.concatenate([self.s_eventlist_train[identifier],
                                                 self.b_eventlist_train[identifier],
                                                 -1*np.ones(len(self.x_test), dtype="i8")])
            except IOError:
                logger.warning("Can't find eventlist - DataFrame won't contain identifiers")
        df["is_train"] = np.concatenate([np.ones(len(self.x_train), dtype=np.bool),
                                         np.zeros(len(self.x_test), dtype=np.bool)])
        return df


def create_getter(dataset_name):
    def getx(self):
        if getattr(self, "_"+dataset_name) is None:
            self._load_from_hdf5(dataset_name)
        return getattr(self, "_"+dataset_name)
    return getx

def create_setter(dataset_name):
    def setx(self, value):
        setattr(self, "_"+dataset_name, value)
    return setx

# define getters and setters for all datasets
for dataset_name in ClassificationProject.dataset_names:
    setattr(ClassificationProject, dataset_name, property(create_getter(dataset_name),
                                                          create_setter(dataset_name)))


class ClassificationProjectDataFrame(ClassificationProject):

    """
    A little hack to initialize a ClassificationProject from a pandas DataFrame instead of ROOT TTrees
    """

    def __init__(self,
                 name,
                 df,
                 input_columns,
                 weight_column="weights",
                 label_column="labels",
                 signal_label="signal",
                 background_label="background",
                 split_mode="split_column",
                 split_column="is_train",
                 **kwargs):

        self.df = df
        self.input_columns = input_columns
        self.weight_column = weight_column
        self.label_column = label_column
        self.signal_label = signal_label
        self.background_label = background_label
        if split_mode != "split_column":
            raise NotImplementedError("'split_column' is the only currently supported split mode")
        self.split_mode = split_mode
        self.split_column = split_column
        super(ClassificationProjectDataFrame, self).__init__(name,
                                                             signal_trees=[], bkg_trees=[], branches=[], weight_expr="1",
                                                             **kwargs)
        self._x_train = None
        self._x_test = None
        self._y_train = None
        self._y_test = None
        self._w_train = None
        self._w_test = None

    @property
    def x_train(self):
        if self._x_train is None:
            self._x_train = self.df[self.df[self.split_column]][self.input_columns].values
        return self._x_train

    @x_train.setter
    def x_train(self, value):
        self._x_train = value

    @property
    def x_test(self):
        if self._x_test is None:
            self._x_test = self.df[~self.df[self.split_column]][self.input_columns].values
        return self._x_test

    @x_test.setter
    def x_test(self, value):
        self._x_test = value

    @property
    def y_train(self):
        if self._y_train is None:
            self._y_train = (self.df[self.df[self.split_column]][self.label_column] == self.signal_label).values
        return self._y_train

    @y_train.setter
    def y_train(self, value):
        self._y_train = value

    @property
    def y_test(self):
        if self._y_test is None:
            self._y_test = (self.df[~self.df[self.split_column]][self.label_column] == self.signal_label).values
        return self._y_test

    @y_test.setter
    def y_test(self, value):
        self._y_test = value

    @property
    def w_train(self):
        if self._w_train is None:
            self._w_train = self.df[self.df[self.split_column]][self.weight_column].values
        return self._w_train

    @w_train.setter
    def w_train(self, value):
        self._w_train = value

    @property
    def w_test(self):
        if self._w_test is None:
            self._w_test = self.df[~self.df[self.split_column]][self.weight_column].values
        return self._w_test

    @w_test.setter
    def w_test(self, value):
        self._w_test = value

    @property
    def fields(self):
        return self.input_columns


    def load(self, reload=False):

        if reload:
            self.data_loaded = False
            self.data_transformed = False
            self._x_train = None
            self._x_test = None
            self._y_train = None
            self._y_test = None
            self._w_train = None
            self._w_test = None

        if not self.data_transformed:
            self._transform_data()


if __name__ == "__main__":

    logging.basicConfig()
    logging.getLogger("KerasROOTClassification").setLevel(logging.INFO)
    #logging.getLogger("KerasROOTClassification").setLevel(logging.DEBUG)

    filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys.root"


    c = ClassificationProject("test4",
                              signal_trees = [(filename, "GG_oneStep_1705_1105_505_NoSys")],
                              bkg_trees = [(filename, "ttbar_NoSys"),
                                           (filename, "wjets_Sherpa221_NoSys")
                              ],
                              optimizer="Adam",
                              #optimizer="SGD",
                              #optimizer_opts=dict(lr=100., decay=1e-6, momentum=0.9),
                              earlystopping_opts=dict(monitor='val_loss',
                                                      min_delta=0, patience=2, verbose=0, mode='auto'),
                              selection="1",
                              branches = ["met", "mt"],
                              weight_expr = "eventWeight*genWeight",
                              identifiers = ["DatasetNumber", "EventNumber"],
                              step_bkg = 100)

    np.random.seed(42)
    c.train(epochs=20)
    #c.plot_all()

    # c.write_friend_tree("test4_score",
    #                     source_filename=filename, source_treename="GG_oneStep_1705_1105_505_NoSys",
    #                     target_filename="friend.root", target_treename="test4_score")

    # c.write_friend_tree("test4_score",
    #                     source_filename=filename, source_treename="ttbar_NoSys",
    #                     target_filename="friend_ttbar_NoSys.root", target_treename="test4_score")