diff --git a/toolkit.py b/toolkit.py index c6ff50e16b787b1795ca189e00aa9c08928e5ed6..09d7bde174aa35681b849786dd64e1fe0623cd60 100755 --- a/toolkit.py +++ b/toolkit.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__all__ = ["ClassificationProject"] +__all__ = ["ClassificationProject", "ClassificationProjectDataFrame"] from sys import version_info @@ -18,6 +18,7 @@ import csv import math import glob import shutil +import gc import logging logger = logging.getLogger("KerasROOTClassification") @@ -132,6 +133,10 @@ class ClassificationProject(object): :param step_bkg: step size when selecting background training events (e.g. 2 means take every second event) + :param stop_train: stop after this number of events for reading in training events + + :param stop_test: stop after this number of events for reading in test events + :param optimizer: name of optimizer class in keras.optimizers :param optimizer_opts: dictionary of options for the optimizer @@ -212,6 +217,8 @@ class ClassificationProject(object): scaler_type="WeightedRobustScaler", step_signal=2, step_bkg=2, + stop_train=None, + stop_test=None, optimizer="SGD", optimizer_opts=None, use_earlystopping=True, @@ -266,6 +273,8 @@ class ClassificationProject(object): self.scaler_type = scaler_type self.step_signal = step_signal self.step_bkg = step_bkg + self.stop_train = stop_train + self.stop_test = stop_test self.optimizer = optimizer self.use_earlystopping = use_earlystopping self.use_modelcheckpoint = use_modelcheckpoint @@ -371,19 +380,19 @@ class ClassificationProject(object): self.s_train = tree2array(signal_chain, branches=self.branches+[self.weight_expr]+self.identifiers, selection=self.selection, - start=0, step=self.step_signal) + start=0, step=self.step_signal, stop=self.stop_train) self.b_train = tree2array(bkg_chain, branches=self.branches+[self.weight_expr]+self.identifiers, selection=self.selection, - start=0, step=self.step_bkg) + start=0, step=self.step_bkg, stop=self.stop_train) self.s_test = tree2array(signal_chain, branches=self.branches+[self.weight_expr], selection=self.selection, - start=1, step=self.step_signal) + start=1, step=self.step_signal, stop=self.stop_test) self.b_test = tree2array(bkg_chain, branches=self.branches+[self.weight_expr], selection=self.selection, - start=1, step=self.step_bkg) + start=1, step=self.step_bkg, stop=self.stop_test) self.rename_fields(self.s_train) self.rename_fields(self.b_train) @@ -402,19 +411,23 @@ class ClassificationProject(object): # the first block will be signals, the second block backgrounds self.x_train = rec2array(self.s_train[self.fields]) self.x_train = np.concatenate((self.x_train, rec2array(self.b_train[self.fields]))) - self.x_test = rec2array(self.s_test[self.fields]) - self.x_test = np.concatenate((self.x_test, rec2array(self.b_test[self.fields]))) self.w_train = self.s_train[self.weight_expr] self.w_train = np.concatenate((self.w_train, self.b_train[self.weight_expr])) - self.w_test = self.s_test[self.weight_expr] - self.w_test = np.concatenate((self.w_test, self.b_test[self.weight_expr])) - - self.y_train = np.empty(len(self.x_train)) + self.y_train = np.empty(len(self.x_train), dtype=np.bool) self.y_train[:len(self.s_train)] = 1 self.y_train[len(self.s_train):] = 0 - self.y_test = np.empty(len(self.x_test)) + self.b_train = None + self.s_train = None + + self.x_test = rec2array(self.s_test[self.fields]) + self.x_test = np.concatenate((self.x_test, rec2array(self.b_test[self.fields]))) + self.w_test = self.s_test[self.weight_expr] + self.w_test = np.concatenate((self.w_test, self.b_test[self.weight_expr])) + self.y_test = np.empty(len(self.x_test), dtype=np.bool) self.y_test[:len(self.s_test)] = 1 self.y_test[len(self.s_test):] = 0 + self.b_test = None + self.s_test = None self._dump_to_hdf5(*self.dataset_names_tree) @@ -523,7 +536,10 @@ class ClassificationProject(object): else: raise ValueError("Scaler type {} unknown".format(self.scaler_type)) logger.info("Fitting {} to training data".format(self.scaler_type)) + orig_copy_setting = self.scaler.copy + self.scaler.copy = False self._scaler.fit(self.x_train, **scaler_fit_kwargs) + self.scaler.copy = orig_copy_setting joblib.dump(self._scaler, filename) return self._scaler @@ -565,9 +581,12 @@ class ClassificationProject(object): logger.debug("training data before transformation: {}".format(self.x_train)) logger.debug("minimum values: {}".format([np.min(self.x_train[:,i]) for i in range(self.x_train.shape[1])])) logger.debug("maximum values: {}".format([np.max(self.x_train[:,i]) for i in range(self.x_train.shape[1])])) + orig_copy_setting = self.scaler.copy + self.scaler.copy = False self.x_train = self.scaler.transform(self.x_train) logger.debug("training data after transformation: {}".format(self.x_train)) self.x_test = self.scaler.transform(self.x_test) + self.scaler.copy = orig_copy_setting self.data_transformed = True logger.info("Training and test data transformed") @@ -833,18 +852,18 @@ class ClassificationProject(object): self.total_epochs += epochs self._write_info("epochs", self.total_epochs) + + def evaluate_train_test(self, do_train=True, do_test=True): logger.info("Reloading (and re-transforming) unshuffled training data") self.load(reload=True) - logger.info("Create/Update scores for ROC curve") - self.scores_test = self.model.predict(self.x_test) - self.scores_train = self.model.predict(self.x_train) - - self._dump_to_hdf5("scores_train", "scores_test") - - logger.info("Creating all validation plots") - self.plot_all() - + logger.info("Create/Update scores for train/test sample") + if do_test: + self.scores_test = self.model.predict(self.x_test) + self._dump_to_hdf5("scores_test") + if do_train: + self.scores_train = self.model.predict(self.x_train) + self._dump_to_hdf5("scores_train") def evaluate(self, x_eval): @@ -942,7 +961,10 @@ class ClassificationProject(object): # range_sig = np.percentile(sig, [1, 99]) # range_bkg = np.percentile(sig, [1, 99]) # plot_range = (min(range_sig[0], range_bkg[0]), max(range_sig[1], range_sig[1])) - plot_range = weighted_quantile(self.x_train[:,var_index], [0.1, 0.99], sample_weight=self.w_train*np.array(self.class_weight)[self.y_train.astype(int)]) + plot_range = weighted_quantile( + self.x_train[:,var_index], [0.01, 0.99], + sample_weight=self.w_train*np.array(self.class_weight)[self.y_train.astype(int)] + ) logger.debug("Calculated range based on percentiles: {}".format(plot_range)) @@ -1177,9 +1199,12 @@ class ClassificationProject(object): categories=["background", "signal"] ) for identifier in self.identifiers: - df[identifier] = np.concatenate([self.s_eventlist_train[identifier], - self.b_eventlist_train[identifier], - -1*np.ones(len(self.x_test), dtype="i8")]) + try: + df[identifier] = np.concatenate([self.s_eventlist_train[identifier], + self.b_eventlist_train[identifier], + -1*np.ones(len(self.x_test), dtype="i8")]) + except IOError: + logger.warning("Can't find eventlist - DataFrame won't contain identifiers") df["is_train"] = np.concatenate([np.ones(len(self.x_train), dtype=np.bool), np.zeros(len(self.x_test), dtype=np.bool)]) return df @@ -1210,15 +1235,116 @@ class ClassificationProjectDataFrame(ClassificationProject): """ def __init__(self, + name, + df, input_columns, weight_column="weights", label_column="labels", signal_label="signal", background_label="background", split_mode="split_column", - split_colurm="is_train", + split_column="is_train", **kwargs): - pass + + self.df = df + self.input_columns = input_columns + self.weight_column = weight_column + self.label_column = label_column + self.signal_label = signal_label + self.background_label = background_label + if split_mode != "split_column": + raise NotImplementedError("'split_column' is the only currently supported split mode") + self.split_mode = split_mode + self.split_column = split_column + super(ClassificationProjectDataFrame, self).__init__(name, + signal_trees=[], bkg_trees=[], branches=[], weight_expr="1", + **kwargs) + self._x_train = None + self._x_test = None + self._y_train = None + self._y_test = None + self._w_train = None + self._w_test = None + + @property + def x_train(self): + if self._x_train is None: + self._x_train = self.df[self.df[self.split_column]][self.input_columns].values + return self._x_train + + @x_train.setter + def x_train(self, value): + self._x_train = value + + @property + def x_test(self): + if self._x_test is None: + self._x_test = self.df[~self.df[self.split_column]][self.input_columns].values + return self._x_test + + @x_test.setter + def x_test(self, value): + self._x_test = value + + @property + def y_train(self): + if self._y_train is None: + self._y_train = (self.df[self.df[self.split_column]][self.label_column] == self.signal_label).values + return self._y_train + + @y_train.setter + def y_train(self, value): + self._y_train = value + + @property + def y_test(self): + if self._y_test is None: + self._y_test = (self.df[~self.df[self.split_column]][self.label_column] == self.signal_label).values + return self._y_test + + @y_test.setter + def y_test(self, value): + self._y_test = value + + @property + def w_train(self): + if self._w_train is None: + self._w_train = self.df[self.df[self.split_column]][self.weight_column].values + return self._w_train + + @w_train.setter + def w_train(self, value): + self._w_train = value + + @property + def w_test(self): + if self._w_test is None: + self._w_test = self.df[~self.df[self.split_column]][self.weight_column].values + return self._w_test + + @w_test.setter + def w_test(self, value): + self._w_test = value + + @property + def fields(self): + return self.input_columns + + + def load(self, reload=False): + + if reload: + self.data_loaded = False + self.data_transformed = False + self._x_train = None + self._x_test = None + self._y_train = None + self._y_test = None + self._w_train = None + self._w_test = None + + if not self.data_transformed: + self._transform_data() class ClassificationProjectRNN(ClassificationProject):