diff --git a/test/test_toolkit.py b/test/test_toolkit.py new file mode 100644 index 0000000000000000000000000000000000000000..ecd3aa244cd2269ae24a3386eaf990272d0c4871 --- /dev/null +++ b/test/test_toolkit.py @@ -0,0 +1,88 @@ +import pytest +import numpy as np +import root_numpy +import pandas as pd +from sklearn.datasets import make_classification +from keras.layers import GRU + +from KerasROOTClassification import ClassificationProject, ClassificationProjectRNN + +def create_dataset(path): + + # create example dataset with (low-weighted) noise added + X, y = make_classification(n_samples=10000, random_state=1) + X2 = np.random.normal(size=20*10000).reshape(-1, 20) + y2 = np.concatenate([np.zeros(5000), np.ones(5000)]) + X = np.concatenate([X, X2]) + y = np.concatenate([y, y2]) + w = np.concatenate([np.ones(10000), 0.01*np.ones(10000)]) + + # shift and scale randomly (to check if transformation is working) + shift = np.random.rand(20)*100 + scale = np.random.rand(20)*1000 + X *= scale + X += shift + + # write to root files + branches = ["var_{}".format(i) for i in range(len(X[0]))] + df = pd.DataFrame(X, columns=branches) + df["class"] = y + df["weight"] = w + tree_path_bkg = str(path / "bkg.root") + tree_path_sig = str(path / "sig.root") + root_numpy.array2root(df[df["class"]==0].to_records(), tree_path_bkg) + root_numpy.array2root(df[df["class"]==1].to_records(), tree_path_sig) + return branches, tree_path_sig, tree_path_bkg + + +def test_ClassificationProject(tmp_path): + branches, tree_path_sig, tree_path_bkg = create_dataset(tmp_path) + c = ClassificationProject( + str(tmp_path / "project"), + bkg_trees = [(tree_path_bkg, "tree")], + signal_trees = [(tree_path_sig, "tree")], + branches = branches, + weight_expr = "weight", + identifiers = ["index"], + optimizer="Adam", + earlystopping_opts=dict(patience=5), + dropout=0.5, + layers=3, + nodes=128, + ) + c.train(epochs=200) + c.plot_all_inputs() + c.plot_loss() + assert min(c.history.history["val_loss"]) < 0.18 + + +def test_ClassificationProjectRNN(tmp_path): + branches, tree_path_sig, tree_path_bkg = create_dataset(tmp_path) + c = ClassificationProjectRNN( + str(tmp_path / "project"), + bkg_trees = [(tree_path_bkg, "tree")], + signal_trees = [(tree_path_sig, "tree")], + branches = branches, + recurrent_field_names=[ + [ + ["var_1", "var_2", "var_3"], + ["var_4", "var_5", "var_6"] + ], + [ + ["var_10", "var_11", "var_12"], + ["var_13", "var_14", "var_15"] + ], + ], + weight_expr = "weight", + identifiers = ["index"], + optimizer="Adam", + earlystopping_opts=dict(patience=5), + dropout=0.5, + layers=3, + nodes=128, + ) + assert sum([isinstance(layer, GRU) for layer in c.model.layers]) == 2 + c.train(epochs=200) + c.plot_all_inputs() + c.plot_loss() + assert min(c.history.history["val_loss"]) < 0.18 diff --git a/toolkit.py b/toolkit.py index a0be319112eca728362cb312e91d792b46fafa0f..eb9234407031b53cff85267e961a3c660169559c 100755 --- a/toolkit.py +++ b/toolkit.py @@ -193,6 +193,8 @@ class ClassificationProject(object): :param kernel_initializer: weight initializer for the dense layers - if None (default) the keras defaults are used + :param shuffle: shuffle training data after (and before first) epoch + """ @@ -264,7 +266,9 @@ class ClassificationProject(object): apply_class_weight=True, normalize_weights=True, ignore_neg_weights=False, - kernel_initializer=None): + kernel_initializer=None, + shuffle=True, + ): self.name = name self.signal_trees = signal_trees @@ -348,6 +352,7 @@ class ClassificationProject(object): self.normalize_weights = normalize_weights self.ignore_neg_weights = ignore_neg_weights self.kernel_initializer = kernel_initializer + self.shuffle = shuffle self.s_train = None self.b_train = None @@ -381,8 +386,6 @@ class ClassificationProject(object): self.total_epochs = 0 self.data_loaded = False - self.data_transformed = False - self.data_shuffled = False # track if we are currently training self.is_training = False @@ -488,7 +491,6 @@ class ClassificationProject(object): self._dump_to_hdf5(*self.dataset_names_tree) self.data_loaded = True - self.data_shuffled = False def _dump_training_list(self): @@ -659,30 +661,6 @@ class ClassificationProject(object): json.dump(self.history.history, of) - def _transform_data(self): - if not self.data_transformed: - if self.mask_value is not None: - self.x_train[self.x_train == self.mask_value] = np.nan - self.x_test[self.x_test == self.mask_value] = np.nan - if logger.level <= logging.DEBUG: - logger.debug("training data before transformation: {}".format(self.x_train)) - logger.debug("minimum values: {}".format([np.min(self.x_train[:,i][~np.isnan(self.x_train[:,i])]) - for i in range(self.x_train.shape[1])])) - logger.debug("maximum values: {}".format([np.max(self.x_train[:,i][~np.isnan(self.x_train[:,i])]) - for i in range(self.x_train.shape[1])])) - orig_copy_setting = self.scaler.copy - self.scaler.copy = False - self.x_train = self.scaler.transform(self.x_train) - logger.debug("training data after transformation: {}".format(self.x_train)) - self.x_test = self.scaler.transform(self.x_test) - self.scaler.copy = orig_copy_setting - if self.mask_value is not None: - self.x_train[np.isnan(self.x_train)] = self.mask_value - self.x_test[np.isnan(self.x_test)] = self.mask_value - self.data_transformed = True - logger.info("Training and test data transformed") - - def _read_info(self, key, default): filename = os.path.join(self.project_dir, "info.json") if not os.path.exists(filename): @@ -825,14 +803,10 @@ class ClassificationProject(object): if reload: self.data_loaded = False - self.data_transformed = False if not self.data_loaded: self._load_data() - if not self.data_transformed: - self._transform_data() - @property def w_train_tot(self): @@ -855,23 +829,27 @@ class ClassificationProject(object): @property def validation_data(self): - "Validation data" + "(Transformed) validation data for loss evaluation" idx = self.train_val_idx[1] - return self.x_train[idx], self.y_train[idx], self.w_train_tot[idx] + x_val, y_val, w_val = self.x_train[idx], self.y_train[idx], self.w_train_tot[idx] + x_val_input = self.get_input_list(self.transform(x_val)) + return x_val_input, y_val, w_val @property def training_data(self): - "Training data with validation data split off" + "(Transformed) Training data with validation data split off" idx = self.train_val_idx[0] - return self.x_train[idx], self.y_train[idx], self.w_train_tot[idx] + x_train, y_train, w_train = self.x_train[idx], self.y_train[idx], self.w_train_tot[idx] + x_train_input = self.get_input_list(self.transform(x_train)) + return x_train_input, y_train, w_train @property def train_val_idx(self): if self._train_val_idx is None: if self.kfold_splits is not None: - kfold = KFold(self.kfold_splits, shuffle=True, random_state=self.shuffle_seed) + kfold = KFold(self.kfold_splits, shuffle=self.shuffle, random_state=self.shuffle_seed) for i, train_val_idx in enumerate(kfold.split(self.x_train)): if i == self.kfold_index: self._train_val_idx = train_val_idx @@ -881,7 +859,10 @@ class ClassificationProject(object): else: split_index = int((1-self.validation_split)*len(self.x_train)) np.random.seed(self.shuffle_seed) - shuffled_idx = np.random.permutation(len(self.x_train)) + if self.shuffle: + shuffled_idx = np.random.permutation(len(self.x_train)) + else: + shuffled_idx = np.arange(len(self.x_train)) self._train_val_idx = (shuffled_idx[:split_index], shuffled_idx[split_index:]) return self._train_val_idx @@ -891,17 +872,30 @@ class ClassificationProject(object): return int(float(len(self.train_val_idx[0]))/float(self.batch_size)) + def get_input_list(self, x): + "For the standard Dense models with single input, this does nothing" + return x + + def yield_batch(self): + "Batch generator - optionally shuffle the indices after each epoch" x_train, y_train, w_train = self.x_train, self.y_train, self.w_train_tot train_idx = list(self.train_val_idx[0]) np.random.seed(self.shuffle_seed+1) + logger.info("Generating training batches from {} signal and {} background events" + .format(len(np.where(self.y_train[train_idx]==1)[0]), + len(np.where(self.y_train[train_idx]==0)[0]))) while True: - shuffled_idx = np.random.permutation(train_idx) + if self.shuffle: + shuffled_idx = np.random.permutation(train_idx) + else: + shuffled_idx = train_idx for start in range(0, len(shuffled_idx), int(self.batch_size)): x_batch = x_train[shuffled_idx[start:start+int(self.batch_size)]] y_batch = y_train[shuffled_idx[start:start+int(self.batch_size)]] w_batch = w_train[shuffled_idx[start:start+int(self.batch_size)]] - yield (x_batch, y_batch, w_batch) + x_input = self.get_input_list(self.transform(x_batch)) + yield (x_input, y_batch, w_batch) def yield_single_class_batch(self, class_label): @@ -913,7 +907,10 @@ class ClassificationProject(object): class_idx = np.where(y_train==class_label)[0] while True: # shuffle the indices for this class label - shuffled_idx = np.random.permutation(class_idx) + if self.shuffle: + shuffled_idx = np.random.permutation(class_idx) + else: + shuffled_idx = class_idx # yield them batch wise for start in range(0, len(shuffled_idx), int(self.batch_size/2)): yield (x_train[shuffled_idx[start:start+int(self.batch_size/2)]], @@ -996,8 +993,6 @@ class ClassificationProject(object): def evaluate_train_test(self, do_train=True, do_test=True, mode=None): - logger.info("Reloading (and re-transforming) unshuffled training data") - self.load(reload=True) if mode is not None: self._write_info("scores_mode", mode) @@ -1011,7 +1006,37 @@ class ClassificationProject(object): self._dump_to_hdf5("scores_train") + def evaluate_train_test(self, do_train=True, do_test=True, batch_size=10000, mode=None): + "Calculate scores for training and test sample" + + if mode is not None: + self._write_info("scores_mode", mode) + + def eval_score(data_name): + logger.info("Create/Update scores for {} sample".format(data_name)) + n_events = len(getattr(self, "x_"+data_name)) + setattr(self, "scores_"+data_name, np.empty(n_events)) + for start in range(0, n_events, batch_size): + stop = start+batch_size + getattr(self, "scores_"+data_name)[start:stop] = ( + self.predict( + self.get_input_list(self.transform(getattr(self, "x_"+data_name)[start:stop])), + mode=mode + ).reshape(-1) + ) + self._dump_to_hdf5("scores_"+data_name) + + if do_test: + eval_score("test") + if do_train: + eval_score("train") + + def predict(self, x, mode=None): + """ + Calculate the scores for a (transformed) array of input values. + If the array is not transformed, use `evaluate` instead + """ if mode is None: # normal output - after activation function output layer return self.model.predict(x) @@ -1031,6 +1056,10 @@ class ClassificationProject(object): def evaluate(self, x_eval, mode=None): + """ + Calculate the scores for an array of input values. + All nescessary transformations are applied. + """ logger.debug("Evaluate score for {}".format(x_eval)) x_eval = self.transform(x_eval) logger.debug("Evaluate for transformed array: {}".format(x_eval)) @@ -1179,8 +1208,6 @@ class ClassificationProject(object): ax.bar(centers_sig, hist_sig, color="r", alpha=0.5, width=width) label = branch - if self.data_transformed: - label += " (transformed)" ax.set_xlabel(label) if fig is not None: plot_dir = os.path.join(self.project_dir, "plots") @@ -1689,7 +1716,6 @@ class ClassificationProjectDataFrame(ClassificationProject): if reload: self.data_loaded = False - self.data_transformed = False self._x_train = None self._x_test = None self._y_train = None @@ -1700,9 +1726,6 @@ class ClassificationProjectDataFrame(ClassificationProject): self.data_loaded = True - if not self.data_transformed: - self._transform_data() - class ClassificationProjectRNN(ClassificationProject): @@ -1838,7 +1861,10 @@ class ClassificationProjectRNN(ClassificationProject): def get_input_list(self, x): - "Format the input starting from flat ntuple" + """ + Returns a list of 3-dimensional inputs for each + recurrent layer and a 2-dimensional one for the normal flat inputs. + """ x_input = [] for field_idx in self.recurrent_field_idx: x_recurrent = x[:,field_idx.reshape(-1)].reshape(-1, *field_idx.shape[1:]) @@ -1849,7 +1875,7 @@ class ClassificationProjectRNN(ClassificationProject): def get_input_flat(self, x): - "Transform input back to flat ntuple" + "Transform the multiple inputs back to flat ntuple" nevent = x[0].shape[0] x_flat = np.empty((nevent, len(self.fields)), dtype=np.float) # recurrent fields @@ -1864,58 +1890,6 @@ class ClassificationProjectRNN(ClassificationProject): return x_flat - def yield_batch(self): - x_train, y_train, w_train = self.x_train, self.y_train, self.w_train_tot - train_idx = list(self.train_val_idx[0]) - np.random.seed(self.shuffle_seed+1) - logger.info("Generating training batches from {} signal and {} background events" - .format(len(np.where(self.y_train[train_idx]==1)[0]), - len(np.where(self.y_train[train_idx]==0)[0]))) - while True: - shuffled_idx = np.random.permutation(train_idx) - for start in range(0, len(shuffled_idx), int(self.batch_size)): - x_batch = x_train[shuffled_idx[start:start+int(self.batch_size)]] - y_batch = y_train[shuffled_idx[start:start+int(self.batch_size)]] - w_batch = w_train[shuffled_idx[start:start+int(self.batch_size)]] - x_input = self.get_input_list(x_batch) - yield (x_input, y_batch, w_batch) - - - @property - def validation_data(self): - "class weighted validation data. Attention: Shuffle training data before using this!" - x_val, y_val, w_val = super(ClassificationProjectRNN, self).validation_data - x_val_input = self.get_input_list(x_val) - return x_val_input, y_val, w_val - - - def evaluate_train_test(self, do_train=True, do_test=True, batch_size=10000, mode=None): - logger.info("Reloading (and re-transforming) unshuffled training data") - self.load(reload=True) - - if mode is not None: - self._write_info("scores_mode", mode) - - def eval_score(data_name): - logger.info("Create/Update scores for {} sample".format(data_name)) - n_events = len(getattr(self, "x_"+data_name)) - setattr(self, "scores_"+data_name, np.empty(n_events)) - for start in range(0, n_events, batch_size): - stop = start+batch_size - getattr(self, "scores_"+data_name)[start:stop] = ( - self.predict( - self.get_input_list(getattr(self, "x_"+data_name)[start:stop]), - mode=mode - ).reshape(-1) - ) - self._dump_to_hdf5("scores_"+data_name) - - if do_test: - eval_score("test") - if do_train: - eval_score("train") - - def evaluate(self, x_eval, mode=None): logger.debug("Evaluate score for {}".format(x_eval)) x_eval = self.transform(x_eval)