diff --git a/__init__.py b/__init__.py index e938f9e61047d1a58d7e0b1ef1c36243b77e682a..c2807c606b193f18802503a3c93270bb5636cbf3 100644 --- a/__init__.py +++ b/__init__.py @@ -1,3 +1,3 @@ -from .toolkit import ClassificationProject -from .compare import overlay_ROC, overlay_loss -from .add_friend import add_friend +from .toolkit import * +from .compare import * +from .add_friend import * diff --git a/plotting.py b/plotting.py index db24def073d5f9aaa7a1eb55f86e3e508ec3bb11..ae25dfc096feffa92f3470e134155b7439638f6c 100644 --- a/plotting.py +++ b/plotting.py @@ -120,11 +120,13 @@ def plot_NN_vs_var_2D(plotname, means, def plot_NN_vs_var_2D_all(plotname, model, means, - var1_index, var1_range, - var2_index, var2_range, + varx_index, + vary_index, + nbinsx, xmin, xmax, + nbinsy, ymin, ymax, transform_function=None, - var1_label=None, - var2_label=None, + varx_label=None, + vary_label=None, zrange=None, logz=False, plot_last_layer=False, log_default_ymin=1e-5, @@ -132,15 +134,15 @@ def plot_NN_vs_var_2D_all(plotname, model, means, "Similar to plot_NN_vs_var_2D, but creates a grid of plots for all neurons." - var1_vals = np.arange(*var1_range) - var2_vals = np.arange(*var2_range) + varx_vals = np.linspace(xmin, xmax, nbinsx) + vary_vals = np.linspace(ymin, ymax, nbinsy) # create the events for which we want to fetch the activations - events = np.tile(means, len(var1_vals)*len(var2_vals)).reshape(len(var2_vals), len(var1_vals), -1) - for i, y in enumerate(var2_vals): - for j, x in enumerate(var1_vals): - events[i][j][var1_index] = x - events[i][j][var2_index] = y + events = np.tile(means, len(varx_vals)*len(vary_vals)).reshape(len(vary_vals), len(varx_vals), -1) + for i, y in enumerate(vary_vals): + for j, x in enumerate(varx_vals): + events[i][j][varx_index] = x + events[i][j][vary_index] = y # convert back into 1d array events = events.reshape(-1, len(means)) @@ -187,7 +189,7 @@ def plot_NN_vs_var_2D_all(plotname, model, means, for layer in range(layers): for neuron in range(len(acts[layer][0])): acts_neuron = acts[layer][:,neuron] - acts_neuron = acts_neuron.reshape(len(var2_vals), len(var1_vals)) + acts_neuron = acts_neuron.reshape(len(vary_vals), len(varx_vals)) ax = grid_array[neuron][layer] extra_opts = {} if not (plot_last_layer and layer == layers-1): @@ -200,12 +202,12 @@ def plot_NN_vs_var_2D_all(plotname, model, means, extra_opts["norm"] = norm(vmin=zrange[0], vmax=zrange[1]) else: extra_opts["norm"] = norm(vmin=global_min, vmax=global_max) - im = ax.pcolormesh(var1_vals, var2_vals, acts_neuron, cmap=cmap, linewidth=0, rasterized=True, **extra_opts) + im = ax.pcolormesh(varx_vals, vary_vals, acts_neuron, cmap=cmap, linewidth=0, rasterized=True, **extra_opts) ax.set_facecolor("black") - if var1_label is not None: - ax.set_xlabel(var1_label) - if var2_label is not None: - ax.set_ylabel(var2_label) + if varx_label is not None: + ax.set_xlabel(varx_label) + if vary_label is not None: + ax.set_ylabel(vary_label) ax.text(0., 0.5, "{}, {}".format(layer, neuron), transform=ax.transAxes, color="white") cb = fig.colorbar(im, cax=grid[0].cax, orientation="horizontal") @@ -342,6 +344,8 @@ if __name__ == "__main__": def test_mean_signal(): + c._load_data() # untransformed + mean_signal = get_mean_event(c.x_test, c.y_test, 1) print("Mean signal: ") @@ -371,9 +375,11 @@ if __name__ == "__main__": plot_NN_vs_var_2D_all("mt_vs_met_all.pdf", means=mean_signal, model=c.model, transform_function=c.scaler.transform, - var1_index=c.fields.index("met"), var1_range=(0, 1000, 10), - var2_index=c.fields.index("mt"), var2_range=(0, 500, 10), - var1_label="met [GeV]", var2_label="mt [GeV]") + varx_index=c.fields.index("met"), + vary_index=c.fields.index("mt"), + nbinsx=100, xmin=0, xmax=1000, + nbinsy=100, ymin=0, ymax=500, + varx_label="met [GeV]", vary_label="mt [GeV]") plot_NN_vs_var_2D("mt_vs_met_crosscheck.pdf", means=mean_signal, scorefun=get_single_neuron_function(c.model, layer=3, neuron=0, scaler=c.scaler), diff --git a/scripts/eval_model.py b/scripts/eval_model.py new file mode 100755 index 0000000000000000000000000000000000000000..63c4bbde062d0532c7abc632d86e83e8c3969e8e --- /dev/null +++ b/scripts/eval_model.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python + +import os +import argparse + +import keras +import h5py +from sklearn.metrics import roc_curve, auc +import matplotlib.pyplot as plt +import numpy as np + +from KerasROOTClassification import ClassificationProject + +parser = argparse.ArgumentParser(description='Evaluate a model from a classification project using the given ' + 'weights and plot the ROC curve and train/test overlayed scores') +parser.add_argument("project_dir") +parser.add_argument("weights") +parser.add_argument("-p", "--plot-prefix", default="eval_nn") +args = parser.parse_args() + +c = ClassificationProject(args.project_dir) + +c.model.load_weights(args.weights) + +print("Predicting for test sample ...") +scores_test = c.evaluate(c.x_test) +print("Done") + +fpr, tpr, threshold = roc_curve(c.y_test, scores_test, sample_weight = c.w_test) +fpr = 1.0 - fpr +try: + roc_auc = auc(tpr, fpr, reorder=True) +except ValueError: + logger.warning("Got a value error from auc - trying to rerun with reorder=True") + roc_auc = auc(tpr, fpr, reorder=True) + +plt.grid(color='gray', linestyle='--', linewidth=1) +plt.plot(tpr, fpr, label=str(c.name + " (AUC = {})".format(roc_auc))) +plt.plot([0,1],[1,0], linestyle='--', color='black', label='Luck') +plt.ylabel("Background rejection") +plt.xlabel("Signal efficiency") +plt.title('Receiver operating characteristic') +plt.xlim(0,1) +plt.ylim(0,1) +plt.xticks(np.arange(0,1,0.1)) +plt.yticks(np.arange(0,1,0.1)) +plt.legend(loc='lower left', framealpha=1.0) +plt.savefig(args.plot_prefix+"_ROC.pdf") +plt.clf() + diff --git a/scripts/plot_NN_2D.py b/scripts/plot_NN_2D.py index ba54b1eb6551d2bb2baefb985d9cbfc7b43557b3..108c35d53d6e18cf59956b6a11cefde384b51b00 100755 --- a/scripts/plot_NN_2D.py +++ b/scripts/plot_NN_2D.py @@ -7,15 +7,20 @@ logging.basicConfig() import numpy as np +import ROOT +ROOT.gROOT.SetBatch() +ROOT.PyConfig.IgnoreCommandLineOptions = True + from KerasROOTClassification import ClassificationProject from KerasROOTClassification.plotting import ( get_mean_event, plot_NN_vs_var_2D, plot_profile_2D, plot_hist_2D_events, - plot_cond_avg_actmax_2D + plot_cond_avg_actmax_2D, + plot_NN_vs_var_2D_all, ) -from KerasROOTClassification.utils import get_single_neuron_function, get_max_activation_events +from KerasROOTClassification.utils import get_single_neuron_function, get_max_activation_events, weighted_quantile parser = argparse.ArgumentParser(description='Create various 2D plots for a single neuron') parser.add_argument("project_dir") @@ -27,6 +32,7 @@ parser.add_argument("-m", "--mode", default="mean_sig") parser.add_argument("-l", "--layer", type=int, help="Layer index (takes last layer by default)") parser.add_argument("-n", "--neuron", type=int, default=0, help="Neuron index (takes first neuron by default)") +parser.add_argument("-a", "--all-neurons", action="store_true", help="Create a summary plot for all neurons in all hidden layers") parser.add_argument("--log", action="store_true", help="Plot in color in log scale") parser.add_argument("--contour", action="store_true", help="Interpolate with contours") parser.add_argument("-b", "--nbins", default=20, type=int, help="Number of bins in x and y direction") @@ -42,6 +48,9 @@ parser.add_argument("-s", "--step-size", help="step size for activation maximisa args = parser.parse_args() +if args.all_neurons and (not args.mode.startswith("mean")): + parser.error("--all-neurons currently only supported for mean_sig and mean_bkg") + if args.verbose: logging.getLogger().setLevel(logging.DEBUG) @@ -64,8 +73,13 @@ else: varx_label = args.varx vary_label = args.vary -percentilesx = np.percentile(c.x_test[:,varx_index], [1,99]) -percentilesy = np.percentile(c.x_test[:,vary_index], [1,99]) +# percentilesx = np.percentile(c.x_test[:,varx_index], [1,99]) +# percentilesy = np.percentile(c.x_test[:,vary_index], [1,99]) + +total_weights = c.w_test*np.array(c.class_weight)[c.y_test.astype(int)] + +percentilesx = weighted_quantile(c.x_test[:,varx_index], [0.1, 0.99], sample_weight=total_weights) +percentilesy = weighted_quantile(c.x_test[:,vary_index], [0.1, 0.99], sample_weight=total_weights) if args.xrange is not None: if len(args.xrange) < 3: @@ -90,17 +104,31 @@ if args.mode.startswith("mean"): elif args.mode == "mean_bkg": means = get_mean_event(c.x_test, c.y_test, 0) - plot_NN_vs_var_2D( - args.output_filename, - means=means, - varx_index=varx_index, - vary_index=vary_index, - scorefun=get_single_neuron_function(c.model, layer, neuron, scaler=c.scaler), - xmin=varx_range[0], xmax=varx_range[1], nbinsx=varx_range[2], - ymin=vary_range[0], ymax=vary_range[1], nbinsy=vary_range[2], - varx_label=varx_label, vary_label=vary_label, - logscale=args.log, only_pixels=(not args.contour) - ) + if not args.all_neurons: + plot_NN_vs_var_2D( + args.output_filename, + means=means, + varx_index=varx_index, + vary_index=vary_index, + scorefun=get_single_neuron_function(c.model, layer, neuron, scaler=c.scaler), + xmin=varx_range[0], xmax=varx_range[1], nbinsx=varx_range[2], + ymin=vary_range[0], ymax=vary_range[1], nbinsy=vary_range[2], + varx_label=varx_label, vary_label=vary_label, + logscale=args.log, only_pixels=(not args.contour) + ) + else: + plot_NN_vs_var_2D_all( + args.output_filename, + means=means, + model=c.model, + transform_function=c.scaler.transform, + varx_index=varx_index, + vary_index=vary_index, + xmin=varx_range[0], xmax=varx_range[1], nbinsx=varx_range[2], + ymin=vary_range[0], ymax=vary_range[1], nbinsy=vary_range[2], + logz=args.log, + plot_last_layer=False, + ) elif args.mode.startswith("profile"): diff --git a/toolkit.py b/toolkit.py index ce6307879ed23daef2818f61ed40284a47429901..ada4e54c9118a414b70a644e6e44dad9b78f5764 100755 --- a/toolkit.py +++ b/toolkit.py @@ -1,5 +1,7 @@ #!/usr/bin/env python +__all__ = ["ClassificationProject", "ClassificationProjectDataFrame"] + from sys import version_info if version_info[0] > 2: @@ -10,13 +12,13 @@ else: import os import json -import yaml import pickle import importlib import csv import math import glob import shutil +import gc import logging logger = logging.getLogger("KerasROOTClassification") @@ -32,7 +34,7 @@ from sklearn.metrics import roc_curve, auc from keras.models import Sequential from keras.layers import Dense, Dropout from keras.models import model_from_json -from keras.callbacks import History, EarlyStopping, CSVLogger, ModelCheckpoint +from keras.callbacks import History, EarlyStopping, CSVLogger, ModelCheckpoint, TensorBoard from keras.optimizers import SGD import keras.optimizers import matplotlib.pyplot as plt @@ -41,15 +43,15 @@ from .utils import WeightedRobustScaler, weighted_quantile # configure number of cores # this doesn't seem to work, but at least with these settings keras only uses 4 processes -import tensorflow as tf -from keras import backend as K -num_cores = 1 -config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, - inter_op_parallelism_threads=num_cores, - allow_soft_placement=True, - device_count = {'CPU': num_cores}) -session = tf.Session(config=config) -K.set_session(session) +# import tensorflow as tf +# from keras import backend as K +# num_cores = 1 +# config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, +# inter_op_parallelism_threads=num_cores, +# allow_soft_placement=True, +# device_count = {'CPU': num_cores}) +# session = tf.Session(config=config) +# K.set_session(session) import ROOT @@ -69,6 +71,7 @@ def byteify(input): if version_info[0] > 2: byteify = lambda input : input + class ClassificationProject(object): """Simple framework to load data from ROOT TTrees and train Keras @@ -110,7 +113,9 @@ class ClassificationProject(object): :param nodes: list number of nodes in each layer. If only a single number is given, use this number for every layer - :param dropout: dropout fraction after each hidden layer. Set to None for no Dropout + :param dropout: dropout fraction after each hidden layer. You can also pass a list for dropout fractions for each layer. Set to None for no Dropout. + + :param dropout_input: dropout fraction for the input layer. Set to None for no Dropout. :param batch_size: size of the training batches @@ -128,6 +133,10 @@ class ClassificationProject(object): :param step_bkg: step size when selecting background training events (e.g. 2 means take every second event) + :param stop_train: stop after this number of events for reading in training events + + :param stop_test: stop after this number of events for reading in test events + :param optimizer: name of optimizer class in keras.optimizers :param optimizer_opts: dictionary of options for the optimizer @@ -143,6 +152,10 @@ class ClassificationProject(object): you change the format of the saved model weights it has to be of the form "weights*.h5" + :param use_tensorboard: if True, use the tensorboard callback to write logs for tensorboard + + :param tensorboard_opts: options for the TensorBoard callback + :param balance_dataset: if True, balance the dataset instead of applying class weights. Only a fraction of the overrepresented class will be used in each epoch, but different subsets of the @@ -196,6 +209,7 @@ class ClassificationProject(object): layers=3, nodes=64, dropout=None, + dropout_input=None, batch_size=128, validation_split=0.33, activation_function='relu', @@ -203,12 +217,16 @@ class ClassificationProject(object): scaler_type="WeightedRobustScaler", step_signal=2, step_bkg=2, + stop_train=None, + stop_test=None, optimizer="SGD", optimizer_opts=None, use_earlystopping=True, earlystopping_opts=None, use_modelcheckpoint=True, modelcheckpoint_opts=None, + use_tensorboard=False, + tensorboard_opts=None, random_seed=1234, balance_dataset=False, loss='binary_crossentropy'): @@ -243,6 +261,11 @@ class ClassificationProject(object): logger.warning("Number of layers not equal to the given nodes " "per layer - adjusted to " + str(self.layers)) self.dropout = dropout + if not isinstance(self.dropout, list): + self.dropout = [self.dropout for i in range(self.layers)] + if len(self.dropout) != self.layers: + raise ValueError("List of dropout fractions has to be of equal size as the number of layers!") + self.dropout_input = dropout_input self.batch_size = batch_size self.validation_split = validation_split self.activation_function = activation_function @@ -250,9 +273,12 @@ class ClassificationProject(object): self.scaler_type = scaler_type self.step_signal = step_signal self.step_bkg = step_bkg + self.stop_train = stop_train + self.stop_test = stop_test self.optimizer = optimizer self.use_earlystopping = use_earlystopping self.use_modelcheckpoint = use_modelcheckpoint + self.use_tensorboard = use_tensorboard if optimizer_opts is None: optimizer_opts = dict() self.optimizer_opts = optimizer_opts @@ -266,6 +292,11 @@ class ClassificationProject(object): filepath="weights.h5" ) self.modelcheckpoint_opts = modelcheckpoint_opts + self.tensorboard_opts = dict( + log_dir=os.path.join(self.project_dir, "tensorboard"), + ) + if tensorboard_opts is not None: + self.tensorboard_opts.update(**tensorboard_opts) self.random_seed = random_seed self.balance_dataset = balance_dataset self.loss = loss @@ -349,19 +380,19 @@ class ClassificationProject(object): self.s_train = tree2array(signal_chain, branches=self.branches+[self.weight_expr]+self.identifiers, selection=self.selection, - start=0, step=self.step_signal) + start=0, step=self.step_signal, stop=self.stop_train) self.b_train = tree2array(bkg_chain, branches=self.branches+[self.weight_expr]+self.identifiers, selection=self.selection, - start=0, step=self.step_bkg) + start=0, step=self.step_bkg, stop=self.stop_train) self.s_test = tree2array(signal_chain, branches=self.branches+[self.weight_expr], selection=self.selection, - start=1, step=self.step_signal) + start=1, step=self.step_signal, stop=self.stop_test) self.b_test = tree2array(bkg_chain, branches=self.branches+[self.weight_expr], selection=self.selection, - start=1, step=self.step_bkg) + start=1, step=self.step_bkg, stop=self.stop_test) self.rename_fields(self.s_train) self.rename_fields(self.b_train) @@ -380,19 +411,23 @@ class ClassificationProject(object): # the first block will be signals, the second block backgrounds self.x_train = rec2array(self.s_train[self.fields]) self.x_train = np.concatenate((self.x_train, rec2array(self.b_train[self.fields]))) - self.x_test = rec2array(self.s_test[self.fields]) - self.x_test = np.concatenate((self.x_test, rec2array(self.b_test[self.fields]))) self.w_train = self.s_train[self.weight_expr] self.w_train = np.concatenate((self.w_train, self.b_train[self.weight_expr])) - self.w_test = self.s_test[self.weight_expr] - self.w_test = np.concatenate((self.w_test, self.b_test[self.weight_expr])) - - self.y_train = np.empty(len(self.x_train)) + self.y_train = np.empty(len(self.x_train), dtype=np.bool) self.y_train[:len(self.s_train)] = 1 self.y_train[len(self.s_train):] = 0 - self.y_test = np.empty(len(self.x_test)) + self.b_train = None + self.s_train = None + + self.x_test = rec2array(self.s_test[self.fields]) + self.x_test = np.concatenate((self.x_test, rec2array(self.b_test[self.fields]))) + self.w_test = self.s_test[self.weight_expr] + self.w_test = np.concatenate((self.w_test, self.b_test[self.weight_expr])) + self.y_test = np.empty(len(self.x_test), dtype=np.bool) self.y_test[:len(self.s_test)] = 1 self.y_test[len(self.s_test):] = 0 + self.b_test = None + self.s_test = None self._dump_to_hdf5(*self.dataset_names_tree) @@ -474,6 +509,8 @@ class ClassificationProject(object): if not os.path.dirname(mc.filepath) == self.project_dir: mc.filepath = os.path.join(self.project_dir, mc.filepath) logger.debug("Prepending project dir to ModelCheckpoint filepath: {}".format(mc.filepath)) + if self.use_tensorboard: + self._callbacks_list.append(TensorBoard(**self.tensorboard_opts)) self._callbacks_list.append(CSVLogger(os.path.join(self.project_dir, "training.log"), append=True)) return self._callbacks_list @@ -499,7 +536,10 @@ class ClassificationProject(object): else: raise ValueError("Scaler type {} unknown".format(self.scaler_type)) logger.info("Fitting {} to training data".format(self.scaler_type)) + orig_copy_setting = self.scaler.copy + self.scaler.copy = False self._scaler.fit(self.x_train, **scaler_fit_kwargs) + self.scaler.copy = orig_copy_setting joblib.dump(self._scaler, filename) return self._scaler @@ -541,9 +581,12 @@ class ClassificationProject(object): logger.debug("training data before transformation: {}".format(self.x_train)) logger.debug("minimum values: {}".format([np.min(self.x_train[:,i]) for i in range(self.x_train.shape[1])])) logger.debug("maximum values: {}".format([np.max(self.x_train[:,i]) for i in range(self.x_train.shape[1])])) + orig_copy_setting = self.scaler.copy + self.scaler.copy = False self.x_train = self.scaler.transform(self.x_train) logger.debug("training data after transformation: {}".format(self.x_train)) self.x_test = self.scaler.transform(self.x_test) + self.scaler.copy = orig_copy_setting self.data_transformed = True logger.info("Training and test data transformed") @@ -588,15 +631,21 @@ class ClassificationProject(object): self._model = Sequential() - # first hidden layer - self._model.add(Dense(self.nodes[0], input_dim=len(self.fields), activation=self.activation_function)) - # the other hidden layers - for node_count, layer_number in zip(self.nodes[1:], range(self.layers-1)): + if self.dropout_input is None: + self._model.add(Dense(self.nodes[0], input_dim=len(self.fields), activation=self.activation_function)) + # in case of no Dropout we already have the first hidden layer + start_layer = 1 + else: + self._model.add(Dropout(rate=self.dropout_input, input_shape=(len(self.fields),))) + start_layer = 0 + # the (other) hidden layers + for node_count, dropout_fraction in zip(self.nodes[start_layer:], self.dropout[start_layer:]): self._model.add(Dense(node_count, activation=self.activation_function)) - if self.dropout is not None: - self._model.add(Dropout(rate=self.dropout)) + if (dropout_fraction is not None) and (dropout_fraction > 0): + self._model.add(Dropout(rate=dropout_fraction)) # last layer is one neuron (binary classification) self._model.add(Dense(1, activation=self.activation_function_output)) + logger.info("Using {}(**{}) as Optimizer".format(self.optimizer, self.optimizer_opts)) Optimizer = getattr(keras.optimizers, self.optimizer) optimizer = Optimizer(**self.optimizer_opts) @@ -605,11 +654,14 @@ class ClassificationProject(object): np.random.seed(self.random_seed) self._model.compile(optimizer=optimizer, loss=self.loss, - metrics=['accuracy']) + weighted_metrics=['accuracy'] + ) np.random.set_state(rn_state) + if os.path.exists(os.path.join(self.project_dir, "weights.h5")): if self.is_training: - continue_training = self.query_yn("Found previously trained weights - continue training (choosing N will restart)? (Y/N) ") + continue_training = self.query_yn("Found previously trained weights - " + "continue training (choosing N will restart)? (Y/N) ") else: continue_training = True if continue_training: @@ -946,34 +998,32 @@ class ClassificationProject(object): plt.close(fig) - def plot_ROC(self, truth=True): + def plot_ROC(self, xlim=(0,1), ylim=(0,1)): logger.info("Plot ROC curve") - fpr, tpr, threshold = roc_curve(self.y_test, self.scores_test, sample_weight = self.w_test) - fpr = 1.0 - fpr - try: - roc_auc = auc(tpr, fpr, reorder=True) - except ValueError: - logger.warning("Got a value error from auc - trying to rerun with reorder=True") - roc_auc = auc(tpr, fpr, reorder=True) + plt.grid(color='gray', linestyle='--', linewidth=1) - if truth: - plot_name = "ROC_truth.pdf" - legend_name = "Truth test" - else: - plot_name = "ROC_reco.pdf" - legend_name = "Reco test" + for y, scores, weight, label in [ + (self.y_train, self.scores_train, self.w_train, "train"), + (self.y_test, self.scores_test, self.w_test, "test") + ]: + fpr, tpr, threshold = roc_curve(y, scores, sample_weight = weight) + fpr = 1.0 - fpr # background rejection + try: + roc_auc = auc(tpr, fpr) + except ValueError: + logger.warning("Got a value error from auc - trying to rerun with reorder=True") + roc_auc = auc(tpr, fpr, reorder=True) + plt.plot(tpr, fpr, label=str(self.name + " {} (AUC = {:.3f})".format(label, roc_auc))) - plt.grid(color='gray', linestyle='--', linewidth=1) - plt.plot(tpr, fpr, label=str(legend_name + " (AUC = {})".format(roc_auc))) plt.plot([0,1],[1,0], linestyle='--', color='black', label='Luck') plt.ylabel("Background rejection") plt.xlabel("Signal efficiency") - # plt.title('Receiver operating characteristic') - plt.xlim(0,1) - plt.ylim(0,1) - plt.xticks(np.arange(0,1,0.1)) - plt.yticks(np.arange(0,1,0.1)) + plt.title('Receiver operating characteristic') + plt.xlim(*xlim) + plt.ylim(*ylim) + # plt.xticks(np.arange(0,1,0.1)) + # plt.yticks(np.arange(0,1,0.1)) plt.legend(loc='lower left', framealpha=1.0) plt.savefig(os.path.join(self.project_dir, plot_name)) plt.clf() @@ -1097,7 +1147,7 @@ class ClassificationProject(object): plt.clf() - def plot_accuracy(self, all_trainings=False, log=False): + def plot_accuracy(self, all_trainings=False, log=False, acc_suffix="weighted_acc"): """ Plot the value of the accuracy metric for each epoch @@ -1109,14 +1159,14 @@ class ClassificationProject(object): else: hist_dict = self.history.history - if (not 'acc' in hist_dict) or (not 'val_acc' in hist_dict): + if (not acc_suffix in hist_dict) or (not 'val_'+acc_suffix in hist_dict): logger.warning("No previous history found for plotting, try global history") hist_dict = self.csv_hist logger.info("Plot accuracy") - plt.plot(hist_dict['acc']) - plt.plot(hist_dict['val_acc']) + plt.plot(hist_dict[acc_suffix]) + plt.plot(hist_dict['val_'+acc_suffix]) plt.title('model accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') @@ -1129,11 +1179,30 @@ class ClassificationProject(object): def plot_all(self): self.plot_ROC() - self.plot_accuracy() + # self.plot_accuracy() self.plot_loss() self.plot_score() self.plot_weights() - self.plot_significance() + # self.plot_significance() + + + def to_DataFrame(self): + df = pd.DataFrame(np.concatenate([self.x_train, self.x_test]), columns=self.fields) + df["weight"] = np.concatenate([self.w_train, self.w_test]) + df["labels"] = pd.Categorical.from_codes( + np.concatenate([self.y_train, self.y_test]), + categories=["background", "signal"] + ) + for identifier in self.identifiers: + try: + df[identifier] = np.concatenate([self.s_eventlist_train[identifier], + self.b_eventlist_train[identifier], + -1*np.ones(len(self.x_test), dtype="i8")]) + except IOError: + logger.warning("Can't find eventlist - DataFrame won't contain identifiers") + df["is_train"] = np.concatenate([np.ones(len(self.x_train), dtype=np.bool), + np.zeros(len(self.x_test), dtype=np.bool)]) + return df def create_getter(dataset_name): @@ -1154,6 +1223,125 @@ for dataset_name in ClassificationProject.dataset_names: create_setter(dataset_name))) +class ClassificationProjectDataFrame(ClassificationProject): + + """ + A little hack to initialize a ClassificationProject from a pandas DataFrame instead of ROOT TTrees + """ + + def __init__(self, + name, + df, + input_columns, + weight_column="weights", + label_column="labels", + signal_label="signal", + background_label="background", + split_mode="split_column", + split_column="is_train", + **kwargs): + + self.df = df + self.input_columns = input_columns + self.weight_column = weight_column + self.label_column = label_column + self.signal_label = signal_label + self.background_label = background_label + if split_mode != "split_column": + raise NotImplementedError("'split_column' is the only currently supported split mode") + self.split_mode = split_mode + self.split_column = split_column + super(ClassificationProjectDataFrame, self).__init__(name, + signal_trees=[], bkg_trees=[], branches=[], weight_expr="1", + **kwargs) + self._x_train = None + self._x_test = None + self._y_train = None + self._y_test = None + self._w_train = None + self._w_test = None + + @property + def x_train(self): + if self._x_train is None: + self._x_train = self.df[self.df[self.split_column]][self.input_columns].values + return self._x_train + + @x_train.setter + def x_train(self, value): + self._x_train = value + + @property + def x_test(self): + if self._x_test is None: + self._x_test = self.df[~self.df[self.split_column]][self.input_columns].values + return self._x_test + + @x_test.setter + def x_test(self, value): + self._x_test = value + + @property + def y_train(self): + if self._y_train is None: + self._y_train = (self.df[self.df[self.split_column]][self.label_column] == self.signal_label).values + return self._y_train + + @y_train.setter + def y_train(self, value): + self._y_train = value + + @property + def y_test(self): + if self._y_test is None: + self._y_test = (self.df[~self.df[self.split_column]][self.label_column] == self.signal_label).values + return self._y_test + + @y_test.setter + def y_test(self, value): + self._y_test = value + + @property + def w_train(self): + if self._w_train is None: + self._w_train = self.df[self.df[self.split_column]][self.weight_column].values + return self._w_train + + @w_train.setter + def w_train(self, value): + self._w_train = value + + @property + def w_test(self): + if self._w_test is None: + self._w_test = self.df[~self.df[self.split_column]][self.weight_column].values + return self._w_test + + @w_test.setter + def w_test(self, value): + self._w_test = value + + @property + def fields(self): + return self.input_columns + + + def load(self, reload=False): + + if reload: + self.data_loaded = False + self.data_transformed = False + self._x_train = None + self._x_test = None + self._y_train = None + self._y_test = None + self._w_train = None + self._w_test = None + + if not self.data_transformed: + self._transform_data() + + if __name__ == "__main__": logging.basicConfig() @@ -1172,8 +1360,8 @@ if __name__ == "__main__": optimizer="Adam", #optimizer="SGD", #optimizer_opts=dict(lr=100., decay=1e-6, momentum=0.9), - earlystopping_opts=dict(monitor='val_loss', - min_delta=0, patience=2, verbose=0, mode='auto'), + earlystopping_opts=dict(monitor='val_loss', + min_delta=0, patience=2, verbose=0, mode='auto'), selection="1", branches = ["met", "mt"], weight_expr = "eventWeight*genWeight",