Skip to content
Snippets Groups Projects
Unverified Commit eaa2b28f authored by Eric Schanet's avatar Eric Schanet
Browse files

Merge branch 'master' of gitlab.physik.uni-muenchen.de:Nikolai.Hartmann/KerasROOTClassification

* 'master' of gitlab.physik.uni-muenchen.de:Nikolai.Hartmann/KerasROOTClassification:
  plot train and test for ROC by default
  commenting number of cores for session
  staring eval_model script
  allow both None and 0 for deactivating dropout
  try to avoid copying also when fitting scaler
  further memory improvements
  Memory improvements
  Experimental support for initialising from pandas DataFrame (memory intense ...)
  starting initialisation from pandas df
  support for tensorboard
  include identifiers in to_DataFrame
  adding to_DataFrame function
  Extended dropout support
  weighted accuracy
  weighted quantile for plot_NN_2D
  all neuron plot in plot_NN_2D script
parents 6ce509e4 71b89893
No related branches found
No related tags found
No related merge requests found
from .toolkit import ClassificationProject from .toolkit import *
from .compare import overlay_ROC, overlay_loss from .compare import *
from .add_friend import add_friend from .add_friend import *
...@@ -120,11 +120,13 @@ def plot_NN_vs_var_2D(plotname, means, ...@@ -120,11 +120,13 @@ def plot_NN_vs_var_2D(plotname, means,
def plot_NN_vs_var_2D_all(plotname, model, means, def plot_NN_vs_var_2D_all(plotname, model, means,
var1_index, var1_range, varx_index,
var2_index, var2_range, vary_index,
nbinsx, xmin, xmax,
nbinsy, ymin, ymax,
transform_function=None, transform_function=None,
var1_label=None, varx_label=None,
var2_label=None, vary_label=None,
zrange=None, logz=False, zrange=None, logz=False,
plot_last_layer=False, plot_last_layer=False,
log_default_ymin=1e-5, log_default_ymin=1e-5,
...@@ -132,15 +134,15 @@ def plot_NN_vs_var_2D_all(plotname, model, means, ...@@ -132,15 +134,15 @@ def plot_NN_vs_var_2D_all(plotname, model, means,
"Similar to plot_NN_vs_var_2D, but creates a grid of plots for all neurons." "Similar to plot_NN_vs_var_2D, but creates a grid of plots for all neurons."
var1_vals = np.arange(*var1_range) varx_vals = np.linspace(xmin, xmax, nbinsx)
var2_vals = np.arange(*var2_range) vary_vals = np.linspace(ymin, ymax, nbinsy)
# create the events for which we want to fetch the activations # create the events for which we want to fetch the activations
events = np.tile(means, len(var1_vals)*len(var2_vals)).reshape(len(var2_vals), len(var1_vals), -1) events = np.tile(means, len(varx_vals)*len(vary_vals)).reshape(len(vary_vals), len(varx_vals), -1)
for i, y in enumerate(var2_vals): for i, y in enumerate(vary_vals):
for j, x in enumerate(var1_vals): for j, x in enumerate(varx_vals):
events[i][j][var1_index] = x events[i][j][varx_index] = x
events[i][j][var2_index] = y events[i][j][vary_index] = y
# convert back into 1d array # convert back into 1d array
events = events.reshape(-1, len(means)) events = events.reshape(-1, len(means))
...@@ -187,7 +189,7 @@ def plot_NN_vs_var_2D_all(plotname, model, means, ...@@ -187,7 +189,7 @@ def plot_NN_vs_var_2D_all(plotname, model, means,
for layer in range(layers): for layer in range(layers):
for neuron in range(len(acts[layer][0])): for neuron in range(len(acts[layer][0])):
acts_neuron = acts[layer][:,neuron] acts_neuron = acts[layer][:,neuron]
acts_neuron = acts_neuron.reshape(len(var2_vals), len(var1_vals)) acts_neuron = acts_neuron.reshape(len(vary_vals), len(varx_vals))
ax = grid_array[neuron][layer] ax = grid_array[neuron][layer]
extra_opts = {} extra_opts = {}
if not (plot_last_layer and layer == layers-1): if not (plot_last_layer and layer == layers-1):
...@@ -200,12 +202,12 @@ def plot_NN_vs_var_2D_all(plotname, model, means, ...@@ -200,12 +202,12 @@ def plot_NN_vs_var_2D_all(plotname, model, means,
extra_opts["norm"] = norm(vmin=zrange[0], vmax=zrange[1]) extra_opts["norm"] = norm(vmin=zrange[0], vmax=zrange[1])
else: else:
extra_opts["norm"] = norm(vmin=global_min, vmax=global_max) extra_opts["norm"] = norm(vmin=global_min, vmax=global_max)
im = ax.pcolormesh(var1_vals, var2_vals, acts_neuron, cmap=cmap, linewidth=0, rasterized=True, **extra_opts) im = ax.pcolormesh(varx_vals, vary_vals, acts_neuron, cmap=cmap, linewidth=0, rasterized=True, **extra_opts)
ax.set_facecolor("black") ax.set_facecolor("black")
if var1_label is not None: if varx_label is not None:
ax.set_xlabel(var1_label) ax.set_xlabel(varx_label)
if var2_label is not None: if vary_label is not None:
ax.set_ylabel(var2_label) ax.set_ylabel(vary_label)
ax.text(0., 0.5, "{}, {}".format(layer, neuron), transform=ax.transAxes, color="white") ax.text(0., 0.5, "{}, {}".format(layer, neuron), transform=ax.transAxes, color="white")
cb = fig.colorbar(im, cax=grid[0].cax, orientation="horizontal") cb = fig.colorbar(im, cax=grid[0].cax, orientation="horizontal")
...@@ -342,6 +344,8 @@ if __name__ == "__main__": ...@@ -342,6 +344,8 @@ if __name__ == "__main__":
def test_mean_signal(): def test_mean_signal():
c._load_data() # untransformed
mean_signal = get_mean_event(c.x_test, c.y_test, 1) mean_signal = get_mean_event(c.x_test, c.y_test, 1)
print("Mean signal: ") print("Mean signal: ")
...@@ -371,9 +375,11 @@ if __name__ == "__main__": ...@@ -371,9 +375,11 @@ if __name__ == "__main__":
plot_NN_vs_var_2D_all("mt_vs_met_all.pdf", means=mean_signal, plot_NN_vs_var_2D_all("mt_vs_met_all.pdf", means=mean_signal,
model=c.model, transform_function=c.scaler.transform, model=c.model, transform_function=c.scaler.transform,
var1_index=c.fields.index("met"), var1_range=(0, 1000, 10), varx_index=c.fields.index("met"),
var2_index=c.fields.index("mt"), var2_range=(0, 500, 10), vary_index=c.fields.index("mt"),
var1_label="met [GeV]", var2_label="mt [GeV]") nbinsx=100, xmin=0, xmax=1000,
nbinsy=100, ymin=0, ymax=500,
varx_label="met [GeV]", vary_label="mt [GeV]")
plot_NN_vs_var_2D("mt_vs_met_crosscheck.pdf", means=mean_signal, plot_NN_vs_var_2D("mt_vs_met_crosscheck.pdf", means=mean_signal,
scorefun=get_single_neuron_function(c.model, layer=3, neuron=0, scaler=c.scaler), scorefun=get_single_neuron_function(c.model, layer=3, neuron=0, scaler=c.scaler),
......
#!/usr/bin/env python
import os
import argparse
import keras
import h5py
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np
from KerasROOTClassification import ClassificationProject
parser = argparse.ArgumentParser(description='Evaluate a model from a classification project using the given '
'weights and plot the ROC curve and train/test overlayed scores')
parser.add_argument("project_dir")
parser.add_argument("weights")
parser.add_argument("-p", "--plot-prefix", default="eval_nn")
args = parser.parse_args()
c = ClassificationProject(args.project_dir)
c.model.load_weights(args.weights)
print("Predicting for test sample ...")
scores_test = c.evaluate(c.x_test)
print("Done")
fpr, tpr, threshold = roc_curve(c.y_test, scores_test, sample_weight = c.w_test)
fpr = 1.0 - fpr
try:
roc_auc = auc(tpr, fpr, reorder=True)
except ValueError:
logger.warning("Got a value error from auc - trying to rerun with reorder=True")
roc_auc = auc(tpr, fpr, reorder=True)
plt.grid(color='gray', linestyle='--', linewidth=1)
plt.plot(tpr, fpr, label=str(c.name + " (AUC = {})".format(roc_auc)))
plt.plot([0,1],[1,0], linestyle='--', color='black', label='Luck')
plt.ylabel("Background rejection")
plt.xlabel("Signal efficiency")
plt.title('Receiver operating characteristic')
plt.xlim(0,1)
plt.ylim(0,1)
plt.xticks(np.arange(0,1,0.1))
plt.yticks(np.arange(0,1,0.1))
plt.legend(loc='lower left', framealpha=1.0)
plt.savefig(args.plot_prefix+"_ROC.pdf")
plt.clf()
...@@ -7,15 +7,20 @@ logging.basicConfig() ...@@ -7,15 +7,20 @@ logging.basicConfig()
import numpy as np import numpy as np
import ROOT
ROOT.gROOT.SetBatch()
ROOT.PyConfig.IgnoreCommandLineOptions = True
from KerasROOTClassification import ClassificationProject from KerasROOTClassification import ClassificationProject
from KerasROOTClassification.plotting import ( from KerasROOTClassification.plotting import (
get_mean_event, get_mean_event,
plot_NN_vs_var_2D, plot_NN_vs_var_2D,
plot_profile_2D, plot_profile_2D,
plot_hist_2D_events, plot_hist_2D_events,
plot_cond_avg_actmax_2D plot_cond_avg_actmax_2D,
plot_NN_vs_var_2D_all,
) )
from KerasROOTClassification.utils import get_single_neuron_function, get_max_activation_events from KerasROOTClassification.utils import get_single_neuron_function, get_max_activation_events, weighted_quantile
parser = argparse.ArgumentParser(description='Create various 2D plots for a single neuron') parser = argparse.ArgumentParser(description='Create various 2D plots for a single neuron')
parser.add_argument("project_dir") parser.add_argument("project_dir")
...@@ -27,6 +32,7 @@ parser.add_argument("-m", "--mode", ...@@ -27,6 +32,7 @@ parser.add_argument("-m", "--mode",
default="mean_sig") default="mean_sig")
parser.add_argument("-l", "--layer", type=int, help="Layer index (takes last layer by default)") parser.add_argument("-l", "--layer", type=int, help="Layer index (takes last layer by default)")
parser.add_argument("-n", "--neuron", type=int, default=0, help="Neuron index (takes first neuron by default)") parser.add_argument("-n", "--neuron", type=int, default=0, help="Neuron index (takes first neuron by default)")
parser.add_argument("-a", "--all-neurons", action="store_true", help="Create a summary plot for all neurons in all hidden layers")
parser.add_argument("--log", action="store_true", help="Plot in color in log scale") parser.add_argument("--log", action="store_true", help="Plot in color in log scale")
parser.add_argument("--contour", action="store_true", help="Interpolate with contours") parser.add_argument("--contour", action="store_true", help="Interpolate with contours")
parser.add_argument("-b", "--nbins", default=20, type=int, help="Number of bins in x and y direction") parser.add_argument("-b", "--nbins", default=20, type=int, help="Number of bins in x and y direction")
...@@ -42,6 +48,9 @@ parser.add_argument("-s", "--step-size", help="step size for activation maximisa ...@@ -42,6 +48,9 @@ parser.add_argument("-s", "--step-size", help="step size for activation maximisa
args = parser.parse_args() args = parser.parse_args()
if args.all_neurons and (not args.mode.startswith("mean")):
parser.error("--all-neurons currently only supported for mean_sig and mean_bkg")
if args.verbose: if args.verbose:
logging.getLogger().setLevel(logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG)
...@@ -64,8 +73,13 @@ else: ...@@ -64,8 +73,13 @@ else:
varx_label = args.varx varx_label = args.varx
vary_label = args.vary vary_label = args.vary
percentilesx = np.percentile(c.x_test[:,varx_index], [1,99]) # percentilesx = np.percentile(c.x_test[:,varx_index], [1,99])
percentilesy = np.percentile(c.x_test[:,vary_index], [1,99]) # percentilesy = np.percentile(c.x_test[:,vary_index], [1,99])
total_weights = c.w_test*np.array(c.class_weight)[c.y_test.astype(int)]
percentilesx = weighted_quantile(c.x_test[:,varx_index], [0.1, 0.99], sample_weight=total_weights)
percentilesy = weighted_quantile(c.x_test[:,vary_index], [0.1, 0.99], sample_weight=total_weights)
if args.xrange is not None: if args.xrange is not None:
if len(args.xrange) < 3: if len(args.xrange) < 3:
...@@ -90,17 +104,31 @@ if args.mode.startswith("mean"): ...@@ -90,17 +104,31 @@ if args.mode.startswith("mean"):
elif args.mode == "mean_bkg": elif args.mode == "mean_bkg":
means = get_mean_event(c.x_test, c.y_test, 0) means = get_mean_event(c.x_test, c.y_test, 0)
plot_NN_vs_var_2D( if not args.all_neurons:
args.output_filename, plot_NN_vs_var_2D(
means=means, args.output_filename,
varx_index=varx_index, means=means,
vary_index=vary_index, varx_index=varx_index,
scorefun=get_single_neuron_function(c.model, layer, neuron, scaler=c.scaler), vary_index=vary_index,
xmin=varx_range[0], xmax=varx_range[1], nbinsx=varx_range[2], scorefun=get_single_neuron_function(c.model, layer, neuron, scaler=c.scaler),
ymin=vary_range[0], ymax=vary_range[1], nbinsy=vary_range[2], xmin=varx_range[0], xmax=varx_range[1], nbinsx=varx_range[2],
varx_label=varx_label, vary_label=vary_label, ymin=vary_range[0], ymax=vary_range[1], nbinsy=vary_range[2],
logscale=args.log, only_pixels=(not args.contour) varx_label=varx_label, vary_label=vary_label,
) logscale=args.log, only_pixels=(not args.contour)
)
else:
plot_NN_vs_var_2D_all(
args.output_filename,
means=means,
model=c.model,
transform_function=c.scaler.transform,
varx_index=varx_index,
vary_index=vary_index,
xmin=varx_range[0], xmax=varx_range[1], nbinsx=varx_range[2],
ymin=vary_range[0], ymax=vary_range[1], nbinsy=vary_range[2],
logz=args.log,
plot_last_layer=False,
)
elif args.mode.startswith("profile"): elif args.mode.startswith("profile"):
......
#!/usr/bin/env python #!/usr/bin/env python
__all__ = ["ClassificationProject", "ClassificationProjectDataFrame"]
from sys import version_info from sys import version_info
if version_info[0] > 2: if version_info[0] > 2:
...@@ -10,13 +12,13 @@ else: ...@@ -10,13 +12,13 @@ else:
import os import os
import json import json
import yaml
import pickle import pickle
import importlib import importlib
import csv import csv
import math import math
import glob import glob
import shutil import shutil
import gc
import logging import logging
logger = logging.getLogger("KerasROOTClassification") logger = logging.getLogger("KerasROOTClassification")
...@@ -32,7 +34,7 @@ from sklearn.metrics import roc_curve, auc ...@@ -32,7 +34,7 @@ from sklearn.metrics import roc_curve, auc
from keras.models import Sequential from keras.models import Sequential
from keras.layers import Dense, Dropout from keras.layers import Dense, Dropout
from keras.models import model_from_json from keras.models import model_from_json
from keras.callbacks import History, EarlyStopping, CSVLogger, ModelCheckpoint from keras.callbacks import History, EarlyStopping, CSVLogger, ModelCheckpoint, TensorBoard
from keras.optimizers import SGD from keras.optimizers import SGD
import keras.optimizers import keras.optimizers
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
...@@ -41,15 +43,15 @@ from .utils import WeightedRobustScaler, weighted_quantile ...@@ -41,15 +43,15 @@ from .utils import WeightedRobustScaler, weighted_quantile
# configure number of cores # configure number of cores
# this doesn't seem to work, but at least with these settings keras only uses 4 processes # this doesn't seem to work, but at least with these settings keras only uses 4 processes
import tensorflow as tf # import tensorflow as tf
from keras import backend as K # from keras import backend as K
num_cores = 1 # num_cores = 1
config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, # config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,
inter_op_parallelism_threads=num_cores, # inter_op_parallelism_threads=num_cores,
allow_soft_placement=True, # allow_soft_placement=True,
device_count = {'CPU': num_cores}) # device_count = {'CPU': num_cores})
session = tf.Session(config=config) # session = tf.Session(config=config)
K.set_session(session) # K.set_session(session)
import ROOT import ROOT
...@@ -69,6 +71,7 @@ def byteify(input): ...@@ -69,6 +71,7 @@ def byteify(input):
if version_info[0] > 2: if version_info[0] > 2:
byteify = lambda input : input byteify = lambda input : input
class ClassificationProject(object): class ClassificationProject(object):
"""Simple framework to load data from ROOT TTrees and train Keras """Simple framework to load data from ROOT TTrees and train Keras
...@@ -110,7 +113,9 @@ class ClassificationProject(object): ...@@ -110,7 +113,9 @@ class ClassificationProject(object):
:param nodes: list number of nodes in each layer. If only a single number is given, use this number for every layer :param nodes: list number of nodes in each layer. If only a single number is given, use this number for every layer
:param dropout: dropout fraction after each hidden layer. Set to None for no Dropout :param dropout: dropout fraction after each hidden layer. You can also pass a list for dropout fractions for each layer. Set to None for no Dropout.
:param dropout_input: dropout fraction for the input layer. Set to None for no Dropout.
:param batch_size: size of the training batches :param batch_size: size of the training batches
...@@ -128,6 +133,10 @@ class ClassificationProject(object): ...@@ -128,6 +133,10 @@ class ClassificationProject(object):
:param step_bkg: step size when selecting background training events (e.g. 2 means take every second event) :param step_bkg: step size when selecting background training events (e.g. 2 means take every second event)
:param stop_train: stop after this number of events for reading in training events
:param stop_test: stop after this number of events for reading in test events
:param optimizer: name of optimizer class in keras.optimizers :param optimizer: name of optimizer class in keras.optimizers
:param optimizer_opts: dictionary of options for the optimizer :param optimizer_opts: dictionary of options for the optimizer
...@@ -143,6 +152,10 @@ class ClassificationProject(object): ...@@ -143,6 +152,10 @@ class ClassificationProject(object):
you change the format of the saved model weights it has to be of you change the format of the saved model weights it has to be of
the form "weights*.h5" the form "weights*.h5"
:param use_tensorboard: if True, use the tensorboard callback to write logs for tensorboard
:param tensorboard_opts: options for the TensorBoard callback
:param balance_dataset: if True, balance the dataset instead of :param balance_dataset: if True, balance the dataset instead of
applying class weights. Only a fraction of the overrepresented applying class weights. Only a fraction of the overrepresented
class will be used in each epoch, but different subsets of the class will be used in each epoch, but different subsets of the
...@@ -196,6 +209,7 @@ class ClassificationProject(object): ...@@ -196,6 +209,7 @@ class ClassificationProject(object):
layers=3, layers=3,
nodes=64, nodes=64,
dropout=None, dropout=None,
dropout_input=None,
batch_size=128, batch_size=128,
validation_split=0.33, validation_split=0.33,
activation_function='relu', activation_function='relu',
...@@ -203,12 +217,16 @@ class ClassificationProject(object): ...@@ -203,12 +217,16 @@ class ClassificationProject(object):
scaler_type="WeightedRobustScaler", scaler_type="WeightedRobustScaler",
step_signal=2, step_signal=2,
step_bkg=2, step_bkg=2,
stop_train=None,
stop_test=None,
optimizer="SGD", optimizer="SGD",
optimizer_opts=None, optimizer_opts=None,
use_earlystopping=True, use_earlystopping=True,
earlystopping_opts=None, earlystopping_opts=None,
use_modelcheckpoint=True, use_modelcheckpoint=True,
modelcheckpoint_opts=None, modelcheckpoint_opts=None,
use_tensorboard=False,
tensorboard_opts=None,
random_seed=1234, random_seed=1234,
balance_dataset=False, balance_dataset=False,
loss='binary_crossentropy'): loss='binary_crossentropy'):
...@@ -243,6 +261,11 @@ class ClassificationProject(object): ...@@ -243,6 +261,11 @@ class ClassificationProject(object):
logger.warning("Number of layers not equal to the given nodes " logger.warning("Number of layers not equal to the given nodes "
"per layer - adjusted to " + str(self.layers)) "per layer - adjusted to " + str(self.layers))
self.dropout = dropout self.dropout = dropout
if not isinstance(self.dropout, list):
self.dropout = [self.dropout for i in range(self.layers)]
if len(self.dropout) != self.layers:
raise ValueError("List of dropout fractions has to be of equal size as the number of layers!")
self.dropout_input = dropout_input
self.batch_size = batch_size self.batch_size = batch_size
self.validation_split = validation_split self.validation_split = validation_split
self.activation_function = activation_function self.activation_function = activation_function
...@@ -250,9 +273,12 @@ class ClassificationProject(object): ...@@ -250,9 +273,12 @@ class ClassificationProject(object):
self.scaler_type = scaler_type self.scaler_type = scaler_type
self.step_signal = step_signal self.step_signal = step_signal
self.step_bkg = step_bkg self.step_bkg = step_bkg
self.stop_train = stop_train
self.stop_test = stop_test
self.optimizer = optimizer self.optimizer = optimizer
self.use_earlystopping = use_earlystopping self.use_earlystopping = use_earlystopping
self.use_modelcheckpoint = use_modelcheckpoint self.use_modelcheckpoint = use_modelcheckpoint
self.use_tensorboard = use_tensorboard
if optimizer_opts is None: if optimizer_opts is None:
optimizer_opts = dict() optimizer_opts = dict()
self.optimizer_opts = optimizer_opts self.optimizer_opts = optimizer_opts
...@@ -266,6 +292,11 @@ class ClassificationProject(object): ...@@ -266,6 +292,11 @@ class ClassificationProject(object):
filepath="weights.h5" filepath="weights.h5"
) )
self.modelcheckpoint_opts = modelcheckpoint_opts self.modelcheckpoint_opts = modelcheckpoint_opts
self.tensorboard_opts = dict(
log_dir=os.path.join(self.project_dir, "tensorboard"),
)
if tensorboard_opts is not None:
self.tensorboard_opts.update(**tensorboard_opts)
self.random_seed = random_seed self.random_seed = random_seed
self.balance_dataset = balance_dataset self.balance_dataset = balance_dataset
self.loss = loss self.loss = loss
...@@ -349,19 +380,19 @@ class ClassificationProject(object): ...@@ -349,19 +380,19 @@ class ClassificationProject(object):
self.s_train = tree2array(signal_chain, self.s_train = tree2array(signal_chain,
branches=self.branches+[self.weight_expr]+self.identifiers, branches=self.branches+[self.weight_expr]+self.identifiers,
selection=self.selection, selection=self.selection,
start=0, step=self.step_signal) start=0, step=self.step_signal, stop=self.stop_train)
self.b_train = tree2array(bkg_chain, self.b_train = tree2array(bkg_chain,
branches=self.branches+[self.weight_expr]+self.identifiers, branches=self.branches+[self.weight_expr]+self.identifiers,
selection=self.selection, selection=self.selection,
start=0, step=self.step_bkg) start=0, step=self.step_bkg, stop=self.stop_train)
self.s_test = tree2array(signal_chain, self.s_test = tree2array(signal_chain,
branches=self.branches+[self.weight_expr], branches=self.branches+[self.weight_expr],
selection=self.selection, selection=self.selection,
start=1, step=self.step_signal) start=1, step=self.step_signal, stop=self.stop_test)
self.b_test = tree2array(bkg_chain, self.b_test = tree2array(bkg_chain,
branches=self.branches+[self.weight_expr], branches=self.branches+[self.weight_expr],
selection=self.selection, selection=self.selection,
start=1, step=self.step_bkg) start=1, step=self.step_bkg, stop=self.stop_test)
self.rename_fields(self.s_train) self.rename_fields(self.s_train)
self.rename_fields(self.b_train) self.rename_fields(self.b_train)
...@@ -380,19 +411,23 @@ class ClassificationProject(object): ...@@ -380,19 +411,23 @@ class ClassificationProject(object):
# the first block will be signals, the second block backgrounds # the first block will be signals, the second block backgrounds
self.x_train = rec2array(self.s_train[self.fields]) self.x_train = rec2array(self.s_train[self.fields])
self.x_train = np.concatenate((self.x_train, rec2array(self.b_train[self.fields]))) self.x_train = np.concatenate((self.x_train, rec2array(self.b_train[self.fields])))
self.x_test = rec2array(self.s_test[self.fields])
self.x_test = np.concatenate((self.x_test, rec2array(self.b_test[self.fields])))
self.w_train = self.s_train[self.weight_expr] self.w_train = self.s_train[self.weight_expr]
self.w_train = np.concatenate((self.w_train, self.b_train[self.weight_expr])) self.w_train = np.concatenate((self.w_train, self.b_train[self.weight_expr]))
self.w_test = self.s_test[self.weight_expr] self.y_train = np.empty(len(self.x_train), dtype=np.bool)
self.w_test = np.concatenate((self.w_test, self.b_test[self.weight_expr]))
self.y_train = np.empty(len(self.x_train))
self.y_train[:len(self.s_train)] = 1 self.y_train[:len(self.s_train)] = 1
self.y_train[len(self.s_train):] = 0 self.y_train[len(self.s_train):] = 0
self.y_test = np.empty(len(self.x_test)) self.b_train = None
self.s_train = None
self.x_test = rec2array(self.s_test[self.fields])
self.x_test = np.concatenate((self.x_test, rec2array(self.b_test[self.fields])))
self.w_test = self.s_test[self.weight_expr]
self.w_test = np.concatenate((self.w_test, self.b_test[self.weight_expr]))
self.y_test = np.empty(len(self.x_test), dtype=np.bool)
self.y_test[:len(self.s_test)] = 1 self.y_test[:len(self.s_test)] = 1
self.y_test[len(self.s_test):] = 0 self.y_test[len(self.s_test):] = 0
self.b_test = None
self.s_test = None
self._dump_to_hdf5(*self.dataset_names_tree) self._dump_to_hdf5(*self.dataset_names_tree)
...@@ -474,6 +509,8 @@ class ClassificationProject(object): ...@@ -474,6 +509,8 @@ class ClassificationProject(object):
if not os.path.dirname(mc.filepath) == self.project_dir: if not os.path.dirname(mc.filepath) == self.project_dir:
mc.filepath = os.path.join(self.project_dir, mc.filepath) mc.filepath = os.path.join(self.project_dir, mc.filepath)
logger.debug("Prepending project dir to ModelCheckpoint filepath: {}".format(mc.filepath)) logger.debug("Prepending project dir to ModelCheckpoint filepath: {}".format(mc.filepath))
if self.use_tensorboard:
self._callbacks_list.append(TensorBoard(**self.tensorboard_opts))
self._callbacks_list.append(CSVLogger(os.path.join(self.project_dir, "training.log"), append=True)) self._callbacks_list.append(CSVLogger(os.path.join(self.project_dir, "training.log"), append=True))
return self._callbacks_list return self._callbacks_list
...@@ -499,7 +536,10 @@ class ClassificationProject(object): ...@@ -499,7 +536,10 @@ class ClassificationProject(object):
else: else:
raise ValueError("Scaler type {} unknown".format(self.scaler_type)) raise ValueError("Scaler type {} unknown".format(self.scaler_type))
logger.info("Fitting {} to training data".format(self.scaler_type)) logger.info("Fitting {} to training data".format(self.scaler_type))
orig_copy_setting = self.scaler.copy
self.scaler.copy = False
self._scaler.fit(self.x_train, **scaler_fit_kwargs) self._scaler.fit(self.x_train, **scaler_fit_kwargs)
self.scaler.copy = orig_copy_setting
joblib.dump(self._scaler, filename) joblib.dump(self._scaler, filename)
return self._scaler return self._scaler
...@@ -541,9 +581,12 @@ class ClassificationProject(object): ...@@ -541,9 +581,12 @@ class ClassificationProject(object):
logger.debug("training data before transformation: {}".format(self.x_train)) logger.debug("training data before transformation: {}".format(self.x_train))
logger.debug("minimum values: {}".format([np.min(self.x_train[:,i]) for i in range(self.x_train.shape[1])])) logger.debug("minimum values: {}".format([np.min(self.x_train[:,i]) for i in range(self.x_train.shape[1])]))
logger.debug("maximum values: {}".format([np.max(self.x_train[:,i]) for i in range(self.x_train.shape[1])])) logger.debug("maximum values: {}".format([np.max(self.x_train[:,i]) for i in range(self.x_train.shape[1])]))
orig_copy_setting = self.scaler.copy
self.scaler.copy = False
self.x_train = self.scaler.transform(self.x_train) self.x_train = self.scaler.transform(self.x_train)
logger.debug("training data after transformation: {}".format(self.x_train)) logger.debug("training data after transformation: {}".format(self.x_train))
self.x_test = self.scaler.transform(self.x_test) self.x_test = self.scaler.transform(self.x_test)
self.scaler.copy = orig_copy_setting
self.data_transformed = True self.data_transformed = True
logger.info("Training and test data transformed") logger.info("Training and test data transformed")
...@@ -588,15 +631,21 @@ class ClassificationProject(object): ...@@ -588,15 +631,21 @@ class ClassificationProject(object):
self._model = Sequential() self._model = Sequential()
# first hidden layer if self.dropout_input is None:
self._model.add(Dense(self.nodes[0], input_dim=len(self.fields), activation=self.activation_function)) self._model.add(Dense(self.nodes[0], input_dim=len(self.fields), activation=self.activation_function))
# the other hidden layers # in case of no Dropout we already have the first hidden layer
for node_count, layer_number in zip(self.nodes[1:], range(self.layers-1)): start_layer = 1
else:
self._model.add(Dropout(rate=self.dropout_input, input_shape=(len(self.fields),)))
start_layer = 0
# the (other) hidden layers
for node_count, dropout_fraction in zip(self.nodes[start_layer:], self.dropout[start_layer:]):
self._model.add(Dense(node_count, activation=self.activation_function)) self._model.add(Dense(node_count, activation=self.activation_function))
if self.dropout is not None: if (dropout_fraction is not None) and (dropout_fraction > 0):
self._model.add(Dropout(rate=self.dropout)) self._model.add(Dropout(rate=dropout_fraction))
# last layer is one neuron (binary classification) # last layer is one neuron (binary classification)
self._model.add(Dense(1, activation=self.activation_function_output)) self._model.add(Dense(1, activation=self.activation_function_output))
logger.info("Using {}(**{}) as Optimizer".format(self.optimizer, self.optimizer_opts)) logger.info("Using {}(**{}) as Optimizer".format(self.optimizer, self.optimizer_opts))
Optimizer = getattr(keras.optimizers, self.optimizer) Optimizer = getattr(keras.optimizers, self.optimizer)
optimizer = Optimizer(**self.optimizer_opts) optimizer = Optimizer(**self.optimizer_opts)
...@@ -605,11 +654,14 @@ class ClassificationProject(object): ...@@ -605,11 +654,14 @@ class ClassificationProject(object):
np.random.seed(self.random_seed) np.random.seed(self.random_seed)
self._model.compile(optimizer=optimizer, self._model.compile(optimizer=optimizer,
loss=self.loss, loss=self.loss,
metrics=['accuracy']) weighted_metrics=['accuracy']
)
np.random.set_state(rn_state) np.random.set_state(rn_state)
if os.path.exists(os.path.join(self.project_dir, "weights.h5")): if os.path.exists(os.path.join(self.project_dir, "weights.h5")):
if self.is_training: if self.is_training:
continue_training = self.query_yn("Found previously trained weights - continue training (choosing N will restart)? (Y/N) ") continue_training = self.query_yn("Found previously trained weights - "
"continue training (choosing N will restart)? (Y/N) ")
else: else:
continue_training = True continue_training = True
if continue_training: if continue_training:
...@@ -946,34 +998,32 @@ class ClassificationProject(object): ...@@ -946,34 +998,32 @@ class ClassificationProject(object):
plt.close(fig) plt.close(fig)
def plot_ROC(self, truth=True): def plot_ROC(self, xlim=(0,1), ylim=(0,1)):
logger.info("Plot ROC curve") logger.info("Plot ROC curve")
fpr, tpr, threshold = roc_curve(self.y_test, self.scores_test, sample_weight = self.w_test) plt.grid(color='gray', linestyle='--', linewidth=1)
fpr = 1.0 - fpr
try:
roc_auc = auc(tpr, fpr, reorder=True)
except ValueError:
logger.warning("Got a value error from auc - trying to rerun with reorder=True")
roc_auc = auc(tpr, fpr, reorder=True)
if truth: for y, scores, weight, label in [
plot_name = "ROC_truth.pdf" (self.y_train, self.scores_train, self.w_train, "train"),
legend_name = "Truth test" (self.y_test, self.scores_test, self.w_test, "test")
else: ]:
plot_name = "ROC_reco.pdf" fpr, tpr, threshold = roc_curve(y, scores, sample_weight = weight)
legend_name = "Reco test" fpr = 1.0 - fpr # background rejection
try:
roc_auc = auc(tpr, fpr)
except ValueError:
logger.warning("Got a value error from auc - trying to rerun with reorder=True")
roc_auc = auc(tpr, fpr, reorder=True)
plt.plot(tpr, fpr, label=str(self.name + " {} (AUC = {:.3f})".format(label, roc_auc)))
plt.grid(color='gray', linestyle='--', linewidth=1)
plt.plot(tpr, fpr, label=str(legend_name + " (AUC = {})".format(roc_auc)))
plt.plot([0,1],[1,0], linestyle='--', color='black', label='Luck') plt.plot([0,1],[1,0], linestyle='--', color='black', label='Luck')
plt.ylabel("Background rejection") plt.ylabel("Background rejection")
plt.xlabel("Signal efficiency") plt.xlabel("Signal efficiency")
# plt.title('Receiver operating characteristic') plt.title('Receiver operating characteristic')
plt.xlim(0,1) plt.xlim(*xlim)
plt.ylim(0,1) plt.ylim(*ylim)
plt.xticks(np.arange(0,1,0.1)) # plt.xticks(np.arange(0,1,0.1))
plt.yticks(np.arange(0,1,0.1)) # plt.yticks(np.arange(0,1,0.1))
plt.legend(loc='lower left', framealpha=1.0) plt.legend(loc='lower left', framealpha=1.0)
plt.savefig(os.path.join(self.project_dir, plot_name)) plt.savefig(os.path.join(self.project_dir, plot_name))
plt.clf() plt.clf()
...@@ -1097,7 +1147,7 @@ class ClassificationProject(object): ...@@ -1097,7 +1147,7 @@ class ClassificationProject(object):
plt.clf() plt.clf()
def plot_accuracy(self, all_trainings=False, log=False): def plot_accuracy(self, all_trainings=False, log=False, acc_suffix="weighted_acc"):
""" """
Plot the value of the accuracy metric for each epoch Plot the value of the accuracy metric for each epoch
...@@ -1109,14 +1159,14 @@ class ClassificationProject(object): ...@@ -1109,14 +1159,14 @@ class ClassificationProject(object):
else: else:
hist_dict = self.history.history hist_dict = self.history.history
if (not 'acc' in hist_dict) or (not 'val_acc' in hist_dict): if (not acc_suffix in hist_dict) or (not 'val_'+acc_suffix in hist_dict):
logger.warning("No previous history found for plotting, try global history") logger.warning("No previous history found for plotting, try global history")
hist_dict = self.csv_hist hist_dict = self.csv_hist
logger.info("Plot accuracy") logger.info("Plot accuracy")
plt.plot(hist_dict['acc']) plt.plot(hist_dict[acc_suffix])
plt.plot(hist_dict['val_acc']) plt.plot(hist_dict['val_'+acc_suffix])
plt.title('model accuracy') plt.title('model accuracy')
plt.ylabel('accuracy') plt.ylabel('accuracy')
plt.xlabel('epoch') plt.xlabel('epoch')
...@@ -1129,11 +1179,30 @@ class ClassificationProject(object): ...@@ -1129,11 +1179,30 @@ class ClassificationProject(object):
def plot_all(self): def plot_all(self):
self.plot_ROC() self.plot_ROC()
self.plot_accuracy() # self.plot_accuracy()
self.plot_loss() self.plot_loss()
self.plot_score() self.plot_score()
self.plot_weights() self.plot_weights()
self.plot_significance() # self.plot_significance()
def to_DataFrame(self):
df = pd.DataFrame(np.concatenate([self.x_train, self.x_test]), columns=self.fields)
df["weight"] = np.concatenate([self.w_train, self.w_test])
df["labels"] = pd.Categorical.from_codes(
np.concatenate([self.y_train, self.y_test]),
categories=["background", "signal"]
)
for identifier in self.identifiers:
try:
df[identifier] = np.concatenate([self.s_eventlist_train[identifier],
self.b_eventlist_train[identifier],
-1*np.ones(len(self.x_test), dtype="i8")])
except IOError:
logger.warning("Can't find eventlist - DataFrame won't contain identifiers")
df["is_train"] = np.concatenate([np.ones(len(self.x_train), dtype=np.bool),
np.zeros(len(self.x_test), dtype=np.bool)])
return df
def create_getter(dataset_name): def create_getter(dataset_name):
...@@ -1154,6 +1223,125 @@ for dataset_name in ClassificationProject.dataset_names: ...@@ -1154,6 +1223,125 @@ for dataset_name in ClassificationProject.dataset_names:
create_setter(dataset_name))) create_setter(dataset_name)))
class ClassificationProjectDataFrame(ClassificationProject):
"""
A little hack to initialize a ClassificationProject from a pandas DataFrame instead of ROOT TTrees
"""
def __init__(self,
name,
df,
input_columns,
weight_column="weights",
label_column="labels",
signal_label="signal",
background_label="background",
split_mode="split_column",
split_column="is_train",
**kwargs):
self.df = df
self.input_columns = input_columns
self.weight_column = weight_column
self.label_column = label_column
self.signal_label = signal_label
self.background_label = background_label
if split_mode != "split_column":
raise NotImplementedError("'split_column' is the only currently supported split mode")
self.split_mode = split_mode
self.split_column = split_column
super(ClassificationProjectDataFrame, self).__init__(name,
signal_trees=[], bkg_trees=[], branches=[], weight_expr="1",
**kwargs)
self._x_train = None
self._x_test = None
self._y_train = None
self._y_test = None
self._w_train = None
self._w_test = None
@property
def x_train(self):
if self._x_train is None:
self._x_train = self.df[self.df[self.split_column]][self.input_columns].values
return self._x_train
@x_train.setter
def x_train(self, value):
self._x_train = value
@property
def x_test(self):
if self._x_test is None:
self._x_test = self.df[~self.df[self.split_column]][self.input_columns].values
return self._x_test
@x_test.setter
def x_test(self, value):
self._x_test = value
@property
def y_train(self):
if self._y_train is None:
self._y_train = (self.df[self.df[self.split_column]][self.label_column] == self.signal_label).values
return self._y_train
@y_train.setter
def y_train(self, value):
self._y_train = value
@property
def y_test(self):
if self._y_test is None:
self._y_test = (self.df[~self.df[self.split_column]][self.label_column] == self.signal_label).values
return self._y_test
@y_test.setter
def y_test(self, value):
self._y_test = value
@property
def w_train(self):
if self._w_train is None:
self._w_train = self.df[self.df[self.split_column]][self.weight_column].values
return self._w_train
@w_train.setter
def w_train(self, value):
self._w_train = value
@property
def w_test(self):
if self._w_test is None:
self._w_test = self.df[~self.df[self.split_column]][self.weight_column].values
return self._w_test
@w_test.setter
def w_test(self, value):
self._w_test = value
@property
def fields(self):
return self.input_columns
def load(self, reload=False):
if reload:
self.data_loaded = False
self.data_transformed = False
self._x_train = None
self._x_test = None
self._y_train = None
self._y_test = None
self._w_train = None
self._w_test = None
if not self.data_transformed:
self._transform_data()
if __name__ == "__main__": if __name__ == "__main__":
logging.basicConfig() logging.basicConfig()
...@@ -1172,8 +1360,8 @@ if __name__ == "__main__": ...@@ -1172,8 +1360,8 @@ if __name__ == "__main__":
optimizer="Adam", optimizer="Adam",
#optimizer="SGD", #optimizer="SGD",
#optimizer_opts=dict(lr=100., decay=1e-6, momentum=0.9), #optimizer_opts=dict(lr=100., decay=1e-6, momentum=0.9),
earlystopping_opts=dict(monitor='val_loss', earlystopping_opts=dict(monitor='val_loss',
min_delta=0, patience=2, verbose=0, mode='auto'), min_delta=0, patience=2, verbose=0, mode='auto'),
selection="1", selection="1",
branches = ["met", "mt"], branches = ["met", "mt"],
weight_expr = "eventWeight*genWeight", weight_expr = "eventWeight*genWeight",
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment