Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • Eric.Schanet/KerasROOTClassification
  • Nikolai.Hartmann/KerasROOTClassification
2 results
Show changes
#!/usr/bin/env python
import os
import math
import matplotlib.pyplot as plt
import matplotlib.colors
from matplotlib.ticker import LogFormatter
from mpl_toolkits.axes_grid1 import ImageGrid, make_axes_locatable
import numpy as np
from .keras_visualize_activations.read_activations import get_activations
from .utils import get_grad_function, max_activation_wrt_input, create_random_event
import logging
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
"""
Some further plotting functions
"""
def save_show(plt, fig, filename, **kwargs):
"Save a figure and show it in case we are in ipython or jupyter notebook."
fig.savefig(filename, **kwargs)
try:
get_ipython
plt.show()
return fig
except NameError:
plt.close(fig)
return None
def get_mean_event(x, y, class_label, mask_value=None):
means = []
for var_index in range(x.shape[1]):
if mask_value is not None:
masked_values = np.where(x[:,var_index] != mask_value)[0]
x = x[masked_values]
y = y[masked_values]
means.append(np.mean(x[y==class_label][:,var_index]))
return means
def plot_NN_vs_var_1D(plotname, means, scorefun, var_index, var_range, var_label=None):
"Plot the NN output vs one variable with the other variables set to the given mean values"
# example: vary var1
logger.info("Creating varied events (1d)")
sequence = np.arange(*var_range)
events = np.tile(means, len(sequence)).reshape(-1, len(means))
events[:,var_index] = sequence
logger.info("Predicting scores")
scores = scorefun(events)
fig, ax = plt.subplots()
ax.plot(sequence, scores)
if var_label is not None:
ax.set_xlabel(var_label)
ax.set_ylabel("NN output")
save_show(plt, fig, plotname)
def plot_NN_vs_var_2D(plotname, means,
scorefun,
varx_index,
vary_index,
nbinsx, xmin, xmax,
nbinsy, ymin, ymax,
varx_label=None,
vary_label=None,
logscale=False,
ncontours=20,
only_pixels=False,
black_contourlines=False,
cmap="inferno"):
logger.info("Creating varied events (2d)")
sequence1 = np.linspace(xmin, xmax, nbinsx)
sequence2 = np.linspace(ymin, ymax, nbinsy)
# the following is a 2d array of events (so effectively 3D)
events = np.tile(means, len(sequence1)*len(sequence2)).reshape(len(sequence2), len(sequence1), -1)
# fill in the varied values
# (probably there is a more clever way, but sufficient here)
for i, y in enumerate(sequence2):
for j, x in enumerate(sequence1):
events[i][j][varx_index] = x
events[i][j][vary_index] = y
# convert back into 1d array
events = events.reshape(-1, len(means))
logger.info("Predicting scores")
scores = scorefun(events)
# convert scores into 2d array
scores = scores.reshape(len(sequence2), len(sequence1))
fig, ax = plt.subplots()
zmin = np.min(scores)
zmax = np.max(scores)
if logscale:
if zmin <= 0:
zmin = 1e-5
logger.info("Setting zmin to {}".format(zmin))
lvls = np.logspace(math.log10(zmin), math.log10(zmax), ncontours)
if only_pixels:
pcm = ax.pcolormesh(sequence1, sequence2, scores, norm=matplotlib.colors.LogNorm(vmin=zmin, vmax=zmax), cmap=cmap, linewidth=0, rasterized=True)
else:
pcm = ax.contourf(sequence1, sequence2, scores, levels=lvls, norm=matplotlib.colors.LogNorm(vmin=zmin, vmax=zmax), cmap=cmap)
if black_contourlines:
ax.contour(sequence1, sequence2, scores, levels=lvls, colors="k", linewidths=1)
l_f = LogFormatter(10, labelOnlyBase=False, minor_thresholds=(np.inf, np.inf))
cbar = fig.colorbar(pcm, ax=ax, extend='max', ticks=lvls, format=l_f)
else:
if only_pixels:
pcm = ax.pcolormesh(sequence1, sequence2, scores, norm=matplotlib.colors.Normalize(vmin=zmin, vmax=zmax), cmap=cmap, linewidth=0, rasterized=True)
else:
pcm = ax.contourf(sequence1, sequence2, scores, ncontours, norm=matplotlib.colors.Normalize(vmin=zmin, vmax=zmax), cmap=cmap)
if black_contourlines:
ax.contour(sequence1, sequence2, scores, ncontours, colors="k", linewidths=1)
cbar = fig.colorbar(pcm, ax=ax, extend='max')
cbar.set_label("NN output")
if varx_label is not None:
ax.set_xlabel(varx_label)
if vary_label is not None:
ax.set_ylabel(vary_label)
save_show(plt, fig, plotname)
def plot_NN_vs_var_2D_all(plotname, model, means,
varx_index,
vary_index,
nbinsx, xmin, xmax,
nbinsy, ymin, ymax,
transform_function=None,
varx_label=None,
vary_label=None,
zrange=None, logz=False,
plot_last_layer=False,
log_default_ymin=1e-5,
cmap="inferno"):
"Similar to plot_NN_vs_var_2D, but creates a grid of plots for all neurons."
varx_vals = np.linspace(xmin, xmax, nbinsx)
vary_vals = np.linspace(ymin, ymax, nbinsy)
# create the events for which we want to fetch the activations
events = np.tile(means, len(varx_vals)*len(vary_vals)).reshape(len(vary_vals), len(varx_vals), -1)
for i, y in enumerate(vary_vals):
for j, x in enumerate(varx_vals):
events[i][j][varx_index] = x
events[i][j][vary_index] = y
# convert back into 1d array
events = events.reshape(-1, len(means))
# transform
if transform_function is not None:
events = transform_function(events)
acts = get_activations(model, events, print_shape_only=True)
if plot_last_layer:
n_neurons = [len(i[0]) for i in acts]
else:
n_neurons = [len(i[0]) for i in acts[:-1]]
layers = len(n_neurons)
nrows_ncols = (layers, max(n_neurons))
fig = plt.figure(1, figsize=nrows_ncols)
grid = ImageGrid(fig, 111, nrows_ncols=nrows_ncols[::-1], axes_pad=0,
label_mode="1",
aspect=False,
cbar_location="top",
cbar_mode="single",
cbar_pad=.2,
cbar_size="5%",)
grid_array = np.array(grid)
grid_array = grid_array.reshape(*nrows_ncols[::-1])
# leave out the last layer
global_min = min([np.min(ar_layer) for ar_layer in acts[:-1]])
global_max = max([np.max(ar_layer) for ar_layer in acts[:-1]])
logger.info("global_min: {}".format(global_min))
logger.info("global_max: {}".format(global_max))
output_min_default = 0
output_max_default = 1
if global_min <= 0 and logz:
global_min = log_default_ymin
logger.info("Changing global_min to {}".format(log_default_ymin))
ims = []
for layer in range(layers):
for neuron in range(len(acts[layer][0])):
acts_neuron = acts[layer][:,neuron]
acts_neuron = acts_neuron.reshape(len(vary_vals), len(varx_vals))
ax = grid_array[neuron][layer]
extra_opts = {}
if not (plot_last_layer and layer == layers-1):
# for hidden layers, plot the same z-scale
if logz:
norm = matplotlib.colors.LogNorm
else:
norm = matplotlib.colors.Normalize
if zrange is not None:
extra_opts["norm"] = norm(vmin=zrange[0], vmax=zrange[1])
else:
extra_opts["norm"] = norm(vmin=global_min, vmax=global_max)
im = ax.pcolormesh(varx_vals, vary_vals, acts_neuron, cmap=cmap, linewidth=0, rasterized=True, **extra_opts)
ax.set_facecolor("black")
if varx_label is not None:
ax.set_xlabel(varx_label)
if vary_label is not None:
ax.set_ylabel(vary_label)
ax.text(0., 0.5, "{}, {}".format(layer, neuron), transform=ax.transAxes, color="white")
cb = fig.colorbar(im, cax=grid[0].cax, orientation="horizontal")
cb.ax.xaxis.set_ticks_position('top')
cb.ax.xaxis.set_label_position('top')
save_show(plt, fig, plotname, bbox_inches='tight')
def plot_profile_2D_all(plotname, model, events,
valsx, valsy,
nbinsx, xmin, xmax,
nbinsy, ymin, ymax,
transform_function=None,
varx_label=None,
vary_label=None,
zrange=None, logz=False,
plot_last_layer=False,
log_default_ymin=1e-5,
global_norm=True,
cmap="inferno", **kwargs):
"Similar to plot_profile_2D, but creates a grid of plots for all neurons."
# transform
if transform_function is not None:
events = transform_function(events)
logger.info("Reading activations for all neurons")
acts = get_activations(model, events, print_shape_only=True)
logger.info("Done")
if plot_last_layer:
n_neurons = [len(i.reshape(i.shape[0], -1)[0]) for i in acts]
else:
n_neurons = [len(i.reshape(i.shape[0], -1)[0]) for i in acts[:-1]]
layers = len(n_neurons)
nrows_ncols = (layers, max(n_neurons))
fig = plt.figure(1, figsize=nrows_ncols)
grid = ImageGrid(fig, 111, nrows_ncols=nrows_ncols[::-1], axes_pad=0,
label_mode="1",
aspect=False,
cbar_location="top",
cbar_mode="single",
cbar_pad=.2,
cbar_size="5%",)
grid_array = np.array(grid)
grid_array = grid_array.reshape(*nrows_ncols[::-1])
global_min = None
global_max = None
logger.info("Creating profile histograms")
ims = []
reg_plots = []
for layer in range(layers):
neurons_acts = acts[layer]
neurons_acts = neurons_acts.reshape(neurons_acts.shape[0], -1)
for neuron in range(len(neurons_acts[0])):
acts_neuron = neurons_acts[:,neuron]
ax = grid_array[neuron][layer]
extra_opts = {}
if not (plot_last_layer and layer == layers-1):
# for hidden layers, plot the same z-scale
if logz:
norm = matplotlib.colors.LogNorm
else:
norm = matplotlib.colors.Normalize
if zrange is not None:
extra_opts["norm"] = norm(vmin=zrange[0], vmax=zrange[1])
else:
extra_opts["norm"] = norm(vmin=global_min, vmax=global_max)
hist, xedges, yedges = get_profile_2D(
valsx, valsy, acts_neuron,
nbinsx, xmin, xmax,
nbinsy, ymin, ymax,
**kwargs
)
if global_min is None or hist.min() < global_min:
global_min = hist.min()
if global_max is None or hist.max() > global_max:
global_max = hist.max()
X, Y = np.meshgrid(xedges, yedges)
reg_plots.append((layer, neuron, ax, (X, Y, hist), dict(cmap="inferno", linewidth=0, rasterized=True, **extra_opts)))
logger.info("Done")
logger.info("global_min: {}".format(global_min))
logger.info("global_max: {}".format(global_max))
if global_min <= 0 and logz:
global_min = log_default_ymin
logger.info("Changing global_min to {}".format(log_default_ymin))
for layer, neuron, ax, args, kwargs in reg_plots:
if zrange is None:
kwargs["norm"].vmin = global_min
kwargs["norm"].vmax = global_max
if not global_norm:
kwargs["norm"] = None
im = ax.pcolormesh(*args, **kwargs)
ax.set_facecolor("black")
if varx_label is not None:
ax.set_xlabel(varx_label)
if vary_label is not None:
ax.set_ylabel(vary_label)
ax.text(0., 0.5, "{}, {}".format(layer, neuron), transform=ax.transAxes, color="white")
cb = fig.colorbar(im, cax=grid[0].cax, orientation="horizontal")
cb.ax.xaxis.set_ticks_position('top')
cb.ax.xaxis.set_label_position('top')
logger.info("Rendering")
save_show(plt, fig, plotname, bbox_inches='tight')
logger.info("Done")
def plot_hist_2D(plotname, xedges, yedges, hist, varx_label=None, vary_label=None, log=False, zlabel="# of events"):
X, Y = np.meshgrid(xedges, yedges)
fig, ax = plt.subplots()
extraopts = dict()
if log:
extraopts.update(norm=matplotlib.colors.LogNorm(vmin=np.min(hist[hist>0]), vmax=np.max(hist)))
ax.set_facecolor("black")
pcm = ax.pcolormesh(X, Y, hist, cmap="inferno", linewidth=0, rasterized=True, **extraopts)
cbar = fig.colorbar(pcm, ax=ax)
cbar.set_label(zlabel)
ax.set_ylabel(vary_label)
ax.set_xlabel(varx_label)
save_show(plt, fig, plotname)
def plot_hist_2D_events(plotname, valsx, valsy, nbinsx, xmin, xmax, nbinsy, ymin, ymax,
weights=None,
varx_label=None, vary_label=None, log=False):
xedges = np.linspace(xmin, xmax, nbinsx)
yedges = np.linspace(ymin, ymax, nbinsy)
hist, xedges, yedges = np.histogram2d(valsx, valsy, bins=(xedges, yedges), weights=weights)
hist = hist.T
plot_hist_2D(plotname, xedges, yedges, hist, varx_label, vary_label, log)
def plot_cond_avg_actmax_2D(plotname, model, layer, neuron, ranges,
varx_index, vary_index,
nbinsx, xmin, xmax, nbinsy, ymin, ymax,
transform=None, inverse_transform=None,
ntries=20,
step=1,
maxit=1,
**kwargs):
transform_given = [fn is not None for fn in [transform, inverse_transform]]
if any(transform_given) and not all(transform_given):
raise ValueError("Need to pass both transform and inverse_transform if data should be transformed")
xedges = np.linspace(xmin, xmax, nbinsx)
yedges = np.linspace(ymin, ymax, nbinsy)
hist = np.zeros(int(nbinsx*nbinsy)).reshape(int(nbinsx), int(nbinsy))
gradient_function = get_grad_function(model, layer, neuron)
for ix, x in enumerate(xedges):
for iy, y in enumerate(yedges):
random_event = create_random_event(ranges)
if inverse_transform is not None:
random_event = inverse_transform(random_event)
for index, val in [(varx_index, x), (vary_index, y)]:
random_event[0][index] = val
if transform is not None:
random_event = transform(random_event)
act = np.mean([max_activation_wrt_input(gradient_function, random_event, maxit=maxit, step=step, const_indices=[varx_index, vary_index])[0][0] for i in range(ntries)])
hist[ix][iy] = act
hist = hist.T
plot_hist_2D(plotname, xedges, yedges, hist, zlabel="Neuron output", **kwargs)
def get_profile_2D(valsx, valsy, scores,
nbinsx, xmin, xmax,
nbinsy, ymin, ymax,
metric=np.mean, weights=None):
xedges = np.linspace(xmin, xmax, nbinsx)
yedges = np.linspace(ymin, ymax, nbinsy)
binindices_x = np.digitize(valsx, xedges)
binindices_y = np.digitize(valsy, yedges)
# create profile histogram
hist = []
for binindex_x in range(1, len(xedges)+1):
line = []
for binindex_y in range(1, len(yedges)+1):
binindices_xy = (binindices_x == binindex_x) & (binindices_y == binindex_y)
scores_bin = scores[binindices_xy]
if len(scores_bin) > 0:
metric_kwargs = dict()
if weights is not None:
metric_kwargs["weights"] = weights[binindices_xy]
prof_score = metric(scores_bin, **metric_kwargs)
else:
prof_score = 0
line.append(prof_score)
hist.append(line)
hist = np.array(hist)
hist = hist.T # had a list of columns - needs to be list of rows
return hist, xedges, yedges
def plot_profile_2D(plotname, valsx, valsy, scores,
nbinsx, xmin, xmax,
nbinsy, ymin, ymax,
metric=np.mean,
weights=None,
**kwargs):
kwargs["zlabel"] = kwargs.get("zlabel", "Profile")
hist, xedges, yedges = get_profile_2D(
valsx, valsy, scores,
nbinsx, xmin, xmax,
nbinsy, ymin, ymax,
metric=metric, weights=weights
)
plot_hist_2D(plotname, xedges, yedges, hist, **kwargs)
if __name__ == "__main__":
import sys
from .toolkit import ClassificationProject
import logging
logging.basicConfig()
logging.getLogger().setLevel(logging.DEBUG)
from .utils import get_single_neuron_function, get_max_activation_events
import meme
# meme.setOptions(overrideCache="/scratch-local/nhartmann/meme_cache")
if len(sys.argv) < 2:
c = ClassificationProject("/project/etp/nhartmann/p/keras/021-check-low-vs-high-fewvar/all_high")
else:
c = ClassificationProject(sys.argv[1])
def test_mean_signal():
c._load_data() # untransformed
mean_signal = get_mean_event(c.x_test, c.y_test, 1)
print("Mean signal: ")
for branch_index, val in enumerate(mean_signal):
print("{:>20}: {:<10.3f}".format(c.fields[branch_index], val))
plot_NN_vs_var_1D("met.pdf", mean_signal,
scorefun=c.evaluate,
var_index=c.fields.index("met"),
var_range=(0, 1000, 10),
var_label="met [GeV]")
plot_NN_vs_var_1D("mt.pdf", mean_signal,
scorefun=c.evaluate,
var_index=c.fields.index("mt"),
var_range=(0, 500, 10),
var_label="mt [GeV]")
plot_NN_vs_var_2D("mt_vs_met.pdf", means=mean_signal,
scorefun=c.evaluate,
varx_index=c.fields.index("met"),
vary_index=c.fields.index("mt"),
nbinsx=100, xmin=0, xmax=1000,
nbinsy=100, ymin=0, ymax=500,
varx_label="met [GeV]", vary_label="mt [GeV]")
plot_NN_vs_var_2D_all("mt_vs_met_all.pdf", means=mean_signal,
model=c.model, transform_function=c.transform,
varx_index=c.fields.index("met"),
vary_index=c.fields.index("mt"),
nbinsx=100, xmin=0, xmax=1000,
nbinsy=100, ymin=0, ymax=500,
varx_label="met [GeV]", vary_label="mt [GeV]")
input_transform = c.transform
if hasattr(c, "get_input_list"):
input_transform = lambda x : c.get_input_list(c.transform(x))
plot_NN_vs_var_2D("mt_vs_met_crosscheck.pdf", means=mean_signal,
scorefun=get_single_neuron_function(c.model, layer=3, neuron=0, input_transform=input_transform),
varx_index=c.fields.index("met"),
vary_index=c.fields.index("mt"),
nbinsx=100, xmin=0, xmax=1000,
nbinsy=100, ymin=0, ymax=500,
varx_label="met [GeV]", vary_label="mt [GeV]")
def test_max_act():
# transformed events
c.load(reload=True)
ranges = [np.percentile(c.x_test[:,var_index], [1,99]) for var_index in range(len(c.fields))]
losses, events = get_max_activation_events(c.model, ranges, ntries=100000, layer=3, neuron=0, threshold=0.2)
events = c.inverse_transform(events)
plot_hist_2D_events(
"mt_vs_met_actmaxhist.pdf",
events[:,c.fields.index("met")],
events[:,c.fields.index("mt")],
100, 0, 1000,
100, 0, 500,
varx_label="met [GeV]", vary_label="mt [GeV]",
)
plot_hist_2D_events(
"mt_vs_output_actmax.pdf",
events[:,c.fields.index("mt")],
losses,
100, 0, 500,
100, 0, 1,
varx_label="mt [GeV]", vary_label="NN output",
log=True,
)
def test_cond_max_act():
c.load(reload=True)
ranges = [np.percentile(c.x_test[:,var_index], [1,99]) for var_index in range(len(c.fields))]
plot_cond_avg_actmax_2D(
"mt_vs_met_cond_actmax.pdf",
c.model, 3, 0, ranges,
c.fields.index("met"),
c.fields.index("mt"),
30, 0, 1000,
30, 0, 500,
transform=c.transform, inverse_transform=c.inverse_transform,
varx_label="met [GeV]", vary_label="mt [GeV]",
)
def test_xtest_vs_output():
c.load(reload=True)
utrf_x_test = c.inverse_transform(c.x_test)
plot_hist_2D_events(
"mt_vs_output_signal_test.pdf",
utrf_x_test[c.y_test==1][:,c.fields.index("mt")],
c.scores_test[c.y_test==1].reshape(-1),
100, 0, 1000,
100, 0, 1,
varx_label="mt [GeV]", vary_label="NN output",
log=True,
)
plot_hist_2D_events(
"mt_vs_met_signal.pdf",
utrf_x_test[c.y_test==1][:,c.fields.index("met")],
utrf_x_test[c.y_test==1][:,c.fields.index("mt")],
100, 0, 1000,
100, 0, 500,
varx_label="met [GeV]",
vary_label="mt [GeV]",
log=True,
)
plot_hist_2D_events(
"mt_vs_met_backgound.pdf",
utrf_x_test[c.y_test==0][:,c.fields.index("met")],
utrf_x_test[c.y_test==0][:,c.fields.index("mt")],
100, 0, 1000,
100, 0, 500,
varx_label="met [GeV]",
vary_label="mt [GeV]",
log=True,
)
# plot_hist_2D_events(
# "apl_vs_output_actmax.pdf",
# events[:,c.fields.index("LepAplanarity")],
# losses,
# 100, 0, 0.1,
# 100, 0, 1,
# varx_label="Aplanarity", vary_label="NN output",
# )
def test_profile():
c.load(reload=True)
utrf_x_test = c.inverse_transform(c.x_test)
plot_profile_2D(
"mt_vs_met_profilemean_sig.pdf",
utrf_x_test[c.y_test==1][:,c.fields.index("met")],
utrf_x_test[c.y_test==1][:,c.fields.index("mt")],
c.scores_test[c.y_test==1].reshape(-1),
20, 0, 500,
20, 0, 1000,
varx_label="met [GeV]", vary_label="mt [GeV]",
)
plot_profile_2D(
"mt_vs_met_profilemax_sig.pdf",
utrf_x_test[c.y_test==1][:,c.fields.index("met")],
utrf_x_test[c.y_test==1][:,c.fields.index("mt")],
c.scores_test[c.y_test==1].reshape(-1),
20, 0, 500,
20, 0, 1000,
metric=np.max,
varx_label="met [GeV]", vary_label="mt [GeV]",
)
for obj in dir():
if obj.startswith("test_") and callable(locals()[obj]):
if (len(sys.argv) > 2) and (not sys.argv[2] == obj):
continue
print("Running {}".format(obj))
locals()[obj]()
#!/usr/bin/env python
import os
import argparse
import keras
import h5py
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np
from KerasROOTClassification import ClassificationProject
parser = argparse.ArgumentParser(description='Evaluate a model from a classification project using the given '
'weights and plot the ROC curve and train/test overlayed scores')
parser.add_argument("project_dir")
parser.add_argument("weights")
parser.add_argument("-p", "--plot-prefix", default="eval_nn")
args = parser.parse_args()
c = ClassificationProject(args.project_dir)
c.model.load_weights(args.weights)
print("Predicting for test sample ...")
scores_test = c.evaluate(c.x_test)
print("Done")
fpr, tpr, threshold = roc_curve(c.y_test, scores_test, sample_weight = c.w_test)
fpr = 1.0 - fpr
try:
roc_auc = auc(tpr, fpr, reorder=True)
except ValueError:
logger.warning("Got a value error from auc - trying to rerun with reorder=True")
roc_auc = auc(tpr, fpr, reorder=True)
plt.grid(color='gray', linestyle='--', linewidth=1)
plt.plot(tpr, fpr, label=str(c.name + " (AUC = {})".format(roc_auc)))
plt.plot([0,1],[1,0], linestyle='--', color='black', label='Luck')
plt.ylabel("Background rejection")
plt.xlabel("Signal efficiency")
plt.title('Receiver operating characteristic')
plt.xlim(0,1)
plt.ylim(0,1)
plt.xticks(np.arange(0,1,0.1))
plt.yticks(np.arange(0,1,0.1))
plt.legend(loc='lower left', framealpha=1.0)
plt.savefig(args.plot_prefix+"_ROC.pdf")
plt.clf()
#!/usr/bin/env python
import sys, argparse, re, random
parser = argparse.ArgumentParser(description="generate events that maximise the activation for a given neuron")
parser.add_argument("input_project")
parser.add_argument("output_file")
parser.add_argument("-n", "--nevents", default=100000, type=int)
parser.add_argument("-j", "--mask-jets", action="store_true",
help="mask variables called jet*Pt/Eta/Phi and generate a random uniform distribution of the number of jets (only nescessary for non-recurrent NN)")
args = parser.parse_args()
import logging
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
import h5py
import numpy as np
from KerasROOTClassification.utils import (
weighted_quantile,
get_max_activation_events,
create_random_event,
get_ranges
)
from KerasROOTClassification import load_from_dir
import meme
meme.setOptions(deactivated=True)
input_project = args.input_project
output_file = args.output_file
c = load_from_dir(input_project)
c._load_data()
ranges, mask_probs = get_ranges(c.transform(c.x_train), [0.01, 0.99], c.w_train_tot, mask_value=c.mask_value, max_evts=10000)
def mask_uniform(x, mask_value, recurrent_field_idx):
"""
Mask recurrent fields with a random (uniform) number of objects. Works in place.
"""
for rec_idx in recurrent_field_idx:
for evt in x:
masked = False
nobj = int(random.random()*(rec_idx.shape[1]+1))
for obj_number, line_idx in enumerate(rec_idx.reshape(*rec_idx.shape[1:])):
if obj_number == nobj:
masked=True
if masked:
evt[line_idx] = mask_value
def get_input_flat(x):
return x[0].reshape(-1, len(c.fields))
if args.mask_jets:
jet_fields = {}
for field_name in c.fields:
if any(field_name.startswith("jet") and field_name.endswith(suffix) for suffix in ["Pt", "Eta", "Phi"]):
jet_number = re.findall("[0-9]+", field_name)[0]
if not jet_number in jet_fields:
jet_fields[jet_number] = []
jet_fields[jet_number].append(c.fields.index(field_name))
jet_fields = [np.array([[v for k, v in sorted(jet_fields.items(), key=lambda l:l[0])]])]
def input_transform(x):
x = np.array(x)
if hasattr(c, "mask_uniform"):
c.mask_uniform(x)
return c.get_input_list(x)
elif args.mask_jets:
mask_uniform(x, c.mask_value, jet_fields)
return x
opt_kwargs = dict()
if hasattr(c, "mask_uniform"):
opt_kwargs["input_transform"] = input_transform
opt_kwargs["input_inverse_transform"] = c.get_input_flat
if args.mask_jets:
opt_kwargs["input_transform"] = input_transform
opt_kwargs["input_inverse_transform"] = get_input_flat
evts = get_max_activation_events(
c.model, ranges,
ntries=args.nevents,
layer=len(c.model.layers)-1,
neuron=0,
maxit=10,
seed=45,
threshold=0,
**opt_kwargs
)
with h5py.File(output_file, "w") as f:
f.create_dataset("actmax", data=evts[1])
#!/usr/bin/env python
import sys
import argparse
parser = argparse.ArgumentParser(description='Create various 2D plots for a single neuron')
parser.add_argument("project_dir")
parser.add_argument("output_filename")
parser.add_argument("varx")
parser.add_argument("vary")
parser.add_argument("-m", "--mode",
choices=["mean_sig", "mean_bkg", "profile_sig", "profile_bkg", "hist_sig", "hist_bkg", "hist_actmax", "cond_actmax"],
default="mean_sig")
parser.add_argument("-l", "--layer", type=int, help="Layer index (takes last layer by default)")
parser.add_argument("-n", "--neuron", type=int, default=0, help="Neuron index (takes first neuron by default)")
parser.add_argument("-a", "--all-neurons", action="store_true", help="Create a summary plot for all neurons in all hidden layers")
parser.add_argument("--log", action="store_true", help="Plot in color in log scale")
parser.add_argument("--contour", action="store_true", help="Interpolate with contours")
parser.add_argument("-b", "--nbins", default=20, type=int, help="Number of bins in x and y direction")
parser.add_argument("-x", "--xrange", type=float, nargs="+", help="xrange (low, high)")
parser.add_argument("-y", "--yrange", type=float, nargs="+", help="yrange (low, high)")
parser.add_argument("-p", "--profile-metric", help="metric for profile modes", default="average", choices=["mean", "average", "max"])
parser.add_argument("--ntries-cond-actmax", help="number of random events to be maximised and averaged per bin", default=20, type=int)
parser.add_argument("--nit-cond-actmax", help="number of iterations for maximisation per bin", default=1, type=int)
parser.add_argument("--ntries-actmax", help="number of random events to be maximised for hist_actmax", default=10000, type=int)
parser.add_argument("-t", "--threshold", help="minimum activation threshold", default=0.2, type=float)
parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("-s", "--step-size", help="step size for activation maximisation", default=1., type=float)
args = parser.parse_args()
import logging
logging.basicConfig()
import numpy as np
import ROOT
ROOT.gROOT.SetBatch()
ROOT.PyConfig.IgnoreCommandLineOptions = True
from KerasROOTClassification import load_from_dir
from KerasROOTClassification.plotting import (
get_mean_event,
plot_NN_vs_var_2D,
plot_profile_2D,
plot_hist_2D_events,
plot_cond_avg_actmax_2D,
plot_NN_vs_var_2D_all,
)
from KerasROOTClassification.utils import (
get_single_neuron_function,
get_max_activation_events,
weighted_quantile,
get_ranges
)
if args.all_neurons and (not args.mode.startswith("mean")):
parser.error("--all-neurons currently only supported for mean_sig and mean_bkg")
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
c = load_from_dir(args.project_dir)
plot_vs_activation = (args.vary == "activation")
layer = args.layer
neuron = args.neuron
if layer is None:
layer = len(c.model.layers)-1
varx_index = c.fields.index(args.varx)
if not plot_vs_activation:
vary_index = c.fields.index(args.vary)
else:
vary_index = 0 # dummy value in this case
varx_label = args.varx
vary_label = args.vary
total_weights = c.w_test*np.array(c.class_weight)[c.y_test.astype(int)]
try:
mask_value = c.mask_value
except AttributeError:
mask_value = None
# varx_test = c.x_test[:,varx_index]
# vary_test = c.x_test[:,vary_index]
# x_not_masked = np.where(varx_test != mask_value)[0]
# y_not_masked = np.where(vary_test != mask_value)[0]
# percentilesx = weighted_quantile(varx_test[x_not_masked], [0.01, 0.99], sample_weight=total_weights[x_not_masked])
# percentilesy = weighted_quantile(vary_test[y_not_masked], [0.01, 0.99], sample_weight=total_weights[y_not_masked])
percentilesx = get_ranges(c.x_test, [0.01, 0.99], total_weights, mask_value=mask_value, filter_index=varx_index, max_evts=10000)[0][0]
percentilesy = get_ranges(c.x_test, [0.01, 0.99], total_weights, mask_value=mask_value, filter_index=vary_index, max_evts=10000)[0][0]
if args.xrange is not None:
if len(args.xrange) < 3:
args.xrange.append(args.nbins)
varx_range = args.xrange
else:
varx_range = (percentilesx[0], percentilesx[1], args.nbins)
if args.yrange is not None:
if len(args.yrange) < 3:
args.yrange.append(args.nbins)
vary_range = args.yrange
else:
vary_range = (percentilesy[0], percentilesy[1], args.nbins)
if plot_vs_activation:
vary_range = (0, 1, args.nbins)
if args.mode.startswith("mean"):
if args.mode == "mean_sig":
means = get_mean_event(c.x_test, c.y_test, 1, mask_value=mask_value)
elif args.mode == "mean_bkg":
means = get_mean_event(c.x_test, c.y_test, 0, mask_value=mask_value)
print(means)
if hasattr(c, "get_input_list"):
input_transform = lambda x : c.get_input_list(c.transform(x))
else:
input_transform = c.transform
if not args.all_neurons:
plot_NN_vs_var_2D(
args.output_filename,
means=means,
varx_index=varx_index,
vary_index=vary_index,
scorefun=get_single_neuron_function(
c.model, layer, neuron,
input_transform=input_transform
),
xmin=varx_range[0], xmax=varx_range[1], nbinsx=varx_range[2],
ymin=vary_range[0], ymax=vary_range[1], nbinsy=vary_range[2],
varx_label=varx_label, vary_label=vary_label,
logscale=args.log, only_pixels=(not args.contour)
)
else:
if hasattr(c, "get_input_list"):
transform_function = lambda inp : c.get_input_list(c.scaler.transform(inp))
else:
transform_function = c.scaler.transform
plot_NN_vs_var_2D_all(
args.output_filename,
means=means,
model=c.model,
transform_function=transform_function,
varx_index=varx_index,
vary_index=vary_index,
xmin=varx_range[0], xmax=varx_range[1], nbinsx=varx_range[2],
ymin=vary_range[0], ymax=vary_range[1], nbinsy=vary_range[2],
logz=args.log,
plot_last_layer=False,
)
elif args.mode.startswith("profile"):
def my_average(x, weights):
if weights.sum() <= 0:
return np.nan
else:
return np.average(x, weights=weights)
metric_dict = {
"mean" : np.mean,
"max" : np.max,
"average" : my_average,
}
if args.mode == "profile_sig":
class_index = 1
else:
class_index = 0
valsx = c.x_test[c.y_test==class_index][:,varx_index]
valsy = c.x_test[c.y_test==class_index][:,vary_index]
scores = c.scores_test[c.y_test==class_index].reshape(-1)
opt_kwargs = dict()
if args.profile_metric == "average":
opt_kwargs["weights"] = c.w_test[c.y_test==class_index]
plot_profile_2D(
args.output_filename,
valsx, valsy, scores,
xmin=varx_range[0], xmax=varx_range[1], nbinsx=varx_range[2],
ymin=vary_range[0], ymax=vary_range[1], nbinsy=vary_range[2],
metric=metric_dict[args.profile_metric],
varx_label=varx_label, vary_label=vary_label,
log=args.log,
**opt_kwargs
)
elif args.mode.startswith("hist"):
if not args.mode == "hist_actmax":
if args.mode == "hist_sig":
class_index = 1
else:
class_index = 0
valsx = c.x_test[c.y_test==class_index][:,varx_index]
if not plot_vs_activation:
valsy = c.x_test[c.y_test==class_index][:,vary_index]
else:
valsy = c.scores_test[c.y_test==class_index].reshape(-1)
weights = c.w_test[c.y_test==class_index]
else:
# ranges in which to sample the random events
x_test_scaled = c.transform(c.x_test)
ranges = get_ranges(x_test_scaled, [0.01, 0.99], total_weights, mask_value=mask_value)
kwargs = {}
if hasattr(c, "get_input_list"):
kwargs["input_transform"] = c.get_input_list
kwargs["input_inverse_transform"] = c.get_input_flat
losses, events = get_max_activation_events(c.model, ranges,
ntries=args.ntries_actmax,
step=args.step_size,
layer=layer,
neuron=neuron,
threshold=args.threshold,
**kwargs)
events = c.inverse_transform(events)
valsx = events[:,varx_index]
if not plot_vs_activation:
valsy = events[:,vary_index]
else:
valsy = losses
weights = None
plot_hist_2D_events(
args.output_filename,
valsx, valsy,
xmin=varx_range[0], xmax=varx_range[1], nbinsx=varx_range[2],
ymin=vary_range[0], ymax=vary_range[1], nbinsy=vary_range[2],
weights=weights,
varx_label=varx_label, vary_label=vary_label,
log=args.log,
)
elif args.mode.startswith("cond_actmax"):
x_test_scaled = c.transform(c.x_test)
# ranges in which to sample the random events
ranges = get_ranges(x_test_scaled, [0.01, 0.99], total_weights, mask_value=mask_value)
plot_cond_avg_actmax_2D(
args.output_filename,
c.model, layer, neuron, ranges,
varx_index,
vary_index,
xmin=varx_range[0], xmax=varx_range[1], nbinsx=varx_range[2],
ymin=vary_range[0], ymax=vary_range[1], nbinsy=vary_range[2],
scaler=c.scaler,
ntries=args.ntries_cond_actmax,
maxit=args.nit_cond_actmax,
step=args.step_size,
varx_label=varx_label, vary_label=vary_label,
log=args.log,
)
#!/usr/bin/env python
"""
Write new TTrees with signal parameters as branches. For the
backgrounds the parameters are generated following the total
distribution for all signals. The discrete values for the whole ntuple
of signal parameters are counted, such that correlations between
signal parameters are taken into account.
"""
import argparse, re, os
import ROOT
from root_numpy import list_trees
from root_pandas import read_root
import numpy as np
if __name__ == "__main__":
input_filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys.root"
output_filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys_parametrized.root"
param_names = ["mg", "mc", "mn"]
param_match = "GG_oneStep_(.*?)_(.*?)_(.*?)_NoSys"
output_signal_treename = "GG_oneStep_NoSys"
bkg_trees = [
"diboson_Sherpa221_NoSys",
"singletop_NoSys",
"ttbar_NoSys",
"ttv_NoSys",
"wjets_Sherpa221_NoSys",
"zjets_Sherpa221_NoSys",
]
# read in the number of events for each combination of parameters
f = ROOT.TFile.Open(input_filename)
count_dict = {}
for key in f.GetListOfKeys():
tree_name = key.GetName()
match = re.match(param_match, tree_name)
if match is not None:
tree = f.Get(tree_name)
params = tuple([float(i) for i in match.groups()])
if not params in count_dict:
count_dict[params] = 0
# TODO: might be better to use sum of weights
count_dict[params] += tree.GetEntries()
f.Close()
# calculate cumulative sum of counts to sample signal parameters for background from
numbers = np.array(count_dict.keys(), dtype=np.float)
counts = np.array(count_dict.values(), dtype=np.float)
probs = counts/counts.sum()
prob_bins = np.cumsum(probs)
# read and write the rest in chunks
if os.path.exists(output_filename):
os.remove(output_filename)
for tree_name in list_trees(input_filename):
match_signal = re.match(param_match, tree_name)
if match_signal is not None or tree_name in bkg_trees:
print("Writing {}".format(tree_name))
nwritten = 0
for df in read_root(input_filename, tree_name, chunksize=100000):
print("Writing event {}".format(nwritten))
if match_signal is None:
rnd = np.random.random(len(df))
rnd_idx = np.digitize(rnd, prob_bins)
param_values = numbers[rnd_idx]
for param_idx, param_name in enumerate(param_names):
df[param_name] = param_values[:,param_idx]
df["training_weight"] = df["eventWeight"]*df["genWeight"]
else:
for param_name, param_value in zip(param_names, match_signal.groups()):
df[param_name] = float(param_value)
df["training_weight"] = df["eventWeight"]
if match_signal is None:
out_tree_name = tree_name
else:
out_tree_name = output_signal_treename
df.to_root(output_filename, mode="a", key=out_tree_name)
nwritten += len(df)
import pytest
import numpy as np
import root_numpy
import pandas as pd
from sklearn.datasets import make_classification
from keras.layers import GRU
from KerasROOTClassification import ClassificationProject, ClassificationProjectRNN
def create_dataset(path):
# create example dataset with (low-weighted) noise added
X, y = make_classification(n_samples=10000, random_state=1)
X2 = np.random.normal(size=20*10000).reshape(-1, 20)
y2 = np.concatenate([np.zeros(5000), np.ones(5000)])
X = np.concatenate([X, X2])
y = np.concatenate([y, y2])
w = np.concatenate([np.ones(10000), 0.01*np.ones(10000)])
# shift and scale randomly (to check if transformation is working)
shift = np.random.rand(20)*100
scale = np.random.rand(20)*1000
X *= scale
X += shift
# write to root files
branches = ["var_{}".format(i) for i in range(len(X[0]))]
df = pd.DataFrame(X, columns=branches)
df["class"] = y
df["weight"] = w
tree_path_bkg = str(path / "bkg.root")
tree_path_sig = str(path / "sig.root")
root_numpy.array2root(df[df["class"]==0].to_records(), tree_path_bkg)
root_numpy.array2root(df[df["class"]==1].to_records(), tree_path_sig)
return branches, tree_path_sig, tree_path_bkg
def test_ClassificationProject(tmp_path):
branches, tree_path_sig, tree_path_bkg = create_dataset(tmp_path)
c = ClassificationProject(
str(tmp_path / "project"),
bkg_trees = [(tree_path_bkg, "tree")],
signal_trees = [(tree_path_sig, "tree")],
branches = branches,
weight_expr = "weight",
identifiers = ["index"],
optimizer="Adam",
earlystopping_opts=dict(patience=5),
dropout=0.5,
layers=3,
nodes=128,
)
c.train(epochs=200)
c.plot_all_inputs()
c.plot_loss()
assert min(c.history.history["val_loss"]) < 0.18
def test_ClassificationProjectRNN(tmp_path):
branches, tree_path_sig, tree_path_bkg = create_dataset(tmp_path)
c = ClassificationProjectRNN(
str(tmp_path / "project"),
bkg_trees = [(tree_path_bkg, "tree")],
signal_trees = [(tree_path_sig, "tree")],
branches = branches,
recurrent_field_names=[
[
["var_1", "var_2", "var_3"],
["var_4", "var_5", "var_6"]
],
[
["var_10", "var_11", "var_12"],
["var_13", "var_14", "var_15"]
],
],
weight_expr = "weight",
identifiers = ["index"],
optimizer="Adam",
earlystopping_opts=dict(patience=5),
dropout=0.5,
layers=3,
nodes=128,
)
assert sum([isinstance(layer, GRU) for layer in c.model.layers]) == 2
c.train(epochs=200)
c.plot_all_inputs()
c.plot_loss()
assert min(c.history.history["val_loss"]) < 0.18
#!/usr/bin/env python
__all__ = ["load_from_dir", "ClassificationProject", "ClassificationProjectDataFrame", "ClassificationProjectRNN"]
from sys import version_info
if version_info[0] > 2:
raw_input = input
izip = zip
else:
from itertools import izip
import os
import json
import pickle
import importlib
import csv
import math
import glob
import shutil
import gc
import random
import logging
logger = logging.getLogger("KerasROOTClassification")
logger.addHandler(logging.NullHandler())
from root_numpy import tree2array, rec2array, array2root
import numpy as np
import pandas as pd
import h5py
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.externals import joblib
from sklearn.metrics import roc_curve, auc
from sklearn.utils.extmath import stable_cumsum
from sklearn.model_selection import KFold
from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Dropout, Input, Masking, GRU, LSTM, concatenate, SimpleRNN
from keras.callbacks import History, EarlyStopping, CSVLogger, ModelCheckpoint, TensorBoard
from keras.optimizers import SGD
from keras.activations import relu
import keras.initializers
import keras.optimizers
from keras.utils.vis_utils import model_to_dot
from keras import backend as K
import tensorflow as tf
import matplotlib.pyplot as plt
from .utils import WeightedRobustScaler, weighted_quantile, poisson_asimov_significance
from .plotting import save_show
import ROOT
def byteify(input):
"From stackoverflow https://stackoverflow.com/a/13105359"
if isinstance(input, dict):
return {byteify(key): byteify(value)
for key, value in input.iteritems()}
elif isinstance(input, list):
return [byteify(element) for element in input]
elif isinstance(input, unicode):
return input.encode('utf-8')
else:
return input
if version_info[0] > 2:
byteify = lambda input : input
def set_session_threads(n_cpu=None):
"Set the number of threads based on OMP_NUM_THREADS or the given argument"
if n_cpu is None:
if os.environ.get('OMP_NUM_THREADS'):
n_cpu = int(os.environ.get('OMP_NUM_THREADS'))
else:
return
# not sure if this is the best configuration ...
config = tf.ConfigProto(intra_op_parallelism_threads=n_cpu,
inter_op_parallelism_threads=1,
allow_soft_placement=True,
#log_device_placement=True,
device_count = {'CPU': n_cpu})
session = tf.Session(config=config)
K.set_session(session)
def load_from_dir(path):
"Load a project and the options from a directory"
try:
with open(os.path.join(path, "info.json")) as f:
info = json.load(f)
project_type = info["project_type"]
if project_type == "ClassificationProjectRNN":
return ClassificationProjectRNN(path)
except (KeyError, IOError):
pass
return ClassificationProject(path)
class ClassificationProject(object):
verbose = 1 # verbosity of the fit method
"""Simple framework to load data from ROOT TTrees and train Keras
neural networks for classification according to some global settings.
See the `Keras documentation <https://keras.io>` for further information
All needed data that is created is stored in a project dir and can
be used again later without the need to be recreated.
:param name: Name of the project - this will also be the name of
the project directory in the output dir. If no further arguments
are given, this argument is interpreted as a directory name, from
which a previously created project should be initialised
:param signal_trees: list of tuples (filename, treename) for the data that should be used as signal
:param bkg_trees: list of tuples (filename, treename) for the data that should be used as background
:param branches: list of branch names or expressions to be used as input values for training
:param regression_branches: list of branch names to be used as regression targets
:param rename_branches: dictionary that maps branch expressions to names for better readability
:param weight_expr: expression to weight the events in the loss function
:param data_dir: if given, load the data from a previous project with the given name
instead of creating it from trees. If the data is on the same
disk (and the filesystem supports it), hard links will be used,
otherwise symlinks.
:param identifiers: list of branches or expressions that uniquely
identify events. This is used to store the list of training
events, such that they can be marked later on, for example when
creating friend trees with output score
:param selection: selection expression that events have to fulfill to be considered for training
:param layers: number of layers in the neural network
:param nodes: list number of nodes in each layer. If only a single number is given, use this number for every layer
:param dropout: dropout fraction after each hidden layer. You can also pass a list for dropout fractions for each layer. Set to None for no Dropout.
:param dropout_input: dropout fraction for the input layer. Set to None for no Dropout.
:param use_bias: use bias constant for each neuron? (default: True). You can also pass a list of booleans for each layer.
:param batch_size: size of the training batches
:param validation_split: split off this fraction of training events for loss evaluation
:param kfold_splits: if given, split into this number of of subsets to perform KFold cross validation
:param kfold_index: index of the subset to leave out for kfold validation
:param activation_function: activation function in the hidden layers
:param activation_function_output: activation function in the output layer
:param leaky_relu_alpha: set this to a non-zero value to use the LeakyReLU variant with a slope in the negative part
:param out_dir: base directory in which the project directories should be stored
:param scaler_type: sklearn scaler class name to transform the data before training (options: "StandardScaler", "RobustScaler")
:param step_signal: step size when selecting signal training events (e.g. 2 means take every second event)
:param step_bkg: step size when selecting background training events (e.g. 2 means take every second event)
:param stop_train: stop after this number of events for reading in training events
:param stop_test: stop after this number of events for reading in test events
:param optimizer: name of optimizer class in keras.optimizers
:param optimizer_opts: dictionary of options for the optimizer
:param use_earlystopping: set true to use the keras EarlyStopping callback
:param earlystopping_opts: options for the keras EarlyStopping callback
:param use_modelcheckpoint: save model weights after each epoch and don't save after no validation loss improvement (except if the options are set otherwise).
:param modelcheckpoint_opts: options for the Keras ModelCheckpoint
callback. After training, the newest saved weight will be used. If
you change the format of the saved model weights it has to be of
the form "weights*.h5"
:param use_tensorboard: if True, use the tensorboard callback to write logs for tensorboard
:param tensorboard_opts: options for the TensorBoard callback
:param balance_dataset: if True, balance the dataset instead of
applying class weights. Only a fraction of the overrepresented
class will be used in each epoch, but different subsets of the
overrepresented class will be used in each epoch.
:param random_seed: use this seed value when initialising the model and produce consistent results.
:param shuffle_seed: use this seed for shuffling the training data
the first time. This seed (increased by one) is used again before
training when keras shuffling is used.
:param loss: loss function name (or list of names in case of regression targets)
:param loss_weights: (optional) list of weights to weight the individual losses (for multiple targets)
:param mask_value: value that is used for non-existent entries (e.g. 4th jet pt in events with 3 jets)
:param apply_class_weight: apply a weight that scales the events such that sumw(signal) = sumw(background)
:param normalize_weights: normalize the weights to mean 1
:param ignore_neg_weights: ignore events with negative weights in training - not recommended! (default: False)
:param kernel_initializer: weight initializer for the dense layers - if None (default) the keras defaults are used
:param shuffle: shuffle training data after (and before first) epoch
"""
# Datasets that are stored to (and dynamically loaded from) hdf5
dataset_names = ["x_train", "x_test", "y_train", "y_test", "w_train", "w_test", "scores_train", "scores_test"]
# Datasets that are retrieved from ROOT trees the first time
dataset_names_tree = ["x_train", "x_test", "y_train", "y_test", "w_train", "w_test"]
def __init__(self, name, *args, **kwargs):
if len(args) < 1 and len(kwargs) < 1:
# if no further arguments given, interpret as directory name
self._init_from_dir(name)
else:
# otherwise initialise new project
self._init_from_args(name, *args, **kwargs)
with open(os.path.join(self.project_dir, "options.pickle"), "wb") as of:
pickle.dump(dict(args=args, kwargs=kwargs), of)
def _init_from_dir(self, dirname):
if not os.path.exists(os.path.join(dirname, "options.pickle")):
# for backward compatibility
with open(os.path.join(dirname, "options.json")) as f:
options = byteify(json.load(f))
else:
with open(os.path.join(dirname, "options.pickle"), "rb") as f:
options = pickle.load(f)
options["kwargs"]["project_dir"] = dirname
self._init_from_args(os.path.basename(dirname), *options["args"], **options["kwargs"])
def _init_from_args(self, name,
signal_trees, bkg_trees, branches, weight_expr,
regression_branches=None,
rename_branches=None,
project_dir=None,
data_dir=None,
identifiers=None,
selection=None,
layers=3,
nodes=64,
dropout=None,
dropout_input=None,
use_bias=True,
batch_size=128,
validation_split=0.33,
kfold_splits=None,
kfold_index=0,
activation_function='relu',
leaky_relu_alpha=None,
activation_function_output='sigmoid',
scaler_type="WeightedRobustScaler",
step_signal=2,
step_bkg=2,
stop_train=None,
stop_test=None,
optimizer="SGD",
optimizer_opts=None,
use_earlystopping=True,
earlystopping_opts=None,
use_modelcheckpoint=True,
modelcheckpoint_opts=None,
use_tensorboard=False,
tensorboard_opts=None,
random_seed=1234,
shuffle_seed=42,
balance_dataset=False,
loss='binary_crossentropy',
loss_weights=None,
mask_value=None,
apply_class_weight=True,
normalize_weights=True,
ignore_neg_weights=False,
kernel_initializer=None,
shuffle=True,
):
self.name = name
self.signal_trees = signal_trees
self.bkg_trees = bkg_trees
self.branches = branches
if rename_branches is None:
rename_branches = {}
self.rename_branches = rename_branches
if regression_branches is None:
regression_branches = []
self.regression_branches = regression_branches
self.weight_expr = weight_expr
self.selection = selection
self.project_dir = project_dir
if self.project_dir is None:
self.project_dir = name
if not os.path.exists(self.project_dir):
os.mkdir(self.project_dir)
self.data_dir = data_dir
if identifiers is None:
identifiers = []
self.identifiers = identifiers
self.layers = layers
self.nodes = nodes
if not isinstance(self.nodes, list):
self.nodes = [self.nodes for i in range(self.layers)]
if len(self.nodes) != self.layers:
self.layers = len(self.nodes)
logger.warning("Number of layers not equal to the given nodes "
"per layer - adjusted to " + str(self.layers))
self.dropout = dropout
if not isinstance(self.dropout, list):
self.dropout = [self.dropout for i in range(self.layers)]
self.dropout_input = dropout_input
self.use_bias = use_bias
if not isinstance(self.use_bias, list):
self.use_bias = [self.use_bias for i in range(self.layers)]
if len(self.dropout) != self.layers:
raise ValueError("List of dropout fractions has to be of equal size as the number of layers!")
if len(self.use_bias) != self.layers:
raise ValueError("List biases has to be of equal size as the number of layers!")
self.batch_size = batch_size
self.validation_split = validation_split
self.kfold_splits = kfold_splits
self.kfold_index = kfold_index
self.activation_function = activation_function
self.leaky_relu_alpha = leaky_relu_alpha
if self.activation_function == "relu" and self.leaky_relu_alpha:
self.activation_function = lambda x : relu(x, alpha=self.leaky_relu_alpha)
self.activation_function_output = activation_function_output
self.scaler_type = scaler_type
self.step_signal = step_signal
self.step_bkg = step_bkg
self.stop_train = stop_train
self.stop_test = stop_test
self.optimizer = optimizer
self.use_earlystopping = use_earlystopping
self.use_modelcheckpoint = use_modelcheckpoint
self.use_tensorboard = use_tensorboard
if optimizer_opts is None:
optimizer_opts = dict()
self.optimizer_opts = optimizer_opts
if earlystopping_opts is None:
earlystopping_opts = dict()
self.earlystopping_opts = earlystopping_opts
if modelcheckpoint_opts is None:
modelcheckpoint_opts = dict(
save_best_only=True,
verbose=True,
filepath="weights.h5"
)
self.modelcheckpoint_opts = modelcheckpoint_opts
self.tensorboard_opts = dict(
log_dir=os.path.join(self.project_dir, "tensorboard"),
)
if tensorboard_opts is not None:
self.tensorboard_opts.update(**tensorboard_opts)
self.random_seed = random_seed
self.shuffle_seed = shuffle_seed
self.balance_dataset = balance_dataset
self.loss = loss
self.loss_weights = loss_weights
if self.regression_branches and (not isinstance(self.loss, list)):
self.loss = [self.loss]+["mean_squared_error"]*len(self.regression_branches)
self.mask_value = mask_value
self.apply_class_weight = apply_class_weight
self.normalize_weights = normalize_weights
self.ignore_neg_weights = ignore_neg_weights
self.kernel_initializer = kernel_initializer
self.shuffle = shuffle
self.s_train = None
self.b_train = None
self.s_test = None
self.b_test = None
self._x_train = None
self._x_test = None
self._y_train = None
self._y_test = None
self._w_train = None
self._w_test = None
self._scores_train = None
self._scores_test = None
# class weighted training data (divided by mean)
self._w_train_tot = None
self._s_eventlist_train = None
self._b_eventlist_train = None
self._scaler = None
self._scaler_target = None
self._class_weight = None
self._balanced_class_weight = None
self._model = None
self._history = None
self._callbacks_list = []
self._train_val_idx = None
# track the number of epochs this model has been trained
self.total_epochs = 0
self.data_loaded = False
# track if we are currently training
self.is_training = False
self._fields = None
self._target_fields = None
@property
def fields(self):
"Renamed branch expressions"
if self._fields is None:
self._fields = []
for branch_expr in self.branches:
self._fields.append(self.rename_branches.get(branch_expr, branch_expr))
return self._fields
@property
def target_fields(self):
"Renamed branch expressions for regression targets"
if self._target_fields is None:
self._target_fields = []
for branch_expr in self.regression_branches:
self._target_fields.append(self.rename_branches.get(branch_expr, branch_expr))
return self._target_fields
def rename_fields(self, ar):
"Rename fields of structured array"
fields = list(ar.dtype.names)
renamed_fields = []
for old_name in fields:
renamed_fields.append(self.rename_branches.get(old_name, old_name))
ar.dtype.names = tuple(renamed_fields)
def _load_data(self):
self._w_train_tot = None
try:
# if those don't exist, we need to load them from ROOT trees first
self._load_from_hdf5(*self.dataset_names_tree)
except (KeyError, IOError):
logger.info("Couldn't load all datasets - reading from ROOT trees")
# Read signal and background trees into structured numpy arrays
signal_chain = ROOT.TChain()
bkg_chain = ROOT.TChain()
for filename, treename in self.signal_trees:
signal_chain.AddFile(filename, -1, treename)
for filename, treename in self.bkg_trees:
bkg_chain.AddFile(filename, -1, treename)
self.s_train = tree2array(signal_chain,
branches=set(self.branches+self.regression_branches+[self.weight_expr]+self.identifiers),
selection=self.selection,
start=0, step=self.step_signal, stop=self.stop_train)
self.b_train = tree2array(bkg_chain,
branches=set(self.branches+self.regression_branches+[self.weight_expr]+self.identifiers),
selection=self.selection,
start=0, step=self.step_bkg, stop=self.stop_train)
self.s_test = tree2array(signal_chain,
branches=set(self.branches+self.regression_branches+[self.weight_expr]),
selection=self.selection,
start=1, step=self.step_signal, stop=self.stop_test)
self.b_test = tree2array(bkg_chain,
branches=set(self.branches+self.regression_branches+[self.weight_expr]),
selection=self.selection,
start=1, step=self.step_bkg, stop=self.stop_test)
if self.ignore_neg_weights:
self.s_train = self.s_train[self.s_train[self.weight_expr]>0]
self.b_train = self.b_train[self.b_train[self.weight_expr]>0]
self.rename_fields(self.s_train)
self.rename_fields(self.b_train)
self.rename_fields(self.s_test)
self.rename_fields(self.b_test)
self.s_eventlist_train = self.s_train[self.identifiers].astype(dtype=[(branchName, "u8") for branchName in self.identifiers])
self.b_eventlist_train = self.b_train[self.identifiers].astype(dtype=[(branchName, "u8") for branchName in self.identifiers])
self._dump_training_list()
# create x (input), y (target) and w (weights) arrays
# the first block will be signals, the second block backgrounds
self.x_train = rec2array(self.s_train[self.fields])
self.x_train = np.concatenate((self.x_train, rec2array(self.b_train[self.fields])))
self.w_train = self.s_train[self.weight_expr]
self.w_train = np.concatenate((self.w_train, self.b_train[self.weight_expr]))
def fill_target(x, s, b):
if not self.target_fields:
y = np.empty(len(x), dtype=np.bool)
y[:len(s)] = 1
y[len(s):] = 0
else:
y = np.empty((len(x), 1+len(self.target_fields)), dtype=np.float)
y[:len(s),0] = 1
y[len(s):,0] = 0
y[:len(s),1:] = rec2array(s[self.target_fields])
y[len(s):,1:] = rec2array(b[self.target_fields])
return y
self.y_train = fill_target(self.x_train, self.s_train, self.b_train)
self.b_train = None
self.s_train = None
self.x_test = rec2array(self.s_test[self.fields])
self.x_test = np.concatenate((self.x_test, rec2array(self.b_test[self.fields])))
self.w_test = self.s_test[self.weight_expr]
self.w_test = np.concatenate((self.w_test, self.b_test[self.weight_expr]))
self.y_test = fill_target(self.x_test, self.s_test, self.b_test)
self.b_test = None
self.s_test = None
self._dump_to_hdf5(*self.dataset_names_tree)
self.data_loaded = True
def _dump_training_list(self):
s_eventlist_df = pd.DataFrame(self.s_eventlist_train)
b_eventlist_df = pd.DataFrame(self.b_eventlist_train)
s_eventlist_df.to_csv(os.path.join(self.project_dir, "s_eventlist_train.csv"))
b_eventlist_df.to_csv(os.path.join(self.project_dir, "b_eventlist_train.csv"))
@property
def s_eventlist_train(self):
if self._s_eventlist_train is None:
df = pd.read_csv(os.path.join(self.project_dir, "s_eventlist_train.csv"))
self._s_eventlist_train = df.to_records()[self.identifiers]
return self._s_eventlist_train
@s_eventlist_train.setter
def s_eventlist_train(self, value):
self._s_eventlist_train = value
@property
def b_eventlist_train(self):
if self._b_eventlist_train is None:
df = pd.read_csv(os.path.join(self.project_dir, "b_eventlist_train.csv"))
self._b_eventlist_train = df.to_records()[self.identifiers]
return self._b_eventlist_train
@b_eventlist_train.setter
def b_eventlist_train(self, value):
self._b_eventlist_train = value
def _dump_to_hdf5(self, *dataset_names):
if len(dataset_names) < 1:
dataset_names = self.dataset_names
for dataset_name in dataset_names:
filename = os.path.join(self.project_dir, dataset_name+".h5")
logger.info("Writing {} to {}".format(dataset_name, filename))
with h5py.File(filename, "w") as hf:
hf.create_dataset(dataset_name, data=getattr(self, dataset_name))
def _load_from_hdf5(self, *dataset_names):
if len(dataset_names) < 1:
dataset_names = self.dataset_names
for dataset_name in dataset_names:
filename = os.path.join(self.project_dir, dataset_name+".h5")
if (self.data_dir is not None) and (not os.path.exists(filename)):
srcpath = os.path.abspath(os.path.join(self.data_dir, dataset_name+".h5"))
try:
os.link(srcpath, filename)
logger.info("Created hardlink from {} to {}".format(srcpath, filename))
except OSError:
os.symlink(srcpath, filename)
logger.info("Created symlink from {} to {}".format(srcpath, filename))
logger.info("Trying to load {} from {}".format(dataset_name, filename))
with h5py.File(filename, "r") as hf:
setattr(self, dataset_name, hf[dataset_name][:])
logger.info("Data loaded")
@property
def callbacks_list(self):
self._callbacks_list = []
self._callbacks_list.append(self.history)
if self.use_earlystopping:
self._callbacks_list.append(EarlyStopping(**self.earlystopping_opts))
if self.use_modelcheckpoint:
mc = ModelCheckpoint(**self.modelcheckpoint_opts)
self._callbacks_list.append(mc)
if not os.path.dirname(mc.filepath) == self.project_dir:
mc.filepath = os.path.join(self.project_dir, mc.filepath)
logger.debug("Prepending project dir to ModelCheckpoint filepath: {}".format(mc.filepath))
if self.use_tensorboard:
self._callbacks_list.append(TensorBoard(**self.tensorboard_opts))
self._callbacks_list.append(CSVLogger(os.path.join(self.project_dir, "training.log"), append=True))
return self._callbacks_list
@property
def scaler(self):
# create the scaler (and fit to training data) if not existent
if self._scaler is None:
filename = os.path.join(self.project_dir, "scaler.pkl")
try:
self._scaler = joblib.load(filename)
logger.info("Loaded existing scaler from {}".format(filename))
except IOError:
logger.info("Creating new {}".format(self.scaler_type))
scaler_fit_kwargs = dict()
if self.scaler_type == "StandardScaler":
self._scaler = StandardScaler()
elif self.scaler_type == "RobustScaler":
self._scaler = RobustScaler()
elif self.scaler_type == "WeightedRobustScaler":
self._scaler = WeightedRobustScaler()
scaler_fit_kwargs["weights"] = self.w_train_tot
scaler_fit_kwargs["mask_value"] = self.mask_value
else:
raise ValueError("Scaler type {} unknown".format(self.scaler_type))
logger.info("Fitting {} to training data".format(self.scaler_type))
orig_copy_setting = self.scaler.copy
self.scaler.copy = False
self._scaler.fit(self.x_train, **scaler_fit_kwargs)
self.scaler.copy = orig_copy_setting
joblib.dump(self._scaler, filename)
return self._scaler
@property
def scaler_target(self):
"same as scaler, but for scaling regression targets"
# create the scaler (and fit to training data) if not existent
if self._scaler_target is None:
filename = os.path.join(self.project_dir, "scaler_target.pkl")
try:
self._scaler_target = joblib.load(filename)
logger.info("Loaded existing scaler from {}".format(filename))
except IOError:
logger.info("Creating new {} for scaling the targets".format(self.scaler_type))
scaler_fit_kwargs = dict()
if self.scaler_type == "StandardScaler":
self._scaler_target = StandardScaler()
elif self.scaler_type == "RobustScaler":
self._scaler_target = RobustScaler()
elif self.scaler_type == "WeightedRobustScaler":
self._scaler_target = WeightedRobustScaler()
scaler_fit_kwargs["weights"] = self.w_train_tot
else:
raise ValueError("Scaler type {} unknown".format(self.scaler_type))
logger.info("Fitting {} to training data".format(self.scaler_type))
orig_copy_setting = self.scaler.copy
self.scaler.copy = False
self._scaler_target.fit(self.y_train, **scaler_fit_kwargs)
# i don't want to scale the classification target here
self._scaler_target.center_[0] = 0.
self._scaler_target.scale_[0] = 1.
self.scaler.copy = orig_copy_setting
joblib.dump(self._scaler_target, filename)
return self._scaler_target
def _batch_transform(self, x, fn, batch_size):
"Transform array in batches, temporarily setting mask_values to nan"
transformed = np.empty(x.shape, dtype=x.dtype)
for start in range(0, len(x), batch_size):
stop = start+batch_size
x_batch = np.array(x[start:stop]) # copy
x_batch[x_batch == self.mask_value] = np.nan
x_batch = fn(x_batch)
x_batch[np.isnan(x_batch)] = self.mask_value
transformed[start:stop] = x_batch
return transformed
def transform(self, x, batch_size=10000):
if self.mask_value is not None:
return self._batch_transform(x, self.scaler.transform, batch_size)
else:
return self.scaler.transform(x)
def inverse_transform(self, x, batch_size=10000):
if self.mask_value is not None:
return self._batch_transform(x, self.scaler.inverse_transform, batch_size)
else:
return self.scaler.inverse_transform(x)
def transform_target(self, y, batch_size=10000):
if not self.target_fields:
return y
if self.mask_value is not None:
return self._batch_transform(y, self.scaler_target.transform, batch_size)
else:
return self.scaler_target.transform(y)
def inverse_transform_target(self, y, batch_size=10000):
if not self.target_fields:
return y
if self.mask_value is not None:
return self._batch_transform(y, self.scaler_target.inverse_transform, batch_size)
else:
return self.scaler_target.inverse_transform(y)
@property
def history(self):
params_file = os.path.join(self.project_dir, "history_params.json")
history_file = os.path.join(self.project_dir, "history_history.json")
if self._history is None:
self._history = History()
if os.path.exists(params_file) and os.path.exists(history_file):
try:
with open(params_file) as f:
self._history.params = json.load(f)
with open(history_file) as f:
self._history.history = json.load(f)
except ValueError:
logger.warning("Couldn't load history - starting with empty one")
return self._history
@history.setter
def history(self, value):
self._history = value
def _dump_history(self):
params_file = os.path.join(self.project_dir, "history_params.json")
history_file = os.path.join(self.project_dir, "history_history.json")
with open(params_file, "w") as of:
json.dump(self.history.params, of)
with open(history_file, "w") as of:
json.dump(self.history.history, of)
def _read_info(self, key, default):
filename = os.path.join(self.project_dir, "info.json")
if not os.path.exists(filename):
with open(filename, "w") as of:
json.dump({}, of)
with open(filename) as f:
info = json.load(f)
return info.get(key, default)
def _write_info(self, key, value):
filename = os.path.join(self.project_dir, "info.json")
if not os.path.exists(filename):
with open(filename, "w") as of:
json.dump({}, of)
with open(filename) as f:
info = json.load(f)
info[key] = value
with open(filename, "w") as of:
json.dump(info, of)
@staticmethod
def query_yn(text):
result = None
while result is None:
input_text = raw_input(text)
if len(input_text) > 0:
if input_text.upper()[0] == "Y":
result = True
elif input_text.upper()[0] == "N":
result = False
return result
@property
def model(self):
"Simple MLP"
if self._model is None:
# input
input_layer = Input((len(self.fields),))
# optional dropout on inputs
if self.dropout_input is None:
hidden_layer = input_layer
else:
hidden_layer = Dropout(rate=self.dropout_input)(input_layer)
# densely connected hidden layers
for node_count, dropout_fraction, use_bias in zip(
self.nodes,
self.dropout,
self.use_bias,
):
extra_opts = dict()
if self.kernel_initializer is not None:
extra_opts["kernel_initializer"] = getattr(keras.initializers, self.kernel_initializer)()
hidden_layer = Dense(node_count, activation=self.activation_function, use_bias=use_bias, **extra_opts)(hidden_layer)
if (dropout_fraction is not None) and (dropout_fraction > 0):
hidden_layer = Dropout(rate=dropout_fraction)(hidden_layer)
# optional regression targets
extra_targets = []
for target_field in self.target_fields:
extra_target = Dense(1, activation="linear", name="target_{}".format(target_field))(hidden_layer)
extra_targets.append(extra_target)
if not self.target_fields:
# one output node for binary classification
output_layer = Dense(1, activation=self.activation_function_output)(hidden_layer)
outputs = [output_layer]
else:
# add another hidden layer on top of the regression targets and previous hidden layers
merge = concatenate([hidden_layer]+extra_targets)
hidden_layer2 = Dense(64, activation=self.activation_function)(merge)
output_class = Dense(1, activation=self.activation_function_output)(hidden_layer2)
outputs = [output_class]+extra_targets
self._model = Model(inputs=[input_layer], outputs=outputs)
self._compile_or_load_model()
return self._model
def _compile_or_load_model(self):
logger.info("Using {}(**{}) as Optimizer".format(self.optimizer, self.optimizer_opts))
Optimizer = getattr(keras.optimizers, self.optimizer)
optimizer = Optimizer(**self.optimizer_opts)
logger.info("Compile model")
rn_state = np.random.get_state()
np.random.seed(self.random_seed)
self._model.compile(optimizer=optimizer,
loss=self.loss,
loss_weights=self.loss_weights,
weighted_metrics=['accuracy']
)
np.random.set_state(rn_state)
if os.path.exists(os.path.join(self.project_dir, "weights.h5")):
if self.is_training:
continue_training = self.query_yn("Found previously trained weights - "
"continue training (choosing N will restart)? (Y/N) ")
else:
continue_training = True
if continue_training:
self.model.load_weights(os.path.join(self.project_dir, "weights.h5"))
logger.info("Found and loaded previously trained weights")
else:
logger.info("Starting completely new model")
else:
logger.info("No weights found, starting completely new model")
# dump to json for documentation
with open(os.path.join(self.project_dir, "model.json"), "w") as of:
of.write(self._model.to_json())
# plot model
with open(os.path.join(self.project_dir, "model.svg"), "wb") as of:
of.write(model_to_dot(self._model, show_shapes=True).create("dot", format="svg"))
@property
def class_weight(self):
if self._class_weight is None:
sumw_bkg = np.sum(self.w_train[self.l_train == 0])
sumw_sig = np.sum(self.w_train[self.l_train == 1])
self._class_weight = [(sumw_sig+sumw_bkg)/(2*sumw_bkg), (sumw_sig+sumw_bkg)/(2*sumw_sig)]
logger.debug("Calculated class_weight: {}".format(self._class_weight))
return self._class_weight
@property
def balanced_class_weight(self):
"""
Class weight for the balance_dataset method
Since we have equal number of signal and background events in
each batch, we need to balance the ratio of sum of weights per
event with class weights
"""
if self._balanced_class_weight is None:
sumw_bkg = np.sum(self.w_train[self.l_train == 0])
sumw_sig = np.sum(self.w_train[self.l_train == 1])
# use sumw *per event* in this case
sumw_bkg /= len(self.w_train[self.l_train == 0])
sumw_sig /= len(self.w_train[self.l_train == 1])
self._balanced_class_weight = [(sumw_sig+sumw_bkg)/(2*sumw_bkg), (sumw_sig+sumw_bkg)/(2*sumw_sig)]
logger.debug("Calculated balanced_class_weight: {}".format(self._balanced_class_weight))
return self._balanced_class_weight
def load(self, reload=False):
"Load all data needed for plotting and training"
if reload:
self.data_loaded = False
if not self.data_loaded:
self._load_data()
@property
def l_train(self):
"labels (in case y contains regression targets)"
if not self.target_fields:
return self.y_train
else:
return self.y_train[:,0]
@property
def l_test(self):
"labels (in case y contains regression targets)"
if not self.target_fields:
return self.y_test
else:
return self.y_test[:,0]
@property
def w_train_tot(self):
"(sample weight * class weight), divided by mean"
if not self.balance_dataset:
class_weight = self.class_weight
else:
class_weight = self.balanced_class_weight
if not self.data_loaded:
raise ValueError("Data not loaded! can't calculate total weight")
if self._w_train_tot is None:
if self.apply_class_weight:
self._w_train_tot = self.w_train*np.array(class_weight)[self.l_train.astype(int)]
else:
self._w_train_tot = np.array(self.w_train)
if self.normalize_weights:
self._w_train_tot /= np.mean(self._w_train_tot)
return self._w_train_tot
@property
def validation_data(self):
"(Transformed) validation data for loss evaluation"
idx = self.train_val_idx[1]
x_val, y_val, w_val = self.x_train[idx], self.y_train[idx], self.w_train_tot[idx]
x_val_input = self.get_input_list(self.transform(x_val))
y_val_output = self.get_output_list(self.transform_target(y_val))
w_val_list = self.get_weight_list(w_val)
return x_val_input, y_val_output, w_val_list
@property
def training_data(self):
"(Transformed) Training data with validation data split off"
idx = self.train_val_idx[0]
x_train, y_train, w_train = self.x_train[idx], self.y_train[idx], self.w_train_tot[idx]
x_train_input = self.get_input_list(self.transform(x_train))
y_train_output = self.get_output_list(self.transform_target(y_train))
w_train_list = self.get_weight_list(w_train)
return x_train_input, y_train_output, w_train_list
@property
def train_val_idx(self):
if self._train_val_idx is None:
if self.kfold_splits is not None:
kfold = KFold(self.kfold_splits, shuffle=self.shuffle, random_state=self.shuffle_seed)
for i, train_val_idx in enumerate(kfold.split(self.x_train)):
if i == self.kfold_index:
self._train_val_idx = train_val_idx
break
else:
raise IndexError("Index {} out of range for kfold (requested {} splits)".format(self.kfold_index, self.kfold_splits))
else:
split_index = int((1-self.validation_split)*len(self.x_train))
np.random.seed(self.shuffle_seed)
if self.shuffle:
shuffled_idx = np.random.permutation(len(self.x_train))
else:
shuffled_idx = np.arange(len(self.x_train))
self._train_val_idx = (shuffled_idx[:split_index], shuffled_idx[split_index:])
return self._train_val_idx
@property
def steps_per_epoch(self):
return int(float(len(self.train_val_idx[0]))/float(self.batch_size))
def get_input_list(self, x):
"For the standard Dense models with single input, this does nothing"
return x
def get_output_list(self, y):
"Split target vector column wise in case of regression targets"
if not self.target_fields:
return y
else:
return np.hsplit(y, len(self.target_fields)+1)
def get_weight_list(self, w):
"Repeat weight n times for regression targets"
if not self.target_fields:
return w
else:
return [w]*(len(self.target_fields)+1)
def yield_batch(self):
"Batch generator - optionally shuffle the indices after each epoch"
x_train, y_train, w_train = self.x_train, self.y_train, self.w_train_tot
train_idx = list(self.train_val_idx[0])
np.random.seed(self.shuffle_seed+1)
logger.info("Generating training batches from {} signal and {} background events"
.format(len(np.where(self.l_train[train_idx]==1)[0]),
len(np.where(self.l_train[train_idx]==0)[0])))
while True:
if self.shuffle:
shuffled_idx = np.random.permutation(train_idx)
else:
shuffled_idx = train_idx
for start in range(0, len(shuffled_idx), int(self.batch_size)):
x_batch = x_train[shuffled_idx[start:start+int(self.batch_size)]]
y_batch = y_train[shuffled_idx[start:start+int(self.batch_size)]]
w_batch = w_train[shuffled_idx[start:start+int(self.batch_size)]]
x_input = self.get_input_list(self.transform(x_batch))
y_output = self.get_output_list(self.transform_target(y_batch))
w_list = self.get_weight_list(w_batch)
yield (x_input, y_output, w_list)
def yield_single_class_batch(self, class_label):
"""
Generate batches of half batch size, containing only entries for the given class label.
The weights are multiplied by balanced_class_weight.
"""
x_train, y_train, w_train = self.training_data
l_train = y_train[:,0] if self.target_fields else y_train
class_idx = np.where(l_train==class_label)[0]
while True:
# shuffle the indices for this class label
if self.shuffle:
shuffled_idx = np.random.permutation(class_idx)
else:
shuffled_idx = class_idx
# yield them batch wise
for start in range(0, len(shuffled_idx), int(self.batch_size/2)):
yield (x_train[shuffled_idx[start:start+int(self.batch_size/2)]],
y_train[shuffled_idx[start:start+int(self.batch_size/2)]],
w_train[shuffled_idx[start:start+int(self.batch_size/2)]])
def yield_balanced_batch(self):
"generate batches with equal amounts of both classes"
logcounter = 0
for batch_0, batch_1 in izip(self.yield_single_class_batch(0),
self.yield_single_class_batch(1)):
if logcounter == 10:
logger.debug("\rSumw sig*balanced_class_weight[1]: {}".format(np.sum(batch_1[2])))
logger.debug("\rSumw bkg*balanced_class_weight[0]: {}".format(np.sum(batch_0[2])))
logcounter = 0
logcounter += 1
yield (np.concatenate((batch_0[0], batch_1[0])),
np.concatenate((batch_0[1], batch_1[1])),
np.concatenate((batch_0[2], batch_1[2])))
def train(self, epochs=10, skip_checkpoint=False):
self.load()
self.total_epochs = self._read_info("epochs", 0)
set_session_threads()
logger.info("Train model")
if not self.balance_dataset:
try:
self.is_training = True
self.model.fit_generator(self.yield_batch(),
steps_per_epoch=self.steps_per_epoch,
epochs=epochs,
validation_data=self.validation_data,
callbacks=self.callbacks_list,
verbose=self.verbose)
self.is_training = False
except KeyboardInterrupt:
logger.info("Interrupt training - continue with rest")
else:
try:
self.is_training = True
labels, label_counts = np.unique(self.l_train, return_counts=True)
logger.info("Training on balanced batches")
# note: the batches have balanced_class_weight already applied
self.model.fit_generator(self.yield_balanced_batch(),
steps_per_epoch=int(min(label_counts)/self.batch_size),
epochs=epochs,
validation_data=self.validation_data,
callbacks=self.callbacks_list,
verbose=self.verbose)
self.is_training = False
except KeyboardInterrupt:
logger.info("Interrupt training - continue with rest")
if not skip_checkpoint:
self.checkpoint_model()
def checkpoint_model(self):
logger.info("Save history")
self._dump_history()
if not self.use_modelcheckpoint:
logger.info("Save weights")
self.model.save_weights(os.path.join(self.project_dir, "weights.h5"))
else:
weight_file = sorted(glob.glob(os.path.join(self.project_dir, "weights*.h5")), key=lambda f:os.path.getmtime(f))[-1]
if not os.path.basename(weight_file) == "weights.h5":
logger.info("Copying latest weight file {} to weights.h5".format(weight_file))
shutil.copy(weight_file, os.path.join(self.project_dir, "weights.h5"))
logger.info("Reloading weights file since we are using model checkpoint!")
self.model.load_weights(os.path.join(self.project_dir, "weights.h5"))
self.total_epochs += self.history.epoch[-1]+1
self._write_info("epochs", self.total_epochs)
def evaluate_train_test(self, do_train=True, do_test=True, batch_size=10000, mode=None):
"Calculate scores for training and test sample"
if mode is not None:
self._write_info("scores_mode", mode)
def eval_score(data_name):
logger.info("Create/Update scores for {} sample".format(data_name))
n_events = len(getattr(self, "x_"+data_name))
setattr(self, "scores_"+data_name, np.empty(n_events))
for start in range(0, n_events, batch_size):
stop = start+batch_size
outputs = self.predict(
self.get_input_list(self.transform(getattr(self, "x_"+data_name)[start:stop])),
mode=mode
)
if not self.target_fields:
scores_batch = outputs.reshape(-1)
else:
scores_batch = outputs[0].reshape(-1)
getattr(self, "scores_"+data_name)[start:stop] = scores_batch
self._dump_to_hdf5("scores_"+data_name)
if do_test:
eval_score("test")
if do_train:
eval_score("train")
def predict(self, x, mode=None):
"""
Calculate the scores for a (transformed) array of input values.
If the array is not transformed, use `evaluate` instead
"""
if mode is None:
# normal output - after activation function output layer
return self.model.predict(x)
elif mode == "skip_activation":
# output before applying activation function
# (after weighted sum + bias of last hidden layer)
if isinstance(self.model.input, list):
feed_dict={tuple(self.model.input) : x}
else:
feed_dict={self.model.input : x}
return K.get_session().run(
self.model.output.op.inputs[0],
feed_dict=feed_dict
)
else:
raise ValueError("Unknown mode {}".format(mode))
def evaluate(self, x_eval, mode=None):
"""
Calculate the scores for an array of input values.
All nescessary transformations are applied.
"""
logger.debug("Evaluate score for {}".format(x_eval))
x_eval = self.transform(x_eval)
logger.debug("Evaluate for transformed array: {}".format(x_eval))
return self.predict(x_eval, mode=mode)
def write_friend_tree(self, score_name,
source_filename, source_treename,
target_filename, target_treename,
batch_size=100000,
score_mode=None,
fixed_params=None):
"TODO: doesn't work for regression targets"
f = ROOT.TFile.Open(source_filename)
tree = f.Get(source_treename)
entries = tree.GetEntries()
logger.info("Write friend tree for {} in {}".format(source_treename, source_filename))
if os.path.exists(target_filename):
raise IOError("{} already exists, if you want to recreate it, delete it first".format(target_filename))
for start in range(0, entries, batch_size):
logger.info("Evaluating score for entry {}/{}".format(start, entries))
logger.debug("Loading next batch")
x_from_tree = tree2array(tree,
branches=self.branches+self.identifiers,
start=start, stop=start+batch_size)
# for parametrized classifiers
if fixed_params is not None:
for param_name, value in fixed_params.items():
x_from_tree[param_name] = value
x_eval = rec2array(x_from_tree[self.branches])
if len(self.identifiers) > 0:
# create list of booleans that indicate which events where used for training
df_identifiers = pd.DataFrame(x_from_tree[self.identifiers])
total_train_list = self.s_eventlist_train
total_train_list = np.concatenate((total_train_list, self.b_eventlist_train))
merged = df_identifiers.merge(pd.DataFrame(total_train_list), on=tuple(self.identifiers), indicator=True, how="left")
is_train = np.array(merged["_merge"] == "both")
else:
is_train = np.zeros(len(x_eval))
# join scores and is_train array
scores = self.evaluate(x_eval, mode=score_mode).reshape(-1)
friend_df = pd.DataFrame(np.array(scores, dtype=[(score_name, np.float64)]))
friend_df[score_name+"_is_train"] = is_train
friend_tree = friend_df.to_records()[[score_name, score_name+"_is_train"]]
if start == 0:
mode = "recreate"
else:
mode = "update"
logger.debug("Write to root file")
array2root(friend_tree, target_filename, treename=target_treename, mode=mode)
logger.debug("Done")
def write_all_friend_trees(self):
pass
@staticmethod
def get_bin_centered_hist(x, scale_factor=None, **np_kwargs):
"Return bin centers, histogram and relative (!) errors"
hist, bins = np.histogram(x, **np_kwargs)
centers = (bins[:-1] + bins[1:]) / 2
if "weights" in np_kwargs:
bin_indices = np.digitize(x, bins)
sumw2 = np.array([np.sum(np_kwargs["weights"][bin_indices==i]**2)
for i in range(1, len(bins)+1)])
sumw = np.array([np.sum(np_kwargs["weights"][bin_indices==i])
for i in range(1, len(bins)+1)])
# move overflow to last bin
# (since thats what np.histogram gives us)
sumw2[-2] += sumw2[-1]
sumw2 = sumw2[:-1]
sumw[-2] += sumw[-1]
sumw = sumw[:-1]
# calculate relative error
errors = np.sqrt(sumw2)/sumw
else:
errors = np.sqrt(hist)/hist
if scale_factor is not None:
hist *= scale_factor
return centers, hist, errors
def plot_input(self, var_index, ax=None, from_training_batches=False, max_n_batches=None):
"""
plot a single input variable as a histogram (signal vs background)
:param from_training_batches: use data from training batch generator
:param max_n_batches: if training batch generator is used, just use
this number of batches (otherwise steps_per_epoch is used)
"""
branch = self.fields[var_index]
if ax is None:
fig, ax = plt.subplots()
else:
fig = None
if not from_training_batches:
bkg = self.x_train[:,var_index][self.l_train == 0]
sig = self.x_train[:,var_index][self.l_train == 1]
bkg_weights = self.w_train_tot[self.l_train == 0]
sig_weights = self.w_train_tot[self.l_train == 1]
else:
bkg = None
sig = None
bkg_weights = None
sig_weights = None
if max_n_batches is not None:
n_batches = max_n_batches
else:
n_batches = self.steps_per_epoch
for i_batch, (x, y, w) in enumerate(self.yield_batch()):
if i_batch > n_batches:
break
if self.target_fields:
y = y[0]
try:
x = self.get_input_flat(x)
except NameError:
pass
bkg_batch = x[:,var_index][y==0]
sig_batch = x[:,var_index][y==1]
bkg_weights_batch = w[y==0]
sig_weights_batch = w[y==1]
if bkg is None:
bkg = bkg_batch
sig = sig_batch
bkg_weights = bkg_weights_batch
sig_weights = sig_weights_batch
else:
bkg = np.concatenate([bkg, bkg_batch])
sig = np.concatenate([sig, sig_batch])
bkg_weights = np.concatenate([bkg_weights, bkg_weights_batch])
sig_weights = np.concatenate([sig_weights, sig_weights_batch])
if hasattr(self, "mask_value"):
bkg_not_masked = np.where(bkg != self.mask_value)[0]
sig_not_masked = np.where(sig != self.mask_value)[0]
bkg = bkg[bkg_not_masked]
sig = sig[sig_not_masked]
bkg_weights = bkg_weights[bkg_not_masked]
sig_weights = sig_weights[sig_not_masked]
if self.balance_dataset:
if len(sig) < len(bkg):
logger.warning("Plotting only up to {} bkg events, since we use balance_dataset".format(len(sig)))
bkg = bkg[0:len(sig)]
bkg_weights = bkg_weights[0:len(sig)]
else:
logger.warning("Plotting only up to {} sig events, since we use balance_dataset".format(len(bkg)))
sig = sig[0:len(bkg)]
sig_weights = sig_weights[0:len(bkg)]
logger.debug("Plotting bkg (min={}, max={}) from {}".format(np.min(bkg), np.max(bkg), bkg))
logger.debug("Plotting sig (min={}, max={}) from {}".format(np.min(sig), np.max(sig), sig))
# calculate percentiles to get a heuristic for the range to be plotted
x_total = np.concatenate([bkg, sig])
w_total = np.concatenate([bkg_weights, sig_weights])
plot_range = weighted_quantile(
x_total,
[0.01, 0.99],
sample_weight=w_total,
)
logger.debug("Calculated range based on percentiles: {}".format(plot_range))
bins = 50
# check if we have a distribution of integer numbers (e.g. njet or something categorical)
# in that case we want to have a bin for each number
if (x_total == x_total.astype(int)).all():
plot_range = (math.floor(plot_range[0])-0.5, math.ceil(plot_range[1])+0.5)
bins = int(plot_range[1]-plot_range[0])
try:
centers_sig, hist_sig, _ = self.get_bin_centered_hist(sig, bins=bins, range=plot_range, weights=sig_weights)
centers_bkg, hist_bkg, _ = self.get_bin_centered_hist(bkg, bins=bins, range=plot_range, weights=bkg_weights)
except ValueError:
# weird, probably not always working workaround for a numpy bug
plot_range = (float("{:.3f}".format(plot_range[0])), float("{:.3f}".format(plot_range[1])))
logger.warn("Got a value error during plotting, maybe this is due to a numpy bug - changing range to {}".format(plot_range))
centers_sig, hist_sig, _ = self.get_bin_centered_hist(sig, bins=bins, range=plot_range, weights=sig_weights)
centers_bkg, hist_bkg, _ = self.get_bin_centered_hist(bkg, bins=bins, range=plot_range, weights=bkg_weights)
if bins > 1:
width = centers_sig[1]-centers_sig[0]
else:
width = 1.
ax.bar(centers_bkg, hist_bkg, color="b", alpha=0.5, width=width)
ax.bar(centers_sig, hist_sig, color="r", alpha=0.5, width=width)
label = branch
ax.set_xlabel(label)
if fig is not None:
plot_dir = os.path.join(self.project_dir, "plots")
if not os.path.exists(plot_dir):
os.mkdir(plot_dir)
return save_show(plt, fig, os.path.join(plot_dir, "var_{}.pdf".format(var_index)))
def plot_all_inputs(self, **kwargs):
nrows = math.ceil(math.sqrt(len(self.fields)))
fig, axes = plt.subplots(nrows=int(nrows), ncols=int(nrows),
figsize=(3*nrows, 3*nrows),
gridspec_kw=dict(wspace=0.4, hspace=0.4))
for i in range(len(self.fields)):
self.plot_input(i, ax=axes.reshape(-1)[i], **kwargs)
return save_show(plt, fig, os.path.join(self.project_dir, "all_inputs.pdf"))
def plot_weights(self, bins=100, range=None):
fig, ax = plt.subplots()
bkg = self.w_train_tot[self.l_train == 0]
sig = self.w_train_tot[self.l_train == 1]
ax.hist(bkg, bins=bins, range=range, color="b", alpha=0.5)
ax.set_yscale("log")
save_show(plt, fig, os.path.join(self.project_dir, "eventweights_bkg.pdf"))
fig, ax = plt.subplots()
ax.hist(sig, bins=bins, range=range, color="r", alpha=0.5)
ax.set_yscale("log")
save_show(plt, fig, os.path.join(self.project_dir, "eventweights_sig.pdf"))
def plot_ROC(self, xlim=(0,1), ylim=(0,1)):
logger.info("Plot ROC curve")
fig, ax = plt.subplots()
ax.grid(color='gray', linestyle='--', linewidth=1)
for y, scores, weight, label in [
(self.l_train, self.scores_train, self.w_train, "train"),
(self.l_test, self.scores_test, self.w_test, "test")
]:
fpr, tpr, threshold = roc_curve(y, scores, sample_weight = weight)
fpr = 1.0 - fpr # background rejection
try:
roc_auc = auc(tpr, fpr)
except ValueError:
logger.warning("Got a value error from auc - trying to rerun with reorder=True")
roc_auc = auc(tpr, fpr, reorder=True)
ax.plot(tpr, fpr, label=str(self.name + " {} (AUC = {:.3f})".format(label, roc_auc)))
ax.plot([0,1],[1,0], linestyle='--', color='black', label='Luck')
ax.set_ylabel("Background rejection")
ax.set_xlabel("Signal efficiency")
ax.set_title('Receiver operating characteristic')
ax.set_xlim(*xlim)
ax.set_ylim(*ylim)
# plt.xticks(np.arange(0,1,0.1))
# plt.yticks(np.arange(0,1,0.1))
ax.legend(loc='lower left', framealpha=1.0)
return save_show(plt, fig, os.path.join(self.project_dir, "ROC.pdf"))
def plot_score(self, log=True, plot_opts=dict(bins=50, range=(0,1)),
ylim=None, xlim=None, density=True,
lumifactor=None, apply_class_weight=True,
invert_activation=False):
if invert_activation:
trf = self.get_inverse_act_fn()
else:
trf = lambda y : y
fig, ax = plt.subplots()
for scores, weights, y, class_label, fn, opts in [
(self.scores_train, self.w_train, self.l_train, 1, ax.bar, dict(color="r", label="signal train")),
(self.scores_train, self.w_train, self.l_train, 0, ax.bar, dict(color="b", label="background train")),
(self.scores_test, self.w_test, self.l_test, 1, ax.errorbar, dict(fmt="ro", label="signal test")),
(self.scores_test, self.w_test, self.l_test, 0, ax.errorbar, dict(fmt="bo", label="background test")),
]:
weights = weights[y==class_label]
if apply_class_weight is True and (lumifactor is not None):
logger.warning("not applying class weight, since lumifactor given")
if apply_class_weight and (lumifactor is None):
weights = weights*self.class_weight[class_label]
if lumifactor is not None:
weights = weights*lumifactor
centers, hist, rel_errors = self.get_bin_centered_hist(
trf(scores[y==class_label].reshape(-1)),
weights=weights,
**plot_opts
)
width = centers[1]-centers[0]
if density:
hist = hist/width
if fn == ax.errorbar:
errors = rel_errors*hist
opts.update(yerr=errors)
else:
opts.update(width=width, alpha=0.5)
fn(centers, hist, **opts)
if log:
ax.set_yscale("log")
if ylim is not None:
ax.set_ylim(*ylim)
if xlim is not None:
ax.set_xlim(*xlim)
ax.set_xlabel("NN output")
if density:
ax.set_ylabel("dN / d(NN output)")
else:
ax.set_ylabel("Events / {:.2f}".format(width))
if apply_class_weight:
ax.set_title("Class weights applied")
ax.legend(loc='upper center', framealpha=0.5)
return save_show(plt, fig, os.path.join(self.project_dir, "scores.pdf"))
def plot_significance_hist(self, lumifactor=1., significance_function=None, plot_opts=dict(bins=50, range=(0, 1)), invert_activation=False):
"""
Plot significances based on a histogram of scores
"""
logger.info("Plot significances")
if invert_activation:
trf = self.get_inverse_act_fn()
else:
trf = lambda y : y
centers_sig_train, hist_sig_train, rel_errors_sig_train = self.get_bin_centered_hist(trf(self.scores_train[self.l_train==1].reshape(-1)), weights=self.w_train[self.l_train==1], **plot_opts)
centers_bkg_train, hist_bkg_train, rel_errors_bkg_train = self.get_bin_centered_hist(trf(self.scores_train[self.l_train==0].reshape(-1)), weights=self.w_train[self.l_train==0], **plot_opts)
centers_sig_test, hist_sig_test, rel_errors_sig_test = self.get_bin_centered_hist(trf(self.scores_test[self.l_test==1].reshape(-1)), weights=self.w_test[self.l_test==1], **plot_opts)
centers_bkg_test, hist_bkg_test, rel_errors_bkg_test = self.get_bin_centered_hist(trf(self.scores_test[self.l_test==0].reshape(-1)), weights=self.w_test[self.l_test==0], **plot_opts)
significances_train = []
significances_test = []
for hist_sig, hist_bkg, rel_errors_sig, rel_errors_bkg, significances in [
(hist_sig_train, hist_bkg_train, rel_errors_sig_train, rel_errors_bkg_train, significances_train),
(hist_sig_test, hist_bkg_test, rel_errors_sig_test, rel_errors_bkg_test, significances_test),
]:
# factor to rescale due to using only a fraction of events (training and test samples)
# normfactor_sig = (np.sum(self.w_train[self.y_train==1])+np.sum(self.w_test[self.y_test==1]))/np.sum(w[y==1])
# normfactor_bkg = (np.sum(self.w_train[self.y_train==0])+np.sum(self.w_test[self.y_test==0]))/np.sum(w[y==0])
normfactor_sig = self.step_signal
normfactor_bkg = self.step_bkg
# first set nan values to 0 and multiply by lumi
for arr in hist_sig, hist_bkg, rel_errors_bkg:
arr[np.isnan(arr)] = 0
hist_sig *= lumifactor*normfactor_sig
hist_bkg *= lumifactor*normfactor_bkg
for i in range(len(hist_sig)):
s = sum(hist_sig[i:])
b = sum(hist_bkg[i:])
db = math.sqrt(sum((rel_errors_bkg[i:]*hist_bkg[i:])**2))
if significance_function is None:
try:
z = poisson_asimov_significance(s, 0, b, db)
except (ZeroDivisionError, ValueError) as e:
z = 0
else:
z = significance_function(s, b, db)
if z == float('inf'):
z = 0
logger.debug("s, b, db, z = {}, {}, {}, {}".format(s, b, db, z))
significances.append(z)
fig, ax = plt.subplots()
width = centers_sig_train[1]-centers_sig_train[0]
ax.plot(centers_bkg_train, significances_train, label="train, Z_max={}".format(np.amax(significances_train)))
ax.plot(centers_bkg_test, significances_test, label="test, Z_max={}".format(np.amax(significances_test)))
ax.set_xlabel("Cut on NN score")
ax.set_ylabel("Significance")
ax.legend(loc='lower center', framealpha=0.5)
return save_show(plt, fig, os.path.join(self.project_dir, "significances_hist.pdf"))
@staticmethod
def calc_s_ds_b_db(scores, y, w):
"""
Calculate the sum of weights of signal (s), background (b) and the
sqrt of the squared sum of weights for all possible threshold
of the output score.
Following the implementation from sklearn.metrics.ranking._binary_clf_curve
"""
desc_score_indices = np.argsort(scores, kind="mergesort")[::-1]
scores_sorted = scores[desc_score_indices]
y_sorted = y[desc_score_indices]
w_sorted = w[desc_score_indices]
distinct_value_indices = np.where(np.diff(scores_sorted))[0]
threshold_idxs = np.r_[distinct_value_indices, y_sorted - 1]
s_sumw = stable_cumsum(y_sorted * w_sorted)[threshold_idxs]
s_sumw2 = stable_cumsum(y_sorted * (w_sorted**2))[threshold_idxs]
b_sumw = stable_cumsum(np.logical_not(y_sorted) * w_sorted)[threshold_idxs]
b_sumw2 = stable_cumsum(np.logical_not(y_sorted) * (w_sorted**2))[threshold_idxs]
return s_sumw, np.sqrt(s_sumw2), b_sumw, np.sqrt(b_sumw2), scores_sorted[threshold_idxs]
def get_inverse_act_fn(self):
if not self.activation_function_output == "sigmoid":
raise NotImplementedError("Inverse function of {} not supported yet - "
"currently only sigmoid"
.format(self.activation_function_output))
return lambda y : np.log(y/(1-y))
def plot_significance(self, significance_function=None, maxsteps=None, lumifactor=1., vectorized=False, invert_activation=False):
"""
Plot the significance when cutting on all posible thresholds and plot against signal efficiency.
"""
if significance_function is None:
vectorized = True
significance_function = poisson_asimov_significance
if invert_activation:
trf = self.get_inverse_act_fn()
else:
trf = lambda y : y
fig, ax = plt.subplots()
ax2 = ax.twinx()
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']
for (scores, y, w, label), col in zip(
[(self.scores_train, self.l_train, self.w_train, "train"),
(self.scores_test, self.l_test, self.w_test, "test")],
colors
):
scores = trf(scores)
s_sumws, s_errs, b_sumws, b_errs, thresholds = self.calc_s_ds_b_db(scores, y, w)
if maxsteps is not None:
stepsize = int(len(s_sumws))/int(maxsteps)
else:
stepsize = 1
if stepsize == 0:
stepsize = 1
s_sumws = s_sumws[::stepsize]*lumifactor*self.step_signal
s_errs = s_errs[::stepsize]*lumifactor*self.step_signal
b_sumws = b_sumws[::stepsize]*lumifactor*self.step_bkg
b_errs = b_errs[::stepsize]*lumifactor*self.step_bkg
nonzero_b = np.where(b_sumws!=0)[0]
s_sumws = s_sumws[nonzero_b]
s_errs = s_errs[nonzero_b]
b_sumws = b_sumws[nonzero_b]
b_errs = b_errs[nonzero_b]
thresholds = thresholds[nonzero_b]
if not vectorized:
zs = []
for s, ds, b, db in zip(s_sumws, s_errs, b_sumws, b_errs):
zs.append(significance_function(s, ds, b, db))
else:
zs = significance_function(s_sumws, s_errs, b_sumws, b_errs)
ax.plot(s_sumws/s_sumws[-1], zs, label=label, color=col)
ax2.plot(s_sumws/s_sumws[-1], thresholds, "--", color=col)
ax.set_xlabel("Signal efficiency")
ax.set_ylabel("Significance")
ax.set_xlim(0, 1)
ax2.set_ylabel("Threshold")
ax.legend()
return save_show(plt, fig, os.path.join(self.project_dir, "significances.pdf"))
@property
def csv_hist(self):
with open(os.path.join(self.project_dir, "training.log")) as f:
reader = csv.reader(f)
history_list = list(reader)
hist_dict = {}
for hist_key_index, hist_key in enumerate(history_list[0]):
hist_dict[hist_key] = [float(line[hist_key_index]) for line in history_list[1:]]
return hist_dict
def plot_loss(self, all_trainings=False, log=False, ylim=None, xlim=None):
"""
Plot the value of the loss function for each epoch
:param all_trainings: set to true if you want to plot all trainings (otherwise the previous history is used)
"""
if all_trainings:
hist_dict = self.csv_hist
else:
hist_dict = self.history.history
if (not 'loss' in hist_dict) or (not 'val_loss' in hist_dict):
logger.warning("No previous history found for plotting, try global history")
hist_dict = self.csv_hist
logger.info("Plot losses")
fig, ax = plt.subplots()
ax.plot(hist_dict['loss'])
ax.plot(hist_dict['val_loss'])
ax.set_ylabel('loss')
ax.set_xlabel('epoch')
ax.legend(['training data','validation data'], loc='upper left')
if log:
ax.set_yscale("log")
if xlim is not None:
ax.set_xlim(*xlim)
if ylim is not None:
ax.set_ylim(*ylim)
return save_show(plt, fig, os.path.join(self.project_dir, "losses.pdf"))
def plot_accuracy(self, all_trainings=False, log=False, acc_suffix="weighted_acc"):
"""
Plot the value of the accuracy metric for each epoch
:param all_trainings: set to true if you want to plot all trainings (otherwise the previous history is used)
"""
if all_trainings:
hist_dict = self.csv_hist
else:
hist_dict = self.history.history
if (not acc_suffix in hist_dict) or (not 'val_'+acc_suffix in hist_dict):
logger.warning("No previous history found for plotting, try global history")
hist_dict = self.csv_hist
logger.info("Plot accuracy")
plt.plot(hist_dict[acc_suffix])
plt.plot(hist_dict['val_'+acc_suffix])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['training data', 'validation data'], loc='upper left')
if log:
plt.yscale("log")
plt.savefig(os.path.join(self.project_dir, "accuracy.pdf"))
plt.clf()
def plot_all(self):
self.plot_ROC()
# self.plot_accuracy()
self.plot_loss()
self.plot_score()
self.plot_weights()
# self.plot_significance()
def to_DataFrame(self):
df = pd.DataFrame(np.concatenate([self.x_train, self.x_test]), columns=self.fields)
df["weight"] = np.concatenate([self.w_train, self.w_test])
df["labels"] = pd.Categorical.from_codes(
np.concatenate([self.l_train, self.l_test]),
categories=["background", "signal"]
)
for identifier in self.identifiers:
try:
df[identifier] = np.concatenate([self.s_eventlist_train[identifier],
self.b_eventlist_train[identifier],
-1*np.ones(len(self.x_test), dtype="i8")])
except IOError:
logger.warning("Can't find eventlist - DataFrame won't contain identifiers")
df["is_train"] = np.concatenate([np.ones(len(self.x_train), dtype=np.bool),
np.zeros(len(self.x_test), dtype=np.bool)])
return df
def create_getter(dataset_name):
def getx(self):
if getattr(self, "_"+dataset_name) is None:
self._load_from_hdf5(dataset_name)
return getattr(self, "_"+dataset_name)
return getx
def create_setter(dataset_name):
def setx(self, value):
setattr(self, "_"+dataset_name, value)
return setx
# define getters and setters for all datasets
for dataset_name in ClassificationProject.dataset_names:
setattr(ClassificationProject, dataset_name, property(create_getter(dataset_name),
create_setter(dataset_name)))
class ClassificationProjectDataFrame(ClassificationProject):
"""
A little hack to initialize a ClassificationProject from a pandas DataFrame instead of ROOT TTrees
"""
def __init__(self, name, *args, **kwargs):
if len(args) < 1 and len(kwargs) < 1:
# if no further arguments given, interpret as directory name
self._init_from_dir(name)
else:
# otherwise initialise new project
self._init_from_args(name, *args, **kwargs)
with open(os.path.join(self.project_dir, "options.pickle"), "wb") as of:
# don't put the dataframe into options.pickle!
if len(args) > 1:
args = args[1:]
else:
args = []
pickle.dump(dict(args=args, kwargs=kwargs), of)
def _init_from_args(self,
name,
df,
input_columns,
weight_column="weights",
label_column="labels",
signal_label="signal",
background_label="background",
split_mode="split_column",
split_column="is_train",
**kwargs):
self.df = df
self.input_columns = input_columns
self.weight_column = weight_column
self.label_column = label_column
self.signal_label = signal_label
self.background_label = background_label
if split_mode != "split_column":
raise NotImplementedError("'split_column' is the only currently supported split mode")
self.split_mode = split_mode
self.split_column = split_column
super(ClassificationProjectDataFrame, self)._init_from_args(
name,
signal_trees=[], bkg_trees=[], branches=[], weight_expr="1",
**kwargs
)
self._x_train = None
self._x_test = None
self._y_train = None
self._y_test = None
self._w_train = None
self._w_test = None
@property
def x_train(self):
if self._x_train is None:
self._x_train = self.df[self.df[self.split_column]][self.input_columns].values
return self._x_train
@x_train.setter
def x_train(self, value):
self._x_train = value
@property
def x_test(self):
if self._x_test is None:
self._x_test = self.df[~self.df[self.split_column]][self.input_columns].values
return self._x_test
@x_test.setter
def x_test(self, value):
self._x_test = value
@property
def y_train(self):
if self._y_train is None:
self._y_train = (self.df[self.df[self.split_column]][self.label_column] == self.signal_label).values
return self._y_train
@y_train.setter
def y_train(self, value):
self._y_train = value
@property
def y_test(self):
if self._y_test is None:
self._y_test = (self.df[~self.df[self.split_column]][self.label_column] == self.signal_label).values
return self._y_test
@y_test.setter
def y_test(self, value):
self._y_test = value
@property
def w_train(self):
if self._w_train is None:
self._w_train = self.df[self.df[self.split_column]][self.weight_column].values
return self._w_train
@w_train.setter
def w_train(self, value):
self._w_train = value
@property
def w_test(self):
if self._w_test is None:
self._w_test = self.df[~self.df[self.split_column]][self.weight_column].values
return self._w_test
@w_test.setter
def w_test(self, value):
self._w_test = value
@property
def fields(self):
return self.input_columns
def load(self, reload=False):
if reload:
self.data_loaded = False
self._x_train = None
self._x_test = None
self._y_train = None
self._y_test = None
self._w_train = None
self._w_test = None
self._w_train_tot = None
self.data_loaded = True
class ClassificationProjectRNN(ClassificationProject):
"""
A little wrapper to use recurrent units for things like jet collections
"""
def _init_from_args(self, name,
recurrent_field_names=None,
rnn_layer_nodes=32,
mask_value=-999,
recurrent_unit_type="GRU",
**kwargs):
"""
recurrent_field_names example:
[["jet1Pt", "jet1Eta", "jet1Phi"],
["jet2Pt", "jet2Eta", "jet2Phi"],
["jet3Pt", "jet3Eta", "jet3Phi"]],
[["lep1Pt", "lep1Eta", "lep1Phi", "lep1flav"],
["lep2Pt", "lep2Eta", "lep2Phi", "lep2flav"]],
"""
super(ClassificationProjectRNN, self)._init_from_args(name, **kwargs)
self._write_info("project_type", "ClassificationProjectRNN")
self.recurrent_field_names = recurrent_field_names
if self.recurrent_field_names is None:
self.recurrent_field_names = []
self.rnn_layer_nodes = rnn_layer_nodes
self.mask_value = mask_value
self.recurrent_unit_type = recurrent_unit_type
# convert to of indices
self.recurrent_field_idx = []
for field_name_list in self.recurrent_field_names:
field_names = np.array([field_name_list])
if field_names.dtype == np.object:
raise ValueError(
"Invalid entry for recurrent fields: {} - "
"please ensure that the length for all elements in the list is equal"
.format(field_names)
)
field_idx = (
np.array([self.fields.index(field_name)
for field_name in field_names.reshape(-1)])
.reshape(field_names.shape)
)
self.recurrent_field_idx.append(field_idx)
self.flat_fields = []
for field in self.fields:
if any(self.fields.index(field) in field_idx.reshape(-1) for field_idx in self.recurrent_field_idx):
continue
self.flat_fields.append(field)
if self.scaler_type != "WeightedRobustScaler":
raise NotImplementedError(
"Invalid scaler '{}' - only WeightedRobustScaler is currently supported for RNN"
.format(self.scaler_type)
)
@property
def model(self):
if self._model is None:
# following the setup from the tutorial:
# https://github.com/YaleATLAS/CERNDeepLearningTutorial
rnn_inputs = []
rnn_channels = []
for field_idx in self.recurrent_field_idx:
chan_inp = Input(field_idx.shape[1:])
channel = Masking(mask_value=self.mask_value)(chan_inp)
if self.recurrent_unit_type == "GRU":
channel = GRU(self.rnn_layer_nodes)(channel)
elif self.recurrent_unit_type == "SimpleRNN":
channel = SimpleRNN(self.rnn_layer_nodes)(channel)
elif self.recurrent_unit_type == "LSTM":
channel = LSTM(self.rnn_layer_nodes)(channel)
else:
raise NotImplementedError("{} not implemented".format(self.recurrent_unit_type))
logger.info("Added {} unit".format(self.recurrent_unit_type))
# TODO: configure dropout for recurrent layers
#channel = Dropout(0.3)(channel)
rnn_inputs.append(chan_inp)
rnn_channels.append(channel)
flat_input = Input((len(self.flat_fields),))
if self.dropout_input is not None:
flat_channel = Dropout(rate=self.dropout_input)(flat_input)
else:
flat_channel = flat_input
combined = concatenate(rnn_channels+[flat_channel])
for node_count, dropout_fraction in zip(self.nodes, self.dropout):
extra_opts = dict()
if self.kernel_initializer is not None:
extra_opts["kernel_initializer"] = getattr(keras.initializers, self.kernel_initializer)()
combined = Dense(node_count, activation=self.activation_function, **extra_opts)(combined)
if (dropout_fraction is not None) and (dropout_fraction > 0):
combined = Dropout(rate=dropout_fraction)(combined)
combined = Dense(1, activation=self.activation_function_output)(combined)
outputs = [combined]
# optional regression targets
for target_field in self.target_fields:
extra_target = Dense(1, activation="linear", name="target_{}".format(target_field))(combined)
outputs.append(extra_target)
self._model = Model(inputs=rnn_inputs+[flat_input], outputs=outputs)
self._compile_or_load_model()
return self._model
def clean_mask(self, x):
"""
Mask recurrent fields such that once a masked value occurs,
all values corresponding to the same and following objects are
masked as well. Works in place.
"""
for recurrent_field_idx in self.recurrent_field_idx:
for evt in x:
masked = False
for line_idx in recurrent_field_idx.reshape(*recurrent_field_idx.shape[1:]):
if (evt[line_idx] == self.mask_value).any():
masked=True
if masked:
evt[line_idx] = self.mask_value
def mask_uniform(self, x):
"""
Mask recurrent fields with a random (uniform) number of objects. Works in place.
"""
for recurrent_field_idx in self.recurrent_field_idx:
for evt in x:
masked = False
nobj = int(random.random()*(recurrent_field_idx.shape[1]+1))
for obj_number, line_idx in enumerate(recurrent_field_idx.reshape(*recurrent_field_idx.shape[1:])):
if obj_number == nobj:
masked=True
if masked:
evt[line_idx] = self.mask_value
def get_input_list(self, x):
"""
Returns a list of 3-dimensional inputs for each
recurrent layer and a 2-dimensional one for the normal flat inputs.
"""
x_input = []
for field_idx in self.recurrent_field_idx:
x_recurrent = x[:,field_idx.reshape(-1)].reshape(-1, *field_idx.shape[1:])
x_input.append(x_recurrent)
x_flat = x[:,[self.fields.index(field_name) for field_name in self.flat_fields]]
x_input.append(x_flat)
return x_input
def get_input_flat(self, x):
"Transform the multiple inputs back to flat ntuple"
nevent = x[0].shape[0]
x_flat = np.empty((nevent, len(self.fields)), dtype=np.float)
# recurrent fields
for rec_ar, idx in zip(x, self.recurrent_field_idx):
idx = idx.reshape(-1)
for source_idx, target_idx in enumerate(idx):
x_flat[:,target_idx] = rec_ar.reshape(nevent, -1)[:,source_idx]
# flat fields
for source_idx, field_name in enumerate(self.flat_fields):
target_idx = self.fields.index(field_name)
x_flat[:,target_idx] = x[-1][:,source_idx]
return x_flat
def evaluate(self, x_eval, mode=None):
logger.debug("Evaluate score for {}".format(x_eval))
x_eval = self.transform(x_eval)
logger.debug("Evaluate for transformed array: {}".format(x_eval))
return self.predict(self.get_input_list(x_eval), mode=mode)
if __name__ == "__main__":
logging.basicConfig()
logging.getLogger("KerasROOTClassification").setLevel(logging.INFO)
#logging.getLogger("KerasROOTClassification").setLevel(logging.DEBUG)
filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys.root"
c = ClassificationProject("test4",
signal_trees = [(filename, "GG_oneStep_1705_1105_505_NoSys")],
bkg_trees = [(filename, "ttbar_NoSys"),
(filename, "wjets_Sherpa221_NoSys")
],
optimizer="Adam",
#optimizer="SGD",
#optimizer_opts=dict(lr=100., decay=1e-6, momentum=0.9),
earlystopping_opts=dict(monitor='val_loss',
min_delta=0, patience=2, verbose=0, mode='auto'),
selection="1",
branches = ["met", "mt"],
weight_expr = "eventWeight*genWeight",
identifiers = ["DatasetNumber", "EventNumber"],
step_bkg = 100)
np.random.seed(42)
c.train(epochs=20)
#c.plot_all()
# c.write_friend_tree("test4_score",
# source_filename=filename, source_treename="GG_oneStep_1705_1105_505_NoSys",
# target_filename="friend.root", target_treename="test4_score")
# c.write_friend_tree("test4_score",
# source_filename=filename, source_treename="ttbar_NoSys",
# target_filename="friend_ttbar_NoSys.root", target_treename="test4_score")
"Helper functions using keras or tensorflow"
import logging
import numpy as np
import keras.backend as K
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing.data import _handle_zeros_in_scale
from meme import cache
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
def get_single_neuron_function(model, layer, neuron, input_transform=None):
inp = model.input
if not isinstance(inp, list):
inp = [inp]
f = K.function(inp+[K.learning_phase()], [model.layers[layer].output[:,neuron]])
def eval_single_neuron(x):
if input_transform is not None:
x_eval = input_transform(x)
else:
x_eval = x
if not isinstance(x_eval, list):
x_eval = [x_eval]
return f(x_eval)[0]
return eval_single_neuron
def create_random_event(ranges, mask_probs=None, mask_value=None):
random_event = np.array([p[0]+(p[1]-p[0])*np.random.rand() for p in ranges])
random_event = random_event.reshape(-1, len(random_event))
# if given, mask values with a certain probability
if mask_probs is not None:
if mask_value is None:
raise ValueError("Need to provide mask_value if random events should be masked")
for var_index, mask_prob in enumerate(mask_probs):
random_event[:,var_index][np.random.rand(len(random_event)) < mask_prob] = mask_value
return random_event
def get_ranges(x, quantiles, weights, mask_value=None, filter_index=None, max_evts=None):
"Get ranges for plotting or random event generation based on quantiles"
ranges = []
mask_probs = []
if max_evts is not None:
rnd_idx = np.random.permutation(np.arange(len(x)))
rnd_idx = rnd_idx[:max_evts]
for var_index in range(x.shape[1]):
if (filter_index is not None) and (var_index != filter_index):
continue
x_var = x[:,var_index]
if max_evts is not None:
x_var = x_var[rnd_idx]
not_masked = np.where(x_var != mask_value)[0]
masked = np.where(x_var == mask_value)[0]
ranges.append(weighted_quantile(x_var[not_masked], quantiles, sample_weight=weights[not_masked]))
mask_probs.append(float(len(masked))/float(len(x_var)))
return ranges, mask_probs
def max_activation_wrt_input(gradient_function, random_event, threshold=None, maxthreshold=None, maxit=100, step=1, const_indices=[],
input_transform=None, input_inverse_transform=None):
if input_transform is not None:
random_event = input_transform(random_event)
if not isinstance(random_event, list):
random_event = [random_event]
def iterate(random_event):
for i in range(maxit):
grads_out = gradient_function(random_event)
loss_value = grads_out[0][0]
grads_values = grads_out[1:]
# follow gradient for all inputs
for i, (grads_value, input_event) in enumerate(zip(grads_values, random_event)):
for const_index in const_indices:
grads_value[0][const_index] = 0
if threshold is not None:
if loss_value > threshold and (maxthreshold is None or loss_value < maxthreshold):
# found an event within the thresholds
return loss_value, random_event
elif (maxthreshold is not None and loss_value > maxthreshold):
random_event[i] -= grads_value*step
else:
random_event[i] += grads_value*step
else:
random_event[i] += grads_value*step
else:
if threshold is not None:
# no event found for the given threshold
return None, None
# otherwise return last status
return loss_value, random_event
loss_value, random_event = iterate(random_event)
if input_inverse_transform is not None and random_event is not None:
random_event = input_inverse_transform(random_event)
elif random_event is None:
return None
return loss_value, random_event
def get_grad_function(model, layer, neuron):
loss = model.layers[layer].output[:,neuron]
grads = K.gradients(loss, model.input)
# trick from https://blog.keras.io/how-convolutional-neural-networks-see-the-world.html
norm_grads = [grad/(K.sqrt(K.mean(K.square(grad))) + 1e-5) for grad in grads]
inp = model.input
if not isinstance(inp, list):
inp = [inp]
return K.function(inp, [loss]+norm_grads)
@cache(useJSON=True,
argHashFunctions=[
lambda model: [hash(i.tostring()) for i in model.get_weights()],
lambda ranges: [hash(i.tostring()) for i in ranges],
],
ignoreKwargs=["input_transform", "input_inverse_transform"],
)
def get_max_activation_events(model, ranges, ntries, layer, neuron, seed=42, mask_probs=None, mask_value=None, **kwargs):
gradient_function = get_grad_function(model, layer, neuron)
events = None
losses = None
np.random.seed(seed)
for i in range(ntries):
if not (i%100):
logger.info(i)
res = max_activation_wrt_input(
gradient_function,
create_random_event(
ranges,
mask_probs=mask_probs,
mask_value=mask_value
),
**kwargs
)
if res is not None:
loss, event = res
loss = np.array([loss])
else:
continue
if events is None:
events = event
losses = loss
else:
events = np.concatenate([events, event])
losses = np.concatenate([losses, loss])
return losses, events
"from https://stackoverflow.com/questions/21844024/weighted-percentile-using-numpy#29677616"
def weighted_quantile(values, quantiles, sample_weight=None, values_sorted=False, old_style=False):
""" Very close to np.percentile, but supports weights.
NOTE: quantiles should be in [0, 1]!
:param values: np.array with data
:param quantiles: array-like with many quantiles needed
:param sample_weight: array-like of the same length as `array`
:param values_sorted: bool, if True, then will avoid sorting of initial array
:param old_style: if True, will correct output to be consistent with np.percentile.
:return: np.array with computed quantiles.
"""
values = np.array(values)
quantiles = np.array(quantiles)
if sample_weight is None:
sample_weight = np.ones(len(values))
sample_weight = np.array(sample_weight)
assert np.all(quantiles >= 0) and np.all(quantiles <= 1), 'quantiles should be in [0, 1]'
if not values_sorted:
sorter = np.argsort(values)
values = values[sorter]
sample_weight = sample_weight[sorter]
weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight
if old_style:
# To be convenient with np.percentile
weighted_quantiles -= weighted_quantiles[0]
weighted_quantiles /= weighted_quantiles[-1]
else:
weighted_quantiles /= np.sum(sample_weight)
return np.interp(quantiles, weighted_quantiles, values)
class WeightedRobustScaler(RobustScaler):
def fit(self, X, y=None, weights=None, mask_value=None):
if not np.isnan(X).any() and mask_value is not None and weights is None:
# these checks don't work for nan values
return super(WeightedRobustScaler, self).fit(X, y)
else:
if weights is None:
weights = np.ones(len(self.X))
wqs = []
for i in range(X.shape[1]):
mask = ~np.isnan(X[:,i])
if mask_value is not None:
mask &= (X[:,i] != mask_value)
wqs.append(
weighted_quantile(
X[:,i][mask],
[0.25, 0.5, 0.75],
sample_weight=weights[mask]
)
)
wqs = np.array(wqs)
self.center_ = wqs[:,1]
self.scale_ = wqs[:,2]-wqs[:,0]
self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False)
return self
def transform(self, X):
if np.isnan(X).any():
# we'd like to ignore nan values, so lets calculate without further checks
X -= self.center_
X /= self.scale_
return X
else:
return super(WeightedRobustScaler, self).transform(X)
def inverse_transform(self, X):
if np.isnan(X).any():
X *= self.scale_
X += self.center_
return X
else:
return super(WeightedRobustScaler, self).inverse_transform(X)
def poisson_asimov_significance(s, ds, b, db):
"see `<http://www.pp.rhul.ac.uk/~cowan/stat/medsig/medsigNote.pdf>`_)"
db = np.sqrt(db**2+ds**2)
return np.sqrt(2*((s+b)*np.log(((s+b)*(b+db**2))/(b**2+(s+b)*db**2))-(b**2)/(db**2)*np.log(1+(db**2*s)/(b*(b+db**2)))))