Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • Eric.Schanet/KerasROOTClassification
  • Nikolai.Hartmann/KerasROOTClassification
2 results
Show changes
Commits on Source (85)
......@@ -20,6 +20,9 @@ def overlay_ROC(filename, *projects, **kwargs):
ylim = kwargs.pop("ylim", (0,1))
plot_thresholds = kwargs.pop("plot_thresholds", False)
threshold_log = kwargs.pop("threshold_log", True)
lumifactor = kwargs.pop("lumifactor", None)
tight_layout = kwargs.pop("tight_layout", False)
show_auc = kwargs.pop("show_auc", True)
if kwargs:
raise KeyError("Unknown kwargs: {}".format(kwargs))
......@@ -33,11 +36,15 @@ def overlay_ROC(filename, *projects, **kwargs):
if threshold_log:
ax2.set_yscale("log")
if lumifactor is not None:
ax_abs_b = ax.twinx()
ax_abs_s = ax.twiny()
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']
for p, color in zip(projects, colors):
fpr, tpr, threshold = roc_curve(p.y_test, p.scores_test, sample_weight = p.w_test)
fpr, tpr, threshold = roc_curve(p.l_test, p.scores_test, sample_weight = p.w_test)
fpr = 1.0 - fpr
try:
roc_auc = auc(tpr, fpr)
......@@ -46,21 +53,39 @@ def overlay_ROC(filename, *projects, **kwargs):
roc_auc = auc(tpr, fpr, reorder=True)
ax.grid(color='gray', linestyle='--', linewidth=1)
ax.plot(tpr, fpr, label=str(p.name+" (AUC = {:.3f})".format(roc_auc)), color=color)
if show_auc:
label = str(p.name+" (AUC = {:.3f})".format(roc_auc))
else:
label = p.name
ax.plot(tpr, fpr, label=label, color=color)
if plot_thresholds:
ax2.plot(tpr, threshold, "--", color=color)
if lumifactor is not None:
sumw_b = p.w_test[p.l_test==0].sum()*lumifactor
sumw_s = p.w_test[p.l_test==1].sum()*lumifactor
ax_abs_b.plot(tpr, (1.-fpr)*sumw_b, alpha=0)
ax_abs_b.invert_yaxis()
ax_abs_s.plot(tpr*sumw_s, fpr, alpha=0)
if xlim is not None:
ax.set_xlim(*xlim)
if ylim is not None:
ax.set_ylim(*ylim)
if lumifactor is not None:
ax_abs_b.set_ylim((1-ax.get_ylim()[0])*sumw_b, (1-ax.get_ylim()[1])*sumw_b)
ax_abs_b.set_xlim(*ax.get_xlim())
ax_abs_s.set_xlim(ax.get_xlim()[0]*sumw_s, ax.get_xlim()[1]*sumw_s)
ax_abs_s.set_ylim(*ax.get_ylim())
ax_abs_b.set_ylabel("Number of background events")
ax_abs_s.set_xlabel("Number of signal events")
# plt.xticks(np.arange(0,1,0.1))
# plt.yticks(np.arange(0,1,0.1))
ax.legend(loc='lower left', framealpha=1.0)
ax.set_title('Receiver operating characteristic')
if lumifactor is None:
ax.set_title('Receiver operating characteristic')
ax.set_ylabel("Background rejection")
ax.set_xlabel("Signal efficiency")
if plot_thresholds:
if plot_thresholds or tight_layout:
# to fit right y-axis description
fig.tight_layout()
return save_show(plt, fig, filename)
......
import keras.backend as K
from keras.layers import Masking, InputLayer
def get_activations(model, model_inputs, print_shape_only=False, layer_name=None):
print('----- activations -----')
......@@ -13,12 +12,8 @@ def get_activations(model, model_inputs, print_shape_only=False, layer_name=None
inp = [inp]
model_multi_inputs_cond = False
# all layer outputs
# skip input and masking layers
outputs = [layer.output for layer in model.layers if
(layer.name == layer_name or layer_name is None)
and not isinstance(layer, InputLayer)
and not isinstance(layer, Masking)]
layer.name == layer_name or layer_name is None] # all layer outputs
funcs = [K.function(inp + [K.learning_phase()], [out]) for out in outputs] # evaluation functions
......
......@@ -20,9 +20,9 @@ logger.addHandler(logging.NullHandler())
Some further plotting functions
"""
def save_show(plt, fig, filename):
def save_show(plt, fig, filename, **kwargs):
"Save a figure and show it in case we are in ipython or jupyter notebook."
fig.savefig(filename)
fig.savefig(filename, **kwargs)
try:
get_ipython
plt.show()
......@@ -60,8 +60,7 @@ def plot_NN_vs_var_1D(plotname, means, scorefun, var_index, var_range, var_label
if var_label is not None:
ax.set_xlabel(var_label)
ax.set_ylabel("NN output")
fig.savefig(plotname)
plt.close(fig)
save_show(plt, fig, plotname)
def plot_NN_vs_var_2D(plotname, means,
......@@ -134,8 +133,7 @@ def plot_NN_vs_var_2D(plotname, means,
ax.set_xlabel(varx_label)
if vary_label is not None:
ax.set_ylabel(vary_label)
fig.savefig(plotname)
plt.close(fig)
save_show(plt, fig, plotname)
def plot_NN_vs_var_2D_all(plotname, model, means,
......@@ -233,8 +231,115 @@ def plot_NN_vs_var_2D_all(plotname, model, means,
cb.ax.xaxis.set_ticks_position('top')
cb.ax.xaxis.set_label_position('top')
fig.savefig(plotname, bbox_inches='tight')
plt.close(fig)
save_show(plt, fig, plotname, bbox_inches='tight')
def plot_profile_2D_all(plotname, model, events,
valsx, valsy,
nbinsx, xmin, xmax,
nbinsy, ymin, ymax,
transform_function=None,
varx_label=None,
vary_label=None,
zrange=None, logz=False,
plot_last_layer=False,
log_default_ymin=1e-5,
global_norm=True,
cmap="inferno", **kwargs):
"Similar to plot_profile_2D, but creates a grid of plots for all neurons."
# transform
if transform_function is not None:
events = transform_function(events)
logger.info("Reading activations for all neurons")
acts = get_activations(model, events, print_shape_only=True)
logger.info("Done")
if plot_last_layer:
n_neurons = [len(i.reshape(i.shape[0], -1)[0]) for i in acts]
else:
n_neurons = [len(i.reshape(i.shape[0], -1)[0]) for i in acts[:-1]]
layers = len(n_neurons)
nrows_ncols = (layers, max(n_neurons))
fig = plt.figure(1, figsize=nrows_ncols)
grid = ImageGrid(fig, 111, nrows_ncols=nrows_ncols[::-1], axes_pad=0,
label_mode="1",
aspect=False,
cbar_location="top",
cbar_mode="single",
cbar_pad=.2,
cbar_size="5%",)
grid_array = np.array(grid)
grid_array = grid_array.reshape(*nrows_ncols[::-1])
global_min = None
global_max = None
logger.info("Creating profile histograms")
ims = []
reg_plots = []
for layer in range(layers):
neurons_acts = acts[layer]
neurons_acts = neurons_acts.reshape(neurons_acts.shape[0], -1)
for neuron in range(len(neurons_acts[0])):
acts_neuron = neurons_acts[:,neuron]
ax = grid_array[neuron][layer]
extra_opts = {}
if not (plot_last_layer and layer == layers-1):
# for hidden layers, plot the same z-scale
if logz:
norm = matplotlib.colors.LogNorm
else:
norm = matplotlib.colors.Normalize
if zrange is not None:
extra_opts["norm"] = norm(vmin=zrange[0], vmax=zrange[1])
else:
extra_opts["norm"] = norm(vmin=global_min, vmax=global_max)
hist, xedges, yedges = get_profile_2D(
valsx, valsy, acts_neuron,
nbinsx, xmin, xmax,
nbinsy, ymin, ymax,
**kwargs
)
if global_min is None or hist.min() < global_min:
global_min = hist.min()
if global_max is None or hist.max() > global_max:
global_max = hist.max()
X, Y = np.meshgrid(xedges, yedges)
reg_plots.append((layer, neuron, ax, (X, Y, hist), dict(cmap="inferno", linewidth=0, rasterized=True, **extra_opts)))
logger.info("Done")
logger.info("global_min: {}".format(global_min))
logger.info("global_max: {}".format(global_max))
if global_min <= 0 and logz:
global_min = log_default_ymin
logger.info("Changing global_min to {}".format(log_default_ymin))
for layer, neuron, ax, args, kwargs in reg_plots:
if zrange is None:
kwargs["norm"].vmin = global_min
kwargs["norm"].vmax = global_max
if not global_norm:
kwargs["norm"] = None
im = ax.pcolormesh(*args, **kwargs)
ax.set_facecolor("black")
if varx_label is not None:
ax.set_xlabel(varx_label)
if vary_label is not None:
ax.set_ylabel(vary_label)
ax.text(0., 0.5, "{}, {}".format(layer, neuron), transform=ax.transAxes, color="white")
cb = fig.colorbar(im, cax=grid[0].cax, orientation="horizontal")
cb.ax.xaxis.set_ticks_position('top')
cb.ax.xaxis.set_label_position('top')
logger.info("Rendering")
save_show(plt, fig, plotname, bbox_inches='tight')
logger.info("Done")
def plot_hist_2D(plotname, xedges, yedges, hist, varx_label=None, vary_label=None, log=False, zlabel="# of events"):
......@@ -252,9 +357,7 @@ def plot_hist_2D(plotname, xedges, yedges, hist, varx_label=None, vary_label=Non
cbar.set_label(zlabel)
ax.set_ylabel(vary_label)
ax.set_xlabel(varx_label)
fig.savefig(plotname)
plt.close(fig)
save_show(plt, fig, plotname)
def plot_hist_2D_events(plotname, valsx, valsy, nbinsx, xmin, xmax, nbinsy, ymin, ymax,
......@@ -308,15 +411,10 @@ def plot_cond_avg_actmax_2D(plotname, model, layer, neuron, ranges,
plot_hist_2D(plotname, xedges, yedges, hist, zlabel="Neuron output", **kwargs)
def plot_profile_2D(plotname, valsx, valsy, scores,
nbinsx, xmin, xmax,
nbinsy, ymin, ymax,
metric=np.mean,
weights=None,
**kwargs):
kwargs["zlabel"] = kwargs.get("zlabel", "Profile")
def get_profile_2D(valsx, valsy, scores,
nbinsx, xmin, xmax,
nbinsy, ymin, ymax,
metric=np.mean, weights=None):
xedges = np.linspace(xmin, xmax, nbinsx)
yedges = np.linspace(ymin, ymax, nbinsy)
......@@ -342,6 +440,25 @@ def plot_profile_2D(plotname, valsx, valsy, scores,
hist = np.array(hist)
hist = hist.T # had a list of columns - needs to be list of rows
return hist, xedges, yedges
def plot_profile_2D(plotname, valsx, valsy, scores,
nbinsx, xmin, xmax,
nbinsy, ymin, ymax,
metric=np.mean,
weights=None,
**kwargs):
kwargs["zlabel"] = kwargs.get("zlabel", "Profile")
hist, xedges, yedges = get_profile_2D(
valsx, valsy, scores,
nbinsx, xmin, xmax,
nbinsy, ymin, ymax,
metric=metric, weights=weights
)
plot_hist_2D(plotname, xedges, yedges, hist, **kwargs)
......
#!/usr/bin/env python
import sys, argparse, re, random
parser = argparse.ArgumentParser(description="generate events that maximise the activation for a given neuron")
parser.add_argument("input_project")
parser.add_argument("output_file")
parser.add_argument("-n", "--nevents", default=100000, type=int)
parser.add_argument("-j", "--mask-jets", action="store_true",
help="mask variables called jet*Pt/Eta/Phi and generate a random uniform distribution of the number of jets (only nescessary for non-recurrent NN)")
args = parser.parse_args()
import logging
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
import h5py
import numpy as np
from KerasROOTClassification.utils import (
weighted_quantile,
get_max_activation_events,
create_random_event,
get_ranges
)
from KerasROOTClassification import load_from_dir
import meme
meme.setOptions(deactivated=True)
input_project = args.input_project
output_file = args.output_file
c = load_from_dir(input_project)
c._load_data()
ranges, mask_probs = get_ranges(c.transform(c.x_train), [0.01, 0.99], c.w_train_tot, mask_value=c.mask_value, max_evts=10000)
def mask_uniform(x, mask_value, recurrent_field_idx):
"""
Mask recurrent fields with a random (uniform) number of objects. Works in place.
"""
for rec_idx in recurrent_field_idx:
for evt in x:
masked = False
nobj = int(random.random()*(rec_idx.shape[1]+1))
for obj_number, line_idx in enumerate(rec_idx.reshape(*rec_idx.shape[1:])):
if obj_number == nobj:
masked=True
if masked:
evt[line_idx] = mask_value
def get_input_flat(x):
return x[0].reshape(-1, len(c.fields))
if args.mask_jets:
jet_fields = {}
for field_name in c.fields:
if any(field_name.startswith("jet") and field_name.endswith(suffix) for suffix in ["Pt", "Eta", "Phi"]):
jet_number = re.findall("[0-9]+", field_name)[0]
if not jet_number in jet_fields:
jet_fields[jet_number] = []
jet_fields[jet_number].append(c.fields.index(field_name))
jet_fields = [np.array([[v for k, v in sorted(jet_fields.items(), key=lambda l:l[0])]])]
def input_transform(x):
x = np.array(x)
if hasattr(c, "mask_uniform"):
c.mask_uniform(x)
return c.get_input_list(x)
elif args.mask_jets:
mask_uniform(x, c.mask_value, jet_fields)
return x
opt_kwargs = dict()
if hasattr(c, "mask_uniform"):
opt_kwargs["input_transform"] = input_transform
opt_kwargs["input_inverse_transform"] = c.get_input_flat
if args.mask_jets:
opt_kwargs["input_transform"] = input_transform
opt_kwargs["input_inverse_transform"] = get_input_flat
evts = get_max_activation_events(
c.model, ranges,
ntries=args.nevents,
layer=len(c.model.layers)-1,
neuron=0,
maxit=10,
seed=45,
threshold=0,
**opt_kwargs
)
with h5py.File(output_file, "w") as f:
f.create_dataset("actmax", data=evts[1])
......@@ -2,30 +2,6 @@
import sys
import argparse
import logging
logging.basicConfig()
import numpy as np
import ROOT
ROOT.gROOT.SetBatch()
ROOT.PyConfig.IgnoreCommandLineOptions = True
from KerasROOTClassification import load_from_dir
from KerasROOTClassification.plotting import (
get_mean_event,
plot_NN_vs_var_2D,
plot_profile_2D,
plot_hist_2D_events,
plot_cond_avg_actmax_2D,
plot_NN_vs_var_2D_all,
)
from KerasROOTClassification.utils import (
get_single_neuron_function,
get_max_activation_events,
weighted_quantile,
get_ranges
)
parser = argparse.ArgumentParser(description='Create various 2D plots for a single neuron')
parser.add_argument("project_dir")
......@@ -53,6 +29,31 @@ parser.add_argument("-s", "--step-size", help="step size for activation maximisa
args = parser.parse_args()
import logging
logging.basicConfig()
import numpy as np
import ROOT
ROOT.gROOT.SetBatch()
ROOT.PyConfig.IgnoreCommandLineOptions = True
from KerasROOTClassification import load_from_dir
from KerasROOTClassification.plotting import (
get_mean_event,
plot_NN_vs_var_2D,
plot_profile_2D,
plot_hist_2D_events,
plot_cond_avg_actmax_2D,
plot_NN_vs_var_2D_all,
)
from KerasROOTClassification.utils import (
get_single_neuron_function,
get_max_activation_events,
weighted_quantile,
get_ranges
)
if args.all_neurons and (not args.mode.startswith("mean")):
parser.error("--all-neurons currently only supported for mean_sig and mean_bkg")
......@@ -82,7 +83,7 @@ total_weights = c.w_test*np.array(c.class_weight)[c.y_test.astype(int)]
try:
mask_value = c.mask_value
except NameError:
except AttributeError:
mask_value = None
# varx_test = c.x_test[:,varx_index]
......@@ -94,8 +95,8 @@ except NameError:
# percentilesx = weighted_quantile(varx_test[x_not_masked], [0.01, 0.99], sample_weight=total_weights[x_not_masked])
# percentilesy = weighted_quantile(vary_test[y_not_masked], [0.01, 0.99], sample_weight=total_weights[y_not_masked])
percentilesx = get_ranges(c.x_test, [0.01, 0.99], total_weights, mask_value=mask_value, filter_index=varx_index)[0]
percentilesy = get_ranges(c.x_test, [0.01, 0.99], total_weights, mask_value=mask_value, filter_index=vary_index)[0]
percentilesx = get_ranges(c.x_test, [0.01, 0.99], total_weights, mask_value=mask_value, filter_index=varx_index, max_evts=10000)[0][0]
percentilesy = get_ranges(c.x_test, [0.01, 0.99], total_weights, mask_value=mask_value, filter_index=vary_index, max_evts=10000)[0][0]
if args.xrange is not None:
......
#!/usr/bin/env python
"""
Write new TTrees with signal parameters as branches. For the
backgrounds the parameters are generated following the total
distribution for all signals. The discrete values for the whole ntuple
of signal parameters are counted, such that correlations between
signal parameters are taken into account.
"""
import argparse, re, os
import ROOT
from root_numpy import list_trees
from root_pandas import read_root
import numpy as np
if __name__ == "__main__":
input_filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys.root"
output_filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys_parametrized.root"
param_names = ["mg", "mc", "mn"]
param_match = "GG_oneStep_(.*?)_(.*?)_(.*?)_NoSys"
output_signal_treename = "GG_oneStep_NoSys"
bkg_trees = [
"diboson_Sherpa221_NoSys",
"singletop_NoSys",
"ttbar_NoSys",
"ttv_NoSys",
"wjets_Sherpa221_NoSys",
"zjets_Sherpa221_NoSys",
]
# read in the number of events for each combination of parameters
f = ROOT.TFile.Open(input_filename)
count_dict = {}
for key in f.GetListOfKeys():
tree_name = key.GetName()
match = re.match(param_match, tree_name)
if match is not None:
tree = f.Get(tree_name)
params = tuple([float(i) for i in match.groups()])
if not params in count_dict:
count_dict[params] = 0
# TODO: might be better to use sum of weights
count_dict[params] += tree.GetEntries()
f.Close()
# calculate cumulative sum of counts to sample signal parameters for background from
numbers = np.array(count_dict.keys(), dtype=np.float)
counts = np.array(count_dict.values(), dtype=np.float)
probs = counts/counts.sum()
prob_bins = np.cumsum(probs)
# read and write the rest in chunks
if os.path.exists(output_filename):
os.remove(output_filename)
for tree_name in list_trees(input_filename):
match_signal = re.match(param_match, tree_name)
if match_signal is not None or tree_name in bkg_trees:
print("Writing {}".format(tree_name))
nwritten = 0
for df in read_root(input_filename, tree_name, chunksize=100000):
print("Writing event {}".format(nwritten))
if match_signal is None:
rnd = np.random.random(len(df))
rnd_idx = np.digitize(rnd, prob_bins)
param_values = numbers[rnd_idx]
for param_idx, param_name in enumerate(param_names):
df[param_name] = param_values[:,param_idx]
df["training_weight"] = df["eventWeight"]*df["genWeight"]
else:
for param_name, param_value in zip(param_names, match_signal.groups()):
df[param_name] = float(param_value)
df["training_weight"] = df["eventWeight"]
if match_signal is None:
out_tree_name = tree_name
else:
out_tree_name = output_signal_treename
df.to_root(output_filename, mode="a", key=out_tree_name)
nwritten += len(df)
import pytest
import numpy as np
import root_numpy
import pandas as pd
from sklearn.datasets import make_classification
from keras.layers import GRU
from KerasROOTClassification import ClassificationProject, ClassificationProjectRNN
def create_dataset(path):
# create example dataset with (low-weighted) noise added
X, y = make_classification(n_samples=10000, random_state=1)
X2 = np.random.normal(size=20*10000).reshape(-1, 20)
y2 = np.concatenate([np.zeros(5000), np.ones(5000)])
X = np.concatenate([X, X2])
y = np.concatenate([y, y2])
w = np.concatenate([np.ones(10000), 0.01*np.ones(10000)])
# shift and scale randomly (to check if transformation is working)
shift = np.random.rand(20)*100
scale = np.random.rand(20)*1000
X *= scale
X += shift
# write to root files
branches = ["var_{}".format(i) for i in range(len(X[0]))]
df = pd.DataFrame(X, columns=branches)
df["class"] = y
df["weight"] = w
tree_path_bkg = str(path / "bkg.root")
tree_path_sig = str(path / "sig.root")
root_numpy.array2root(df[df["class"]==0].to_records(), tree_path_bkg)
root_numpy.array2root(df[df["class"]==1].to_records(), tree_path_sig)
return branches, tree_path_sig, tree_path_bkg
def test_ClassificationProject(tmp_path):
branches, tree_path_sig, tree_path_bkg = create_dataset(tmp_path)
c = ClassificationProject(
str(tmp_path / "project"),
bkg_trees = [(tree_path_bkg, "tree")],
signal_trees = [(tree_path_sig, "tree")],
branches = branches,
weight_expr = "weight",
identifiers = ["index"],
optimizer="Adam",
earlystopping_opts=dict(patience=5),
dropout=0.5,
layers=3,
nodes=128,
)
c.train(epochs=200)
c.plot_all_inputs()
c.plot_loss()
assert min(c.history.history["val_loss"]) < 0.18
def test_ClassificationProjectRNN(tmp_path):
branches, tree_path_sig, tree_path_bkg = create_dataset(tmp_path)
c = ClassificationProjectRNN(
str(tmp_path / "project"),
bkg_trees = [(tree_path_bkg, "tree")],
signal_trees = [(tree_path_sig, "tree")],
branches = branches,
recurrent_field_names=[
[
["var_1", "var_2", "var_3"],
["var_4", "var_5", "var_6"]
],
[
["var_10", "var_11", "var_12"],
["var_13", "var_14", "var_15"]
],
],
weight_expr = "weight",
identifiers = ["index"],
optimizer="Adam",
earlystopping_opts=dict(patience=5),
dropout=0.5,
layers=3,
nodes=128,
)
assert sum([isinstance(layer, GRU) for layer in c.model.layers]) == 2
c.train(epochs=200)
c.plot_all_inputs()
c.plot_loss()
assert min(c.history.history["val_loss"]) < 0.18
......@@ -19,6 +19,7 @@ import math
import glob
import shutil
import gc
import random
import logging
logger = logging.getLogger("KerasROOTClassification")
......@@ -32,33 +33,24 @@ from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.externals import joblib
from sklearn.metrics import roc_curve, auc
from sklearn.utils.extmath import stable_cumsum
from sklearn.model_selection import KFold
from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Dropout, Input, Masking, GRU, concatenate
from keras.layers import Dense, Dropout, Input, Masking, GRU, LSTM, concatenate, SimpleRNN
from keras.callbacks import History, EarlyStopping, CSVLogger, ModelCheckpoint, TensorBoard
from keras.optimizers import SGD
from keras.activations import relu
import keras.initializers
import keras.optimizers
from keras.utils.vis_utils import model_to_dot
from keras import backend as K
import tensorflow as tf
import matplotlib.pyplot as plt
from .utils import WeightedRobustScaler, weighted_quantile, poisson_asimov_significance
from .plotting import save_show
# configure number of cores
# this doesn't seem to work, but at least with these settings keras only uses 4 processes
# import tensorflow as tf
# from keras import backend as K
# num_cores = 1
# config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,
# inter_op_parallelism_threads=num_cores,
# allow_soft_placement=True,
# device_count = {'CPU': num_cores})
# session = tf.Session(config=config)
# K.set_session(session)
import ROOT
def byteify(input):
"From stackoverflow https://stackoverflow.com/a/13105359"
if isinstance(input, dict):
......@@ -75,6 +67,25 @@ if version_info[0] > 2:
byteify = lambda input : input
def set_session_threads(n_cpu=None):
"Set the number of threads based on OMP_NUM_THREADS or the given argument"
if n_cpu is None:
if os.environ.get('OMP_NUM_THREADS'):
n_cpu = int(os.environ.get('OMP_NUM_THREADS'))
else:
return
# not sure if this is the best configuration ...
config = tf.ConfigProto(intra_op_parallelism_threads=n_cpu,
inter_op_parallelism_threads=1,
allow_soft_placement=True,
#log_device_placement=True,
device_count = {'CPU': n_cpu})
session = tf.Session(config=config)
K.set_session(session)
def load_from_dir(path):
"Load a project and the options from a directory"
try:
......@@ -90,6 +101,8 @@ def load_from_dir(path):
class ClassificationProject(object):
verbose = 1 # verbosity of the fit method
"""Simple framework to load data from ROOT TTrees and train Keras
neural networks for classification according to some global settings.
......@@ -109,6 +122,8 @@ class ClassificationProject(object):
:param branches: list of branch names or expressions to be used as input values for training
:param regression_branches: list of branch names to be used as regression targets
:param rename_branches: dictionary that maps branch expressions to names for better readability
:param weight_expr: expression to weight the events in the loss function
......@@ -133,14 +148,22 @@ class ClassificationProject(object):
:param dropout_input: dropout fraction for the input layer. Set to None for no Dropout.
:param use_bias: use bias constant for each neuron? (default: True). You can also pass a list of booleans for each layer.
:param batch_size: size of the training batches
:param validation_split: split off this fraction of training events for loss evaluation
:param kfold_splits: if given, split into this number of of subsets to perform KFold cross validation
:param kfold_index: index of the subset to leave out for kfold validation
:param activation_function: activation function in the hidden layers
:param activation_function_output: activation function in the output layer
:param leaky_relu_alpha: set this to a non-zero value to use the LeakyReLU variant with a slope in the negative part
:param out_dir: base directory in which the project directories should be stored
:param scaler_type: sklearn scaler class name to transform the data before training (options: "StandardScaler", "RobustScaler")
......@@ -177,11 +200,27 @@ class ClassificationProject(object):
class will be used in each epoch, but different subsets of the
overrepresented class will be used in each epoch.
:param random_seed: use this seed value when initialising the model and produce consistent results. Note:
random data is also used for shuffling the training data, so results may vary still. To
produce consistent results, set the numpy random seed before training.
:param random_seed: use this seed value when initialising the model and produce consistent results.
:param shuffle_seed: use this seed for shuffling the training data
the first time. This seed (increased by one) is used again before
training when keras shuffling is used.
:param loss: loss function name (or list of names in case of regression targets)
:param loss_weights: (optional) list of weights to weight the individual losses (for multiple targets)
:param loss: loss function name
:param mask_value: value that is used for non-existent entries (e.g. 4th jet pt in events with 3 jets)
:param apply_class_weight: apply a weight that scales the events such that sumw(signal) = sumw(background)
:param normalize_weights: normalize the weights to mean 1
:param ignore_neg_weights: ignore events with negative weights in training - not recommended! (default: False)
:param kernel_initializer: weight initializer for the dense layers - if None (default) the keras defaults are used
:param shuffle: shuffle training data after (and before first) epoch
"""
......@@ -217,6 +256,7 @@ class ClassificationProject(object):
def _init_from_args(self, name,
signal_trees, bkg_trees, branches, weight_expr,
regression_branches=None,
rename_branches=None,
project_dir=None,
data_dir=None,
......@@ -226,9 +266,13 @@ class ClassificationProject(object):
nodes=64,
dropout=None,
dropout_input=None,
use_bias=True,
batch_size=128,
validation_split=0.33,
kfold_splits=None,
kfold_index=0,
activation_function='relu',
leaky_relu_alpha=None,
activation_function_output='sigmoid',
scaler_type="WeightedRobustScaler",
step_signal=2,
......@@ -244,8 +288,17 @@ class ClassificationProject(object):
use_tensorboard=False,
tensorboard_opts=None,
random_seed=1234,
shuffle_seed=42,
balance_dataset=False,
loss='binary_crossentropy'):
loss='binary_crossentropy',
loss_weights=None,
mask_value=None,
apply_class_weight=True,
normalize_weights=True,
ignore_neg_weights=False,
kernel_initializer=None,
shuffle=True,
):
self.name = name
self.signal_trees = signal_trees
......@@ -254,6 +307,9 @@ class ClassificationProject(object):
if rename_branches is None:
rename_branches = {}
self.rename_branches = rename_branches
if regression_branches is None:
regression_branches = []
self.regression_branches = regression_branches
self.weight_expr = weight_expr
self.selection = selection
......@@ -279,12 +335,22 @@ class ClassificationProject(object):
self.dropout = dropout
if not isinstance(self.dropout, list):
self.dropout = [self.dropout for i in range(self.layers)]
self.dropout_input = dropout_input
self.use_bias = use_bias
if not isinstance(self.use_bias, list):
self.use_bias = [self.use_bias for i in range(self.layers)]
if len(self.dropout) != self.layers:
raise ValueError("List of dropout fractions has to be of equal size as the number of layers!")
self.dropout_input = dropout_input
if len(self.use_bias) != self.layers:
raise ValueError("List biases has to be of equal size as the number of layers!")
self.batch_size = batch_size
self.validation_split = validation_split
self.kfold_splits = kfold_splits
self.kfold_index = kfold_index
self.activation_function = activation_function
self.leaky_relu_alpha = leaky_relu_alpha
if self.activation_function == "relu" and self.leaky_relu_alpha:
self.activation_function = lambda x : relu(x, alpha=self.leaky_relu_alpha)
self.activation_function_output = activation_function_output
self.scaler_type = scaler_type
self.step_signal = step_signal
......@@ -314,8 +380,19 @@ class ClassificationProject(object):
if tensorboard_opts is not None:
self.tensorboard_opts.update(**tensorboard_opts)
self.random_seed = random_seed
self.shuffle_seed = shuffle_seed
self.balance_dataset = balance_dataset
self.loss = loss
self.loss_weights = loss_weights
if self.regression_branches and (not isinstance(self.loss, list)):
self.loss = [self.loss]+["mean_squared_error"]*len(self.regression_branches)
self.mask_value = mask_value
self.apply_class_weight = apply_class_weight
self.normalize_weights = normalize_weights
self.ignore_neg_weights = ignore_neg_weights
self.kernel_initializer = kernel_initializer
self.shuffle = shuffle
self.s_train = None
self.b_train = None
......@@ -338,23 +415,24 @@ class ClassificationProject(object):
self._b_eventlist_train = None
self._scaler = None
self._scaler_target = None
self._class_weight = None
self._balanced_class_weight = None
self._model = None
self._history = None
self._callbacks_list = []
self._train_val_idx = None
# track the number of epochs this model has been trained
self.total_epochs = 0
self.data_loaded = False
self.data_transformed = False
self.data_shuffled = False
# track if we are currently training
self.is_training = False
self._fields = None
self._target_fields = None
@property
......@@ -367,6 +445,16 @@ class ClassificationProject(object):
return self._fields
@property
def target_fields(self):
"Renamed branch expressions for regression targets"
if self._target_fields is None:
self._target_fields = []
for branch_expr in self.regression_branches:
self._target_fields.append(self.rename_branches.get(branch_expr, branch_expr))
return self._target_fields
def rename_fields(self, ar):
"Rename fields of structured array"
fields = list(ar.dtype.names)
......@@ -378,12 +466,14 @@ class ClassificationProject(object):
def _load_data(self):
self._w_train_tot = None
try:
# if those don't exist, we need to load them from ROOT trees first
self._load_from_hdf5(*self.dataset_names_tree)
except KeyError:
except (KeyError, IOError):
logger.info("Couldn't load all datasets - reading from ROOT trees")
......@@ -395,22 +485,26 @@ class ClassificationProject(object):
for filename, treename in self.bkg_trees:
bkg_chain.AddFile(filename, -1, treename)
self.s_train = tree2array(signal_chain,
branches=self.branches+[self.weight_expr]+self.identifiers,
branches=set(self.branches+self.regression_branches+[self.weight_expr]+self.identifiers),
selection=self.selection,
start=0, step=self.step_signal, stop=self.stop_train)
self.b_train = tree2array(bkg_chain,
branches=self.branches+[self.weight_expr]+self.identifiers,
branches=set(self.branches+self.regression_branches+[self.weight_expr]+self.identifiers),
selection=self.selection,
start=0, step=self.step_bkg, stop=self.stop_train)
self.s_test = tree2array(signal_chain,
branches=self.branches+[self.weight_expr],
branches=set(self.branches+self.regression_branches+[self.weight_expr]),
selection=self.selection,
start=1, step=self.step_signal, stop=self.stop_test)
self.b_test = tree2array(bkg_chain,
branches=self.branches+[self.weight_expr],
branches=set(self.branches+self.regression_branches+[self.weight_expr]),
selection=self.selection,
start=1, step=self.step_bkg, stop=self.stop_test)
if self.ignore_neg_weights:
self.s_train = self.s_train[self.s_train[self.weight_expr]>0]
self.b_train = self.b_train[self.b_train[self.weight_expr]>0]
self.rename_fields(self.s_train)
self.rename_fields(self.b_train)
self.rename_fields(self.s_test)
......@@ -420,19 +514,27 @@ class ClassificationProject(object):
self.b_eventlist_train = self.b_train[self.identifiers].astype(dtype=[(branchName, "u8") for branchName in self.identifiers])
self._dump_training_list()
# now we don't need the identifiers anymore
self.s_train = self.s_train[self.fields+[self.weight_expr]]
self.b_train = self.b_train[self.fields+[self.weight_expr]]
# create x (input), y (target) and w (weights) arrays
# the first block will be signals, the second block backgrounds
self.x_train = rec2array(self.s_train[self.fields])
self.x_train = np.concatenate((self.x_train, rec2array(self.b_train[self.fields])))
self.w_train = self.s_train[self.weight_expr]
self.w_train = np.concatenate((self.w_train, self.b_train[self.weight_expr]))
self.y_train = np.empty(len(self.x_train), dtype=np.bool)
self.y_train[:len(self.s_train)] = 1
self.y_train[len(self.s_train):] = 0
def fill_target(x, s, b):
if not self.target_fields:
y = np.empty(len(x), dtype=np.bool)
y[:len(s)] = 1
y[len(s):] = 0
else:
y = np.empty((len(x), 1+len(self.target_fields)), dtype=np.float)
y[:len(s),0] = 1
y[len(s):,0] = 0
y[:len(s),1:] = rec2array(s[self.target_fields])
y[len(s):,1:] = rec2array(b[self.target_fields])
return y
self.y_train = fill_target(self.x_train, self.s_train, self.b_train)
self.b_train = None
self.s_train = None
......@@ -440,16 +542,14 @@ class ClassificationProject(object):
self.x_test = np.concatenate((self.x_test, rec2array(self.b_test[self.fields])))
self.w_test = self.s_test[self.weight_expr]
self.w_test = np.concatenate((self.w_test, self.b_test[self.weight_expr]))
self.y_test = np.empty(len(self.x_test), dtype=np.bool)
self.y_test[:len(self.s_test)] = 1
self.y_test[len(self.s_test):] = 0
self.y_test = fill_target(self.x_test, self.s_test, self.b_test)
self.b_test = None
self.s_test = None
self._dump_to_hdf5(*self.dataset_names_tree)
self.data_loaded = True
self.data_shuffled = False
def _dump_training_list(self):
......@@ -510,7 +610,7 @@ class ClassificationProject(object):
os.symlink(srcpath, filename)
logger.info("Created symlink from {} to {}".format(srcpath, filename))
logger.info("Trying to load {} from {}".format(dataset_name, filename))
with h5py.File(filename) as hf:
with h5py.File(filename, "r") as hf:
setattr(self, dataset_name, hf[dataset_name][:])
logger.info("Data loaded")
......@@ -551,6 +651,7 @@ class ClassificationProject(object):
elif self.scaler_type == "WeightedRobustScaler":
self._scaler = WeightedRobustScaler()
scaler_fit_kwargs["weights"] = self.w_train_tot
scaler_fit_kwargs["mask_value"] = self.mask_value
else:
raise ValueError("Scaler type {} unknown".format(self.scaler_type))
logger.info("Fitting {} to training data".format(self.scaler_type))
......@@ -562,12 +663,82 @@ class ClassificationProject(object):
return self._scaler
def transform(self, x):
return self.scaler.transform(x)
@property
def scaler_target(self):
"same as scaler, but for scaling regression targets"
# create the scaler (and fit to training data) if not existent
if self._scaler_target is None:
filename = os.path.join(self.project_dir, "scaler_target.pkl")
try:
self._scaler_target = joblib.load(filename)
logger.info("Loaded existing scaler from {}".format(filename))
except IOError:
logger.info("Creating new {} for scaling the targets".format(self.scaler_type))
scaler_fit_kwargs = dict()
if self.scaler_type == "StandardScaler":
self._scaler_target = StandardScaler()
elif self.scaler_type == "RobustScaler":
self._scaler_target = RobustScaler()
elif self.scaler_type == "WeightedRobustScaler":
self._scaler_target = WeightedRobustScaler()
scaler_fit_kwargs["weights"] = self.w_train_tot
else:
raise ValueError("Scaler type {} unknown".format(self.scaler_type))
logger.info("Fitting {} to training data".format(self.scaler_type))
orig_copy_setting = self.scaler.copy
self.scaler.copy = False
self._scaler_target.fit(self.y_train, **scaler_fit_kwargs)
# i don't want to scale the classification target here
self._scaler_target.center_[0] = 0.
self._scaler_target.scale_[0] = 1.
self.scaler.copy = orig_copy_setting
joblib.dump(self._scaler_target, filename)
return self._scaler_target
def _batch_transform(self, x, fn, batch_size):
"Transform array in batches, temporarily setting mask_values to nan"
transformed = np.empty(x.shape, dtype=x.dtype)
for start in range(0, len(x), batch_size):
stop = start+batch_size
x_batch = np.array(x[start:stop]) # copy
x_batch[x_batch == self.mask_value] = np.nan
x_batch = fn(x_batch)
x_batch[np.isnan(x_batch)] = self.mask_value
transformed[start:stop] = x_batch
return transformed
def transform(self, x, batch_size=10000):
if self.mask_value is not None:
return self._batch_transform(x, self.scaler.transform, batch_size)
else:
return self.scaler.transform(x)
def inverse_transform(self, x, batch_size=10000):
if self.mask_value is not None:
return self._batch_transform(x, self.scaler.inverse_transform, batch_size)
else:
return self.scaler.inverse_transform(x)
def transform_target(self, y, batch_size=10000):
if not self.target_fields:
return y
if self.mask_value is not None:
return self._batch_transform(y, self.scaler_target.transform, batch_size)
else:
return self.scaler_target.transform(y)
def inverse_transform(self, x):
return self.scaler.inverse_transform(x)
def inverse_transform_target(self, y, batch_size=10000):
if not self.target_fields:
return y
if self.mask_value is not None:
return self._batch_transform(y, self.scaler_target.inverse_transform, batch_size)
else:
return self.scaler_target.inverse_transform(y)
@property
......@@ -601,24 +772,6 @@ class ClassificationProject(object):
json.dump(self.history.history, of)
def _transform_data(self):
if not self.data_transformed:
if logger.level <= logging.DEBUG:
logger.debug("training data before transformation: {}".format(self.x_train))
logger.debug("minimum values: {}".format([np.min(self.x_train[:,i][~np.isnan(self.x_train[:,i])])
for i in range(self.x_train.shape[1])]))
logger.debug("maximum values: {}".format([np.max(self.x_train[:,i][~np.isnan(self.x_train[:,i])])
for i in range(self.x_train.shape[1])]))
orig_copy_setting = self.scaler.copy
self.scaler.copy = False
self.x_train = self.scaler.transform(self.x_train)
logger.debug("training data after transformation: {}".format(self.x_train))
self.x_test = self.scaler.transform(self.x_test)
self.scaler.copy = orig_copy_setting
self.data_transformed = True
logger.info("Training and test data transformed")
def _read_info(self, key, default):
filename = os.path.join(self.project_dir, "info.json")
if not os.path.exists(filename):
......@@ -660,23 +813,47 @@ class ClassificationProject(object):
if self._model is None:
self._model = Sequential()
# input
input_layer = Input((len(self.fields),))
# optional dropout on inputs
if self.dropout_input is None:
self._model.add(Dense(self.nodes[0], input_dim=len(self.fields), activation=self.activation_function))
# in case of no Dropout we already have the first hidden layer
start_layer = 1
hidden_layer = input_layer
else:
self._model.add(Dropout(rate=self.dropout_input, input_shape=(len(self.fields),)))
start_layer = 0
# the (other) hidden layers
for node_count, dropout_fraction in zip(self.nodes[start_layer:], self.dropout[start_layer:]):
self._model.add(Dense(node_count, activation=self.activation_function))
hidden_layer = Dropout(rate=self.dropout_input)(input_layer)
# densely connected hidden layers
for node_count, dropout_fraction, use_bias in zip(
self.nodes,
self.dropout,
self.use_bias,
):
extra_opts = dict()
if self.kernel_initializer is not None:
extra_opts["kernel_initializer"] = getattr(keras.initializers, self.kernel_initializer)()
hidden_layer = Dense(node_count, activation=self.activation_function, use_bias=use_bias, **extra_opts)(hidden_layer)
if (dropout_fraction is not None) and (dropout_fraction > 0):
self._model.add(Dropout(rate=dropout_fraction))
# last layer is one neuron (binary classification)
self._model.add(Dense(1, activation=self.activation_function_output))
hidden_layer = Dropout(rate=dropout_fraction)(hidden_layer)
# optional regression targets
extra_targets = []
for target_field in self.target_fields:
extra_target = Dense(1, activation="linear", name="target_{}".format(target_field))(hidden_layer)
extra_targets.append(extra_target)
if not self.target_fields:
# one output node for binary classification
output_layer = Dense(1, activation=self.activation_function_output)(hidden_layer)
outputs = [output_layer]
else:
# add another hidden layer on top of the regression targets and previous hidden layers
merge = concatenate([hidden_layer]+extra_targets)
hidden_layer2 = Dense(64, activation=self.activation_function)(merge)
output_class = Dense(1, activation=self.activation_function_output)(hidden_layer2)
outputs = [output_class]+extra_targets
self._model = Model(inputs=[input_layer], outputs=outputs)
self._compile_or_load_model()
return self._model
......@@ -691,6 +868,7 @@ class ClassificationProject(object):
np.random.seed(self.random_seed)
self._model.compile(optimizer=optimizer,
loss=self.loss,
loss_weights=self.loss_weights,
weighted_metrics=['accuracy']
)
np.random.set_state(rn_state)
......@@ -715,14 +893,14 @@ class ClassificationProject(object):
# plot model
with open(os.path.join(self.project_dir, "model.svg"), "wb") as of:
of.write(model_to_dot(self._model).create("dot", format="svg"))
of.write(model_to_dot(self._model, show_shapes=True).create("dot", format="svg"))
@property
def class_weight(self):
if self._class_weight is None:
sumw_bkg = np.sum(self.w_train[self.y_train == 0])
sumw_sig = np.sum(self.w_train[self.y_train == 1])
sumw_bkg = np.sum(self.w_train[self.l_train == 0])
sumw_sig = np.sum(self.w_train[self.l_train == 1])
self._class_weight = [(sumw_sig+sumw_bkg)/(2*sumw_bkg), (sumw_sig+sumw_bkg)/(2*sumw_sig)]
logger.debug("Calculated class_weight: {}".format(self._class_weight))
return self._class_weight
......@@ -737,11 +915,11 @@ class ClassificationProject(object):
event with class weights
"""
if self._balanced_class_weight is None:
sumw_bkg = np.sum(self.w_train[self.y_train == 0])
sumw_sig = np.sum(self.w_train[self.y_train == 1])
sumw_bkg = np.sum(self.w_train[self.l_train == 0])
sumw_sig = np.sum(self.w_train[self.l_train == 1])
# use sumw *per event* in this case
sumw_bkg /= len(self.w_train[self.y_train == 0])
sumw_sig /= len(self.w_train[self.y_train == 1])
sumw_bkg /= len(self.w_train[self.l_train == 0])
sumw_sig /= len(self.w_train[self.l_train == 1])
self._balanced_class_weight = [(sumw_sig+sumw_bkg)/(2*sumw_bkg), (sumw_sig+sumw_bkg)/(2*sumw_sig)]
logger.debug("Calculated balanced_class_weight: {}".format(self._balanced_class_weight))
return self._balanced_class_weight
......@@ -752,29 +930,27 @@ class ClassificationProject(object):
if reload:
self.data_loaded = False
self.data_transformed = False
if not self.data_loaded:
self._load_data()
if not self.data_transformed:
self._transform_data()
@property
def l_train(self):
"labels (in case y contains regression targets)"
if not self.target_fields:
return self.y_train
else:
return self.y_train[:,0]
def shuffle_training_data(self):
rn_state = np.random.get_state()
np.random.shuffle(self.x_train)
np.random.set_state(rn_state)
np.random.shuffle(self.y_train)
np.random.set_state(rn_state)
np.random.shuffle(self.w_train)
np.random.set_state(rn_state)
np.random.shuffle(self.w_train_tot)
if self._scores_train is not None:
logger.info("Shuffling scores, since they are also there")
np.random.set_state(rn_state)
np.random.shuffle(self._scores_train)
self.data_shuffled = True
@property
def l_test(self):
"labels (in case y contains regression targets)"
if not self.target_fields:
return self.y_test
else:
return self.y_test[:,0]
@property
......@@ -784,28 +960,109 @@ class ClassificationProject(object):
class_weight = self.class_weight
else:
class_weight = self.balanced_class_weight
if not self.data_loaded:
raise ValueError("Data not loaded! can't calculate total weight")
if self._w_train_tot is None:
self._w_train_tot = self.w_train*np.array(class_weight)[self.y_train.astype(int)]
self._w_train_tot /= np.mean(self._w_train_tot)
if self.apply_class_weight:
self._w_train_tot = self.w_train*np.array(class_weight)[self.l_train.astype(int)]
else:
self._w_train_tot = np.array(self.w_train)
if self.normalize_weights:
self._w_train_tot /= np.mean(self._w_train_tot)
return self._w_train_tot
@property
def validation_data(self):
"Validation data. Attention: Shuffle training data before using this!"
if not self.data_shuffled:
raise ValueError("Training data isn't shuffled, can't split of validation data")
split_index = int((1-self.validation_split)*len(self.x_train))
return self.x_train[split_index:], self.y_train[split_index:], self.w_train_tot[split_index:]
"(Transformed) validation data for loss evaluation"
idx = self.train_val_idx[1]
x_val, y_val, w_val = self.x_train[idx], self.y_train[idx], self.w_train_tot[idx]
x_val_input = self.get_input_list(self.transform(x_val))
y_val_output = self.get_output_list(self.transform_target(y_val))
w_val_list = self.get_weight_list(w_val)
return x_val_input, y_val_output, w_val_list
@property
def training_data(self):
"Training data with validation data split off. Attention: Shuffle training data before using this!"
if not self.data_shuffled:
raise ValueError("Training data isn't shuffled, can't split of validation data")
split_index = int((1-self.validation_split)*len(self.x_train))
return self.x_train[:split_index], self.y_train[:split_index], self.w_train_tot[:split_index]
"(Transformed) Training data with validation data split off"
idx = self.train_val_idx[0]
x_train, y_train, w_train = self.x_train[idx], self.y_train[idx], self.w_train_tot[idx]
x_train_input = self.get_input_list(self.transform(x_train))
y_train_output = self.get_output_list(self.transform_target(y_train))
w_train_list = self.get_weight_list(w_train)
return x_train_input, y_train_output, w_train_list
@property
def train_val_idx(self):
if self._train_val_idx is None:
if self.kfold_splits is not None:
kfold = KFold(self.kfold_splits, shuffle=self.shuffle, random_state=self.shuffle_seed)
for i, train_val_idx in enumerate(kfold.split(self.x_train)):
if i == self.kfold_index:
self._train_val_idx = train_val_idx
break
else:
raise IndexError("Index {} out of range for kfold (requested {} splits)".format(self.kfold_index, self.kfold_splits))
else:
split_index = int((1-self.validation_split)*len(self.x_train))
np.random.seed(self.shuffle_seed)
if self.shuffle:
shuffled_idx = np.random.permutation(len(self.x_train))
else:
shuffled_idx = np.arange(len(self.x_train))
self._train_val_idx = (shuffled_idx[:split_index], shuffled_idx[split_index:])
return self._train_val_idx
@property
def steps_per_epoch(self):
return int(float(len(self.train_val_idx[0]))/float(self.batch_size))
def get_input_list(self, x):
"For the standard Dense models with single input, this does nothing"
return x
def get_output_list(self, y):
"Split target vector column wise in case of regression targets"
if not self.target_fields:
return y
else:
return np.hsplit(y, len(self.target_fields)+1)
def get_weight_list(self, w):
"Repeat weight n times for regression targets"
if not self.target_fields:
return w
else:
return [w]*(len(self.target_fields)+1)
def yield_batch(self):
"Batch generator - optionally shuffle the indices after each epoch"
x_train, y_train, w_train = self.x_train, self.y_train, self.w_train_tot
train_idx = list(self.train_val_idx[0])
np.random.seed(self.shuffle_seed+1)
logger.info("Generating training batches from {} signal and {} background events"
.format(len(np.where(self.l_train[train_idx]==1)[0]),
len(np.where(self.l_train[train_idx]==0)[0])))
while True:
if self.shuffle:
shuffled_idx = np.random.permutation(train_idx)
else:
shuffled_idx = train_idx
for start in range(0, len(shuffled_idx), int(self.batch_size)):
x_batch = x_train[shuffled_idx[start:start+int(self.batch_size)]]
y_batch = y_train[shuffled_idx[start:start+int(self.batch_size)]]
w_batch = w_train[shuffled_idx[start:start+int(self.batch_size)]]
x_input = self.get_input_list(self.transform(x_batch))
y_output = self.get_output_list(self.transform_target(y_batch))
w_list = self.get_weight_list(w_batch)
yield (x_input, y_output, w_list)
def yield_single_class_batch(self, class_label):
......@@ -814,10 +1071,14 @@ class ClassificationProject(object):
The weights are multiplied by balanced_class_weight.
"""
x_train, y_train, w_train = self.training_data
class_idx = np.where(y_train==class_label)[0]
l_train = y_train[:,0] if self.target_fields else y_train
class_idx = np.where(l_train==class_label)[0]
while True:
# shuffle the indices for this class label
shuffled_idx = np.random.permutation(class_idx)
if self.shuffle:
shuffled_idx = np.random.permutation(class_idx)
else:
shuffled_idx = class_idx
# yield them batch wise
for start in range(0, len(shuffled_idx), int(self.batch_size/2)):
yield (x_train[shuffled_idx[start:start+int(self.batch_size/2)]],
......@@ -840,51 +1101,45 @@ class ClassificationProject(object):
np.concatenate((batch_0[2], batch_1[2])))
def train(self, epochs=10):
def train(self, epochs=10, skip_checkpoint=False):
self.load()
self.shuffle_training_data()
for branch_index, branch in enumerate(self.fields):
self.plot_input(branch_index)
self.total_epochs = self._read_info("epochs", 0)
set_session_threads()
logger.info("Train model")
if not self.balance_dataset:
try:
self.is_training = True
self.model.fit(self.x_train,
# the reshape might be unnescessary here
self.y_train.reshape(-1, 1),
epochs=epochs,
validation_split=self.validation_split,
# we have to multiply by class weight since keras ignores class weight if sample weight is given
# see https://github.com/keras-team/keras/issues/497
sample_weight=self.w_train_tot,
shuffle=True,
batch_size=self.batch_size,
callbacks=self.callbacks_list)
self.model.fit_generator(self.yield_batch(),
steps_per_epoch=self.steps_per_epoch,
epochs=epochs,
validation_data=self.validation_data,
callbacks=self.callbacks_list,
verbose=self.verbose)
self.is_training = False
except KeyboardInterrupt:
logger.info("Interrupt training - continue with rest")
else:
try:
self.is_training = True
labels, label_counts = np.unique(self.y_train, return_counts=True)
labels, label_counts = np.unique(self.l_train, return_counts=True)
logger.info("Training on balanced batches")
# note: the batches have balanced_class_weight already applied
self.model.fit_generator(self.yield_balanced_batch(),
steps_per_epoch=int(min(label_counts)/self.batch_size),
epochs=epochs,
validation_data=self.validation_data,
callbacks=self.callbacks_list)
callbacks=self.callbacks_list,
verbose=self.verbose)
self.is_training = False
except KeyboardInterrupt:
logger.info("Interrupt training - continue with rest")
self.checkpoint_model()
if not skip_checkpoint:
self.checkpoint_model()
def checkpoint_model(self):
......@@ -907,23 +1162,40 @@ class ClassificationProject(object):
self._write_info("epochs", self.total_epochs)
def evaluate_train_test(self, do_train=True, do_test=True, mode=None):
logger.info("Reloading (and re-transforming) unshuffled training data")
self.load(reload=True)
def evaluate_train_test(self, do_train=True, do_test=True, batch_size=10000, mode=None):
"Calculate scores for training and test sample"
if mode is not None:
self._write_info("scores_mode", mode)
logger.info("Create/Update scores for train/test sample")
def eval_score(data_name):
logger.info("Create/Update scores for {} sample".format(data_name))
n_events = len(getattr(self, "x_"+data_name))
setattr(self, "scores_"+data_name, np.empty(n_events))
for start in range(0, n_events, batch_size):
stop = start+batch_size
outputs = self.predict(
self.get_input_list(self.transform(getattr(self, "x_"+data_name)[start:stop])),
mode=mode
)
if not self.target_fields:
scores_batch = outputs.reshape(-1)
else:
scores_batch = outputs[0].reshape(-1)
getattr(self, "scores_"+data_name)[start:stop] = scores_batch
self._dump_to_hdf5("scores_"+data_name)
if do_test:
self.scores_test = self.predict(self.x_test, mode=mode).reshape(-1)
self._dump_to_hdf5("scores_test")
eval_score("test")
if do_train:
self.scores_train = self.predict(self.x_train, mode=mode).reshape(-1)
self._dump_to_hdf5("scores_train")
eval_score("train")
def predict(self, x, mode=None):
"""
Calculate the scores for a (transformed) array of input values.
If the array is not transformed, use `evaluate` instead
"""
if mode is None:
# normal output - after activation function output layer
return self.model.predict(x)
......@@ -943,6 +1215,10 @@ class ClassificationProject(object):
def evaluate(self, x_eval, mode=None):
"""
Calculate the scores for an array of input values.
All nescessary transformations are applied.
"""
logger.debug("Evaluate score for {}".format(x_eval))
x_eval = self.transform(x_eval)
logger.debug("Evaluate for transformed array: {}".format(x_eval))
......@@ -952,7 +1228,10 @@ class ClassificationProject(object):
def write_friend_tree(self, score_name,
source_filename, source_treename,
target_filename, target_treename,
batch_size=100000):
batch_size=100000,
score_mode=None,
fixed_params=None):
"TODO: doesn't work for regression targets"
f = ROOT.TFile.Open(source_filename)
tree = f.Get(source_treename)
entries = tree.GetEntries()
......@@ -965,8 +1244,11 @@ class ClassificationProject(object):
x_from_tree = tree2array(tree,
branches=self.branches+self.identifiers,
start=start, stop=start+batch_size)
# for parametrized classifiers
if fixed_params is not None:
for param_name, value in fixed_params.items():
x_from_tree[param_name] = value
x_eval = rec2array(x_from_tree[self.branches])
if len(self.identifiers) > 0:
# create list of booleans that indicate which events where used for training
df_identifiers = pd.DataFrame(x_from_tree[self.identifiers])
......@@ -978,7 +1260,7 @@ class ClassificationProject(object):
is_train = np.zeros(len(x_eval))
# join scores and is_train array
scores = self.evaluate(x_eval).reshape(-1)
scores = self.evaluate(x_eval, mode=score_mode).reshape(-1)
friend_df = pd.DataFrame(np.array(scores, dtype=[(score_name, np.float64)]))
friend_df[score_name+"_is_train"] = is_train
friend_tree = friend_df.to_records()[[score_name, score_name+"_is_train"]]
......@@ -1020,17 +1302,57 @@ class ClassificationProject(object):
return centers, hist, errors
def plot_input(self, var_index, ax=None):
"plot a single input variable"
def plot_input(self, var_index, ax=None, from_training_batches=False, max_n_batches=None):
"""
plot a single input variable as a histogram (signal vs background)
:param from_training_batches: use data from training batch generator
:param max_n_batches: if training batch generator is used, just use
this number of batches (otherwise steps_per_epoch is used)
"""
branch = self.fields[var_index]
if ax is None:
fig, ax = plt.subplots()
else:
fig = None
bkg = self.x_train[:,var_index][self.y_train == 0]
sig = self.x_train[:,var_index][self.y_train == 1]
bkg_weights = self.w_train_tot[self.y_train == 0]
sig_weights = self.w_train_tot[self.y_train == 1]
if not from_training_batches:
bkg = self.x_train[:,var_index][self.l_train == 0]
sig = self.x_train[:,var_index][self.l_train == 1]
bkg_weights = self.w_train_tot[self.l_train == 0]
sig_weights = self.w_train_tot[self.l_train == 1]
else:
bkg = None
sig = None
bkg_weights = None
sig_weights = None
if max_n_batches is not None:
n_batches = max_n_batches
else:
n_batches = self.steps_per_epoch
for i_batch, (x, y, w) in enumerate(self.yield_batch()):
if i_batch > n_batches:
break
if self.target_fields:
y = y[0]
try:
x = self.get_input_flat(x)
except NameError:
pass
bkg_batch = x[:,var_index][y==0]
sig_batch = x[:,var_index][y==1]
bkg_weights_batch = w[y==0]
sig_weights_batch = w[y==1]
if bkg is None:
bkg = bkg_batch
sig = sig_batch
bkg_weights = bkg_weights_batch
sig_weights = sig_weights_batch
else:
bkg = np.concatenate([bkg, bkg_batch])
sig = np.concatenate([sig, sig_batch])
bkg_weights = np.concatenate([bkg_weights, bkg_weights_batch])
sig_weights = np.concatenate([sig_weights, sig_weights_batch])
if hasattr(self, "mask_value"):
bkg_not_masked = np.where(bkg != self.mask_value)[0]
......@@ -1081,13 +1403,14 @@ class ClassificationProject(object):
centers_sig, hist_sig, _ = self.get_bin_centered_hist(sig, bins=bins, range=plot_range, weights=sig_weights)
centers_bkg, hist_bkg, _ = self.get_bin_centered_hist(bkg, bins=bins, range=plot_range, weights=bkg_weights)
width = centers_sig[1]-centers_sig[0]
if bins > 1:
width = centers_sig[1]-centers_sig[0]
else:
width = 1.
ax.bar(centers_bkg, hist_bkg, color="b", alpha=0.5, width=width)
ax.bar(centers_sig, hist_sig, color="r", alpha=0.5, width=width)
label = branch
if self.data_transformed:
label += " (transformed)"
ax.set_xlabel(label)
if fig is not None:
plot_dir = os.path.join(self.project_dir, "plots")
......@@ -1096,27 +1419,27 @@ class ClassificationProject(object):
return save_show(plt, fig, os.path.join(plot_dir, "var_{}.pdf".format(var_index)))
def plot_all_inputs(self):
def plot_all_inputs(self, **kwargs):
nrows = math.ceil(math.sqrt(len(self.fields)))
fig, axes = plt.subplots(nrows=int(nrows), ncols=int(nrows),
figsize=(3*nrows, 3*nrows),
gridspec_kw=dict(wspace=0.4, hspace=0.4))
for i in range(len(self.fields)):
self.plot_input(i, ax=axes.reshape(-1)[i])
self.plot_input(i, ax=axes.reshape(-1)[i], **kwargs)
return save_show(plt, fig, os.path.join(self.project_dir, "all_inputs.pdf"))
def plot_weights(self, bins=100, range=None):
fig, ax = plt.subplots()
bkg = self.w_train_tot[self.y_train == 0]
sig = self.w_train_tot[self.y_train == 1]
bkg = self.w_train_tot[self.l_train == 0]
sig = self.w_train_tot[self.l_train == 1]
ax.hist(bkg, bins=bins, range=range, color="b", alpha=0.5)
ax.set_yscale("log")
return save_show(plt, fig, os.path.join(self.project_dir, "eventweights_bkg.pdf"))
save_show(plt, fig, os.path.join(self.project_dir, "eventweights_bkg.pdf"))
fig, ax = plt.subplots()
ax.hist(sig, bins=bins, range=range, color="r", alpha=0.5)
ax.set_yscale("log")
return save_show(plt, fig, os.path.join(self.project_dir, "eventweights_sig.pdf"))
save_show(plt, fig, os.path.join(self.project_dir, "eventweights_sig.pdf"))
def plot_ROC(self, xlim=(0,1), ylim=(0,1)):
......@@ -1127,8 +1450,8 @@ class ClassificationProject(object):
ax.grid(color='gray', linestyle='--', linewidth=1)
for y, scores, weight, label in [
(self.y_train, self.scores_train, self.w_train, "train"),
(self.y_test, self.scores_test, self.w_test, "test")
(self.l_train, self.scores_train, self.w_train, "train"),
(self.l_test, self.scores_test, self.w_test, "test")
]:
fpr, tpr, threshold = roc_curve(y, scores, sample_weight = weight)
fpr = 1.0 - fpr # background rejection
......@@ -1161,10 +1484,10 @@ class ClassificationProject(object):
trf = lambda y : y
fig, ax = plt.subplots()
for scores, weights, y, class_label, fn, opts in [
(self.scores_train, self.w_train, self.y_train, 1, ax.bar, dict(color="r", label="signal train")),
(self.scores_train, self.w_train, self.y_train, 0, ax.bar, dict(color="b", label="background train")),
(self.scores_test, self.w_test, self.y_test, 1, ax.errorbar, dict(fmt="ro", label="signal test")),
(self.scores_test, self.w_test, self.y_test, 0, ax.errorbar, dict(fmt="bo", label="background test")),
(self.scores_train, self.w_train, self.l_train, 1, ax.bar, dict(color="r", label="signal train")),
(self.scores_train, self.w_train, self.l_train, 0, ax.bar, dict(color="b", label="background train")),
(self.scores_test, self.w_test, self.l_test, 1, ax.errorbar, dict(fmt="ro", label="signal test")),
(self.scores_test, self.w_test, self.l_test, 0, ax.errorbar, dict(fmt="bo", label="background test")),
]:
weights = weights[y==class_label]
if apply_class_weight is True and (lumifactor is not None):
......@@ -1217,16 +1540,16 @@ class ClassificationProject(object):
else:
trf = lambda y : y
centers_sig_train, hist_sig_train, rel_errors_sig_train = self.get_bin_centered_hist(trf(self.scores_train[self.y_train==1].reshape(-1)), weights=self.w_train[self.y_train==1], **plot_opts)
centers_bkg_train, hist_bkg_train, rel_errors_bkg_train = self.get_bin_centered_hist(trf(self.scores_train[self.y_train==0].reshape(-1)), weights=self.w_train[self.y_train==0], **plot_opts)
centers_sig_test, hist_sig_test, rel_errors_sig_test = self.get_bin_centered_hist(trf(self.scores_test[self.y_test==1].reshape(-1)), weights=self.w_test[self.y_test==1], **plot_opts)
centers_bkg_test, hist_bkg_test, rel_errors_bkg_test = self.get_bin_centered_hist(trf(self.scores_test[self.y_test==0].reshape(-1)), weights=self.w_test[self.y_test==0], **plot_opts)
centers_sig_train, hist_sig_train, rel_errors_sig_train = self.get_bin_centered_hist(trf(self.scores_train[self.l_train==1].reshape(-1)), weights=self.w_train[self.l_train==1], **plot_opts)
centers_bkg_train, hist_bkg_train, rel_errors_bkg_train = self.get_bin_centered_hist(trf(self.scores_train[self.l_train==0].reshape(-1)), weights=self.w_train[self.l_train==0], **plot_opts)
centers_sig_test, hist_sig_test, rel_errors_sig_test = self.get_bin_centered_hist(trf(self.scores_test[self.l_test==1].reshape(-1)), weights=self.w_test[self.l_test==1], **plot_opts)
centers_bkg_test, hist_bkg_test, rel_errors_bkg_test = self.get_bin_centered_hist(trf(self.scores_test[self.l_test==0].reshape(-1)), weights=self.w_test[self.l_test==0], **plot_opts)
significances_train = []
significances_test = []
for hist_sig, hist_bkg, rel_errors_sig, rel_errors_bkg, significances, w, y in [
(hist_sig_train, hist_bkg_train, rel_errors_sig_train, rel_errors_bkg_train, significances_train, self.w_train, self.y_train),
(hist_sig_test, hist_bkg_test, rel_errors_sig_test, rel_errors_bkg_test, significances_test, self.w_test, self.y_test),
for hist_sig, hist_bkg, rel_errors_sig, rel_errors_bkg, significances in [
(hist_sig_train, hist_bkg_train, rel_errors_sig_train, rel_errors_bkg_train, significances_train),
(hist_sig_test, hist_bkg_test, rel_errors_sig_test, rel_errors_bkg_test, significances_test),
]:
# factor to rescale due to using only a fraction of events (training and test samples)
# normfactor_sig = (np.sum(self.w_train[self.y_train==1])+np.sum(self.w_test[self.y_test==1]))/np.sum(w[y==1])
......@@ -1296,7 +1619,7 @@ class ClassificationProject(object):
return lambda y : np.log(y/(1-y))
def plot_significance(self, significance_function=None, maxsteps=1000, lumifactor=1., vectorized=False, invert_activation=False):
def plot_significance(self, significance_function=None, maxsteps=None, lumifactor=1., vectorized=False, invert_activation=False):
"""
Plot the significance when cutting on all posible thresholds and plot against signal efficiency.
"""
......@@ -1315,13 +1638,16 @@ class ClassificationProject(object):
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']
for (scores, y, w, label), col in zip(
[(self.scores_train, self.y_train, self.w_train, "train"),
(self.scores_test, self.y_test, self.w_test, "test")],
[(self.scores_train, self.l_train, self.w_train, "train"),
(self.scores_test, self.l_test, self.w_test, "test")],
colors
):
scores = trf(scores)
s_sumws, s_errs, b_sumws, b_errs, thresholds = self.calc_s_ds_b_db(scores, y, w)
stepsize = int(len(s_sumws))/int(maxsteps)
if maxsteps is not None:
stepsize = int(len(s_sumws))/int(maxsteps)
else:
stepsize = 1
if stepsize == 0:
stepsize = 1
s_sumws = s_sumws[::stepsize]*lumifactor*self.step_signal
......@@ -1377,19 +1703,19 @@ class ClassificationProject(object):
hist_dict = self.csv_hist
logger.info("Plot losses")
plt.plot(hist_dict['loss'])
plt.plot(hist_dict['val_loss'])
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['training data','validation data'], loc='upper left')
fig, ax = plt.subplots()
ax.plot(hist_dict['loss'])
ax.plot(hist_dict['val_loss'])
ax.set_ylabel('loss')
ax.set_xlabel('epoch')
ax.legend(['training data','validation data'], loc='upper left')
if log:
plt.yscale("log")
ax.set_yscale("log")
if xlim is not None:
plt.xlim(*xlim)
ax.set_xlim(*xlim)
if ylim is not None:
plt.ylim(*ylim)
plt.savefig(os.path.join(self.project_dir, "losses.pdf"))
plt.clf()
ax.set_ylim(*ylim)
return save_show(plt, fig, os.path.join(self.project_dir, "losses.pdf"))
def plot_accuracy(self, all_trainings=False, log=False, acc_suffix="weighted_acc"):
......@@ -1435,7 +1761,7 @@ class ClassificationProject(object):
df = pd.DataFrame(np.concatenate([self.x_train, self.x_test]), columns=self.fields)
df["weight"] = np.concatenate([self.w_train, self.w_test])
df["labels"] = pd.Categorical.from_codes(
np.concatenate([self.y_train, self.y_test]),
np.concatenate([self.l_train, self.l_test]),
categories=["background", "signal"]
)
for identifier in self.identifiers:
......@@ -1474,6 +1800,22 @@ class ClassificationProjectDataFrame(ClassificationProject):
A little hack to initialize a ClassificationProject from a pandas DataFrame instead of ROOT TTrees
"""
def __init__(self, name, *args, **kwargs):
if len(args) < 1 and len(kwargs) < 1:
# if no further arguments given, interpret as directory name
self._init_from_dir(name)
else:
# otherwise initialise new project
self._init_from_args(name, *args, **kwargs)
with open(os.path.join(self.project_dir, "options.pickle"), "wb") as of:
# don't put the dataframe into options.pickle!
if len(args) > 1:
args = args[1:]
else:
args = []
pickle.dump(dict(args=args, kwargs=kwargs), of)
def _init_from_args(self,
name,
df,
......@@ -1496,9 +1838,11 @@ class ClassificationProjectDataFrame(ClassificationProject):
raise NotImplementedError("'split_column' is the only currently supported split mode")
self.split_mode = split_mode
self.split_column = split_column
super(ClassificationProjectDataFrame, self).__init__(name,
signal_trees=[], bkg_trees=[], branches=[], weight_expr="1",
**kwargs)
super(ClassificationProjectDataFrame, self)._init_from_args(
name,
signal_trees=[], bkg_trees=[], branches=[], weight_expr="1",
**kwargs
)
self._x_train = None
self._x_test = None
self._y_train = None
......@@ -1575,16 +1919,15 @@ class ClassificationProjectDataFrame(ClassificationProject):
if reload:
self.data_loaded = False
self.data_transformed = False
self._x_train = None
self._x_test = None
self._y_train = None
self._y_test = None
self._w_train = None
self._w_test = None
self._w_train_tot = None
if not self.data_transformed:
self._transform_data()
self.data_loaded = True
class ClassificationProjectRNN(ClassificationProject):
......@@ -1597,6 +1940,7 @@ class ClassificationProjectRNN(ClassificationProject):
recurrent_field_names=None,
rnn_layer_nodes=32,
mask_value=-999,
recurrent_unit_type="GRU",
**kwargs):
"""
recurrent_field_names example:
......@@ -1615,6 +1959,7 @@ class ClassificationProjectRNN(ClassificationProject):
self.recurrent_field_names = []
self.rnn_layer_nodes = rnn_layer_nodes
self.mask_value = mask_value
self.recurrent_unit_type = recurrent_unit_type
# convert to of indices
self.recurrent_field_idx = []
......@@ -1645,14 +1990,6 @@ class ClassificationProjectRNN(ClassificationProject):
)
def _transform_data(self):
self.x_train[self.x_train == self.mask_value] = np.nan
self.x_test[self.x_test == self.mask_value] = np.nan
super(ClassificationProjectRNN, self)._transform_data()
self.x_train[np.isnan(self.x_train)] = self.mask_value
self.x_test[np.isnan(self.x_test)] = self.mask_value
@property
def model(self):
if self._model is None:
......@@ -1663,7 +2000,15 @@ class ClassificationProjectRNN(ClassificationProject):
for field_idx in self.recurrent_field_idx:
chan_inp = Input(field_idx.shape[1:])
channel = Masking(mask_value=self.mask_value)(chan_inp)
channel = GRU(self.rnn_layer_nodes)(channel)
if self.recurrent_unit_type == "GRU":
channel = GRU(self.rnn_layer_nodes)(channel)
elif self.recurrent_unit_type == "SimpleRNN":
channel = SimpleRNN(self.rnn_layer_nodes)(channel)
elif self.recurrent_unit_type == "LSTM":
channel = LSTM(self.rnn_layer_nodes)(channel)
else:
raise NotImplementedError("{} not implemented".format(self.recurrent_unit_type))
logger.info("Added {} unit".format(self.recurrent_unit_type))
# TODO: configure dropout for recurrent layers
#channel = Dropout(0.3)(channel)
rnn_inputs.append(chan_inp)
......@@ -1675,43 +2020,61 @@ class ClassificationProjectRNN(ClassificationProject):
flat_channel = flat_input
combined = concatenate(rnn_channels+[flat_channel])
for node_count, dropout_fraction in zip(self.nodes, self.dropout):
combined = Dense(node_count, activation=self.activation_function)(combined)
extra_opts = dict()
if self.kernel_initializer is not None:
extra_opts["kernel_initializer"] = getattr(keras.initializers, self.kernel_initializer)()
combined = Dense(node_count, activation=self.activation_function, **extra_opts)(combined)
if (dropout_fraction is not None) and (dropout_fraction > 0):
combined = Dropout(rate=dropout_fraction)(combined)
combined = Dense(1, activation=self.activation_function_output)(combined)
self._model = Model(inputs=rnn_inputs+[flat_input], outputs=combined)
self._compile_or_load_model()
return self._model
outputs = [combined]
# optional regression targets
for target_field in self.target_fields:
extra_target = Dense(1, activation="linear", name="target_{}".format(target_field))(combined)
outputs.append(extra_target)
def train(self, epochs=10):
self.load()
self.shuffle_training_data()
self._model = Model(inputs=rnn_inputs+[flat_input], outputs=outputs)
self._compile_or_load_model()
return self._model
for branch_index, branch in enumerate(self.fields):
self.plot_input(branch_index)
self.total_epochs = self._read_info("epochs", 0)
def clean_mask(self, x):
"""
Mask recurrent fields such that once a masked value occurs,
all values corresponding to the same and following objects are
masked as well. Works in place.
"""
for recurrent_field_idx in self.recurrent_field_idx:
for evt in x:
masked = False
for line_idx in recurrent_field_idx.reshape(*recurrent_field_idx.shape[1:]):
if (evt[line_idx] == self.mask_value).any():
masked=True
if masked:
evt[line_idx] = self.mask_value
try:
self.is_training = True
logger.info("Training on batches for RNN")
# note: the batches have class_weight already applied
self.model.fit_generator(self.yield_batch(),
steps_per_epoch=int(len(self.training_data[0])/self.batch_size),
epochs=epochs,
validation_data=self.validation_data,
callbacks=self.callbacks_list)
self.is_training = False
except KeyboardInterrupt:
logger.info("Interrupt training - continue with rest")
self.checkpoint_model()
def mask_uniform(self, x):
"""
Mask recurrent fields with a random (uniform) number of objects. Works in place.
"""
for recurrent_field_idx in self.recurrent_field_idx:
for evt in x:
masked = False
nobj = int(random.random()*(recurrent_field_idx.shape[1]+1))
for obj_number, line_idx in enumerate(recurrent_field_idx.reshape(*recurrent_field_idx.shape[1:])):
if obj_number == nobj:
masked=True
if masked:
evt[line_idx] = self.mask_value
def get_input_list(self, x):
"Format the input starting from flat ntuple"
"""
Returns a list of 3-dimensional inputs for each
recurrent layer and a 2-dimensional one for the normal flat inputs.
"""
x_input = []
for field_idx in self.recurrent_field_idx:
x_recurrent = x[:,field_idx.reshape(-1)].reshape(-1, *field_idx.shape[1:])
......@@ -1722,7 +2085,7 @@ class ClassificationProjectRNN(ClassificationProject):
def get_input_flat(self, x):
"Transform input back to flat ntuple"
"Transform the multiple inputs back to flat ntuple"
nevent = x[0].shape[0]
x_flat = np.empty((nevent, len(self.fields)), dtype=np.float)
# recurrent fields
......@@ -1737,74 +2100,6 @@ class ClassificationProjectRNN(ClassificationProject):
return x_flat
def yield_batch(self):
x_train, y_train, w_train = self.training_data
while True:
shuffled_idx = np.random.permutation(len(x_train))
for start in range(0, len(shuffled_idx), int(self.batch_size)):
x_batch = x_train[shuffled_idx[start:start+int(self.batch_size)]]
y_batch = y_train[shuffled_idx[start:start+int(self.batch_size)]]
w_batch = w_train[shuffled_idx[start:start+int(self.batch_size)]]
x_input = self.get_input_list(x_batch)
yield (x_input, y_batch, w_batch)
@property
def validation_data(self):
"class weighted validation data. Attention: Shuffle training data before using this!"
x_val, y_val, w_val = super(ClassificationProjectRNN, self).validation_data
x_val_input = self.get_input_list(x_val)
return x_val_input, y_val, w_val
def evaluate_train_test(self, do_train=True, do_test=True, batch_size=10000, mode=None):
logger.info("Reloading (and re-transforming) unshuffled training data")
self.load(reload=True)
if mode is not None:
self._write_info("scores_mode", mode)
def eval_score(data_name):
logger.info("Create/Update scores for {} sample".format(data_name))
n_events = len(getattr(self, "x_"+data_name))
setattr(self, "scores_"+data_name, np.empty(n_events))
for start in range(0, n_events, batch_size):
stop = start+batch_size
getattr(self, "scores_"+data_name)[start:stop] = (
self.predict(
self.get_input_list(getattr(self, "x_"+data_name)[start:stop]),
mode=mode
).reshape(-1)
)
self._dump_to_hdf5("scores_"+data_name)
if do_test:
eval_score("test")
if do_train:
eval_score("train")
def _batch_transform(self, x, fn, batch_size):
"Transform array in batches, temporarily setting mask_values to nan"
transformed = np.empty(x.shape, dtype=x.dtype)
for start in range(0, len(x), batch_size):
stop = start+batch_size
x_batch = np.array(x[start:stop]) # copy
x_batch[x_batch == self.mask_value] = np.nan
x_batch = fn(x_batch)
x_batch[np.isnan(x_batch)] = self.mask_value
transformed[start:stop] = x_batch
return transformed
def transform(self, x, batch_size=10000):
return self._batch_transform(x, self.scaler.transform, batch_size)
def inverse_transform(self, x, batch_size=10000):
return self._batch_transform(x, self.scaler.inverse_transform, batch_size)
def evaluate(self, x_eval, mode=None):
logger.debug("Evaluate score for {}".format(x_eval))
x_eval = self.transform(x_eval)
......
......@@ -33,22 +33,36 @@ def get_single_neuron_function(model, layer, neuron, input_transform=None):
return eval_single_neuron
def create_random_event(ranges):
def create_random_event(ranges, mask_probs=None, mask_value=None):
random_event = np.array([p[0]+(p[1]-p[0])*np.random.rand() for p in ranges])
random_event = random_event.reshape(-1, len(random_event))
# if given, mask values with a certain probability
if mask_probs is not None:
if mask_value is None:
raise ValueError("Need to provide mask_value if random events should be masked")
for var_index, mask_prob in enumerate(mask_probs):
random_event[:,var_index][np.random.rand(len(random_event)) < mask_prob] = mask_value
return random_event
def get_ranges(x, quantiles, weights, mask_value=None, filter_index=None):
def get_ranges(x, quantiles, weights, mask_value=None, filter_index=None, max_evts=None):
"Get ranges for plotting or random event generation based on quantiles"
ranges = []
mask_probs = []
if max_evts is not None:
rnd_idx = np.random.permutation(np.arange(len(x)))
rnd_idx = rnd_idx[:max_evts]
for var_index in range(x.shape[1]):
if (filter_index is not None) and (var_index != filter_index):
continue
x_var = x[:,var_index]
if max_evts is not None:
x_var = x_var[rnd_idx]
not_masked = np.where(x_var != mask_value)[0]
masked = np.where(x_var == mask_value)[0]
ranges.append(weighted_quantile(x_var[not_masked], quantiles, sample_weight=weights[not_masked]))
return ranges
mask_probs.append(float(len(masked))/float(len(x_var)))
return ranges, mask_probs
def max_activation_wrt_input(gradient_function, random_event, threshold=None, maxthreshold=None, maxit=100, step=1, const_indices=[],
......@@ -115,7 +129,7 @@ def get_grad_function(model, layer, neuron):
],
ignoreKwargs=["input_transform", "input_inverse_transform"],
)
def get_max_activation_events(model, ranges, ntries, layer, neuron, seed=42, **kwargs):
def get_max_activation_events(model, ranges, ntries, layer, neuron, seed=42, mask_probs=None, mask_value=None, **kwargs):
gradient_function = get_grad_function(model, layer, neuron)
......@@ -125,9 +139,15 @@ def get_max_activation_events(model, ranges, ntries, layer, neuron, seed=42, **k
for i in range(ntries):
if not (i%100):
logger.info(i)
res = max_activation_wrt_input(gradient_function,
create_random_event(ranges),
**kwargs)
res = max_activation_wrt_input(
gradient_function,
create_random_event(
ranges,
mask_probs=mask_probs,
mask_value=mask_value
),
**kwargs
)
if res is not None:
loss, event = res
loss = np.array([loss])
......@@ -177,18 +197,29 @@ def weighted_quantile(values, quantiles, sample_weight=None, values_sorted=False
class WeightedRobustScaler(RobustScaler):
def fit(self, X, y=None, weights=None):
if not np.isnan(X).any():
def fit(self, X, y=None, weights=None, mask_value=None):
if not np.isnan(X).any() and mask_value is not None and weights is None:
# these checks don't work for nan values
super(WeightedRobustScaler, self).fit(X, y)
if weights is None:
return self
return super(WeightedRobustScaler, self).fit(X, y)
else:
wqs = np.array([weighted_quantile(X[:,i][~np.isnan(X[:,i])], [0.25, 0.5, 0.75], sample_weight=weights) for i in range(X.shape[1])])
if weights is None:
weights = np.ones(len(self.X))
wqs = []
for i in range(X.shape[1]):
mask = ~np.isnan(X[:,i])
if mask_value is not None:
mask &= (X[:,i] != mask_value)
wqs.append(
weighted_quantile(
X[:,i][mask],
[0.25, 0.5, 0.75],
sample_weight=weights[mask]
)
)
wqs = np.array(wqs)
self.center_ = wqs[:,1]
self.scale_ = wqs[:,2]-wqs[:,0]
self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False)
print(self.scale_)
return self
......@@ -202,6 +233,15 @@ class WeightedRobustScaler(RobustScaler):
return super(WeightedRobustScaler, self).transform(X)
def inverse_transform(self, X):
if np.isnan(X).any():
X *= self.scale_
X += self.center_
return X
else:
return super(WeightedRobustScaler, self).inverse_transform(X)
def poisson_asimov_significance(s, ds, b, db):
"see `<http://www.pp.rhul.ac.uk/~cowan/stat/medsig/medsigNote.pdf>`_)"
db = np.sqrt(db**2+ds**2)
......