Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • Eric.Schanet/KerasROOTClassification
  • Nikolai.Hartmann/KerasROOTClassification
2 results
Show changes
Commits on Source (45)
......@@ -22,6 +22,7 @@ def overlay_ROC(filename, *projects, **kwargs):
threshold_log = kwargs.pop("threshold_log", True)
lumifactor = kwargs.pop("lumifactor", None)
tight_layout = kwargs.pop("tight_layout", False)
show_auc = kwargs.pop("show_auc", True)
if kwargs:
raise KeyError("Unknown kwargs: {}".format(kwargs))
......@@ -43,7 +44,7 @@ def overlay_ROC(filename, *projects, **kwargs):
colors = prop_cycle.by_key()['color']
for p, color in zip(projects, colors):
fpr, tpr, threshold = roc_curve(p.y_test, p.scores_test, sample_weight = p.w_test)
fpr, tpr, threshold = roc_curve(p.l_test, p.scores_test, sample_weight = p.w_test)
fpr = 1.0 - fpr
try:
roc_auc = auc(tpr, fpr)
......@@ -52,12 +53,16 @@ def overlay_ROC(filename, *projects, **kwargs):
roc_auc = auc(tpr, fpr, reorder=True)
ax.grid(color='gray', linestyle='--', linewidth=1)
ax.plot(tpr, fpr, label=str(p.name+" (AUC = {:.3f})".format(roc_auc)), color=color)
if show_auc:
label = str(p.name+" (AUC = {:.3f})".format(roc_auc))
else:
label = p.name
ax.plot(tpr, fpr, label=label, color=color)
if plot_thresholds:
ax2.plot(tpr, threshold, "--", color=color)
if lumifactor is not None:
sumw_b = p.w_test[p.y_test==0].sum()*lumifactor
sumw_s = p.w_test[p.y_test==1].sum()*lumifactor
sumw_b = p.w_test[p.l_test==0].sum()*lumifactor
sumw_s = p.w_test[p.l_test==1].sum()*lumifactor
ax_abs_b.plot(tpr, (1.-fpr)*sumw_b, alpha=0)
ax_abs_b.invert_yaxis()
ax_abs_s.plot(tpr*sumw_s, fpr, alpha=0)
......
......@@ -258,9 +258,9 @@ def plot_profile_2D_all(plotname, model, events,
logger.info("Done")
if plot_last_layer:
n_neurons = [len(i[0]) for i in acts]
n_neurons = [len(i.reshape(i.shape[0], -1)[0]) for i in acts]
else:
n_neurons = [len(i[0]) for i in acts[:-1]]
n_neurons = [len(i.reshape(i.shape[0], -1)[0]) for i in acts[:-1]]
layers = len(n_neurons)
nrows_ncols = (layers, max(n_neurons))
......@@ -282,8 +282,10 @@ def plot_profile_2D_all(plotname, model, events,
ims = []
reg_plots = []
for layer in range(layers):
for neuron in range(len(acts[layer][0])):
acts_neuron = acts[layer][:,neuron]
neurons_acts = acts[layer]
neurons_acts = neurons_acts.reshape(neurons_acts.shape[0], -1)
for neuron in range(len(neurons_acts[0])):
acts_neuron = neurons_acts[:,neuron]
ax = grid_array[neuron][layer]
extra_opts = {}
if not (plot_last_layer and layer == layers-1):
......
......@@ -32,6 +32,7 @@ input_project = args.input_project
output_file = args.output_file
c = load_from_dir(input_project)
c._load_data()
ranges, mask_probs = get_ranges(c.transform(c.x_train), [0.01, 0.99], c.w_train_tot, mask_value=c.mask_value, max_evts=10000)
......
#!/usr/bin/env python
"""
Write new TTrees with signal parameters as branches. For the
backgrounds the parameters are generated following the total
distribution for all signals. The discrete values for the whole ntuple
of signal parameters are counted, such that correlations between
signal parameters are taken into account.
"""
import argparse, re, os
import ROOT
from root_numpy import list_trees
from root_pandas import read_root
import numpy as np
if __name__ == "__main__":
input_filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys.root"
output_filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys_parametrized.root"
param_names = ["mg", "mc", "mn"]
param_match = "GG_oneStep_(.*?)_(.*?)_(.*?)_NoSys"
output_signal_treename = "GG_oneStep_NoSys"
bkg_trees = [
"diboson_Sherpa221_NoSys",
"singletop_NoSys",
"ttbar_NoSys",
"ttv_NoSys",
"wjets_Sherpa221_NoSys",
"zjets_Sherpa221_NoSys",
]
# read in the number of events for each combination of parameters
f = ROOT.TFile.Open(input_filename)
count_dict = {}
for key in f.GetListOfKeys():
tree_name = key.GetName()
match = re.match(param_match, tree_name)
if match is not None:
tree = f.Get(tree_name)
params = tuple([float(i) for i in match.groups()])
if not params in count_dict:
count_dict[params] = 0
# TODO: might be better to use sum of weights
count_dict[params] += tree.GetEntries()
f.Close()
# calculate cumulative sum of counts to sample signal parameters for background from
numbers = np.array(count_dict.keys(), dtype=np.float)
counts = np.array(count_dict.values(), dtype=np.float)
probs = counts/counts.sum()
prob_bins = np.cumsum(probs)
# read and write the rest in chunks
if os.path.exists(output_filename):
os.remove(output_filename)
for tree_name in list_trees(input_filename):
match_signal = re.match(param_match, tree_name)
if match_signal is not None or tree_name in bkg_trees:
print("Writing {}".format(tree_name))
nwritten = 0
for df in read_root(input_filename, tree_name, chunksize=100000):
print("Writing event {}".format(nwritten))
if match_signal is None:
rnd = np.random.random(len(df))
rnd_idx = np.digitize(rnd, prob_bins)
param_values = numbers[rnd_idx]
for param_idx, param_name in enumerate(param_names):
df[param_name] = param_values[:,param_idx]
df["training_weight"] = df["eventWeight"]*df["genWeight"]
else:
for param_name, param_value in zip(param_names, match_signal.groups()):
df[param_name] = float(param_value)
df["training_weight"] = df["eventWeight"]
if match_signal is None:
out_tree_name = tree_name
else:
out_tree_name = output_signal_treename
df.to_root(output_filename, mode="a", key=out_tree_name)
nwritten += len(df)
import pytest
import numpy as np
import root_numpy
import pandas as pd
from sklearn.datasets import make_classification
from keras.layers import GRU
from KerasROOTClassification import ClassificationProject, ClassificationProjectRNN
def create_dataset(path):
# create example dataset with (low-weighted) noise added
X, y = make_classification(n_samples=10000, random_state=1)
X2 = np.random.normal(size=20*10000).reshape(-1, 20)
y2 = np.concatenate([np.zeros(5000), np.ones(5000)])
X = np.concatenate([X, X2])
y = np.concatenate([y, y2])
w = np.concatenate([np.ones(10000), 0.01*np.ones(10000)])
# shift and scale randomly (to check if transformation is working)
shift = np.random.rand(20)*100
scale = np.random.rand(20)*1000
X *= scale
X += shift
# write to root files
branches = ["var_{}".format(i) for i in range(len(X[0]))]
df = pd.DataFrame(X, columns=branches)
df["class"] = y
df["weight"] = w
tree_path_bkg = str(path / "bkg.root")
tree_path_sig = str(path / "sig.root")
root_numpy.array2root(df[df["class"]==0].to_records(), tree_path_bkg)
root_numpy.array2root(df[df["class"]==1].to_records(), tree_path_sig)
return branches, tree_path_sig, tree_path_bkg
def test_ClassificationProject(tmp_path):
branches, tree_path_sig, tree_path_bkg = create_dataset(tmp_path)
c = ClassificationProject(
str(tmp_path / "project"),
bkg_trees = [(tree_path_bkg, "tree")],
signal_trees = [(tree_path_sig, "tree")],
branches = branches,
weight_expr = "weight",
identifiers = ["index"],
optimizer="Adam",
earlystopping_opts=dict(patience=5),
dropout=0.5,
layers=3,
nodes=128,
)
c.train(epochs=200)
c.plot_all_inputs()
c.plot_loss()
assert min(c.history.history["val_loss"]) < 0.18
def test_ClassificationProjectRNN(tmp_path):
branches, tree_path_sig, tree_path_bkg = create_dataset(tmp_path)
c = ClassificationProjectRNN(
str(tmp_path / "project"),
bkg_trees = [(tree_path_bkg, "tree")],
signal_trees = [(tree_path_sig, "tree")],
branches = branches,
recurrent_field_names=[
[
["var_1", "var_2", "var_3"],
["var_4", "var_5", "var_6"]
],
[
["var_10", "var_11", "var_12"],
["var_13", "var_14", "var_15"]
],
],
weight_expr = "weight",
identifiers = ["index"],
optimizer="Adam",
earlystopping_opts=dict(patience=5),
dropout=0.5,
layers=3,
nodes=128,
)
assert sum([isinstance(layer, GRU) for layer in c.model.layers]) == 2
c.train(epochs=200)
c.plot_all_inputs()
c.plot_loss()
assert min(c.history.history["val_loss"]) < 0.18
This diff is collapsed.
......@@ -197,14 +197,26 @@ def weighted_quantile(values, quantiles, sample_weight=None, values_sorted=False
class WeightedRobustScaler(RobustScaler):
def fit(self, X, y=None, weights=None):
if not np.isnan(X).any():
def fit(self, X, y=None, weights=None, mask_value=None):
if not np.isnan(X).any() and mask_value is not None and weights is None:
# these checks don't work for nan values
super(WeightedRobustScaler, self).fit(X, y)
if weights is None:
return self
return super(WeightedRobustScaler, self).fit(X, y)
else:
wqs = np.array([weighted_quantile(X[:,i][~np.isnan(X[:,i])], [0.25, 0.5, 0.75], sample_weight=weights) for i in range(X.shape[1])])
if weights is None:
weights = np.ones(len(self.X))
wqs = []
for i in range(X.shape[1]):
mask = ~np.isnan(X[:,i])
if mask_value is not None:
mask &= (X[:,i] != mask_value)
wqs.append(
weighted_quantile(
X[:,i][mask],
[0.25, 0.5, 0.75],
sample_weight=weights[mask]
)
)
wqs = np.array(wqs)
self.center_ = wqs[:,1]
self.scale_ = wqs[:,2]-wqs[:,0]
self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False)
......