Compare revisions

Eric Schanet · Eric Schanet · Eric Schanet · Eric Schanet · Eric Schanet · Eric Schanet
--- a/compare.py
+++ b/compare.py
@@ -20,6 +20,9 @@ def overlay_ROC(filename, *projects, **kwargs):
    ylim = kwargs.pop("ylim", (0,1))
    plot_thresholds = kwargs.pop("plot_thresholds", False)
    threshold_log = kwargs.pop("threshold_log", True)
+    lumifactor = kwargs.pop("lumifactor", None)
+    tight_layout = kwargs.pop("tight_layout", False)
+    show_auc = kwargs.pop("show_auc", True)
    if kwargs:
        raise KeyError("Unknown kwargs: {}".format(kwargs))

@@ -33,11 +36,15 @@ def overlay_ROC(filename, *projects, **kwargs):
        if threshold_log:
            ax2.set_yscale("log")

+    if lumifactor is not None:
+        ax_abs_b = ax.twinx()
+        ax_abs_s = ax.twiny()
+
    prop_cycle = plt.rcParams['axes.prop_cycle']
    colors = prop_cycle.by_key()['color']

    for p, color in zip(projects, colors):
-        fpr, tpr, threshold = roc_curve(p.y_test, p.scores_test, sample_weight = p.w_test)
+        fpr, tpr, threshold = roc_curve(p.l_test, p.scores_test, sample_weight = p.w_test)
        fpr = 1.0 - fpr
        try:
            roc_auc = auc(tpr, fpr)
@@ -46,21 +53,39 @@ def overlay_ROC(filename, *projects, **kwargs):
            roc_auc = auc(tpr, fpr, reorder=True)

        ax.grid(color='gray', linestyle='--', linewidth=1)
-        ax.plot(tpr,  fpr, label=str(p.name+" (AUC = {:.3f})".format(roc_auc)), color=color)
+        if show_auc:
+            label = str(p.name+" (AUC = {:.3f})".format(roc_auc))
+        else:
+            label = p.name
+        ax.plot(tpr,  fpr, label=label, color=color)
        if plot_thresholds:
            ax2.plot(tpr, threshold, "--", color=color)
+        if lumifactor is not None:
+            sumw_b = p.w_test[p.l_test==0].sum()*lumifactor
+            sumw_s = p.w_test[p.l_test==1].sum()*lumifactor
+            ax_abs_b.plot(tpr, (1.-fpr)*sumw_b, alpha=0)
+            ax_abs_b.invert_yaxis()
+            ax_abs_s.plot(tpr*sumw_s, fpr, alpha=0)

    if xlim is not None:
        ax.set_xlim(*xlim)
    if ylim is not None:
        ax.set_ylim(*ylim)
+    if lumifactor is not None:
+        ax_abs_b.set_ylim((1-ax.get_ylim()[0])*sumw_b, (1-ax.get_ylim()[1])*sumw_b)
+        ax_abs_b.set_xlim(*ax.get_xlim())
+        ax_abs_s.set_xlim(ax.get_xlim()[0]*sumw_s, ax.get_xlim()[1]*sumw_s)
+        ax_abs_s.set_ylim(*ax.get_ylim())
+        ax_abs_b.set_ylabel("Number of background events")
+        ax_abs_s.set_xlabel("Number of signal events")
    # plt.xticks(np.arange(0,1,0.1))
    # plt.yticks(np.arange(0,1,0.1))
    ax.legend(loc='lower left', framealpha=1.0)
-    ax.set_title('Receiver operating characteristic')
+    if lumifactor is None:
+        ax.set_title('Receiver operating characteristic')
    ax.set_ylabel("Background rejection")
    ax.set_xlabel("Signal efficiency")
-    if plot_thresholds:
+    if plot_thresholds or tight_layout:
        # to fit right y-axis description
        fig.tight_layout()
    return save_show(plt, fig, filename)

--- a/keras_visualize_activations/read_activations.py
+++ b/keras_visualize_activations/read_activations.py
 import keras.backend as K

-from keras.layers import Masking, InputLayer

 def get_activations(model, model_inputs, print_shape_only=False, layer_name=None):
    print('----- activations -----')
@@ -13,12 +12,8 @@ def get_activations(model, model_inputs, print_shape_only=False, layer_name=None
        inp = [inp]
        model_multi_inputs_cond = False

-    # all layer outputs
-    # skip input and masking layers
    outputs = [layer.output for layer in model.layers if
-               (layer.name == layer_name or layer_name is None)
-               and not isinstance(layer, InputLayer)
-               and not isinstance(layer, Masking)]
+               layer.name == layer_name or layer_name is None]  # all layer outputs

    funcs = [K.function(inp + [K.learning_phase()], [out]) for out in outputs]  # evaluation functions


--- a/plotting.py
+++ b/plotting.py
@@ -20,9 +20,9 @@ logger.addHandler(logging.NullHandler())
 Some further plotting functions
 """

-def save_show(plt, fig, filename):
+def save_show(plt, fig, filename, **kwargs):
    "Save a figure and show it in case we are in ipython or jupyter notebook."
-    fig.savefig(filename)
+    fig.savefig(filename, **kwargs)
    try:
        get_ipython
        plt.show()
@@ -60,8 +60,7 @@ def plot_NN_vs_var_1D(plotname, means, scorefun, var_index, var_range, var_label
    if var_label is not None:
        ax.set_xlabel(var_label)
    ax.set_ylabel("NN output")
-    fig.savefig(plotname)
-    plt.close(fig)
+    save_show(plt, fig, plotname)


 def plot_NN_vs_var_2D(plotname, means,
@@ -134,8 +133,7 @@ def plot_NN_vs_var_2D(plotname, means,
        ax.set_xlabel(varx_label)
    if vary_label is not None:
        ax.set_ylabel(vary_label)
-    fig.savefig(plotname)
-    plt.close(fig)
+    save_show(plt, fig, plotname)


 def plot_NN_vs_var_2D_all(plotname, model, means,
@@ -233,8 +231,115 @@ def plot_NN_vs_var_2D_all(plotname, model, means,
    cb.ax.xaxis.set_ticks_position('top')
    cb.ax.xaxis.set_label_position('top')

-    fig.savefig(plotname, bbox_inches='tight')
-    plt.close(fig)
+    save_show(plt, fig, plotname, bbox_inches='tight')
+
+
+def plot_profile_2D_all(plotname, model, events,
+                        valsx, valsy,
+                        nbinsx, xmin, xmax,
+                        nbinsy, ymin, ymax,
+                        transform_function=None,
+                        varx_label=None,
+                        vary_label=None,
+                        zrange=None, logz=False,
+                        plot_last_layer=False,
+                        log_default_ymin=1e-5,
+                        global_norm=True,
+                        cmap="inferno", **kwargs):
+
+    "Similar to plot_profile_2D, but creates a grid of plots for all neurons."
+
+    # transform
+    if transform_function is not None:
+        events = transform_function(events)
+
+    logger.info("Reading activations for all neurons")
+    acts = get_activations(model, events, print_shape_only=True)
+    logger.info("Done")
+
+    if plot_last_layer:
+        n_neurons = [len(i.reshape(i.shape[0], -1)[0]) for i in acts]
+    else:
+        n_neurons = [len(i.reshape(i.shape[0], -1)[0]) for i in acts[:-1]]
+    layers = len(n_neurons)
+
+    nrows_ncols = (layers, max(n_neurons))
+    fig = plt.figure(1, figsize=nrows_ncols)
+    grid = ImageGrid(fig, 111, nrows_ncols=nrows_ncols[::-1], axes_pad=0,
+                     label_mode="1",
+                     aspect=False,
+                     cbar_location="top",
+                     cbar_mode="single",
+                     cbar_pad=.2,
+                     cbar_size="5%",)
+    grid_array = np.array(grid)
+    grid_array = grid_array.reshape(*nrows_ncols[::-1])
+
+    global_min = None
+    global_max = None
+
+    logger.info("Creating profile histograms")
+    ims = []
+    reg_plots = []
+    for layer in range(layers):
+        neurons_acts = acts[layer]
+        neurons_acts = neurons_acts.reshape(neurons_acts.shape[0], -1)
+        for neuron in range(len(neurons_acts[0])):
+            acts_neuron = neurons_acts[:,neuron]
+            ax = grid_array[neuron][layer]
+            extra_opts = {}
+            if not (plot_last_layer and layer == layers-1):
+                # for hidden layers, plot the same z-scale
+                if logz:
+                    norm = matplotlib.colors.LogNorm
+                else:
+                    norm = matplotlib.colors.Normalize
+                if zrange is not None:
+                    extra_opts["norm"] = norm(vmin=zrange[0], vmax=zrange[1])
+                else:
+                    extra_opts["norm"] = norm(vmin=global_min, vmax=global_max)
+            hist, xedges, yedges = get_profile_2D(
+                valsx, valsy, acts_neuron,
+                nbinsx, xmin, xmax,
+                nbinsy, ymin, ymax,
+                **kwargs
+            )
+            if global_min is None or hist.min() < global_min:
+                global_min = hist.min()
+            if global_max is None or hist.max() > global_max:
+                global_max = hist.max()
+            X, Y = np.meshgrid(xedges, yedges)
+            reg_plots.append((layer, neuron, ax, (X, Y, hist), dict(cmap="inferno", linewidth=0, rasterized=True, **extra_opts)))
+    logger.info("Done")
+
+    logger.info("global_min: {}".format(global_min))
+    logger.info("global_max: {}".format(global_max))
+
+    if global_min <= 0 and logz:
+        global_min = log_default_ymin
+        logger.info("Changing global_min to {}".format(log_default_ymin))
+
+    for layer, neuron, ax, args, kwargs in reg_plots:
+        if zrange is None:
+            kwargs["norm"].vmin = global_min
+            kwargs["norm"].vmax = global_max
+        if not global_norm:
+            kwargs["norm"] = None
+        im = ax.pcolormesh(*args, **kwargs)
+        ax.set_facecolor("black")
+        if varx_label is not None:
+            ax.set_xlabel(varx_label)
+        if vary_label is not None:
+            ax.set_ylabel(vary_label)
+        ax.text(0., 0.5, "{}, {}".format(layer, neuron), transform=ax.transAxes, color="white")
+
+    cb = fig.colorbar(im, cax=grid[0].cax, orientation="horizontal")
+    cb.ax.xaxis.set_ticks_position('top')
+    cb.ax.xaxis.set_label_position('top')
+
+    logger.info("Rendering")
+    save_show(plt, fig, plotname, bbox_inches='tight')
+    logger.info("Done")


 def plot_hist_2D(plotname, xedges, yedges, hist, varx_label=None, vary_label=None, log=False, zlabel="# of events"):
@@ -252,9 +357,7 @@ def plot_hist_2D(plotname, xedges, yedges, hist, varx_label=None, vary_label=Non
    cbar.set_label(zlabel)
    ax.set_ylabel(vary_label)
    ax.set_xlabel(varx_label)
-    fig.savefig(plotname)
-
-    plt.close(fig)
+    save_show(plt, fig, plotname)


 def plot_hist_2D_events(plotname, valsx, valsy, nbinsx, xmin, xmax, nbinsy, ymin, ymax,
@@ -308,15 +411,10 @@ def plot_cond_avg_actmax_2D(plotname, model, layer, neuron, ranges,
    plot_hist_2D(plotname, xedges, yedges, hist, zlabel="Neuron output", **kwargs)


-def plot_profile_2D(plotname, valsx, valsy, scores,
-                    nbinsx, xmin, xmax,
-                    nbinsy, ymin, ymax,
-                    metric=np.mean,
-                    weights=None,
-                    **kwargs):
-
-    kwargs["zlabel"] = kwargs.get("zlabel", "Profile")
-
+def get_profile_2D(valsx, valsy, scores,
+                   nbinsx, xmin, xmax,
+                   nbinsy, ymin, ymax,
+                   metric=np.mean, weights=None):
    xedges = np.linspace(xmin, xmax, nbinsx)
    yedges = np.linspace(ymin, ymax, nbinsy)

@@ -342,6 +440,25 @@ def plot_profile_2D(plotname, valsx, valsy, scores,
    hist = np.array(hist)
    hist = hist.T # had a list of columns - needs to be list of rows

+    return hist, xedges, yedges
+
+
+def plot_profile_2D(plotname, valsx, valsy, scores,
+                    nbinsx, xmin, xmax,
+                    nbinsy, ymin, ymax,
+                    metric=np.mean,
+                    weights=None,
+                    **kwargs):
+
+    kwargs["zlabel"] = kwargs.get("zlabel", "Profile")
+
+    hist, xedges, yedges = get_profile_2D(
+        valsx, valsy, scores,
+        nbinsx, xmin, xmax,
+        nbinsy, ymin, ymax,
+        metric=metric, weights=weights
+    )
+
    plot_hist_2D(plotname, xedges, yedges, hist, **kwargs)



--- a/scripts/generate_actmax.py
+++ b/scripts/generate_actmax.py
+#!/usr/bin/env python
+
+import sys, argparse, re, random
+
+parser = argparse.ArgumentParser(description="generate events that maximise the activation for a given neuron")
+parser.add_argument("input_project")
+parser.add_argument("output_file")
+parser.add_argument("-n", "--nevents", default=100000, type=int)
+parser.add_argument("-j", "--mask-jets", action="store_true",
+                    help="mask variables called jet*Pt/Eta/Phi and generate a random uniform distribution of the number of jets (only nescessary for non-recurrent NN)")
+args = parser.parse_args()
+
+import logging
+logging.basicConfig()
+logging.getLogger().setLevel(logging.INFO)
+
+import h5py
+import numpy as np
+
+from KerasROOTClassification.utils import (
+    weighted_quantile,
+    get_max_activation_events,
+    create_random_event,
+    get_ranges
+)
+from KerasROOTClassification import load_from_dir
+import meme
+
+meme.setOptions(deactivated=True)
+
+input_project = args.input_project
+output_file = args.output_file
+
+c = load_from_dir(input_project)
+c._load_data()
+
+ranges, mask_probs = get_ranges(c.transform(c.x_train), [0.01, 0.99], c.w_train_tot, mask_value=c.mask_value, max_evts=10000)
+
+def mask_uniform(x, mask_value, recurrent_field_idx):
+    """
+    Mask recurrent fields with a random (uniform) number of objects. Works in place.
+    """
+    for rec_idx in recurrent_field_idx:
+        for evt in x:
+            masked = False
+            nobj = int(random.random()*(rec_idx.shape[1]+1))
+            for obj_number, line_idx in enumerate(rec_idx.reshape(*rec_idx.shape[1:])):
+                if obj_number == nobj:
+                    masked=True
+                if masked:
+                    evt[line_idx] = mask_value
+
+def get_input_flat(x):
+    return x[0].reshape(-1, len(c.fields))
+
+
+if args.mask_jets:
+    jet_fields = {}
+    for field_name in c.fields:
+        if any(field_name.startswith("jet") and field_name.endswith(suffix) for suffix in ["Pt", "Eta", "Phi"]):
+            jet_number = re.findall("[0-9]+", field_name)[0]
+            if not jet_number in jet_fields:
+                jet_fields[jet_number] = []
+            jet_fields[jet_number].append(c.fields.index(field_name))
+    jet_fields = [np.array([[v for k, v in sorted(jet_fields.items(), key=lambda l:l[0])]])]
+
+
+def input_transform(x):
+    x = np.array(x)
+    if hasattr(c, "mask_uniform"):
+        c.mask_uniform(x)
+        return c.get_input_list(x)
+    elif args.mask_jets:
+        mask_uniform(x, c.mask_value, jet_fields)
+        return x
+
+
+opt_kwargs = dict()
+if hasattr(c, "mask_uniform"):
+    opt_kwargs["input_transform"] = input_transform
+    opt_kwargs["input_inverse_transform"] = c.get_input_flat
+if args.mask_jets:
+    opt_kwargs["input_transform"] = input_transform
+    opt_kwargs["input_inverse_transform"] = get_input_flat
+
+
+evts = get_max_activation_events(
+    c.model, ranges,
+    ntries=args.nevents,
+    layer=len(c.model.layers)-1,
+    neuron=0,
+    maxit=10,
+    seed=45,
+    threshold=0,
+    **opt_kwargs
+)
+
+with h5py.File(output_file, "w") as f:
+    f.create_dataset("actmax", data=evts[1])
--- a/scripts/plot_NN_2D.py
+++ b/scripts/plot_NN_2D.py
@@ -2,30 +2,6 @@

 import sys
 import argparse
-import logging
-logging.basicConfig()
-
-import numpy as np
-
-import ROOT
-ROOT.gROOT.SetBatch()
-ROOT.PyConfig.IgnoreCommandLineOptions = True
-
-from KerasROOTClassification import load_from_dir
-from KerasROOTClassification.plotting import (
-    get_mean_event,
-    plot_NN_vs_var_2D,
-    plot_profile_2D,
-    plot_hist_2D_events,
-    plot_cond_avg_actmax_2D,
-    plot_NN_vs_var_2D_all,
-)
-from KerasROOTClassification.utils import (
-    get_single_neuron_function,
-    get_max_activation_events,
-    weighted_quantile,
-    get_ranges
-)

 parser = argparse.ArgumentParser(description='Create various 2D plots for a single neuron')
 parser.add_argument("project_dir")
@@ -53,6 +29,31 @@ parser.add_argument("-s", "--step-size", help="step size for activation maximisa

 args = parser.parse_args()

+import logging
+logging.basicConfig()
+
+import numpy as np
+
+import ROOT
+ROOT.gROOT.SetBatch()
+ROOT.PyConfig.IgnoreCommandLineOptions = True
+
+from KerasROOTClassification import load_from_dir
+from KerasROOTClassification.plotting import (
+    get_mean_event,
+    plot_NN_vs_var_2D,
+    plot_profile_2D,
+    plot_hist_2D_events,
+    plot_cond_avg_actmax_2D,
+    plot_NN_vs_var_2D_all,
+)
+from KerasROOTClassification.utils import (
+    get_single_neuron_function,
+    get_max_activation_events,
+    weighted_quantile,
+    get_ranges
+)
+
 if args.all_neurons and (not args.mode.startswith("mean")):
    parser.error("--all-neurons currently only supported for mean_sig and mean_bkg")

@@ -82,7 +83,7 @@ total_weights = c.w_test*np.array(c.class_weight)[c.y_test.astype(int)]

 try:
    mask_value = c.mask_value
-except NameError:
+except AttributeError:
    mask_value = None

 # varx_test = c.x_test[:,varx_index]
@@ -94,8 +95,8 @@ except NameError:
 # percentilesx = weighted_quantile(varx_test[x_not_masked], [0.01, 0.99], sample_weight=total_weights[x_not_masked])
 # percentilesy = weighted_quantile(vary_test[y_not_masked], [0.01, 0.99], sample_weight=total_weights[y_not_masked])

-percentilesx = get_ranges(c.x_test, [0.01, 0.99], total_weights, mask_value=mask_value, filter_index=varx_index)[0]
-percentilesy = get_ranges(c.x_test, [0.01, 0.99], total_weights, mask_value=mask_value, filter_index=vary_index)[0]
+percentilesx = get_ranges(c.x_test, [0.01, 0.99], total_weights, mask_value=mask_value, filter_index=varx_index, max_evts=10000)[0][0]
+percentilesy = get_ranges(c.x_test, [0.01, 0.99], total_weights, mask_value=mask_value, filter_index=vary_index, max_evts=10000)[0][0]


 if args.xrange is not None:

--- a/scripts/write_parametrized.py
+++ b/scripts/write_parametrized.py
+#!/usr/bin/env python
+"""
+Write new TTrees with signal parameters as branches. For the
+backgrounds the parameters are generated following the total
+distribution for all signals. The discrete values for the whole ntuple
+of signal parameters are counted, such that correlations between
+signal parameters are taken into account.
+"""
+
+import argparse, re, os
+
+import ROOT
+
+from root_numpy import list_trees
+from root_pandas import read_root
+import numpy as np
+
+if __name__ == "__main__":
+
+    input_filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys.root"
+    output_filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys_parametrized.root"
+
+    param_names = ["mg", "mc", "mn"]
+
+    param_match = "GG_oneStep_(.*?)_(.*?)_(.*?)_NoSys"
+
+    output_signal_treename = "GG_oneStep_NoSys"
+
+    bkg_trees = [
+        "diboson_Sherpa221_NoSys",
+        "singletop_NoSys",
+        "ttbar_NoSys",
+        "ttv_NoSys",
+        "wjets_Sherpa221_NoSys",
+        "zjets_Sherpa221_NoSys",
+    ]
+
+    # read in the number of events for each combination of parameters
+    f = ROOT.TFile.Open(input_filename)
+    count_dict = {}
+    for key in f.GetListOfKeys():
+        tree_name = key.GetName()
+        match = re.match(param_match, tree_name)
+        if match is not None:
+            tree = f.Get(tree_name)
+            params = tuple([float(i) for i in match.groups()])
+            if not params in count_dict:
+                count_dict[params] = 0
+            # TODO: might be better to use sum of weights
+            count_dict[params] += tree.GetEntries()
+    f.Close()
+
+    # calculate cumulative sum of counts to sample signal parameters for background from
+    numbers = np.array(count_dict.keys(), dtype=np.float)
+    counts = np.array(count_dict.values(), dtype=np.float)
+    probs = counts/counts.sum()
+    prob_bins = np.cumsum(probs)
+
+    # read and write the rest in chunks
+    if os.path.exists(output_filename):
+        os.remove(output_filename)
+    for tree_name in list_trees(input_filename):
+        match_signal = re.match(param_match, tree_name)
+        if match_signal is not None or tree_name in bkg_trees:
+            print("Writing {}".format(tree_name))
+            nwritten = 0
+            for df in read_root(input_filename, tree_name, chunksize=100000):
+                print("Writing event {}".format(nwritten))
+                if match_signal is None:
+                    rnd = np.random.random(len(df))
+                    rnd_idx = np.digitize(rnd, prob_bins)
+                    param_values = numbers[rnd_idx]
+                    for param_idx, param_name in enumerate(param_names):
+                        df[param_name] = param_values[:,param_idx]
+                    df["training_weight"] = df["eventWeight"]*df["genWeight"]
+                else:
+                    for param_name, param_value in zip(param_names, match_signal.groups()):
+                        df[param_name] = float(param_value)
+                    df["training_weight"] = df["eventWeight"]
+                if match_signal is None:
+                    out_tree_name = tree_name
+                else:
+                    out_tree_name = output_signal_treename
+                df.to_root(output_filename, mode="a", key=out_tree_name)
+                nwritten += len(df)
--- a/test/test_toolkit.py
+++ b/test/test_toolkit.py
+import pytest
+import numpy as np
+import root_numpy
+import pandas as pd
+from sklearn.datasets import make_classification
+from keras.layers import GRU
+
+from KerasROOTClassification import ClassificationProject, ClassificationProjectRNN
+
+
+def create_dataset(path):
+
+    # create example dataset with (low-weighted) noise added
+    X, y = make_classification(n_samples=10000, random_state=1)
+    X2 = np.random.normal(size=20*10000).reshape(-1, 20)
+    y2 = np.concatenate([np.zeros(5000), np.ones(5000)])
+    X = np.concatenate([X, X2])
+    y = np.concatenate([y, y2])
+    w = np.concatenate([np.ones(10000), 0.01*np.ones(10000)])
+
+    # shift and scale randomly (to check if transformation is working)
+    shift = np.random.rand(20)*100
+    scale = np.random.rand(20)*1000
+    X *= scale
+    X += shift
+
+    # write to root files
+    branches = ["var_{}".format(i) for i in range(len(X[0]))]
+    df = pd.DataFrame(X, columns=branches)
+    df["class"] = y
+    df["weight"] = w
+    tree_path_bkg = str(path / "bkg.root")
+    tree_path_sig = str(path / "sig.root")
+    root_numpy.array2root(df[df["class"]==0].to_records(), tree_path_bkg)
+    root_numpy.array2root(df[df["class"]==1].to_records(), tree_path_sig)
+    return branches, tree_path_sig, tree_path_bkg
+
+
+def test_ClassificationProject(tmp_path):
+    branches, tree_path_sig, tree_path_bkg = create_dataset(tmp_path)
+    c = ClassificationProject(
+        str(tmp_path / "project"),
+        bkg_trees = [(tree_path_bkg, "tree")],
+        signal_trees = [(tree_path_sig, "tree")],
+        branches = branches,
+        weight_expr = "weight",
+        identifiers = ["index"],
+        optimizer="Adam",
+        earlystopping_opts=dict(patience=5),
+        dropout=0.5,
+        layers=3,
+        nodes=128,
+    )
+
+    c.train(epochs=200)
+    c.plot_all_inputs()
+    c.plot_loss()
+    assert min(c.history.history["val_loss"]) < 0.18
+
+
+def test_ClassificationProjectRNN(tmp_path):
+    branches, tree_path_sig, tree_path_bkg = create_dataset(tmp_path)
+    c = ClassificationProjectRNN(
+        str(tmp_path / "project"),
+        bkg_trees = [(tree_path_bkg, "tree")],
+        signal_trees = [(tree_path_sig, "tree")],
+        branches = branches,
+        recurrent_field_names=[
+            [
+                ["var_1", "var_2", "var_3"],
+                ["var_4", "var_5", "var_6"]
+            ],
+            [
+                ["var_10", "var_11", "var_12"],
+                ["var_13", "var_14", "var_15"]
+            ],
+        ],
+        weight_expr = "weight",
+        identifiers = ["index"],
+        optimizer="Adam",
+        earlystopping_opts=dict(patience=5),
+        dropout=0.5,
+        layers=3,
+        nodes=128,
+    )
+    assert sum([isinstance(layer, GRU) for layer in c.model.layers]) == 2
+    c.train(epochs=200)
+    c.plot_all_inputs()
+    c.plot_loss()
+    assert min(c.history.history["val_loss"]) < 0.18
--- a/toolkit.py
+++ b/toolkit.py
--- a/utils.py
+++ b/utils.py
@@ -33,22 +33,36 @@ def get_single_neuron_function(model, layer, neuron, input_transform=None):
    return eval_single_neuron


-def create_random_event(ranges):
+def create_random_event(ranges, mask_probs=None, mask_value=None):
    random_event = np.array([p[0]+(p[1]-p[0])*np.random.rand() for p in ranges])
    random_event = random_event.reshape(-1, len(random_event))
+    # if given, mask values with a certain probability
+    if mask_probs is not None:
+        if mask_value is None:
+            raise ValueError("Need to provide mask_value if random events should be masked")
+        for var_index, mask_prob in enumerate(mask_probs):
+            random_event[:,var_index][np.random.rand(len(random_event)) < mask_prob] = mask_value
    return random_event


-def get_ranges(x, quantiles, weights, mask_value=None, filter_index=None):
+def get_ranges(x, quantiles, weights, mask_value=None, filter_index=None, max_evts=None):
    "Get ranges for plotting or random event generation based on quantiles"
    ranges = []
+    mask_probs = []
+    if max_evts is not None:
+        rnd_idx = np.random.permutation(np.arange(len(x)))
+        rnd_idx = rnd_idx[:max_evts]
    for var_index in range(x.shape[1]):
        if (filter_index is not None) and (var_index != filter_index):
            continue
        x_var = x[:,var_index]
+        if max_evts is not None:
+            x_var = x_var[rnd_idx]
        not_masked = np.where(x_var != mask_value)[0]
+        masked = np.where(x_var == mask_value)[0]
        ranges.append(weighted_quantile(x_var[not_masked], quantiles, sample_weight=weights[not_masked]))
-    return ranges
+        mask_probs.append(float(len(masked))/float(len(x_var)))
+    return ranges, mask_probs


 def max_activation_wrt_input(gradient_function, random_event, threshold=None, maxthreshold=None, maxit=100, step=1, const_indices=[],
@@ -115,7 +129,7 @@ def get_grad_function(model, layer, neuron):
       ],
       ignoreKwargs=["input_transform", "input_inverse_transform"],
 )
-def get_max_activation_events(model, ranges, ntries, layer, neuron, seed=42, **kwargs):
+def get_max_activation_events(model, ranges, ntries, layer, neuron, seed=42, mask_probs=None, mask_value=None, **kwargs):

    gradient_function = get_grad_function(model, layer, neuron)

@@ -125,9 +139,15 @@ def get_max_activation_events(model, ranges, ntries, layer, neuron, seed=42, **k
    for i in range(ntries):
        if not (i%100):
            logger.info(i)
-        res = max_activation_wrt_input(gradient_function,
-                                       create_random_event(ranges),
-                                       **kwargs)
+        res = max_activation_wrt_input(
+            gradient_function,
+            create_random_event(
+                ranges,
+                mask_probs=mask_probs,
+                mask_value=mask_value
+            ),
+            **kwargs
+        )
        if res is not None:
            loss, event = res
            loss = np.array([loss])
@@ -177,18 +197,29 @@ def weighted_quantile(values, quantiles, sample_weight=None, values_sorted=False

 class WeightedRobustScaler(RobustScaler):

-    def fit(self, X, y=None, weights=None):
-        if not np.isnan(X).any():
+    def fit(self, X, y=None, weights=None, mask_value=None):
+        if not np.isnan(X).any() and mask_value is not None and weights is None:
            # these checks don't work for nan values
-            super(WeightedRobustScaler, self).fit(X, y)
-        if weights is None:
-            return self
+            return super(WeightedRobustScaler, self).fit(X, y)
        else:
-            wqs = np.array([weighted_quantile(X[:,i][~np.isnan(X[:,i])], [0.25, 0.5, 0.75], sample_weight=weights) for i in range(X.shape[1])])
+            if weights is None:
+                weights = np.ones(len(self.X))
+            wqs = []
+            for i in range(X.shape[1]):
+                mask = ~np.isnan(X[:,i])
+                if mask_value is not None:
+                    mask &= (X[:,i] != mask_value)
+                wqs.append(
+                    weighted_quantile(
+                        X[:,i][mask],
+                        [0.25, 0.5, 0.75],
+                        sample_weight=weights[mask]
+                    )
+                )
+            wqs = np.array(wqs)
            self.center_ = wqs[:,1]
            self.scale_ = wqs[:,2]-wqs[:,0]
            self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False)
-            print(self.scale_)
            return self


@@ -202,6 +233,15 @@ class WeightedRobustScaler(RobustScaler):
            return super(WeightedRobustScaler, self).transform(X)


+    def inverse_transform(self, X):
+        if np.isnan(X).any():
+            X *= self.scale_
+            X += self.center_
+            return X
+        else:
+            return super(WeightedRobustScaler, self).inverse_transform(X)
+
+
 def poisson_asimov_significance(s, ds, b, db):
    "see `<http://www.pp.rhul.ac.uk/~cowan/stat/medsig/medsigNote.pdf>`_)"
    db = np.sqrt(db**2+ds**2)
No results found