Compare revisions

Eric Schanet · Eric Schanet · Eric Schanet · Eric Schanet · Eric Schanet · Eric Schanet
--- a/compare.py
+++ b/compare.py
@@ -20,6 +20,9 @@ def overlay_ROC(filename, *projects, **kwargs):
    ylim = kwargs.pop("ylim", (0,1))
    plot_thresholds = kwargs.pop("plot_thresholds", False)
    threshold_log = kwargs.pop("threshold_log", True)
+    lumifactor = kwargs.pop("lumifactor", None)
+    tight_layout = kwargs.pop("tight_layout", False)
+    show_auc = kwargs.pop("show_auc", True)
    if kwargs:
        raise KeyError("Unknown kwargs: {}".format(kwargs))

@@ -33,11 +36,15 @@ def overlay_ROC(filename, *projects, **kwargs):
        if threshold_log:
            ax2.set_yscale("log")

+    if lumifactor is not None:
+        ax_abs_b = ax.twinx()
+        ax_abs_s = ax.twiny()
+
    prop_cycle = plt.rcParams['axes.prop_cycle']
    colors = prop_cycle.by_key()['color']

    for p, color in zip(projects, colors):
-        fpr, tpr, threshold = roc_curve(p.y_test, p.scores_test, sample_weight = p.w_test)
+        fpr, tpr, threshold = roc_curve(p.l_test, p.scores_test, sample_weight = p.w_test)
        fpr = 1.0 - fpr
        try:
            roc_auc = auc(tpr, fpr)
@@ -46,21 +53,39 @@ def overlay_ROC(filename, *projects, **kwargs):
            roc_auc = auc(tpr, fpr, reorder=True)

        ax.grid(color='gray', linestyle='--', linewidth=1)
-        ax.plot(tpr,  fpr, label=str(p.name+" (AUC = {:.3f})".format(roc_auc)), color=color)
+        if show_auc:
+            label = str(p.name+" (AUC = {:.3f})".format(roc_auc))
+        else:
+            label = p.name
+        ax.plot(tpr,  fpr, label=label, color=color)
        if plot_thresholds:
            ax2.plot(tpr, threshold, "--", color=color)
+        if lumifactor is not None:
+            sumw_b = p.w_test[p.l_test==0].sum()*lumifactor
+            sumw_s = p.w_test[p.l_test==1].sum()*lumifactor
+            ax_abs_b.plot(tpr, (1.-fpr)*sumw_b, alpha=0)
+            ax_abs_b.invert_yaxis()
+            ax_abs_s.plot(tpr*sumw_s, fpr, alpha=0)

    if xlim is not None:
        ax.set_xlim(*xlim)
    if ylim is not None:
        ax.set_ylim(*ylim)
+    if lumifactor is not None:
+        ax_abs_b.set_ylim((1-ax.get_ylim()[0])*sumw_b, (1-ax.get_ylim()[1])*sumw_b)
+        ax_abs_b.set_xlim(*ax.get_xlim())
+        ax_abs_s.set_xlim(ax.get_xlim()[0]*sumw_s, ax.get_xlim()[1]*sumw_s)
+        ax_abs_s.set_ylim(*ax.get_ylim())
+        ax_abs_b.set_ylabel("Number of background events")
+        ax_abs_s.set_xlabel("Number of signal events")
    # plt.xticks(np.arange(0,1,0.1))
    # plt.yticks(np.arange(0,1,0.1))
    ax.legend(loc='lower left', framealpha=1.0)
-    ax.set_title('Receiver operating characteristic')
+    if lumifactor is None:
+        ax.set_title('Receiver operating characteristic')
    ax.set_ylabel("Background rejection")
    ax.set_xlabel("Signal efficiency")
-    if plot_thresholds:
+    if plot_thresholds or tight_layout:
        # to fit right y-axis description
        fig.tight_layout()
    return save_show(plt, fig, filename)

--- a/keras_visualize_activations/read_activations.py
+++ b/keras_visualize_activations/read_activations.py
 import keras.backend as K

-from keras.layers import Masking, InputLayer

 def get_activations(model, model_inputs, print_shape_only=False, layer_name=None):
    print('----- activations -----')
@@ -13,12 +12,8 @@ def get_activations(model, model_inputs, print_shape_only=False, layer_name=None
        inp = [inp]
        model_multi_inputs_cond = False

-    # all layer outputs
-    # skip input and masking layers
    outputs = [layer.output for layer in model.layers if
-               (layer.name == layer_name or layer_name is None)
-               and not isinstance(layer, InputLayer)
-               and not isinstance(layer, Masking)]
+               layer.name == layer_name or layer_name is None]  # all layer outputs

    funcs = [K.function(inp + [K.learning_phase()], [out]) for out in outputs]  # evaluation functions


--- a/plotting.py
+++ b/plotting.py
@@ -20,9 +20,9 @@ logger.addHandler(logging.NullHandler())
 Some further plotting functions
 """

-def save_show(plt, fig, filename):
+def save_show(plt, fig, filename, **kwargs):
    "Save a figure and show it in case we are in ipython or jupyter notebook."
-    fig.savefig(filename)
+    fig.savefig(filename, **kwargs)
    try:
        get_ipython
        plt.show()
@@ -60,8 +60,7 @@ def plot_NN_vs_var_1D(plotname, means, scorefun, var_index, var_range, var_label
    if var_label is not None:
        ax.set_xlabel(var_label)
    ax.set_ylabel("NN output")
-    fig.savefig(plotname)
-    plt.close(fig)
+    save_show(plt, fig, plotname)


 def plot_NN_vs_var_2D(plotname, means,
@@ -134,8 +133,7 @@ def plot_NN_vs_var_2D(plotname, means,
        ax.set_xlabel(varx_label)
    if vary_label is not None:
        ax.set_ylabel(vary_label)
-    fig.savefig(plotname)
-    plt.close(fig)
+    save_show(plt, fig, plotname)


 def plot_NN_vs_var_2D_all(plotname, model, means,
@@ -233,8 +231,115 @@ def plot_NN_vs_var_2D_all(plotname, model, means,
    cb.ax.xaxis.set_ticks_position('top')
    cb.ax.xaxis.set_label_position('top')

-    fig.savefig(plotname, bbox_inches='tight')
-    plt.close(fig)
+    save_show(plt, fig, plotname, bbox_inches='tight')
+
+
+def plot_profile_2D_all(plotname, model, events,
+                        valsx, valsy,
+                        nbinsx, xmin, xmax,
+                        nbinsy, ymin, ymax,
+                        transform_function=None,
+                        varx_label=None,
+                        vary_label=None,
+                        zrange=None, logz=False,
+                        plot_last_layer=False,
+                        log_default_ymin=1e-5,
+                        global_norm=True,
+                        cmap="inferno", **kwargs):
+
+    "Similar to plot_profile_2D, but creates a grid of plots for all neurons."
+
+    # transform
+    if transform_function is not None:
+        events = transform_function(events)
+
+    logger.info("Reading activations for all neurons")
+    acts = get_activations(model, events, print_shape_only=True)
+    logger.info("Done")
+
+    if plot_last_layer:
+        n_neurons = [len(i.reshape(i.shape[0], -1)[0]) for i in acts]
+    else:
+        n_neurons = [len(i.reshape(i.shape[0], -1)[0]) for i in acts[:-1]]
+    layers = len(n_neurons)
+
+    nrows_ncols = (layers, max(n_neurons))
+    fig = plt.figure(1, figsize=nrows_ncols)
+    grid = ImageGrid(fig, 111, nrows_ncols=nrows_ncols[::-1], axes_pad=0,
+                     label_mode="1",
+                     aspect=False,
+                     cbar_location="top",
+                     cbar_mode="single",
+                     cbar_pad=.2,
+                     cbar_size="5%",)
+    grid_array = np.array(grid)
+    grid_array = grid_array.reshape(*nrows_ncols[::-1])
+
+    global_min = None
+    global_max = None
+
+    logger.info("Creating profile histograms")
+    ims = []
+    reg_plots = []
+    for layer in range(layers):
+        neurons_acts = acts[layer]
+        neurons_acts = neurons_acts.reshape(neurons_acts.shape[0], -1)
+        for neuron in range(len(neurons_acts[0])):
+            acts_neuron = neurons_acts[:,neuron]
+            ax = grid_array[neuron][layer]
+            extra_opts = {}
+            if not (plot_last_layer and layer == layers-1):
+                # for hidden layers, plot the same z-scale
+                if logz:
+                    norm = matplotlib.colors.LogNorm
+                else:
+                    norm = matplotlib.colors.Normalize
+                if zrange is not None:
+                    extra_opts["norm"] = norm(vmin=zrange[0], vmax=zrange[1])
+                else:
+                    extra_opts["norm"] = norm(vmin=global_min, vmax=global_max)
+            hist, xedges, yedges = get_profile_2D(
+                valsx, valsy, acts_neuron,
+                nbinsx, xmin, xmax,
+                nbinsy, ymin, ymax,
+                **kwargs
+            )
+            if global_min is None or hist.min() < global_min:
+                global_min = hist.min()
+            if global_max is None or hist.max() > global_max:
+                global_max = hist.max()
+            X, Y = np.meshgrid(xedges, yedges)
+            reg_plots.append((layer, neuron, ax, (X, Y, hist), dict(cmap="inferno", linewidth=0, rasterized=True, **extra_opts)))
+    logger.info("Done")
+
+    logger.info("global_min: {}".format(global_min))
+    logger.info("global_max: {}".format(global_max))
+
+    if global_min <= 0 and logz:
+        global_min = log_default_ymin
+        logger.info("Changing global_min to {}".format(log_default_ymin))
+
+    for layer, neuron, ax, args, kwargs in reg_plots:
+        if zrange is None:
+            kwargs["norm"].vmin = global_min
+            kwargs["norm"].vmax = global_max
+        if not global_norm:
+            kwargs["norm"] = None
+        im = ax.pcolormesh(*args, **kwargs)
+        ax.set_facecolor("black")
+        if varx_label is not None:
+            ax.set_xlabel(varx_label)
+        if vary_label is not None:
+            ax.set_ylabel(vary_label)
+        ax.text(0., 0.5, "{}, {}".format(layer, neuron), transform=ax.transAxes, color="white")
+
+    cb = fig.colorbar(im, cax=grid[0].cax, orientation="horizontal")
+    cb.ax.xaxis.set_ticks_position('top')
+    cb.ax.xaxis.set_label_position('top')
+
+    logger.info("Rendering")
+    save_show(plt, fig, plotname, bbox_inches='tight')
+    logger.info("Done")


 def plot_hist_2D(plotname, xedges, yedges, hist, varx_label=None, vary_label=None, log=False, zlabel="# of events"):
@@ -252,9 +357,7 @@ def plot_hist_2D(plotname, xedges, yedges, hist, varx_label=None, vary_label=Non
    cbar.set_label(zlabel)
    ax.set_ylabel(vary_label)
    ax.set_xlabel(varx_label)
-    fig.savefig(plotname)
-
-    plt.close(fig)
+    save_show(plt, fig, plotname)


 def plot_hist_2D_events(plotname, valsx, valsy, nbinsx, xmin, xmax, nbinsy, ymin, ymax,
@@ -308,15 +411,10 @@ def plot_cond_avg_actmax_2D(plotname, model, layer, neuron, ranges,
    plot_hist_2D(plotname, xedges, yedges, hist, zlabel="Neuron output", **kwargs)


-def plot_profile_2D(plotname, valsx, valsy, scores,
-                    nbinsx, xmin, xmax,
-                    nbinsy, ymin, ymax,
-                    metric=np.mean,
-                    weights=None,
-                    **kwargs):
-
-    kwargs["zlabel"] = kwargs.get("zlabel", "Profile")
-
+def get_profile_2D(valsx, valsy, scores,
+                   nbinsx, xmin, xmax,
+                   nbinsy, ymin, ymax,
+                   metric=np.mean, weights=None):
    xedges = np.linspace(xmin, xmax, nbinsx)
    yedges = np.linspace(ymin, ymax, nbinsy)

@@ -342,6 +440,25 @@ def plot_profile_2D(plotname, valsx, valsy, scores,
    hist = np.array(hist)
    hist = hist.T # had a list of columns - needs to be list of rows

+    return hist, xedges, yedges
+
+
+def plot_profile_2D(plotname, valsx, valsy, scores,
+                    nbinsx, xmin, xmax,
+                    nbinsy, ymin, ymax,
+                    metric=np.mean,
+                    weights=None,
+                    **kwargs):
+
+    kwargs["zlabel"] = kwargs.get("zlabel", "Profile")
+
+    hist, xedges, yedges = get_profile_2D(
+        valsx, valsy, scores,
+        nbinsx, xmin, xmax,
+        nbinsy, ymin, ymax,
+        metric=metric, weights=weights
+    )
+
    plot_hist_2D(plotname, xedges, yedges, hist, **kwargs)



--- a/scripts/generate_actmax.py
+++ b/scripts/generate_actmax.py
+#!/usr/bin/env python
+
+import sys, argparse, re, random
+
+parser = argparse.ArgumentParser(description="generate events that maximise the activation for a given neuron")
+parser.add_argument("input_project")
+parser.add_argument("output_file")
+parser.add_argument("-n", "--nevents", default=100000, type=int)
+parser.add_argument("-j", "--mask-jets", action="store_true",
+                    help="mask variables called jet*Pt/Eta/Phi and generate a random uniform distribution of the number of jets (only nescessary for non-recurrent NN)")
+args = parser.parse_args()
+
+import logging
+logging.basicConfig()
+logging.getLogger().setLevel(logging.INFO)
+
+import h5py
+import numpy as np
+
+from KerasROOTClassification.utils import (
+    weighted_quantile,
+    get_max_activation_events,
+    create_random_event,
+    get_ranges
+)
+from KerasROOTClassification import load_from_dir
+import meme
+
+meme.setOptions(deactivated=True)
+
+input_project = args.input_project
+output_file = args.output_file
+
+c = load_from_dir(input_project)
+c._load_data()
+
+ranges, mask_probs = get_ranges(c.transform(c.x_train), [0.01, 0.99], c.w_train_tot, mask_value=c.mask_value, max_evts=10000)
+
+def mask_uniform(x, mask_value, recurrent_field_idx):
+    """
+    Mask recurrent fields with a random (uniform) number of objects. Works in place.
+    """
+    for rec_idx in recurrent_field_idx:
+        for evt in x:
+            masked = False
+            nobj = int(random.random()*(rec_idx.shape[1]+1))
+            for obj_number, line_idx in enumerate(rec_idx.reshape(*rec_idx.shape[1:])):
+                if obj_number == nobj:
+                    masked=True
+                if masked:
+                    evt[line_idx] = mask_value
+
+def get_input_flat(x):
+    return x[0].reshape(-1, len(c.fields))
+
+
+if args.mask_jets:
+    jet_fields = {}
+    for field_name in c.fields:
+        if any(field_name.startswith("jet") and field_name.endswith(suffix) for suffix in ["Pt", "Eta", "Phi"]):
+            jet_number = re.findall("[0-9]+", field_name)[0]
+            if not jet_number in jet_fields:
+                jet_fields[jet_number] = []
+            jet_fields[jet_number].append(c.fields.index(field_name))
+    jet_fields = [np.array([[v for k, v in sorted(jet_fields.items(), key=lambda l:l[0])]])]
+
+
+def input_transform(x):
+    x = np.array(x)
+    if hasattr(c, "mask_uniform"):
+        c.mask_uniform(x)
+        return c.get_input_list(x)
+    elif args.mask_jets:
+        mask_uniform(x, c.mask_value, jet_fields)
+        return x
+
+
+opt_kwargs = dict()
+if hasattr(c, "mask_uniform"):
+    opt_kwargs["input_transform"] = input_transform
+    opt_kwargs["input_inverse_transform"] = c.get_input_flat
+if args.mask_jets:
+    opt_kwargs["input_transform"] = input_transform
+    opt_kwargs["input_inverse_transform"] = get_input_flat
+
+
+evts = get_max_activation_events(
+    c.model, ranges,
+    ntries=args.nevents,
+    layer=len(c.model.layers)-1,
+    neuron=0,
+    maxit=10,
+    seed=45,
+    threshold=0,
+    **opt_kwargs
+)
+
+with h5py.File(output_file, "w") as f:
+    f.create_dataset("actmax", data=evts[1])
--- a/scripts/plot_NN_2D.py
+++ b/scripts/plot_NN_2D.py
@@ -2,30 +2,6 @@

 import sys
 import argparse
-import logging
-logging.basicConfig()
-
-import numpy as np
-
-import ROOT
-ROOT.gROOT.SetBatch()
-ROOT.PyConfig.IgnoreCommandLineOptions = True
-
-from KerasROOTClassification import load_from_dir
-from KerasROOTClassification.plotting import (
-    get_mean_event,
-    plot_NN_vs_var_2D,
-    plot_profile_2D,
-    plot_hist_2D_events,
-    plot_cond_avg_actmax_2D,
-    plot_NN_vs_var_2D_all,
-)
-from KerasROOTClassification.utils import (
-    get_single_neuron_function,
-    get_max_activation_events,
-    weighted_quantile,
-    get_ranges
-)

 parser = argparse.ArgumentParser(description='Create various 2D plots for a single neuron')
 parser.add_argument("project_dir")
@@ -53,6 +29,31 @@ parser.add_argument("-s", "--step-size", help="step size for activation maximisa

 args = parser.parse_args()

+import logging
+logging.basicConfig()
+
+import numpy as np
+
+import ROOT
+ROOT.gROOT.SetBatch()
+ROOT.PyConfig.IgnoreCommandLineOptions = True
+
+from KerasROOTClassification import load_from_dir
+from KerasROOTClassification.plotting import (
+    get_mean_event,
+    plot_NN_vs_var_2D,
+    plot_profile_2D,
+    plot_hist_2D_events,
+    plot_cond_avg_actmax_2D,
+    plot_NN_vs_var_2D_all,
+)
+from KerasROOTClassification.utils import (
+    get_single_neuron_function,
+    get_max_activation_events,
+    weighted_quantile,
+    get_ranges
+)
+
 if args.all_neurons and (not args.mode.startswith("mean")):
    parser.error("--all-neurons currently only supported for mean_sig and mean_bkg")

@@ -82,7 +83,7 @@ total_weights = c.w_test*np.array(c.class_weight)[c.y_test.astype(int)]

 try:
    mask_value = c.mask_value
-except NameError:
+except AttributeError:
    mask_value = None

 # varx_test = c.x_test[:,varx_index]
@@ -94,8 +95,8 @@ except NameError:
 # percentilesx = weighted_quantile(varx_test[x_not_masked], [0.01, 0.99], sample_weight=total_weights[x_not_masked])
 # percentilesy = weighted_quantile(vary_test[y_not_masked], [0.01, 0.99], sample_weight=total_weights[y_not_masked])

-percentilesx = get_ranges(c.x_test, [0.01, 0.99], total_weights, mask_value=mask_value, filter_index=varx_index)[0]
-percentilesy = get_ranges(c.x_test, [0.01, 0.99], total_weights, mask_value=mask_value, filter_index=vary_index)[0]
+percentilesx = get_ranges(c.x_test, [0.01, 0.99], total_weights, mask_value=mask_value, filter_index=varx_index, max_evts=10000)[0][0]
+percentilesy = get_ranges(c.x_test, [0.01, 0.99], total_weights, mask_value=mask_value, filter_index=vary_index, max_evts=10000)[0][0]


 if args.xrange is not None:

--- a/scripts/write_parametrized.py
+++ b/scripts/write_parametrized.py
+#!/usr/bin/env python
+"""
+Write new TTrees with signal parameters as branches. For the
+backgrounds the parameters are generated following the total
+distribution for all signals. The discrete values for the whole ntuple
+of signal parameters are counted, such that correlations between
+signal parameters are taken into account.
+"""
+
+import argparse, re, os
+
+import ROOT
+
+from root_numpy import list_trees
+from root_pandas import read_root
+import numpy as np
+
+if __name__ == "__main__":
+
+    input_filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys.root"
+    output_filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys_parametrized.root"
+
+    param_names = ["mg", "mc", "mn"]
+
+    param_match = "GG_oneStep_(.*?)_(.*?)_(.*?)_NoSys"
+
+    output_signal_treename = "GG_oneStep_NoSys"
+
+    bkg_trees = [
+        "diboson_Sherpa221_NoSys",
+        "singletop_NoSys",
+        "ttbar_NoSys",
+        "ttv_NoSys",
+        "wjets_Sherpa221_NoSys",
+        "zjets_Sherpa221_NoSys",
+    ]
+
+    # read in the number of events for each combination of parameters
+    f = ROOT.TFile.Open(input_filename)
+    count_dict = {}
+    for key in f.GetListOfKeys():
+        tree_name = key.GetName()
+        match = re.match(param_match, tree_name)
+        if match is not None:
+            tree = f.Get(tree_name)
+            params = tuple([float(i) for i in match.groups()])
+            if not params in count_dict:
+                count_dict[params] = 0
+            # TODO: might be better to use sum of weights
+            count_dict[params] += tree.GetEntries()
+    f.Close()
+
+    # calculate cumulative sum of counts to sample signal parameters for background from
+    numbers = np.array(count_dict.keys(), dtype=np.float)
+    counts = np.array(count_dict.values(), dtype=np.float)
+    probs = counts/counts.sum()
+    prob_bins = np.cumsum(probs)
+
+    # read and write the rest in chunks
+    if os.path.exists(output_filename):
+        os.remove(output_filename)
+    for tree_name in list_trees(input_filename):
+        match_signal = re.match(param_match, tree_name)
+        if match_signal is not None or tree_name in bkg_trees:
+            print("Writing {}".format(tree_name))
+            nwritten = 0
+            for df in read_root(input_filename, tree_name, chunksize=100000):
+                print("Writing event {}".format(nwritten))
+                if match_signal is None:
+                    rnd = np.random.random(len(df))
+                    rnd_idx = np.digitize(rnd, prob_bins)
+                    param_values = numbers[rnd_idx]
+                    for param_idx, param_name in enumerate(param_names):
+                        df[param_name] = param_values[:,param_idx]
+                    df["training_weight"] = df["eventWeight"]*df["genWeight"]
+                else:
+                    for param_name, param_value in zip(param_names, match_signal.groups()):
+                        df[param_name] = float(param_value)
+                    df["training_weight"] = df["eventWeight"]
+                if match_signal is None:
+                    out_tree_name = tree_name
+                else:
+                    out_tree_name = output_signal_treename
+                df.to_root(output_filename, mode="a", key=out_tree_name)
+                nwritten += len(df)
--- a/test/test_toolkit.py
+++ b/test/test_toolkit.py
+import pytest
+import numpy as np
+import root_numpy
+import pandas as pd
+from sklearn.datasets import make_classification
+from keras.layers import GRU
+
+from KerasROOTClassification import ClassificationProject, ClassificationProjectRNN
+
+
+def create_dataset(path):
+
+    # create example dataset with (low-weighted) noise added
+    X, y = make_classification(n_samples=10000, random_state=1)
+    X2 = np.random.normal(size=20*10000).reshape(-1, 20)
+    y2 = np.concatenate([np.zeros(5000), np.ones(5000)])
+    X = np.concatenate([X, X2])
+    y = np.concatenate([y, y2])
+    w = np.concatenate([np.ones(10000), 0.01*np.ones(10000)])
+
+    # shift and scale randomly (to check if transformation is working)
+    shift = np.random.rand(20)*100
+    scale = np.random.rand(20)*1000
+    X *= scale
+    X += shift
+
+    # write to root files
+    branches = ["var_{}".format(i) for i in range(len(X[0]))]
+    df = pd.DataFrame(X, columns=branches)
+    df["class"] = y
+    df["weight"] = w
+    tree_path_bkg = str(path / "bkg.root")
+    tree_path_sig = str(path / "sig.root")
+    root_numpy.array2root(df[df["class"]==0].to_records(), tree_path_bkg)
+    root_numpy.array2root(df[df["class"]==1].to_records(), tree_path_sig)
+    return branches, tree_path_sig, tree_path_bkg
+
+
+def test_ClassificationProject(tmp_path):
+    branches, tree_path_sig, tree_path_bkg = create_dataset(tmp_path)
+    c = ClassificationProject(
+        str(tmp_path / "project"),
+        bkg_trees = [(tree_path_bkg, "tree")],
+        signal_trees = [(tree_path_sig, "tree")],
+        branches = branches,
+        weight_expr = "weight",
+        identifiers = ["index"],
+        optimizer="Adam",
+        earlystopping_opts=dict(patience=5),
+        dropout=0.5,
+        layers=3,
+        nodes=128,
+    )
+
+    c.train(epochs=200)
+    c.plot_all_inputs()
+    c.plot_loss()
+    assert min(c.history.history["val_loss"]) < 0.18
+
+
+def test_ClassificationProjectRNN(tmp_path):
+    branches, tree_path_sig, tree_path_bkg = create_dataset(tmp_path)
+    c = ClassificationProjectRNN(
+        str(tmp_path / "project"),
+        bkg_trees = [(tree_path_bkg, "tree")],
+        signal_trees = [(tree_path_sig, "tree")],
+        branches = branches,
+        recurrent_field_names=[
+            [
+                ["var_1", "var_2", "var_3"],
+                ["var_4", "var_5", "var_6"]
+            ],
+            [
+                ["var_10", "var_11", "var_12"],
+                ["var_13", "var_14", "var_15"]
+            ],
+        ],
+        weight_expr = "weight",
+        identifiers = ["index"],
+        optimizer="Adam",
+        earlystopping_opts=dict(patience=5),
+        dropout=0.5,
+        layers=3,
+        nodes=128,
+    )
+    assert sum([isinstance(layer, GRU) for layer in c.model.layers]) == 2
+    c.train(epochs=200)
+    c.plot_all_inputs()
+    c.plot_loss()
+    assert min(c.history.history["val_loss"]) < 0.18
--- a/toolkit.py
+++ b/toolkit.py
@@ -19,6 +19,7 @@ import math
 import glob
 import shutil
 import gc
+import random

 import logging
 logger = logging.getLogger("KerasROOTClassification")
@@ -32,33 +33,24 @@ from sklearn.preprocessing import StandardScaler, RobustScaler
 from sklearn.externals import joblib
 from sklearn.metrics import roc_curve, auc
 from sklearn.utils.extmath import stable_cumsum
+from sklearn.model_selection import KFold
 from keras.models import Sequential, Model, model_from_json
-from keras.layers import Dense, Dropout, Input, Masking, GRU, concatenate
+from keras.layers import Dense, Dropout, Input, Masking, GRU, LSTM, concatenate, SimpleRNN
 from keras.callbacks import History, EarlyStopping, CSVLogger, ModelCheckpoint, TensorBoard
 from keras.optimizers import SGD
+from keras.activations import relu
+import keras.initializers
 import keras.optimizers
 from keras.utils.vis_utils import model_to_dot
 from keras import backend as K
+import tensorflow as tf
 import matplotlib.pyplot as plt

 from .utils import WeightedRobustScaler, weighted_quantile, poisson_asimov_significance
 from .plotting import save_show

-# configure number of cores
-# this doesn't seem to work, but at least with these settings keras only uses 4 processes
-# import tensorflow as tf
-# from keras import backend as K
-# num_cores = 1
-# config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,
-#                         inter_op_parallelism_threads=num_cores,
-#                         allow_soft_placement=True,
-#                         device_count = {'CPU': num_cores})
-# session = tf.Session(config=config)
-# K.set_session(session)
-
 import ROOT

-
 def byteify(input):
    "From stackoverflow https://stackoverflow.com/a/13105359"
    if isinstance(input, dict):
@@ -75,6 +67,25 @@ if version_info[0] > 2:
    byteify = lambda input : input


+def set_session_threads(n_cpu=None):
+    "Set the number of threads based on OMP_NUM_THREADS or the given argument"
+
+    if n_cpu is None:
+        if os.environ.get('OMP_NUM_THREADS'):
+            n_cpu = int(os.environ.get('OMP_NUM_THREADS'))
+        else:
+            return
+
+    # not sure if this is the best configuration ...
+    config = tf.ConfigProto(intra_op_parallelism_threads=n_cpu,
+                            inter_op_parallelism_threads=1,
+                            allow_soft_placement=True,
+                            #log_device_placement=True,
+                            device_count = {'CPU': n_cpu})
+    session = tf.Session(config=config)
+    K.set_session(session)
+
+
 def load_from_dir(path):
    "Load a project and the options from a directory"
    try:
@@ -90,6 +101,8 @@ def load_from_dir(path):

 class ClassificationProject(object):

+    verbose = 1 # verbosity of the fit method
+
    """Simple framework to load data from ROOT TTrees and train Keras
    neural networks for classification according to some global settings.

@@ -109,6 +122,8 @@ class ClassificationProject(object):

    :param branches: list of branch names or expressions to be used as input values for training

+    :param regression_branches: list of branch names to be used as regression targets
+
    :param rename_branches: dictionary that maps branch expressions to names for better readability

    :param weight_expr: expression to weight the events in the loss function
@@ -133,14 +148,22 @@ class ClassificationProject(object):

    :param dropout_input: dropout fraction for the input layer. Set to None for no Dropout.

+    :param use_bias: use bias constant for each neuron? (default: True). You can also pass a list of booleans for each layer.
+
    :param batch_size: size of the training batches

    :param validation_split: split off this fraction of training events for loss evaluation

+    :param kfold_splits: if given, split into this number of of subsets to perform KFold cross validation
+
+    :param kfold_index: index of the subset to leave out for kfold validation
+
    :param activation_function: activation function in the hidden layers

    :param activation_function_output: activation function in the output layer

+    :param leaky_relu_alpha: set this to a non-zero value to use the LeakyReLU variant with a slope in the negative part
+
    :param out_dir: base directory in which the project directories should be stored

    :param scaler_type: sklearn scaler class name to transform the data before training (options: "StandardScaler", "RobustScaler")
@@ -177,11 +200,27 @@ class ClassificationProject(object):
                            class will be used in each epoch, but different subsets of the
                            overrepresented class will be used in each epoch.

-    :param random_seed: use this seed value when initialising the model and produce consistent results. Note:
-                        random data is also used for shuffling the training data, so results may vary still. To
-                        produce consistent results, set the numpy random seed before training.
+    :param random_seed: use this seed value when initialising the model and produce consistent results.
+
+    :param shuffle_seed: use this seed for shuffling the training data
+                         the first time. This seed (increased by one) is used again before
+                         training when keras shuffling is used.
+
+    :param loss: loss function name (or list of names in case of regression targets)
+
+    :param loss_weights: (optional) list of weights to weight the individual losses (for multiple targets)

-    :param loss: loss function name
+    :param mask_value: value that is used for non-existent entries (e.g. 4th jet pt in events with 3 jets)
+
+    :param apply_class_weight: apply a weight that scales the events such that sumw(signal) = sumw(background)
+
+    :param normalize_weights: normalize the weights to mean 1
+
+    :param ignore_neg_weights: ignore events with negative weights in training - not recommended! (default: False)
+
+    :param kernel_initializer: weight initializer for the dense layers - if None (default) the keras defaults are used
+
+    :param shuffle: shuffle training data after (and before first) epoch

    """

@@ -217,6 +256,7 @@ class ClassificationProject(object):

    def _init_from_args(self, name,
                        signal_trees, bkg_trees, branches, weight_expr,
+                        regression_branches=None,
                        rename_branches=None,
                        project_dir=None,
                        data_dir=None,
@@ -226,9 +266,13 @@ class ClassificationProject(object):
                        nodes=64,
                        dropout=None,
                        dropout_input=None,
+                        use_bias=True,
                        batch_size=128,
                        validation_split=0.33,
+                        kfold_splits=None,
+                        kfold_index=0,
                        activation_function='relu',
+                        leaky_relu_alpha=None,
                        activation_function_output='sigmoid',
                        scaler_type="WeightedRobustScaler",
                        step_signal=2,
@@ -244,8 +288,17 @@ class ClassificationProject(object):
                        use_tensorboard=False,
                        tensorboard_opts=None,
                        random_seed=1234,
+                        shuffle_seed=42,
                        balance_dataset=False,
-                        loss='binary_crossentropy'):
+                        loss='binary_crossentropy',
+                        loss_weights=None,
+                        mask_value=None,
+                        apply_class_weight=True,
+                        normalize_weights=True,
+                        ignore_neg_weights=False,
+                        kernel_initializer=None,
+                        shuffle=True,
+    ):

        self.name = name
        self.signal_trees = signal_trees
@@ -254,6 +307,9 @@ class ClassificationProject(object):
        if rename_branches is None:
            rename_branches = {}
        self.rename_branches = rename_branches
+        if regression_branches is None:
+            regression_branches = []
+        self.regression_branches = regression_branches
        self.weight_expr = weight_expr
        self.selection = selection

@@ -279,12 +335,22 @@ class ClassificationProject(object):
        self.dropout = dropout
        if not isinstance(self.dropout, list):
            self.dropout = [self.dropout for i in range(self.layers)]
+        self.dropout_input = dropout_input
+        self.use_bias = use_bias
+        if not isinstance(self.use_bias, list):
+            self.use_bias = [self.use_bias for i in range(self.layers)]
        if len(self.dropout) != self.layers:
            raise ValueError("List of dropout fractions has to be of equal size as the number of layers!")
-        self.dropout_input = dropout_input
+        if len(self.use_bias) != self.layers:
+            raise ValueError("List biases has to be of equal size as the number of layers!")
        self.batch_size = batch_size
        self.validation_split = validation_split
+        self.kfold_splits = kfold_splits
+        self.kfold_index = kfold_index
        self.activation_function = activation_function
+        self.leaky_relu_alpha = leaky_relu_alpha
+        if self.activation_function == "relu" and self.leaky_relu_alpha:
+            self.activation_function = lambda x : relu(x, alpha=self.leaky_relu_alpha)
        self.activation_function_output = activation_function_output
        self.scaler_type = scaler_type
        self.step_signal = step_signal
@@ -314,8 +380,19 @@ class ClassificationProject(object):
        if tensorboard_opts is not None:
            self.tensorboard_opts.update(**tensorboard_opts)
        self.random_seed = random_seed
+        self.shuffle_seed = shuffle_seed
        self.balance_dataset = balance_dataset
        self.loss = loss
+        self.loss_weights = loss_weights
+        if self.regression_branches and (not isinstance(self.loss, list)):
+            self.loss = [self.loss]+["mean_squared_error"]*len(self.regression_branches)
+
+        self.mask_value = mask_value
+        self.apply_class_weight = apply_class_weight
+        self.normalize_weights = normalize_weights
+        self.ignore_neg_weights = ignore_neg_weights
+        self.kernel_initializer = kernel_initializer
+        self.shuffle = shuffle

        self.s_train = None
        self.b_train = None
@@ -338,23 +415,24 @@ class ClassificationProject(object):
        self._b_eventlist_train = None

        self._scaler = None
+        self._scaler_target = None
        self._class_weight = None
        self._balanced_class_weight = None
        self._model = None
        self._history = None
        self._callbacks_list = []
+        self._train_val_idx = None

        # track the number of epochs this model has been trained
        self.total_epochs = 0

        self.data_loaded = False
-        self.data_transformed = False
-        self.data_shuffled = False

        # track if we are currently training
        self.is_training = False

        self._fields = None
+        self._target_fields = None


    @property
@@ -367,6 +445,16 @@ class ClassificationProject(object):
        return self._fields


+    @property
+    def target_fields(self):
+        "Renamed branch expressions for regression targets"
+        if self._target_fields is None:
+            self._target_fields = []
+            for branch_expr in self.regression_branches:
+                self._target_fields.append(self.rename_branches.get(branch_expr, branch_expr))
+        return self._target_fields
+
+
    def rename_fields(self, ar):
        "Rename fields of structured array"
        fields = list(ar.dtype.names)
@@ -378,12 +466,14 @@ class ClassificationProject(object):

    def _load_data(self):

+        self._w_train_tot = None
+
        try:

            # if those don't exist, we need to load them from ROOT trees first
            self._load_from_hdf5(*self.dataset_names_tree)

-        except KeyError:
+        except (KeyError, IOError):

            logger.info("Couldn't load all datasets - reading from ROOT trees")

@@ -395,22 +485,26 @@ class ClassificationProject(object):
            for filename, treename in self.bkg_trees:
                bkg_chain.AddFile(filename, -1, treename)
            self.s_train = tree2array(signal_chain,
-                                      branches=self.branches+[self.weight_expr]+self.identifiers,
+                                      branches=set(self.branches+self.regression_branches+[self.weight_expr]+self.identifiers),
                                      selection=self.selection,
                                      start=0, step=self.step_signal, stop=self.stop_train)
            self.b_train = tree2array(bkg_chain,
-                                      branches=self.branches+[self.weight_expr]+self.identifiers,
+                                      branches=set(self.branches+self.regression_branches+[self.weight_expr]+self.identifiers),
                                      selection=self.selection,
                                      start=0, step=self.step_bkg, stop=self.stop_train)
            self.s_test = tree2array(signal_chain,
-                                     branches=self.branches+[self.weight_expr],
+                                     branches=set(self.branches+self.regression_branches+[self.weight_expr]),
                                     selection=self.selection,
                                     start=1, step=self.step_signal, stop=self.stop_test)
            self.b_test = tree2array(bkg_chain,
-                                     branches=self.branches+[self.weight_expr],
+                                     branches=set(self.branches+self.regression_branches+[self.weight_expr]),
                                     selection=self.selection,
                                     start=1, step=self.step_bkg, stop=self.stop_test)

+            if self.ignore_neg_weights:
+                self.s_train = self.s_train[self.s_train[self.weight_expr]>0]
+                self.b_train = self.b_train[self.b_train[self.weight_expr]>0]
+
            self.rename_fields(self.s_train)
            self.rename_fields(self.b_train)
            self.rename_fields(self.s_test)
@@ -420,19 +514,27 @@ class ClassificationProject(object):
            self.b_eventlist_train = self.b_train[self.identifiers].astype(dtype=[(branchName, "u8") for branchName in self.identifiers])
            self._dump_training_list()

-            # now we don't need the identifiers anymore
-            self.s_train = self.s_train[self.fields+[self.weight_expr]]
-            self.b_train = self.b_train[self.fields+[self.weight_expr]]
-
            # create x (input), y (target) and w (weights) arrays
            # the first block will be signals, the second block backgrounds
            self.x_train = rec2array(self.s_train[self.fields])
            self.x_train = np.concatenate((self.x_train, rec2array(self.b_train[self.fields])))
            self.w_train = self.s_train[self.weight_expr]
            self.w_train = np.concatenate((self.w_train, self.b_train[self.weight_expr]))
-            self.y_train = np.empty(len(self.x_train), dtype=np.bool)
-            self.y_train[:len(self.s_train)] = 1
-            self.y_train[len(self.s_train):] = 0
+
+            def fill_target(x, s, b):
+                if not self.target_fields:
+                    y = np.empty(len(x), dtype=np.bool)
+                    y[:len(s)] = 1
+                    y[len(s):] = 0
+                else:
+                    y = np.empty((len(x), 1+len(self.target_fields)), dtype=np.float)
+                    y[:len(s),0] = 1
+                    y[len(s):,0] = 0
+                    y[:len(s),1:] = rec2array(s[self.target_fields])
+                    y[len(s):,1:] = rec2array(b[self.target_fields])
+                return y
+
+            self.y_train = fill_target(self.x_train, self.s_train, self.b_train)
            self.b_train = None
            self.s_train = None

@@ -440,16 +542,14 @@ class ClassificationProject(object):
            self.x_test = np.concatenate((self.x_test, rec2array(self.b_test[self.fields])))
            self.w_test = self.s_test[self.weight_expr]
            self.w_test = np.concatenate((self.w_test, self.b_test[self.weight_expr]))
-            self.y_test = np.empty(len(self.x_test), dtype=np.bool)
-            self.y_test[:len(self.s_test)] = 1
-            self.y_test[len(self.s_test):] = 0
+
+            self.y_test = fill_target(self.x_test, self.s_test, self.b_test)
            self.b_test = None
            self.s_test = None

            self._dump_to_hdf5(*self.dataset_names_tree)

        self.data_loaded = True
-        self.data_shuffled = False


    def _dump_training_list(self):
@@ -510,7 +610,7 @@ class ClassificationProject(object):
                    os.symlink(srcpath, filename)
                    logger.info("Created symlink from {} to {}".format(srcpath, filename))
            logger.info("Trying to load {} from {}".format(dataset_name, filename))
-            with h5py.File(filename) as hf:
+            with h5py.File(filename, "r") as hf:
                setattr(self, dataset_name,  hf[dataset_name][:])
        logger.info("Data loaded")

@@ -551,6 +651,7 @@ class ClassificationProject(object):
                elif self.scaler_type == "WeightedRobustScaler":
                    self._scaler = WeightedRobustScaler()
                    scaler_fit_kwargs["weights"] = self.w_train_tot
+                    scaler_fit_kwargs["mask_value"] = self.mask_value
                else:
                    raise ValueError("Scaler type {} unknown".format(self.scaler_type))
                logger.info("Fitting {} to training data".format(self.scaler_type))
@@ -562,12 +663,82 @@ class ClassificationProject(object):
        return self._scaler


-    def transform(self, x):
-        return self.scaler.transform(x)
+    @property
+    def scaler_target(self):
+        "same as scaler, but for scaling regression targets"
+        # create the scaler (and fit to training data) if not existent
+        if self._scaler_target is None:
+            filename = os.path.join(self.project_dir, "scaler_target.pkl")
+            try:
+                self._scaler_target = joblib.load(filename)
+                logger.info("Loaded existing scaler from {}".format(filename))
+            except IOError:
+                logger.info("Creating new {} for scaling the targets".format(self.scaler_type))
+                scaler_fit_kwargs = dict()
+                if self.scaler_type == "StandardScaler":
+                    self._scaler_target = StandardScaler()
+                elif self.scaler_type == "RobustScaler":
+                    self._scaler_target = RobustScaler()
+                elif self.scaler_type == "WeightedRobustScaler":
+                    self._scaler_target = WeightedRobustScaler()
+                    scaler_fit_kwargs["weights"] = self.w_train_tot
+                else:
+                    raise ValueError("Scaler type {} unknown".format(self.scaler_type))
+                logger.info("Fitting {} to training data".format(self.scaler_type))
+                orig_copy_setting = self.scaler.copy
+                self.scaler.copy = False
+                self._scaler_target.fit(self.y_train, **scaler_fit_kwargs)
+                # i don't want to scale the classification target here
+                self._scaler_target.center_[0] = 0.
+                self._scaler_target.scale_[0] = 1.
+                self.scaler.copy = orig_copy_setting
+                joblib.dump(self._scaler_target, filename)
+        return self._scaler_target
+
+
+    def _batch_transform(self, x, fn, batch_size):
+        "Transform array in batches, temporarily setting mask_values to nan"
+        transformed = np.empty(x.shape, dtype=x.dtype)
+        for start in range(0, len(x), batch_size):
+            stop = start+batch_size
+            x_batch = np.array(x[start:stop]) # copy
+            x_batch[x_batch == self.mask_value] = np.nan
+            x_batch = fn(x_batch)
+            x_batch[np.isnan(x_batch)] = self.mask_value
+            transformed[start:stop] = x_batch
+        return transformed
+
+
+    def transform(self, x, batch_size=10000):
+        if self.mask_value is not None:
+            return self._batch_transform(x, self.scaler.transform, batch_size)
+        else:
+            return self.scaler.transform(x)
+
+
+    def inverse_transform(self, x, batch_size=10000):
+        if self.mask_value is not None:
+            return self._batch_transform(x, self.scaler.inverse_transform, batch_size)
+        else:
+            return self.scaler.inverse_transform(x)
+
+
+    def transform_target(self, y, batch_size=10000):
+        if not self.target_fields:
+            return y
+        if self.mask_value is not None:
+            return self._batch_transform(y, self.scaler_target.transform, batch_size)
+        else:
+            return self.scaler_target.transform(y)


-    def inverse_transform(self, x):
-        return self.scaler.inverse_transform(x)
+    def inverse_transform_target(self, y, batch_size=10000):
+        if not self.target_fields:
+            return y
+        if self.mask_value is not None:
+            return self._batch_transform(y, self.scaler_target.inverse_transform, batch_size)
+        else:
+            return self.scaler_target.inverse_transform(y)


    @property
@@ -601,24 +772,6 @@ class ClassificationProject(object):
            json.dump(self.history.history, of)


-    def _transform_data(self):
-        if not self.data_transformed:
-            if logger.level <= logging.DEBUG:
-                logger.debug("training data before transformation: {}".format(self.x_train))
-                logger.debug("minimum values: {}".format([np.min(self.x_train[:,i][~np.isnan(self.x_train[:,i])])
-                                                          for i in range(self.x_train.shape[1])]))
-                logger.debug("maximum values: {}".format([np.max(self.x_train[:,i][~np.isnan(self.x_train[:,i])])
-                                                          for i in range(self.x_train.shape[1])]))
-            orig_copy_setting = self.scaler.copy
-            self.scaler.copy = False
-            self.x_train = self.scaler.transform(self.x_train)
-            logger.debug("training data after transformation: {}".format(self.x_train))
-            self.x_test = self.scaler.transform(self.x_test)
-            self.scaler.copy = orig_copy_setting
-            self.data_transformed = True
-            logger.info("Training and test data transformed")
-
-
    def _read_info(self, key, default):
        filename = os.path.join(self.project_dir, "info.json")
        if not os.path.exists(filename):
@@ -660,23 +813,47 @@ class ClassificationProject(object):

        if self._model is None:

-            self._model = Sequential()

+            # input
+            input_layer = Input((len(self.fields),))
+
+            # optional dropout on inputs
            if self.dropout_input is None:
-                self._model.add(Dense(self.nodes[0], input_dim=len(self.fields), activation=self.activation_function))
-                # in case of no Dropout we already have the first hidden layer
-                start_layer = 1
+                hidden_layer = input_layer
            else:
-                self._model.add(Dropout(rate=self.dropout_input, input_shape=(len(self.fields),)))
-                start_layer = 0
-            # the (other) hidden layers
-            for node_count, dropout_fraction in zip(self.nodes[start_layer:], self.dropout[start_layer:]):
-                self._model.add(Dense(node_count, activation=self.activation_function))
+                hidden_layer = Dropout(rate=self.dropout_input)(input_layer)
+
+            # densely connected hidden layers
+            for node_count, dropout_fraction, use_bias in zip(
+                    self.nodes,
+                    self.dropout,
+                    self.use_bias,
+            ):
+                extra_opts = dict()
+                if self.kernel_initializer is not None:
+                    extra_opts["kernel_initializer"] = getattr(keras.initializers, self.kernel_initializer)()
+                hidden_layer = Dense(node_count, activation=self.activation_function, use_bias=use_bias, **extra_opts)(hidden_layer)
                if (dropout_fraction is not None) and (dropout_fraction > 0):
-                    self._model.add(Dropout(rate=dropout_fraction))
-            # last layer is one neuron (binary classification)
-            self._model.add(Dense(1, activation=self.activation_function_output))
+                    hidden_layer = Dropout(rate=dropout_fraction)(hidden_layer)
+
+            # optional regression targets
+            extra_targets = []
+            for target_field in self.target_fields:
+                extra_target = Dense(1, activation="linear", name="target_{}".format(target_field))(hidden_layer)
+                extra_targets.append(extra_target)
+
+            if not self.target_fields:
+                # one output node for binary classification
+                output_layer = Dense(1, activation=self.activation_function_output)(hidden_layer)
+                outputs = [output_layer]
+            else:
+                # add another hidden layer on top of the regression targets and previous hidden layers
+                merge = concatenate([hidden_layer]+extra_targets)
+                hidden_layer2 = Dense(64, activation=self.activation_function)(merge)
+                output_class = Dense(1, activation=self.activation_function_output)(hidden_layer2)
+                outputs = [output_class]+extra_targets

+            self._model = Model(inputs=[input_layer], outputs=outputs)
            self._compile_or_load_model()

        return self._model
@@ -691,6 +868,7 @@ class ClassificationProject(object):
        np.random.seed(self.random_seed)
        self._model.compile(optimizer=optimizer,
                            loss=self.loss,
+                            loss_weights=self.loss_weights,
                            weighted_metrics=['accuracy']
        )
        np.random.set_state(rn_state)
@@ -715,14 +893,14 @@ class ClassificationProject(object):

        # plot model
        with open(os.path.join(self.project_dir, "model.svg"), "wb") as of:
-            of.write(model_to_dot(self._model).create("dot", format="svg"))
+            of.write(model_to_dot(self._model, show_shapes=True).create("dot", format="svg"))


    @property
    def class_weight(self):
        if self._class_weight is None:
-            sumw_bkg = np.sum(self.w_train[self.y_train == 0])
-            sumw_sig = np.sum(self.w_train[self.y_train == 1])
+            sumw_bkg = np.sum(self.w_train[self.l_train == 0])
+            sumw_sig = np.sum(self.w_train[self.l_train == 1])
            self._class_weight = [(sumw_sig+sumw_bkg)/(2*sumw_bkg), (sumw_sig+sumw_bkg)/(2*sumw_sig)]
            logger.debug("Calculated class_weight: {}".format(self._class_weight))
        return self._class_weight
@@ -737,11 +915,11 @@ class ClassificationProject(object):
        event with class weights
        """
        if self._balanced_class_weight is None:
-            sumw_bkg = np.sum(self.w_train[self.y_train == 0])
-            sumw_sig = np.sum(self.w_train[self.y_train == 1])
+            sumw_bkg = np.sum(self.w_train[self.l_train == 0])
+            sumw_sig = np.sum(self.w_train[self.l_train == 1])
            # use sumw *per event* in this case
-            sumw_bkg /= len(self.w_train[self.y_train == 0])
-            sumw_sig /= len(self.w_train[self.y_train == 1])
+            sumw_bkg /= len(self.w_train[self.l_train == 0])
+            sumw_sig /= len(self.w_train[self.l_train == 1])
            self._balanced_class_weight = [(sumw_sig+sumw_bkg)/(2*sumw_bkg), (sumw_sig+sumw_bkg)/(2*sumw_sig)]
            logger.debug("Calculated balanced_class_weight: {}".format(self._balanced_class_weight))
        return self._balanced_class_weight
@@ -752,29 +930,27 @@ class ClassificationProject(object):

        if reload:
            self.data_loaded = False
-            self.data_transformed = False

        if not self.data_loaded:
            self._load_data()

-        if not self.data_transformed:
-            self._transform_data()

+    @property
+    def l_train(self):
+        "labels (in case y contains regression targets)"
+        if not self.target_fields:
+            return self.y_train
+        else:
+            return self.y_train[:,0]

-    def shuffle_training_data(self):
-        rn_state = np.random.get_state()
-        np.random.shuffle(self.x_train)
-        np.random.set_state(rn_state)
-        np.random.shuffle(self.y_train)
-        np.random.set_state(rn_state)
-        np.random.shuffle(self.w_train)
-        np.random.set_state(rn_state)
-        np.random.shuffle(self.w_train_tot)
-        if self._scores_train is not None:
-            logger.info("Shuffling scores, since they are also there")
-            np.random.set_state(rn_state)
-            np.random.shuffle(self._scores_train)
-        self.data_shuffled = True
+
+    @property
+    def l_test(self):
+        "labels (in case y contains regression targets)"
+        if not self.target_fields:
+            return self.y_test
+        else:
+            return self.y_test[:,0]


    @property
@@ -784,28 +960,109 @@ class ClassificationProject(object):
            class_weight = self.class_weight
        else:
            class_weight = self.balanced_class_weight
+        if not self.data_loaded:
+            raise ValueError("Data not loaded! can't calculate total weight")
        if self._w_train_tot is None:
-            self._w_train_tot = self.w_train*np.array(class_weight)[self.y_train.astype(int)]
-            self._w_train_tot /= np.mean(self._w_train_tot)
+            if self.apply_class_weight:
+                self._w_train_tot = self.w_train*np.array(class_weight)[self.l_train.astype(int)]
+            else:
+                self._w_train_tot = np.array(self.w_train)
+            if self.normalize_weights:
+                self._w_train_tot /= np.mean(self._w_train_tot)
        return self._w_train_tot


    @property
    def validation_data(self):
-        "Validation data. Attention: Shuffle training data before using this!"
-        if not self.data_shuffled:
-            raise ValueError("Training data isn't shuffled, can't split of validation data")
-        split_index = int((1-self.validation_split)*len(self.x_train))
-        return self.x_train[split_index:], self.y_train[split_index:], self.w_train_tot[split_index:]
+        "(Transformed) validation data for loss evaluation"
+        idx = self.train_val_idx[1]
+        x_val, y_val, w_val = self.x_train[idx], self.y_train[idx], self.w_train_tot[idx]
+        x_val_input = self.get_input_list(self.transform(x_val))
+        y_val_output = self.get_output_list(self.transform_target(y_val))
+        w_val_list = self.get_weight_list(w_val)
+        return x_val_input, y_val_output, w_val_list


    @property
    def training_data(self):
-        "Training data with validation data split off. Attention: Shuffle training data before using this!"
-        if not self.data_shuffled:
-            raise ValueError("Training data isn't shuffled, can't split of validation data")
-        split_index = int((1-self.validation_split)*len(self.x_train))
-        return self.x_train[:split_index], self.y_train[:split_index], self.w_train_tot[:split_index]
+        "(Transformed) Training data with validation data split off"
+        idx = self.train_val_idx[0]
+        x_train, y_train, w_train =  self.x_train[idx], self.y_train[idx], self.w_train_tot[idx]
+        x_train_input = self.get_input_list(self.transform(x_train))
+        y_train_output = self.get_output_list(self.transform_target(y_train))
+        w_train_list = self.get_weight_list(w_train)
+        return x_train_input, y_train_output, w_train_list
+
+
+    @property
+    def train_val_idx(self):
+        if self._train_val_idx is None:
+            if self.kfold_splits is not None:
+                kfold = KFold(self.kfold_splits, shuffle=self.shuffle, random_state=self.shuffle_seed)
+                for i, train_val_idx in enumerate(kfold.split(self.x_train)):
+                    if i == self.kfold_index:
+                        self._train_val_idx = train_val_idx
+                        break
+                else:
+                    raise IndexError("Index {} out of range for kfold (requested {} splits)".format(self.kfold_index, self.kfold_splits))
+            else:
+                split_index = int((1-self.validation_split)*len(self.x_train))
+                np.random.seed(self.shuffle_seed)
+                if self.shuffle:
+                    shuffled_idx = np.random.permutation(len(self.x_train))
+                else:
+                    shuffled_idx = np.arange(len(self.x_train))
+                self._train_val_idx = (shuffled_idx[:split_index], shuffled_idx[split_index:])
+        return self._train_val_idx
+
+
+    @property
+    def steps_per_epoch(self):
+        return int(float(len(self.train_val_idx[0]))/float(self.batch_size))
+
+
+    def get_input_list(self, x):
+        "For the standard Dense models with single input, this does nothing"
+        return x
+
+
+    def get_output_list(self, y):
+        "Split target vector column wise in case of regression targets"
+        if not self.target_fields:
+            return y
+        else:
+            return np.hsplit(y, len(self.target_fields)+1)
+
+
+    def get_weight_list(self, w):
+        "Repeat weight n times for regression targets"
+        if not self.target_fields:
+            return w
+        else:
+            return [w]*(len(self.target_fields)+1)
+
+
+    def yield_batch(self):
+        "Batch generator - optionally shuffle the indices after each epoch"
+        x_train, y_train, w_train = self.x_train, self.y_train, self.w_train_tot
+        train_idx = list(self.train_val_idx[0])
+        np.random.seed(self.shuffle_seed+1)
+        logger.info("Generating training batches from {} signal and {} background events"
+                    .format(len(np.where(self.l_train[train_idx]==1)[0]),
+                            len(np.where(self.l_train[train_idx]==0)[0])))
+        while True:
+            if self.shuffle:
+                shuffled_idx = np.random.permutation(train_idx)
+            else:
+                shuffled_idx = train_idx
+            for start in range(0, len(shuffled_idx), int(self.batch_size)):
+                x_batch = x_train[shuffled_idx[start:start+int(self.batch_size)]]
+                y_batch = y_train[shuffled_idx[start:start+int(self.batch_size)]]
+                w_batch = w_train[shuffled_idx[start:start+int(self.batch_size)]]
+                x_input = self.get_input_list(self.transform(x_batch))
+                y_output = self.get_output_list(self.transform_target(y_batch))
+                w_list = self.get_weight_list(w_batch)
+                yield (x_input, y_output, w_list)


    def yield_single_class_batch(self, class_label):
@@ -814,10 +1071,14 @@ class ClassificationProject(object):
        The weights are multiplied by balanced_class_weight.
        """
        x_train, y_train, w_train = self.training_data
-        class_idx = np.where(y_train==class_label)[0]
+        l_train = y_train[:,0] if self.target_fields else y_train
+        class_idx = np.where(l_train==class_label)[0]
        while True:
            # shuffle the indices for this class label
-            shuffled_idx = np.random.permutation(class_idx)
+            if self.shuffle:
+                shuffled_idx = np.random.permutation(class_idx)
+            else:
+                shuffled_idx = class_idx
            # yield them batch wise
            for start in range(0, len(shuffled_idx), int(self.batch_size/2)):
                yield (x_train[shuffled_idx[start:start+int(self.batch_size/2)]],
@@ -840,51 +1101,45 @@ class ClassificationProject(object):
                   np.concatenate((batch_0[2], batch_1[2])))


-    def train(self, epochs=10):
+    def train(self, epochs=10, skip_checkpoint=False):

        self.load()

-        self.shuffle_training_data()
-
-        for branch_index, branch in enumerate(self.fields):
-            self.plot_input(branch_index)
-
        self.total_epochs = self._read_info("epochs", 0)

+        set_session_threads()
+
        logger.info("Train model")
        if not self.balance_dataset:
            try:
                self.is_training = True
-                self.model.fit(self.x_train,
-                               # the reshape might be unnescessary here
-                               self.y_train.reshape(-1, 1),
-                               epochs=epochs,
-                               validation_split=self.validation_split,
-                               # we have to multiply by class weight since keras ignores class weight if sample weight is given
-                               # see https://github.com/keras-team/keras/issues/497
-                               sample_weight=self.w_train_tot,
-                               shuffle=True,
-                               batch_size=self.batch_size,
-                               callbacks=self.callbacks_list)
+                self.model.fit_generator(self.yield_batch(),
+                                         steps_per_epoch=self.steps_per_epoch,
+                                         epochs=epochs,
+                                         validation_data=self.validation_data,
+                                         callbacks=self.callbacks_list,
+                                         verbose=self.verbose)
                self.is_training = False
            except KeyboardInterrupt:
                logger.info("Interrupt training - continue with rest")
        else:
            try:
                self.is_training = True
-                labels, label_counts = np.unique(self.y_train, return_counts=True)
+                labels, label_counts = np.unique(self.l_train, return_counts=True)
                logger.info("Training on balanced batches")
                # note: the batches have balanced_class_weight already applied
                self.model.fit_generator(self.yield_balanced_batch(),
                                         steps_per_epoch=int(min(label_counts)/self.batch_size),
                                         epochs=epochs,
                                         validation_data=self.validation_data,
-                                         callbacks=self.callbacks_list)
+                                         callbacks=self.callbacks_list,
+                                         verbose=self.verbose)
                self.is_training = False
            except KeyboardInterrupt:
                logger.info("Interrupt training - continue with rest")

-        self.checkpoint_model()
+        if not skip_checkpoint:
+            self.checkpoint_model()


    def checkpoint_model(self):
@@ -907,23 +1162,40 @@ class ClassificationProject(object):
        self._write_info("epochs", self.total_epochs)


-    def evaluate_train_test(self, do_train=True, do_test=True, mode=None):
-        logger.info("Reloading (and re-transforming) unshuffled training data")
-        self.load(reload=True)
+    def evaluate_train_test(self, do_train=True, do_test=True, batch_size=10000, mode=None):
+        "Calculate scores for training and test sample"

        if mode is not None:
            self._write_info("scores_mode", mode)

-        logger.info("Create/Update scores for train/test sample")
+        def eval_score(data_name):
+            logger.info("Create/Update scores for {} sample".format(data_name))
+            n_events = len(getattr(self, "x_"+data_name))
+            setattr(self, "scores_"+data_name, np.empty(n_events))
+            for start in range(0, n_events, batch_size):
+                stop = start+batch_size
+                outputs =  self.predict(
+                    self.get_input_list(self.transform(getattr(self, "x_"+data_name)[start:stop])),
+                    mode=mode
+                )
+                if not self.target_fields:
+                    scores_batch = outputs.reshape(-1)
+                else:
+                    scores_batch = outputs[0].reshape(-1)
+                getattr(self, "scores_"+data_name)[start:stop] = scores_batch
+            self._dump_to_hdf5("scores_"+data_name)
+
        if do_test:
-            self.scores_test = self.predict(self.x_test, mode=mode).reshape(-1)
-            self._dump_to_hdf5("scores_test")
+            eval_score("test")
        if do_train:
-            self.scores_train = self.predict(self.x_train, mode=mode).reshape(-1)
-            self._dump_to_hdf5("scores_train")
+            eval_score("train")


    def predict(self, x, mode=None):
+        """
+        Calculate the scores for a (transformed) array of input values.
+        If the array is not transformed, use `evaluate` instead
+        """
        if mode is None:
            # normal output - after activation function output layer
            return self.model.predict(x)
@@ -943,6 +1215,10 @@ class ClassificationProject(object):


    def evaluate(self, x_eval, mode=None):
+        """
+        Calculate the scores for an array of input values.
+        All nescessary transformations are applied.
+        """
        logger.debug("Evaluate score for {}".format(x_eval))
        x_eval = self.transform(x_eval)
        logger.debug("Evaluate for transformed array: {}".format(x_eval))
@@ -952,7 +1228,10 @@ class ClassificationProject(object):
    def write_friend_tree(self, score_name,
                          source_filename, source_treename,
                          target_filename, target_treename,
-                          batch_size=100000):
+                          batch_size=100000,
+                          score_mode=None,
+                          fixed_params=None):
+        "TODO: doesn't work for regression targets"
        f = ROOT.TFile.Open(source_filename)
        tree = f.Get(source_treename)
        entries = tree.GetEntries()
@@ -965,8 +1244,11 @@ class ClassificationProject(object):
            x_from_tree = tree2array(tree,
                                     branches=self.branches+self.identifiers,
                                     start=start, stop=start+batch_size)
+            # for parametrized classifiers
+            if fixed_params is not None:
+                for param_name, value in fixed_params.items():
+                    x_from_tree[param_name] = value
            x_eval = rec2array(x_from_tree[self.branches])
-
            if len(self.identifiers) > 0:
                # create list of booleans that indicate which events where used for training
                df_identifiers = pd.DataFrame(x_from_tree[self.identifiers])
@@ -978,7 +1260,7 @@ class ClassificationProject(object):
                is_train = np.zeros(len(x_eval))

            # join scores and is_train array
-            scores = self.evaluate(x_eval).reshape(-1)
+            scores = self.evaluate(x_eval, mode=score_mode).reshape(-1)
            friend_df = pd.DataFrame(np.array(scores, dtype=[(score_name, np.float64)]))
            friend_df[score_name+"_is_train"] = is_train
            friend_tree = friend_df.to_records()[[score_name, score_name+"_is_train"]]
@@ -1020,17 +1302,57 @@ class ClassificationProject(object):
        return centers, hist, errors


-    def plot_input(self, var_index, ax=None):
-        "plot a single input variable"
+    def plot_input(self, var_index, ax=None, from_training_batches=False, max_n_batches=None):
+        """
+        plot a single input variable as a histogram (signal vs background)
+
+        :param from_training_batches: use data from training batch generator
+        :param max_n_batches: if training batch generator is used, just use
+                              this number of batches (otherwise steps_per_epoch is used)
+        """
        branch = self.fields[var_index]
        if ax is None:
            fig, ax = plt.subplots()
        else:
            fig = None
-        bkg = self.x_train[:,var_index][self.y_train == 0]
-        sig = self.x_train[:,var_index][self.y_train == 1]
-        bkg_weights = self.w_train_tot[self.y_train == 0]
-        sig_weights = self.w_train_tot[self.y_train == 1]
+
+        if not from_training_batches:
+            bkg = self.x_train[:,var_index][self.l_train == 0]
+            sig = self.x_train[:,var_index][self.l_train == 1]
+            bkg_weights = self.w_train_tot[self.l_train == 0]
+            sig_weights = self.w_train_tot[self.l_train == 1]
+        else:
+            bkg = None
+            sig = None
+            bkg_weights = None
+            sig_weights = None
+            if max_n_batches is not None:
+                n_batches = max_n_batches
+            else:
+                n_batches = self.steps_per_epoch
+            for i_batch, (x, y, w) in enumerate(self.yield_batch()):
+                if i_batch > n_batches:
+                    break
+                if self.target_fields:
+                    y = y[0]
+                try:
+                    x = self.get_input_flat(x)
+                except NameError:
+                    pass
+                bkg_batch = x[:,var_index][y==0]
+                sig_batch = x[:,var_index][y==1]
+                bkg_weights_batch = w[y==0]
+                sig_weights_batch = w[y==1]
+                if bkg is None:
+                    bkg = bkg_batch
+                    sig = sig_batch
+                    bkg_weights = bkg_weights_batch
+                    sig_weights = sig_weights_batch
+                else:
+                    bkg = np.concatenate([bkg, bkg_batch])
+                    sig = np.concatenate([sig, sig_batch])
+                    bkg_weights = np.concatenate([bkg_weights, bkg_weights_batch])
+                    sig_weights = np.concatenate([sig_weights, sig_weights_batch])

        if hasattr(self, "mask_value"):
            bkg_not_masked = np.where(bkg != self.mask_value)[0]
@@ -1081,13 +1403,14 @@ class ClassificationProject(object):
            centers_sig, hist_sig, _ = self.get_bin_centered_hist(sig, bins=bins, range=plot_range, weights=sig_weights)
            centers_bkg, hist_bkg, _ = self.get_bin_centered_hist(bkg, bins=bins, range=plot_range, weights=bkg_weights)

-        width = centers_sig[1]-centers_sig[0]
+        if bins > 1:
+            width = centers_sig[1]-centers_sig[0]
+        else:
+            width = 1.
        ax.bar(centers_bkg, hist_bkg, color="b", alpha=0.5, width=width)
        ax.bar(centers_sig, hist_sig, color="r", alpha=0.5, width=width)

        label = branch
-        if self.data_transformed:
-            label += " (transformed)"
        ax.set_xlabel(label)
        if fig is not None:
            plot_dir = os.path.join(self.project_dir, "plots")
@@ -1096,27 +1419,27 @@ class ClassificationProject(object):
            return save_show(plt, fig, os.path.join(plot_dir, "var_{}.pdf".format(var_index)))


-    def plot_all_inputs(self):
+    def plot_all_inputs(self, **kwargs):
        nrows = math.ceil(math.sqrt(len(self.fields)))
        fig, axes = plt.subplots(nrows=int(nrows), ncols=int(nrows),
                                 figsize=(3*nrows, 3*nrows),
                                 gridspec_kw=dict(wspace=0.4, hspace=0.4))
        for i in range(len(self.fields)):
-            self.plot_input(i, ax=axes.reshape(-1)[i])
+            self.plot_input(i, ax=axes.reshape(-1)[i], **kwargs)
        return save_show(plt, fig, os.path.join(self.project_dir, "all_inputs.pdf"))


    def plot_weights(self, bins=100, range=None):
        fig, ax = plt.subplots()
-        bkg = self.w_train_tot[self.y_train == 0]
-        sig = self.w_train_tot[self.y_train == 1]
+        bkg = self.w_train_tot[self.l_train == 0]
+        sig = self.w_train_tot[self.l_train == 1]
        ax.hist(bkg, bins=bins, range=range, color="b", alpha=0.5)
        ax.set_yscale("log")
-        return save_show(plt, fig, os.path.join(self.project_dir, "eventweights_bkg.pdf"))
+        save_show(plt, fig, os.path.join(self.project_dir, "eventweights_bkg.pdf"))
        fig, ax = plt.subplots()
        ax.hist(sig, bins=bins, range=range, color="r", alpha=0.5)
        ax.set_yscale("log")
-        return save_show(plt, fig, os.path.join(self.project_dir, "eventweights_sig.pdf"))
+        save_show(plt, fig, os.path.join(self.project_dir, "eventweights_sig.pdf"))


    def plot_ROC(self, xlim=(0,1), ylim=(0,1)):
@@ -1127,8 +1450,8 @@ class ClassificationProject(object):
        ax.grid(color='gray', linestyle='--', linewidth=1)

        for y, scores, weight, label in [
-                (self.y_train, self.scores_train, self.w_train, "train"),
-                (self.y_test, self.scores_test, self.w_test, "test")
+                (self.l_train, self.scores_train, self.w_train, "train"),
+                (self.l_test, self.scores_test, self.w_test, "test")
        ]:
            fpr, tpr, threshold = roc_curve(y, scores, sample_weight = weight)
            fpr = 1.0 - fpr # background rejection
@@ -1161,10 +1484,10 @@ class ClassificationProject(object):
            trf = lambda y : y
        fig, ax = plt.subplots()
        for scores, weights, y, class_label, fn, opts in [
-                (self.scores_train, self.w_train, self.y_train, 1, ax.bar, dict(color="r", label="signal train")),
-                (self.scores_train, self.w_train, self.y_train, 0, ax.bar, dict(color="b", label="background train")),
-                (self.scores_test, self.w_test, self.y_test, 1, ax.errorbar, dict(fmt="ro", label="signal test")),
-                (self.scores_test, self.w_test, self.y_test, 0, ax.errorbar, dict(fmt="bo", label="background test")),
+                (self.scores_train, self.w_train, self.l_train, 1, ax.bar, dict(color="r", label="signal train")),
+                (self.scores_train, self.w_train, self.l_train, 0, ax.bar, dict(color="b", label="background train")),
+                (self.scores_test, self.w_test, self.l_test, 1, ax.errorbar, dict(fmt="ro", label="signal test")),
+                (self.scores_test, self.w_test, self.l_test, 0, ax.errorbar, dict(fmt="bo", label="background test")),
        ]:
            weights = weights[y==class_label]
            if apply_class_weight is True and (lumifactor is not None):
@@ -1217,16 +1540,16 @@ class ClassificationProject(object):
        else:
            trf = lambda y : y

-        centers_sig_train, hist_sig_train, rel_errors_sig_train = self.get_bin_centered_hist(trf(self.scores_train[self.y_train==1].reshape(-1)), weights=self.w_train[self.y_train==1], **plot_opts)
-        centers_bkg_train, hist_bkg_train, rel_errors_bkg_train = self.get_bin_centered_hist(trf(self.scores_train[self.y_train==0].reshape(-1)), weights=self.w_train[self.y_train==0], **plot_opts)
-        centers_sig_test, hist_sig_test, rel_errors_sig_test = self.get_bin_centered_hist(trf(self.scores_test[self.y_test==1].reshape(-1)), weights=self.w_test[self.y_test==1], **plot_opts)
-        centers_bkg_test, hist_bkg_test, rel_errors_bkg_test = self.get_bin_centered_hist(trf(self.scores_test[self.y_test==0].reshape(-1)), weights=self.w_test[self.y_test==0], **plot_opts)
+        centers_sig_train, hist_sig_train, rel_errors_sig_train = self.get_bin_centered_hist(trf(self.scores_train[self.l_train==1].reshape(-1)), weights=self.w_train[self.l_train==1], **plot_opts)
+        centers_bkg_train, hist_bkg_train, rel_errors_bkg_train = self.get_bin_centered_hist(trf(self.scores_train[self.l_train==0].reshape(-1)), weights=self.w_train[self.l_train==0], **plot_opts)
+        centers_sig_test, hist_sig_test, rel_errors_sig_test = self.get_bin_centered_hist(trf(self.scores_test[self.l_test==1].reshape(-1)), weights=self.w_test[self.l_test==1], **plot_opts)
+        centers_bkg_test, hist_bkg_test, rel_errors_bkg_test = self.get_bin_centered_hist(trf(self.scores_test[self.l_test==0].reshape(-1)), weights=self.w_test[self.l_test==0], **plot_opts)

        significances_train = []
        significances_test = []
-        for hist_sig, hist_bkg, rel_errors_sig, rel_errors_bkg, significances, w, y in [
-                (hist_sig_train, hist_bkg_train, rel_errors_sig_train, rel_errors_bkg_train, significances_train, self.w_train, self.y_train),
-                (hist_sig_test, hist_bkg_test, rel_errors_sig_test, rel_errors_bkg_test, significances_test, self.w_test, self.y_test),
+        for hist_sig, hist_bkg, rel_errors_sig, rel_errors_bkg, significances in [
+                (hist_sig_train, hist_bkg_train, rel_errors_sig_train, rel_errors_bkg_train, significances_train),
+                (hist_sig_test, hist_bkg_test, rel_errors_sig_test, rel_errors_bkg_test, significances_test),
        ]:
            # factor to rescale due to using only a fraction of events (training and test samples)
            # normfactor_sig = (np.sum(self.w_train[self.y_train==1])+np.sum(self.w_test[self.y_test==1]))/np.sum(w[y==1])
@@ -1296,7 +1619,7 @@ class ClassificationProject(object):
        return lambda y : np.log(y/(1-y))


-    def plot_significance(self, significance_function=None, maxsteps=1000, lumifactor=1., vectorized=False, invert_activation=False):
+    def plot_significance(self, significance_function=None, maxsteps=None, lumifactor=1., vectorized=False, invert_activation=False):
        """
        Plot the significance when cutting on all posible thresholds and plot against signal efficiency.
        """
@@ -1315,13 +1638,16 @@ class ClassificationProject(object):
        prop_cycle = plt.rcParams['axes.prop_cycle']
        colors = prop_cycle.by_key()['color']
        for (scores, y, w, label), col in zip(
-                [(self.scores_train, self.y_train, self.w_train, "train"),
-                 (self.scores_test, self.y_test, self.w_test, "test")],
+                [(self.scores_train, self.l_train, self.w_train, "train"),
+                 (self.scores_test, self.l_test, self.w_test, "test")],
                colors
        ):
            scores = trf(scores)
            s_sumws, s_errs, b_sumws, b_errs, thresholds = self.calc_s_ds_b_db(scores, y, w)
-            stepsize = int(len(s_sumws))/int(maxsteps)
+            if maxsteps is not None:
+                stepsize = int(len(s_sumws))/int(maxsteps)
+            else:
+                stepsize = 1
            if stepsize == 0:
                stepsize = 1
            s_sumws = s_sumws[::stepsize]*lumifactor*self.step_signal
@@ -1377,19 +1703,19 @@ class ClassificationProject(object):
            hist_dict = self.csv_hist

        logger.info("Plot losses")
-        plt.plot(hist_dict['loss'])
-        plt.plot(hist_dict['val_loss'])
-        plt.ylabel('loss')
-        plt.xlabel('epoch')
-        plt.legend(['training data','validation data'], loc='upper left')
+        fig, ax = plt.subplots()
+        ax.plot(hist_dict['loss'])
+        ax.plot(hist_dict['val_loss'])
+        ax.set_ylabel('loss')
+        ax.set_xlabel('epoch')
+        ax.legend(['training data','validation data'], loc='upper left')
        if log:
-            plt.yscale("log")
+            ax.set_yscale("log")
        if xlim is not None:
-            plt.xlim(*xlim)
+            ax.set_xlim(*xlim)
        if ylim is not None:
-            plt.ylim(*ylim)
-        plt.savefig(os.path.join(self.project_dir, "losses.pdf"))
-        plt.clf()
+            ax.set_ylim(*ylim)
+        return save_show(plt, fig, os.path.join(self.project_dir, "losses.pdf"))


    def plot_accuracy(self, all_trainings=False, log=False, acc_suffix="weighted_acc"):
@@ -1435,7 +1761,7 @@ class ClassificationProject(object):
        df = pd.DataFrame(np.concatenate([self.x_train, self.x_test]), columns=self.fields)
        df["weight"] = np.concatenate([self.w_train, self.w_test])
        df["labels"] = pd.Categorical.from_codes(
-            np.concatenate([self.y_train, self.y_test]),
+            np.concatenate([self.l_train, self.l_test]),
            categories=["background", "signal"]
        )
        for identifier in self.identifiers:
@@ -1474,6 +1800,22 @@ class ClassificationProjectDataFrame(ClassificationProject):
    A little hack to initialize a ClassificationProject from a pandas DataFrame instead of ROOT TTrees
    """

+    def __init__(self, name, *args, **kwargs):
+        if len(args) < 1 and len(kwargs) < 1:
+            # if no further arguments given, interpret as directory name
+            self._init_from_dir(name)
+        else:
+            # otherwise initialise new project
+            self._init_from_args(name, *args, **kwargs)
+            with open(os.path.join(self.project_dir, "options.pickle"), "wb") as of:
+                # don't put the dataframe into options.pickle!
+                if len(args) > 1:
+                    args = args[1:]
+                else:
+                    args = []
+                pickle.dump(dict(args=args, kwargs=kwargs), of)
+
+
    def _init_from_args(self,
                        name,
                        df,
@@ -1496,9 +1838,11 @@ class ClassificationProjectDataFrame(ClassificationProject):
            raise NotImplementedError("'split_column' is the only currently supported split mode")
        self.split_mode = split_mode
        self.split_column = split_column
-        super(ClassificationProjectDataFrame, self).__init__(name,
-                                                             signal_trees=[], bkg_trees=[], branches=[], weight_expr="1",
-                                                             **kwargs)
+        super(ClassificationProjectDataFrame, self)._init_from_args(
+            name,
+            signal_trees=[], bkg_trees=[], branches=[], weight_expr="1",
+            **kwargs
+        )
        self._x_train = None
        self._x_test = None
        self._y_train = None
@@ -1575,16 +1919,15 @@ class ClassificationProjectDataFrame(ClassificationProject):

        if reload:
            self.data_loaded = False
-            self.data_transformed = False
            self._x_train = None
            self._x_test = None
            self._y_train = None
            self._y_test = None
            self._w_train = None
            self._w_test = None
+            self._w_train_tot = None

-        if not self.data_transformed:
-            self._transform_data()
+        self.data_loaded = True


 class ClassificationProjectRNN(ClassificationProject):
@@ -1597,6 +1940,7 @@ class ClassificationProjectRNN(ClassificationProject):
                        recurrent_field_names=None,
                        rnn_layer_nodes=32,
                        mask_value=-999,
+                        recurrent_unit_type="GRU",
                        **kwargs):
        """
        recurrent_field_names example:
@@ -1615,6 +1959,7 @@ class ClassificationProjectRNN(ClassificationProject):
            self.recurrent_field_names = []
        self.rnn_layer_nodes = rnn_layer_nodes
        self.mask_value = mask_value
+        self.recurrent_unit_type = recurrent_unit_type

        # convert to  of indices
        self.recurrent_field_idx = []
@@ -1645,14 +1990,6 @@ class ClassificationProjectRNN(ClassificationProject):
            )


-    def _transform_data(self):
-        self.x_train[self.x_train == self.mask_value] = np.nan
-        self.x_test[self.x_test == self.mask_value] = np.nan
-        super(ClassificationProjectRNN, self)._transform_data()
-        self.x_train[np.isnan(self.x_train)] = self.mask_value
-        self.x_test[np.isnan(self.x_test)] = self.mask_value
-
-
    @property
    def model(self):
        if self._model is None:
@@ -1663,7 +2000,15 @@ class ClassificationProjectRNN(ClassificationProject):
            for field_idx in self.recurrent_field_idx:
                chan_inp = Input(field_idx.shape[1:])
                channel = Masking(mask_value=self.mask_value)(chan_inp)
-                channel = GRU(self.rnn_layer_nodes)(channel)
+                if self.recurrent_unit_type == "GRU":
+                    channel = GRU(self.rnn_layer_nodes)(channel)
+                elif self.recurrent_unit_type == "SimpleRNN":
+                    channel = SimpleRNN(self.rnn_layer_nodes)(channel)
+                elif self.recurrent_unit_type == "LSTM":
+                    channel = LSTM(self.rnn_layer_nodes)(channel)
+                else:
+                    raise NotImplementedError("{} not implemented".format(self.recurrent_unit_type))
+                logger.info("Added {} unit".format(self.recurrent_unit_type))
                # TODO: configure dropout for recurrent layers
                #channel = Dropout(0.3)(channel)
                rnn_inputs.append(chan_inp)
@@ -1675,43 +2020,61 @@ class ClassificationProjectRNN(ClassificationProject):
                flat_channel = flat_input
            combined = concatenate(rnn_channels+[flat_channel])
            for node_count, dropout_fraction in zip(self.nodes, self.dropout):
-                combined = Dense(node_count, activation=self.activation_function)(combined)
+                extra_opts = dict()
+                if self.kernel_initializer is not None:
+                    extra_opts["kernel_initializer"] = getattr(keras.initializers, self.kernel_initializer)()
+                combined = Dense(node_count, activation=self.activation_function, **extra_opts)(combined)
                if (dropout_fraction is not None) and (dropout_fraction > 0):
                    combined = Dropout(rate=dropout_fraction)(combined)
            combined = Dense(1, activation=self.activation_function_output)(combined)
-            self._model = Model(inputs=rnn_inputs+[flat_input], outputs=combined)
-            self._compile_or_load_model()
-        return self._model
+            outputs = [combined]

+            # optional regression targets
+            for target_field in self.target_fields:
+                extra_target = Dense(1, activation="linear", name="target_{}".format(target_field))(combined)
+                outputs.append(extra_target)

-    def train(self, epochs=10):
-        self.load()
-
-        self.shuffle_training_data()
+            self._model = Model(inputs=rnn_inputs+[flat_input], outputs=outputs)
+            self._compile_or_load_model()
+        return self._model

-        for branch_index, branch in enumerate(self.fields):
-            self.plot_input(branch_index)

-        self.total_epochs = self._read_info("epochs", 0)
+    def clean_mask(self, x):
+        """
+        Mask recurrent fields such that once a masked value occurs,
+        all values corresponding to the same and following objects are
+        masked as well. Works in place.
+        """
+        for recurrent_field_idx in self.recurrent_field_idx:
+            for evt in x:
+                masked = False
+                for line_idx in recurrent_field_idx.reshape(*recurrent_field_idx.shape[1:]):
+                    if (evt[line_idx] == self.mask_value).any():
+                        masked=True
+                    if masked:
+                        evt[line_idx] = self.mask_value

-        try:
-            self.is_training = True
-            logger.info("Training on batches for RNN")
-            # note: the batches have class_weight already applied
-            self.model.fit_generator(self.yield_batch(),
-                                     steps_per_epoch=int(len(self.training_data[0])/self.batch_size),
-                                     epochs=epochs,
-                                     validation_data=self.validation_data,
-                                     callbacks=self.callbacks_list)
-            self.is_training = False
-        except KeyboardInterrupt:
-            logger.info("Interrupt training - continue with rest")

-        self.checkpoint_model()
+    def mask_uniform(self, x):
+        """
+        Mask recurrent fields with a random (uniform) number of objects. Works in place.
+        """
+        for recurrent_field_idx in self.recurrent_field_idx:
+            for evt in x:
+                masked = False
+                nobj = int(random.random()*(recurrent_field_idx.shape[1]+1))
+                for obj_number, line_idx in enumerate(recurrent_field_idx.reshape(*recurrent_field_idx.shape[1:])):
+                    if obj_number == nobj:
+                        masked=True
+                    if masked:
+                        evt[line_idx] = self.mask_value


    def get_input_list(self, x):
-        "Format the input starting from flat ntuple"
+        """
+        Returns a list of 3-dimensional inputs for each
+        recurrent layer and a 2-dimensional one for the normal flat inputs.
+        """
        x_input = []
        for field_idx in self.recurrent_field_idx:
            x_recurrent = x[:,field_idx.reshape(-1)].reshape(-1, *field_idx.shape[1:])
@@ -1722,7 +2085,7 @@ class ClassificationProjectRNN(ClassificationProject):


    def get_input_flat(self, x):
-        "Transform input back to flat ntuple"
+        "Transform the multiple inputs back to flat ntuple"
        nevent = x[0].shape[0]
        x_flat = np.empty((nevent, len(self.fields)), dtype=np.float)
        # recurrent fields
@@ -1737,74 +2100,6 @@ class ClassificationProjectRNN(ClassificationProject):
        return x_flat


-    def yield_batch(self):
-        x_train, y_train, w_train = self.training_data
-        while True:
-            shuffled_idx = np.random.permutation(len(x_train))
-            for start in range(0, len(shuffled_idx), int(self.batch_size)):
-                x_batch = x_train[shuffled_idx[start:start+int(self.batch_size)]]
-                y_batch = y_train[shuffled_idx[start:start+int(self.batch_size)]]
-                w_batch = w_train[shuffled_idx[start:start+int(self.batch_size)]]
-                x_input = self.get_input_list(x_batch)
-                yield (x_input, y_batch, w_batch)
-
-
-    @property
-    def validation_data(self):
-        "class weighted validation data. Attention: Shuffle training data before using this!"
-        x_val, y_val, w_val = super(ClassificationProjectRNN, self).validation_data
-        x_val_input = self.get_input_list(x_val)
-        return x_val_input, y_val, w_val
-
-
-    def evaluate_train_test(self, do_train=True, do_test=True, batch_size=10000, mode=None):
-        logger.info("Reloading (and re-transforming) unshuffled training data")
-        self.load(reload=True)
-
-        if mode is not None:
-            self._write_info("scores_mode", mode)
-
-        def eval_score(data_name):
-            logger.info("Create/Update scores for {} sample".format(data_name))
-            n_events = len(getattr(self, "x_"+data_name))
-            setattr(self, "scores_"+data_name, np.empty(n_events))
-            for start in range(0, n_events, batch_size):
-                stop = start+batch_size
-                getattr(self, "scores_"+data_name)[start:stop] = (
-                    self.predict(
-                        self.get_input_list(getattr(self, "x_"+data_name)[start:stop]),
-                        mode=mode
-                    ).reshape(-1)
-                )
-            self._dump_to_hdf5("scores_"+data_name)
-
-        if do_test:
-            eval_score("test")
-        if do_train:
-            eval_score("train")
-
-
-    def _batch_transform(self, x, fn, batch_size):
-        "Transform array in batches, temporarily setting mask_values to nan"
-        transformed = np.empty(x.shape, dtype=x.dtype)
-        for start in range(0, len(x), batch_size):
-            stop = start+batch_size
-            x_batch = np.array(x[start:stop]) # copy
-            x_batch[x_batch == self.mask_value] = np.nan
-            x_batch = fn(x_batch)
-            x_batch[np.isnan(x_batch)] = self.mask_value
-            transformed[start:stop] = x_batch
-        return transformed
-
-
-    def transform(self, x, batch_size=10000):
-        return self._batch_transform(x, self.scaler.transform, batch_size)
-
-
-    def inverse_transform(self, x, batch_size=10000):
-        return self._batch_transform(x, self.scaler.inverse_transform, batch_size)
-
-
    def evaluate(self, x_eval, mode=None):
        logger.debug("Evaluate score for {}".format(x_eval))
        x_eval = self.transform(x_eval)

--- a/utils.py
+++ b/utils.py
@@ -33,22 +33,36 @@ def get_single_neuron_function(model, layer, neuron, input_transform=None):
    return eval_single_neuron


-def create_random_event(ranges):
+def create_random_event(ranges, mask_probs=None, mask_value=None):
    random_event = np.array([p[0]+(p[1]-p[0])*np.random.rand() for p in ranges])
    random_event = random_event.reshape(-1, len(random_event))
+    # if given, mask values with a certain probability
+    if mask_probs is not None:
+        if mask_value is None:
+            raise ValueError("Need to provide mask_value if random events should be masked")
+        for var_index, mask_prob in enumerate(mask_probs):
+            random_event[:,var_index][np.random.rand(len(random_event)) < mask_prob] = mask_value
    return random_event


-def get_ranges(x, quantiles, weights, mask_value=None, filter_index=None):
+def get_ranges(x, quantiles, weights, mask_value=None, filter_index=None, max_evts=None):
    "Get ranges for plotting or random event generation based on quantiles"
    ranges = []
+    mask_probs = []
+    if max_evts is not None:
+        rnd_idx = np.random.permutation(np.arange(len(x)))
+        rnd_idx = rnd_idx[:max_evts]
    for var_index in range(x.shape[1]):
        if (filter_index is not None) and (var_index != filter_index):
            continue
        x_var = x[:,var_index]
+        if max_evts is not None:
+            x_var = x_var[rnd_idx]
        not_masked = np.where(x_var != mask_value)[0]
+        masked = np.where(x_var == mask_value)[0]
        ranges.append(weighted_quantile(x_var[not_masked], quantiles, sample_weight=weights[not_masked]))
-    return ranges
+        mask_probs.append(float(len(masked))/float(len(x_var)))
+    return ranges, mask_probs


 def max_activation_wrt_input(gradient_function, random_event, threshold=None, maxthreshold=None, maxit=100, step=1, const_indices=[],
@@ -115,7 +129,7 @@ def get_grad_function(model, layer, neuron):
       ],
       ignoreKwargs=["input_transform", "input_inverse_transform"],
 )
-def get_max_activation_events(model, ranges, ntries, layer, neuron, seed=42, **kwargs):
+def get_max_activation_events(model, ranges, ntries, layer, neuron, seed=42, mask_probs=None, mask_value=None, **kwargs):

    gradient_function = get_grad_function(model, layer, neuron)

@@ -125,9 +139,15 @@ def get_max_activation_events(model, ranges, ntries, layer, neuron, seed=42, **k
    for i in range(ntries):
        if not (i%100):
            logger.info(i)
-        res = max_activation_wrt_input(gradient_function,
-                                       create_random_event(ranges),
-                                       **kwargs)
+        res = max_activation_wrt_input(
+            gradient_function,
+            create_random_event(
+                ranges,
+                mask_probs=mask_probs,
+                mask_value=mask_value
+            ),
+            **kwargs
+        )
        if res is not None:
            loss, event = res
            loss = np.array([loss])
@@ -177,18 +197,29 @@ def weighted_quantile(values, quantiles, sample_weight=None, values_sorted=False

 class WeightedRobustScaler(RobustScaler):

-    def fit(self, X, y=None, weights=None):
-        if not np.isnan(X).any():
+    def fit(self, X, y=None, weights=None, mask_value=None):
+        if not np.isnan(X).any() and mask_value is not None and weights is None:
            # these checks don't work for nan values
-            super(WeightedRobustScaler, self).fit(X, y)
-        if weights is None:
-            return self
+            return super(WeightedRobustScaler, self).fit(X, y)
        else:
-            wqs = np.array([weighted_quantile(X[:,i][~np.isnan(X[:,i])], [0.25, 0.5, 0.75], sample_weight=weights) for i in range(X.shape[1])])
+            if weights is None:
+                weights = np.ones(len(self.X))
+            wqs = []
+            for i in range(X.shape[1]):
+                mask = ~np.isnan(X[:,i])
+                if mask_value is not None:
+                    mask &= (X[:,i] != mask_value)
+                wqs.append(
+                    weighted_quantile(
+                        X[:,i][mask],
+                        [0.25, 0.5, 0.75],
+                        sample_weight=weights[mask]
+                    )
+                )
+            wqs = np.array(wqs)
            self.center_ = wqs[:,1]
            self.scale_ = wqs[:,2]-wqs[:,0]
            self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False)
-            print(self.scale_)
            return self


@@ -202,6 +233,15 @@ class WeightedRobustScaler(RobustScaler):
            return super(WeightedRobustScaler, self).transform(X)


+    def inverse_transform(self, X):
+        if np.isnan(X).any():
+            X *= self.scale_
+            X += self.center_
+            return X
+        else:
+            return super(WeightedRobustScaler, self).inverse_transform(X)
+
+
 def poisson_asimov_significance(s, ds, b, db):
    "see `<http://www.pp.rhul.ac.uk/~cowan/stat/medsig/medsigNote.pdf>`_)"
    db = np.sqrt(db**2+ds**2)
No results found