diff --git a/test/test_toolkit.py b/test/test_toolkit.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecd3aa244cd2269ae24a3386eaf990272d0c4871
--- /dev/null
+++ b/test/test_toolkit.py
@@ -0,0 +1,88 @@
+import pytest
+import numpy as np
+import root_numpy
+import pandas as pd
+from sklearn.datasets import make_classification
+from keras.layers import GRU
+
+from KerasROOTClassification import ClassificationProject, ClassificationProjectRNN
+
+def create_dataset(path):
+
+    # create example dataset with (low-weighted) noise added
+    X, y = make_classification(n_samples=10000, random_state=1)
+    X2 = np.random.normal(size=20*10000).reshape(-1, 20)
+    y2 = np.concatenate([np.zeros(5000), np.ones(5000)])
+    X = np.concatenate([X, X2])
+    y = np.concatenate([y, y2])
+    w = np.concatenate([np.ones(10000), 0.01*np.ones(10000)])
+
+    # shift and scale randomly (to check if transformation is working)
+    shift = np.random.rand(20)*100
+    scale = np.random.rand(20)*1000
+    X *= scale
+    X += shift
+
+    # write to root files
+    branches = ["var_{}".format(i) for i in range(len(X[0]))]
+    df = pd.DataFrame(X, columns=branches)
+    df["class"] = y
+    df["weight"] = w
+    tree_path_bkg = str(path / "bkg.root")
+    tree_path_sig = str(path / "sig.root")
+    root_numpy.array2root(df[df["class"]==0].to_records(), tree_path_bkg)
+    root_numpy.array2root(df[df["class"]==1].to_records(), tree_path_sig)
+    return branches, tree_path_sig, tree_path_bkg
+
+
+def test_ClassificationProject(tmp_path):
+    branches, tree_path_sig, tree_path_bkg = create_dataset(tmp_path)
+    c = ClassificationProject(
+        str(tmp_path / "project"),
+        bkg_trees = [(tree_path_bkg, "tree")],
+        signal_trees = [(tree_path_sig, "tree")],
+        branches = branches,
+        weight_expr = "weight",
+        identifiers = ["index"],
+        optimizer="Adam",
+        earlystopping_opts=dict(patience=5),
+        dropout=0.5,
+        layers=3,
+        nodes=128,
+    )
+    c.train(epochs=200)
+    c.plot_all_inputs()
+    c.plot_loss()
+    assert min(c.history.history["val_loss"]) < 0.18
+
+
+def test_ClassificationProjectRNN(tmp_path):
+    branches, tree_path_sig, tree_path_bkg = create_dataset(tmp_path)
+    c = ClassificationProjectRNN(
+        str(tmp_path / "project"),
+        bkg_trees = [(tree_path_bkg, "tree")],
+        signal_trees = [(tree_path_sig, "tree")],
+        branches = branches,
+        recurrent_field_names=[
+            [
+                ["var_1", "var_2", "var_3"],
+                ["var_4", "var_5", "var_6"]
+            ],
+            [
+                ["var_10", "var_11", "var_12"],
+                ["var_13", "var_14", "var_15"]
+            ],
+        ],
+        weight_expr = "weight",
+        identifiers = ["index"],
+        optimizer="Adam",
+        earlystopping_opts=dict(patience=5),
+        dropout=0.5,
+        layers=3,
+        nodes=128,
+    )
+    assert sum([isinstance(layer, GRU) for layer in c.model.layers]) == 2
+    c.train(epochs=200)
+    c.plot_all_inputs()
+    c.plot_loss()
+    assert min(c.history.history["val_loss"]) < 0.18
diff --git a/toolkit.py b/toolkit.py
index a0be319112eca728362cb312e91d792b46fafa0f..eb9234407031b53cff85267e961a3c660169559c 100755
--- a/toolkit.py
+++ b/toolkit.py
@@ -193,6 +193,8 @@ class ClassificationProject(object):
 
     :param kernel_initializer: weight initializer for the dense layers - if None (default) the keras defaults are used
 
+    :param shuffle: shuffle training data after (and before first) epoch
+
     """
 
 
@@ -264,7 +266,9 @@ class ClassificationProject(object):
                         apply_class_weight=True,
                         normalize_weights=True,
                         ignore_neg_weights=False,
-                        kernel_initializer=None):
+                        kernel_initializer=None,
+                        shuffle=True,
+    ):
 
         self.name = name
         self.signal_trees = signal_trees
@@ -348,6 +352,7 @@ class ClassificationProject(object):
         self.normalize_weights = normalize_weights
         self.ignore_neg_weights = ignore_neg_weights
         self.kernel_initializer = kernel_initializer
+        self.shuffle = shuffle
 
         self.s_train = None
         self.b_train = None
@@ -381,8 +386,6 @@ class ClassificationProject(object):
         self.total_epochs = 0
 
         self.data_loaded = False
-        self.data_transformed = False
-        self.data_shuffled = False
 
         # track if we are currently training
         self.is_training = False
@@ -488,7 +491,6 @@ class ClassificationProject(object):
             self._dump_to_hdf5(*self.dataset_names_tree)
 
         self.data_loaded = True
-        self.data_shuffled = False
 
 
     def _dump_training_list(self):
@@ -659,30 +661,6 @@ class ClassificationProject(object):
             json.dump(self.history.history, of)
 
 
-    def _transform_data(self):
-        if not self.data_transformed:
-            if self.mask_value is not None:
-                self.x_train[self.x_train == self.mask_value] = np.nan
-                self.x_test[self.x_test == self.mask_value] = np.nan
-            if logger.level <= logging.DEBUG:
-                logger.debug("training data before transformation: {}".format(self.x_train))
-                logger.debug("minimum values: {}".format([np.min(self.x_train[:,i][~np.isnan(self.x_train[:,i])])
-                                                          for i in range(self.x_train.shape[1])]))
-                logger.debug("maximum values: {}".format([np.max(self.x_train[:,i][~np.isnan(self.x_train[:,i])])
-                                                          for i in range(self.x_train.shape[1])]))
-            orig_copy_setting = self.scaler.copy
-            self.scaler.copy = False
-            self.x_train = self.scaler.transform(self.x_train)
-            logger.debug("training data after transformation: {}".format(self.x_train))
-            self.x_test = self.scaler.transform(self.x_test)
-            self.scaler.copy = orig_copy_setting
-            if self.mask_value is not None:
-                self.x_train[np.isnan(self.x_train)] = self.mask_value
-                self.x_test[np.isnan(self.x_test)] = self.mask_value
-            self.data_transformed = True
-            logger.info("Training and test data transformed")
-
-
     def _read_info(self, key, default):
         filename = os.path.join(self.project_dir, "info.json")
         if not os.path.exists(filename):
@@ -825,14 +803,10 @@ class ClassificationProject(object):
 
         if reload:
             self.data_loaded = False
-            self.data_transformed = False
 
         if not self.data_loaded:
             self._load_data()
 
-        if not self.data_transformed:
-            self._transform_data()
-
 
     @property
     def w_train_tot(self):
@@ -855,23 +829,27 @@ class ClassificationProject(object):
 
     @property
     def validation_data(self):
-        "Validation data"
+        "(Transformed) validation data for loss evaluation"
         idx = self.train_val_idx[1]
-        return self.x_train[idx], self.y_train[idx], self.w_train_tot[idx]
+        x_val, y_val, w_val = self.x_train[idx], self.y_train[idx], self.w_train_tot[idx]
+        x_val_input = self.get_input_list(self.transform(x_val))
+        return x_val_input, y_val, w_val
 
 
     @property
     def training_data(self):
-        "Training data with validation data split off"
+        "(Transformed) Training data with validation data split off"
         idx = self.train_val_idx[0]
-        return self.x_train[idx], self.y_train[idx], self.w_train_tot[idx]
+        x_train, y_train, w_train =  self.x_train[idx], self.y_train[idx], self.w_train_tot[idx]
+        x_train_input = self.get_input_list(self.transform(x_train))
+        return x_train_input, y_train, w_train
 
 
     @property
     def train_val_idx(self):
         if self._train_val_idx is None:
             if self.kfold_splits is not None:
-                kfold = KFold(self.kfold_splits, shuffle=True, random_state=self.shuffle_seed)
+                kfold = KFold(self.kfold_splits, shuffle=self.shuffle, random_state=self.shuffle_seed)
                 for i, train_val_idx in enumerate(kfold.split(self.x_train)):
                     if i == self.kfold_index:
                         self._train_val_idx = train_val_idx
@@ -881,7 +859,10 @@ class ClassificationProject(object):
             else:
                 split_index = int((1-self.validation_split)*len(self.x_train))
                 np.random.seed(self.shuffle_seed)
-                shuffled_idx = np.random.permutation(len(self.x_train))
+                if self.shuffle:
+                    shuffled_idx = np.random.permutation(len(self.x_train))
+                else:
+                    shuffled_idx = np.arange(len(self.x_train))
                 self._train_val_idx = (shuffled_idx[:split_index], shuffled_idx[split_index:])
         return self._train_val_idx
 
@@ -891,17 +872,30 @@ class ClassificationProject(object):
         return int(float(len(self.train_val_idx[0]))/float(self.batch_size))
 
 
+    def get_input_list(self, x):
+        "For the standard Dense models with single input, this does nothing"
+        return x
+
+
     def yield_batch(self):
+        "Batch generator - optionally shuffle the indices after each epoch"
         x_train, y_train, w_train = self.x_train, self.y_train, self.w_train_tot
         train_idx = list(self.train_val_idx[0])
         np.random.seed(self.shuffle_seed+1)
+        logger.info("Generating training batches from {} signal and {} background events"
+                    .format(len(np.where(self.y_train[train_idx]==1)[0]),
+                            len(np.where(self.y_train[train_idx]==0)[0])))
         while True:
-            shuffled_idx = np.random.permutation(train_idx)
+            if self.shuffle:
+                shuffled_idx = np.random.permutation(train_idx)
+            else:
+                shuffled_idx = train_idx
             for start in range(0, len(shuffled_idx), int(self.batch_size)):
                 x_batch = x_train[shuffled_idx[start:start+int(self.batch_size)]]
                 y_batch = y_train[shuffled_idx[start:start+int(self.batch_size)]]
                 w_batch = w_train[shuffled_idx[start:start+int(self.batch_size)]]
-                yield (x_batch, y_batch, w_batch)
+                x_input = self.get_input_list(self.transform(x_batch))
+                yield (x_input, y_batch, w_batch)
 
 
     def yield_single_class_batch(self, class_label):
@@ -913,7 +907,10 @@ class ClassificationProject(object):
         class_idx = np.where(y_train==class_label)[0]
         while True:
             # shuffle the indices for this class label
-            shuffled_idx = np.random.permutation(class_idx)
+            if self.shuffle:
+                shuffled_idx = np.random.permutation(class_idx)
+            else:
+                shuffled_idx = class_idx
             # yield them batch wise
             for start in range(0, len(shuffled_idx), int(self.batch_size/2)):
                 yield (x_train[shuffled_idx[start:start+int(self.batch_size/2)]],
@@ -996,8 +993,6 @@ class ClassificationProject(object):
 
 
     def evaluate_train_test(self, do_train=True, do_test=True, mode=None):
-        logger.info("Reloading (and re-transforming) unshuffled training data")
-        self.load(reload=True)
 
         if mode is not None:
             self._write_info("scores_mode", mode)
@@ -1011,7 +1006,37 @@ class ClassificationProject(object):
             self._dump_to_hdf5("scores_train")
 
 
+    def evaluate_train_test(self, do_train=True, do_test=True, batch_size=10000, mode=None):
+        "Calculate scores for training and test sample"
+
+        if mode is not None:
+            self._write_info("scores_mode", mode)
+
+        def eval_score(data_name):
+            logger.info("Create/Update scores for {} sample".format(data_name))
+            n_events = len(getattr(self, "x_"+data_name))
+            setattr(self, "scores_"+data_name, np.empty(n_events))
+            for start in range(0, n_events, batch_size):
+                stop = start+batch_size
+                getattr(self, "scores_"+data_name)[start:stop] = (
+                    self.predict(
+                        self.get_input_list(self.transform(getattr(self, "x_"+data_name)[start:stop])),
+                        mode=mode
+                    ).reshape(-1)
+                )
+            self._dump_to_hdf5("scores_"+data_name)
+
+        if do_test:
+            eval_score("test")
+        if do_train:
+            eval_score("train")
+
+
     def predict(self, x, mode=None):
+        """
+        Calculate the scores for a (transformed) array of input values.
+        If the array is not transformed, use `evaluate` instead
+        """
         if mode is None:
             # normal output - after activation function output layer
             return self.model.predict(x)
@@ -1031,6 +1056,10 @@ class ClassificationProject(object):
 
 
     def evaluate(self, x_eval, mode=None):
+        """
+        Calculate the scores for an array of input values.
+        All nescessary transformations are applied.
+        """
         logger.debug("Evaluate score for {}".format(x_eval))
         x_eval = self.transform(x_eval)
         logger.debug("Evaluate for transformed array: {}".format(x_eval))
@@ -1179,8 +1208,6 @@ class ClassificationProject(object):
         ax.bar(centers_sig, hist_sig, color="r", alpha=0.5, width=width)
 
         label = branch
-        if self.data_transformed:
-            label += " (transformed)"
         ax.set_xlabel(label)
         if fig is not None:
             plot_dir = os.path.join(self.project_dir, "plots")
@@ -1689,7 +1716,6 @@ class ClassificationProjectDataFrame(ClassificationProject):
 
         if reload:
             self.data_loaded = False
-            self.data_transformed = False
             self._x_train = None
             self._x_test = None
             self._y_train = None
@@ -1700,9 +1726,6 @@ class ClassificationProjectDataFrame(ClassificationProject):
 
         self.data_loaded = True
 
-        if not self.data_transformed:
-            self._transform_data()
-
 
 class ClassificationProjectRNN(ClassificationProject):
 
@@ -1838,7 +1861,10 @@ class ClassificationProjectRNN(ClassificationProject):
 
 
     def get_input_list(self, x):
-        "Format the input starting from flat ntuple"
+        """
+        Returns a list of 3-dimensional inputs for each
+        recurrent layer and a 2-dimensional one for the normal flat inputs.
+        """
         x_input = []
         for field_idx in self.recurrent_field_idx:
             x_recurrent = x[:,field_idx.reshape(-1)].reshape(-1, *field_idx.shape[1:])
@@ -1849,7 +1875,7 @@ class ClassificationProjectRNN(ClassificationProject):
 
 
     def get_input_flat(self, x):
-        "Transform input back to flat ntuple"
+        "Transform the multiple inputs back to flat ntuple"
         nevent = x[0].shape[0]
         x_flat = np.empty((nevent, len(self.fields)), dtype=np.float)
         # recurrent fields
@@ -1864,58 +1890,6 @@ class ClassificationProjectRNN(ClassificationProject):
         return x_flat
 
 
-    def yield_batch(self):
-        x_train, y_train, w_train = self.x_train, self.y_train, self.w_train_tot
-        train_idx = list(self.train_val_idx[0])
-        np.random.seed(self.shuffle_seed+1)
-        logger.info("Generating training batches from {} signal and {} background events"
-                    .format(len(np.where(self.y_train[train_idx]==1)[0]),
-                            len(np.where(self.y_train[train_idx]==0)[0])))
-        while True:
-            shuffled_idx = np.random.permutation(train_idx)
-            for start in range(0, len(shuffled_idx), int(self.batch_size)):
-                x_batch = x_train[shuffled_idx[start:start+int(self.batch_size)]]
-                y_batch = y_train[shuffled_idx[start:start+int(self.batch_size)]]
-                w_batch = w_train[shuffled_idx[start:start+int(self.batch_size)]]
-                x_input = self.get_input_list(x_batch)
-                yield (x_input, y_batch, w_batch)
-
-
-    @property
-    def validation_data(self):
-        "class weighted validation data. Attention: Shuffle training data before using this!"
-        x_val, y_val, w_val = super(ClassificationProjectRNN, self).validation_data
-        x_val_input = self.get_input_list(x_val)
-        return x_val_input, y_val, w_val
-
-
-    def evaluate_train_test(self, do_train=True, do_test=True, batch_size=10000, mode=None):
-        logger.info("Reloading (and re-transforming) unshuffled training data")
-        self.load(reload=True)
-
-        if mode is not None:
-            self._write_info("scores_mode", mode)
-
-        def eval_score(data_name):
-            logger.info("Create/Update scores for {} sample".format(data_name))
-            n_events = len(getattr(self, "x_"+data_name))
-            setattr(self, "scores_"+data_name, np.empty(n_events))
-            for start in range(0, n_events, batch_size):
-                stop = start+batch_size
-                getattr(self, "scores_"+data_name)[start:stop] = (
-                    self.predict(
-                        self.get_input_list(getattr(self, "x_"+data_name)[start:stop]),
-                        mode=mode
-                    ).reshape(-1)
-                )
-            self._dump_to_hdf5("scores_"+data_name)
-
-        if do_test:
-            eval_score("test")
-        if do_train:
-            eval_score("train")
-
-
     def evaluate(self, x_eval, mode=None):
         logger.debug("Evaluate score for {}".format(x_eval))
         x_eval = self.transform(x_eval)