Compare revisions

Nikolai Hartmann · Nikolai Hartmann · Nikolai Hartmann · Nikolai.Hartmann · Nikolai.Hartmann · Nikolai Hartmann
--- a/compare.py
+++ b/compare.py
@@ -22,6 +22,7 @@ def overlay_ROC(filename, *projects, **kwargs):
    threshold_log = kwargs.pop("threshold_log", True)
    lumifactor = kwargs.pop("lumifactor", None)
    tight_layout = kwargs.pop("tight_layout", False)
+    show_auc = kwargs.pop("show_auc", True)
    if kwargs:
        raise KeyError("Unknown kwargs: {}".format(kwargs))
@@ -43,7 +44,7 @@ def overlay_ROC(filename, *projects, **kwargs):
    colors = prop_cycle.by_key()['color']
    for p, color in zip(projects, colors):
-        fpr, tpr, threshold = roc_curve(p.y_test, p.scores_test, sample_weight = p.w_test)
+        fpr, tpr, threshold = roc_curve(p.l_test, p.scores_test, sample_weight = p.w_test)
        fpr = 1.0 - fpr
        try:
            roc_auc = auc(tpr, fpr)
@@ -52,12 +53,16 @@ def overlay_ROC(filename, *projects, **kwargs):
            roc_auc = auc(tpr, fpr, reorder=True)
        ax.grid(color='gray', linestyle='--', linewidth=1)
-        ax.plot(tpr,  fpr, label=str(p.name+" (AUC = {:.3f})".format(roc_auc)), color=color)
+        if show_auc:
+            label = str(p.name+" (AUC = {:.3f})".format(roc_auc))
+        else:
+            label = p.name
+        ax.plot(tpr,  fpr, label=label, color=color)
        if plot_thresholds:
            ax2.plot(tpr, threshold, "--", color=color)
        if lumifactor is not None:
-            sumw_b = p.w_test[p.y_test==0].sum()*lumifactor
+            sumw_b = p.w_test[p.l_test==0].sum()*lumifactor
-            sumw_s = p.w_test[p.y_test==1].sum()*lumifactor
+            sumw_s = p.w_test[p.l_test==1].sum()*lumifactor
            ax_abs_b.plot(tpr, (1.-fpr)*sumw_b, alpha=0)
            ax_abs_b.invert_yaxis()
            ax_abs_s.plot(tpr*sumw_s, fpr, alpha=0)

--- a/test/test_toolkit.py
+++ b/test/test_toolkit.py
@@ -7,13 +7,24 @@ from keras.layers import GRU
 from KerasROOTClassification import ClassificationProject, ClassificationProjectRNN
 def create_dataset(path):
+    # create example dataset with (low-weighted) noise added
    X, y = make_classification(n_samples=10000, random_state=1)
    X2 = np.random.normal(size=20*10000).reshape(-1, 20)
    y2 = np.concatenate([np.zeros(5000), np.ones(5000)])
    X = np.concatenate([X, X2])
    y = np.concatenate([y, y2])
    w = np.concatenate([np.ones(10000), 0.01*np.ones(10000)])
+    # shift and scale randomly (to check if transformation is working)
+    shift = np.random.rand(20)*100
+    scale = np.random.rand(20)*1000
+    X *= scale
+    X += shift
+    # write to root files
    branches = ["var_{}".format(i) for i in range(len(X[0]))]
    df = pd.DataFrame(X, columns=branches)
    df["class"] = y
@@ -40,7 +51,10 @@ def test_ClassificationProject(tmp_path):
        layers=3,
        nodes=128,
    )
    c.train(epochs=200)
+    c.plot_all_inputs()
+    c.plot_loss()
    assert min(c.history.history["val_loss"]) < 0.18
@@ -71,4 +85,6 @@ def test_ClassificationProjectRNN(tmp_path):
    )
    assert sum([isinstance(layer, GRU) for layer in c.model.layers]) == 2
    c.train(epochs=200)
+    c.plot_all_inputs()
+    c.plot_loss()
    assert min(c.history.history["val_loss"]) < 0.18
--- a/toolkit.py
+++ b/toolkit.py
--- a/utils.py
+++ b/utils.py
@@ -197,14 +197,26 @@ def weighted_quantile(values, quantiles, sample_weight=None, values_sorted=False
 class WeightedRobustScaler(RobustScaler):
-    def fit(self, X, y=None, weights=None):
+    def fit(self, X, y=None, weights=None, mask_value=None):
-        if not np.isnan(X).any():
+        if not np.isnan(X).any() and mask_value is not None and weights is None:
            # these checks don't work for nan values
-            super(WeightedRobustScaler, self).fit(X, y)
+            return super(WeightedRobustScaler, self).fit(X, y)
-        if weights is None:
-            return self
        else:
-            wqs = np.array([weighted_quantile(X[:,i][~np.isnan(X[:,i])], [0.25, 0.5, 0.75], sample_weight=weights) for i in range(X.shape[1])])
+            if weights is None:
+                weights = np.ones(len(self.X))
+            wqs = []
+            for i in range(X.shape[1]):
+                mask = ~np.isnan(X[:,i])
+                if mask_value is not None:
+                    mask &= (X[:,i] != mask_value)
+                wqs.append(
+                    weighted_quantile(
+                        X[:,i][mask],
+                        [0.25, 0.5, 0.75],
+                        sample_weight=weights[mask]
+                    )
+                )
+            wqs = np.array(wqs)
            self.center_ = wqs[:,1]
            self.scale_ = wqs[:,2]-wqs[:,0]
            self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False)
No results found