Implementing WeightedRobustScaler (now default)

96a77543 · Nikolai.Hartmann · b607030c · 96a77543 · 96a77543
Commit 96a77543 authored 6 years ago by Nikolai.Hartmann
--- a/toolkit.py
+++ b/toolkit.py
@@ -29,16 +29,16 @@ import h5py
 from sklearn.preprocessing import StandardScaler, RobustScaler
 from sklearn.externals import joblib
 from sklearn.metrics import roc_curve, auc
-
 from keras.models import Sequential
 from keras.layers import Dense, Dropout
 from keras.models import model_from_json
 from keras.callbacks import History, EarlyStopping, CSVLogger, ModelCheckpoint
 from keras.optimizers import SGD
 import keras.optimizers
-
 import matplotlib.pyplot as plt

+from .utils import WeightedRobustScaler, weighted_quantile
+
 # configure number of cores
 # this doesn't seem to work, but at least with these settings keras only uses 4 processes
 import tensorflow as tf
@@ -197,7 +197,7 @@ class ClassificationProject(object):
                        validation_split=0.33,
                        activation_function='relu',
                        activation_function_output='sigmoid',
-                        scaler_type="RobustScaler",
+                        scaler_type="WeightedRobustScaler",
                        step_signal=2,
                        step_bkg=2,
                        optimizer="SGD",
@@ -450,18 +450,18 @@ class ClassificationProject(object):
                logger.info("Loaded existing scaler from {}".format(filename))
            except IOError:
                logger.info("Creating new {}".format(self.scaler_type))
+                scaler_fit_kwargs = dict()
                if self.scaler_type == "StandardScaler":
                    self._scaler = StandardScaler()
                elif self.scaler_type == "RobustScaler":
                    self._scaler = RobustScaler()
+                elif self.scaler_type == "WeightedRobustScaler":
+                    self._scaler = WeightedRobustScaler()
+                    scaler_fit_kwargs["weights"] = self.w_train*np.array(self.class_weight)[self.y_train.astype(int)]
                else:
                    raise ValueError("Scaler type {} unknown".format(self.scaler_type))
                logger.info("Fitting {} to training data".format(self.scaler_type))
-                self._scaler.fit(self.x_train)
-                # i think this would refit to test data (and overwrite the parameters)
-                # probably we either want to fit only training data or training and test data together
-                # logger.info("Fitting StandardScaler to test data")
-                # self._scaler.fit(self.x_test)
+                self._scaler.fit(self.x_train, **scaler_fit_kwargs)
                joblib.dump(self._scaler, filename)
        return self._scaler

@@ -866,9 +866,10 @@ class ClassificationProject(object):

        # calculate percentiles to get a heuristic for the range to be plotted
        # (should in principle also be done with weights, but for now do it unweighted)
-        range_sig = np.percentile(sig, [1, 99])
-        range_bkg = np.percentile(sig, [1, 99])
-        plot_range = (min(range_sig[0], range_bkg[0]), max(range_sig[1], range_sig[1]))
+        # range_sig = np.percentile(sig, [1, 99])
+        # range_bkg = np.percentile(sig, [1, 99])
+        # plot_range = (min(range_sig[0], range_bkg[0]), max(range_sig[1], range_sig[1]))
+        plot_range = weighted_quantile(self.x_train[:,var_index], [0.1, 0.99], sample_weight=self.w_train*np.array(self.class_weight)[self.y_train.astype(int)])

        logger.debug("Calculated range based on percentiles: {}".format(plot_range))


--- a/utils.py
+++ b/utils.py
@@ -5,6 +5,7 @@ import logging
 import numpy as np

 import keras.backend as K
+from sklearn.preprocessing import RobustScaler

 from meme import cache

@@ -95,3 +96,49 @@ def get_max_activation_events(model, ranges, ntries, layer, neuron, seed=42, **k
            losses = np.concatenate([losses, loss])

    return losses, events
+
+"from https://stackoverflow.com/questions/21844024/weighted-percentile-using-numpy#29677616"
+def weighted_quantile(values, quantiles, sample_weight=None, values_sorted=False, old_style=False):
+    """ Very close to np.percentile, but supports weights.
+    NOTE: quantiles should be in [0, 1]!
+    :param values: np.array with data
+    :param quantiles: array-like with many quantiles needed
+    :param sample_weight: array-like of the same length as `array`
+    :param values_sorted: bool, if True, then will avoid sorting of initial array
+    :param old_style: if True, will correct output to be consistent with np.percentile.
+    :return: np.array with computed quantiles.
+    """
+    values = np.array(values)
+    quantiles = np.array(quantiles)
+    if sample_weight is None:
+        sample_weight = np.ones(len(values))
+    sample_weight = np.array(sample_weight)
+    assert np.all(quantiles >= 0) and np.all(quantiles <= 1), 'quantiles should be in [0, 1]'
+
+    if not values_sorted:
+        sorter = np.argsort(values)
+        values = values[sorter]
+        sample_weight = sample_weight[sorter]
+
+    weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight
+    if old_style:
+        # To be convenient with np.percentile
+        weighted_quantiles -= weighted_quantiles[0]
+        weighted_quantiles /= weighted_quantiles[-1]
+    else:
+        weighted_quantiles /= np.sum(sample_weight)
+    return np.interp(quantiles, weighted_quantiles, values)
+
+
+class WeightedRobustScaler(RobustScaler):
+
+    def fit(self, X, y=None, weights=None):
+        RobustScaler.fit(self, X, y)
+        if weights is None:
+            return self
+        else:
+            wqs = np.array([weighted_quantile(X[:,i], [0.25, 0.5, 0.75], sample_weight=weights) for i in range(X.shape[1])])
+            self.center_ = wqs[:,1]
+            self.scale_ = wqs[:,2]-wqs[:,0]
+            return self
+