diff --git a/toolkit.py b/toolkit.py index 776ae2bbcb162de78cadd12cd63329f15f7969ac..a75459e5bbb2e0659d9b83fbd458393a37d8c04e 100755 --- a/toolkit.py +++ b/toolkit.py @@ -29,16 +29,16 @@ import h5py from sklearn.preprocessing import StandardScaler, RobustScaler from sklearn.externals import joblib from sklearn.metrics import roc_curve, auc - from keras.models import Sequential from keras.layers import Dense, Dropout from keras.models import model_from_json from keras.callbacks import History, EarlyStopping, CSVLogger, ModelCheckpoint from keras.optimizers import SGD import keras.optimizers - import matplotlib.pyplot as plt +from .utils import WeightedRobustScaler, weighted_quantile + # configure number of cores # this doesn't seem to work, but at least with these settings keras only uses 4 processes import tensorflow as tf @@ -197,7 +197,7 @@ class ClassificationProject(object): validation_split=0.33, activation_function='relu', activation_function_output='sigmoid', - scaler_type="RobustScaler", + scaler_type="WeightedRobustScaler", step_signal=2, step_bkg=2, optimizer="SGD", @@ -450,18 +450,18 @@ class ClassificationProject(object): logger.info("Loaded existing scaler from {}".format(filename)) except IOError: logger.info("Creating new {}".format(self.scaler_type)) + scaler_fit_kwargs = dict() if self.scaler_type == "StandardScaler": self._scaler = StandardScaler() elif self.scaler_type == "RobustScaler": self._scaler = RobustScaler() + elif self.scaler_type == "WeightedRobustScaler": + self._scaler = WeightedRobustScaler() + scaler_fit_kwargs["weights"] = self.w_train*np.array(self.class_weight)[self.y_train.astype(int)] else: raise ValueError("Scaler type {} unknown".format(self.scaler_type)) logger.info("Fitting {} to training data".format(self.scaler_type)) - self._scaler.fit(self.x_train) - # i think this would refit to test data (and overwrite the parameters) - # probably we either want to fit only training data or training and test data together - # logger.info("Fitting StandardScaler to test data") - # self._scaler.fit(self.x_test) + self._scaler.fit(self.x_train, **scaler_fit_kwargs) joblib.dump(self._scaler, filename) return self._scaler @@ -866,9 +866,10 @@ class ClassificationProject(object): # calculate percentiles to get a heuristic for the range to be plotted # (should in principle also be done with weights, but for now do it unweighted) - range_sig = np.percentile(sig, [1, 99]) - range_bkg = np.percentile(sig, [1, 99]) - plot_range = (min(range_sig[0], range_bkg[0]), max(range_sig[1], range_sig[1])) + # range_sig = np.percentile(sig, [1, 99]) + # range_bkg = np.percentile(sig, [1, 99]) + # plot_range = (min(range_sig[0], range_bkg[0]), max(range_sig[1], range_sig[1])) + plot_range = weighted_quantile(self.x_train[:,var_index], [0.1, 0.99], sample_weight=self.w_train*np.array(self.class_weight)[self.y_train.astype(int)]) logger.debug("Calculated range based on percentiles: {}".format(plot_range)) diff --git a/utils.py b/utils.py index 4c6fe8a4f0c518ffb6630bb5b3ca6ea1249575ac..adf213f7edb71086fa8e9148a77062565f098f0e 100644 --- a/utils.py +++ b/utils.py @@ -5,6 +5,7 @@ import logging import numpy as np import keras.backend as K +from sklearn.preprocessing import RobustScaler from meme import cache @@ -95,3 +96,49 @@ def get_max_activation_events(model, ranges, ntries, layer, neuron, seed=42, **k losses = np.concatenate([losses, loss]) return losses, events + +"from https://stackoverflow.com/questions/21844024/weighted-percentile-using-numpy#29677616" +def weighted_quantile(values, quantiles, sample_weight=None, values_sorted=False, old_style=False): + """ Very close to np.percentile, but supports weights. + NOTE: quantiles should be in [0, 1]! + :param values: np.array with data + :param quantiles: array-like with many quantiles needed + :param sample_weight: array-like of the same length as `array` + :param values_sorted: bool, if True, then will avoid sorting of initial array + :param old_style: if True, will correct output to be consistent with np.percentile. + :return: np.array with computed quantiles. + """ + values = np.array(values) + quantiles = np.array(quantiles) + if sample_weight is None: + sample_weight = np.ones(len(values)) + sample_weight = np.array(sample_weight) + assert np.all(quantiles >= 0) and np.all(quantiles <= 1), 'quantiles should be in [0, 1]' + + if not values_sorted: + sorter = np.argsort(values) + values = values[sorter] + sample_weight = sample_weight[sorter] + + weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight + if old_style: + # To be convenient with np.percentile + weighted_quantiles -= weighted_quantiles[0] + weighted_quantiles /= weighted_quantiles[-1] + else: + weighted_quantiles /= np.sum(sample_weight) + return np.interp(quantiles, weighted_quantiles, values) + + +class WeightedRobustScaler(RobustScaler): + + def fit(self, X, y=None, weights=None): + RobustScaler.fit(self, X, y) + if weights is None: + return self + else: + wqs = np.array([weighted_quantile(X[:,i], [0.25, 0.5, 0.75], sample_weight=weights) for i in range(X.shape[1])]) + self.center_ = wqs[:,1] + self.scale_ = wqs[:,2]-wqs[:,0] + return self +