Skip to content
Snippets Groups Projects
Commit 96a77543 authored by Nikolai.Hartmann's avatar Nikolai.Hartmann
Browse files

Implementing WeightedRobustScaler (now default)

parent b607030c
No related branches found
No related tags found
No related merge requests found
......@@ -29,16 +29,16 @@ import h5py
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.externals import joblib
from sklearn.metrics import roc_curve, auc
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.models import model_from_json
from keras.callbacks import History, EarlyStopping, CSVLogger, ModelCheckpoint
from keras.optimizers import SGD
import keras.optimizers
import matplotlib.pyplot as plt
from .utils import WeightedRobustScaler, weighted_quantile
# configure number of cores
# this doesn't seem to work, but at least with these settings keras only uses 4 processes
import tensorflow as tf
......@@ -197,7 +197,7 @@ class ClassificationProject(object):
validation_split=0.33,
activation_function='relu',
activation_function_output='sigmoid',
scaler_type="RobustScaler",
scaler_type="WeightedRobustScaler",
step_signal=2,
step_bkg=2,
optimizer="SGD",
......@@ -450,18 +450,18 @@ class ClassificationProject(object):
logger.info("Loaded existing scaler from {}".format(filename))
except IOError:
logger.info("Creating new {}".format(self.scaler_type))
scaler_fit_kwargs = dict()
if self.scaler_type == "StandardScaler":
self._scaler = StandardScaler()
elif self.scaler_type == "RobustScaler":
self._scaler = RobustScaler()
elif self.scaler_type == "WeightedRobustScaler":
self._scaler = WeightedRobustScaler()
scaler_fit_kwargs["weights"] = self.w_train*np.array(self.class_weight)[self.y_train.astype(int)]
else:
raise ValueError("Scaler type {} unknown".format(self.scaler_type))
logger.info("Fitting {} to training data".format(self.scaler_type))
self._scaler.fit(self.x_train)
# i think this would refit to test data (and overwrite the parameters)
# probably we either want to fit only training data or training and test data together
# logger.info("Fitting StandardScaler to test data")
# self._scaler.fit(self.x_test)
self._scaler.fit(self.x_train, **scaler_fit_kwargs)
joblib.dump(self._scaler, filename)
return self._scaler
......@@ -866,9 +866,10 @@ class ClassificationProject(object):
# calculate percentiles to get a heuristic for the range to be plotted
# (should in principle also be done with weights, but for now do it unweighted)
range_sig = np.percentile(sig, [1, 99])
range_bkg = np.percentile(sig, [1, 99])
plot_range = (min(range_sig[0], range_bkg[0]), max(range_sig[1], range_sig[1]))
# range_sig = np.percentile(sig, [1, 99])
# range_bkg = np.percentile(sig, [1, 99])
# plot_range = (min(range_sig[0], range_bkg[0]), max(range_sig[1], range_sig[1]))
plot_range = weighted_quantile(self.x_train[:,var_index], [0.1, 0.99], sample_weight=self.w_train*np.array(self.class_weight)[self.y_train.astype(int)])
logger.debug("Calculated range based on percentiles: {}".format(plot_range))
......
......@@ -5,6 +5,7 @@ import logging
import numpy as np
import keras.backend as K
from sklearn.preprocessing import RobustScaler
from meme import cache
......@@ -95,3 +96,49 @@ def get_max_activation_events(model, ranges, ntries, layer, neuron, seed=42, **k
losses = np.concatenate([losses, loss])
return losses, events
"from https://stackoverflow.com/questions/21844024/weighted-percentile-using-numpy#29677616"
def weighted_quantile(values, quantiles, sample_weight=None, values_sorted=False, old_style=False):
""" Very close to np.percentile, but supports weights.
NOTE: quantiles should be in [0, 1]!
:param values: np.array with data
:param quantiles: array-like with many quantiles needed
:param sample_weight: array-like of the same length as `array`
:param values_sorted: bool, if True, then will avoid sorting of initial array
:param old_style: if True, will correct output to be consistent with np.percentile.
:return: np.array with computed quantiles.
"""
values = np.array(values)
quantiles = np.array(quantiles)
if sample_weight is None:
sample_weight = np.ones(len(values))
sample_weight = np.array(sample_weight)
assert np.all(quantiles >= 0) and np.all(quantiles <= 1), 'quantiles should be in [0, 1]'
if not values_sorted:
sorter = np.argsort(values)
values = values[sorter]
sample_weight = sample_weight[sorter]
weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight
if old_style:
# To be convenient with np.percentile
weighted_quantiles -= weighted_quantiles[0]
weighted_quantiles /= weighted_quantiles[-1]
else:
weighted_quantiles /= np.sum(sample_weight)
return np.interp(quantiles, weighted_quantiles, values)
class WeightedRobustScaler(RobustScaler):
def fit(self, X, y=None, weights=None):
RobustScaler.fit(self, X, y)
if weights is None:
return self
else:
wqs = np.array([weighted_quantile(X[:,i], [0.25, 0.5, 0.75], sample_weight=weights) for i in range(X.shape[1])])
self.center_ = wqs[:,1]
self.scale_ = wqs[:,2]-wqs[:,0]
return self
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment