Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • Eric.Schanet/KerasROOTClassification
  • Nikolai.Hartmann/KerasROOTClassification
2 results
Show changes
Commits on Source (73)
...@@ -20,6 +20,9 @@ def overlay_ROC(filename, *projects, **kwargs): ...@@ -20,6 +20,9 @@ def overlay_ROC(filename, *projects, **kwargs):
ylim = kwargs.pop("ylim", (0,1)) ylim = kwargs.pop("ylim", (0,1))
plot_thresholds = kwargs.pop("plot_thresholds", False) plot_thresholds = kwargs.pop("plot_thresholds", False)
threshold_log = kwargs.pop("threshold_log", True) threshold_log = kwargs.pop("threshold_log", True)
lumifactor = kwargs.pop("lumifactor", None)
tight_layout = kwargs.pop("tight_layout", False)
show_auc = kwargs.pop("show_auc", True)
if kwargs: if kwargs:
raise KeyError("Unknown kwargs: {}".format(kwargs)) raise KeyError("Unknown kwargs: {}".format(kwargs))
...@@ -33,11 +36,15 @@ def overlay_ROC(filename, *projects, **kwargs): ...@@ -33,11 +36,15 @@ def overlay_ROC(filename, *projects, **kwargs):
if threshold_log: if threshold_log:
ax2.set_yscale("log") ax2.set_yscale("log")
if lumifactor is not None:
ax_abs_b = ax.twinx()
ax_abs_s = ax.twiny()
prop_cycle = plt.rcParams['axes.prop_cycle'] prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color'] colors = prop_cycle.by_key()['color']
for p, color in zip(projects, colors): for p, color in zip(projects, colors):
fpr, tpr, threshold = roc_curve(p.y_test, p.scores_test, sample_weight = p.w_test) fpr, tpr, threshold = roc_curve(p.l_test, p.scores_test, sample_weight = p.w_test)
fpr = 1.0 - fpr fpr = 1.0 - fpr
try: try:
roc_auc = auc(tpr, fpr) roc_auc = auc(tpr, fpr)
...@@ -46,21 +53,39 @@ def overlay_ROC(filename, *projects, **kwargs): ...@@ -46,21 +53,39 @@ def overlay_ROC(filename, *projects, **kwargs):
roc_auc = auc(tpr, fpr, reorder=True) roc_auc = auc(tpr, fpr, reorder=True)
ax.grid(color='gray', linestyle='--', linewidth=1) ax.grid(color='gray', linestyle='--', linewidth=1)
ax.plot(tpr, fpr, label=str(p.name+" (AUC = {:.3f})".format(roc_auc)), color=color) if show_auc:
label = str(p.name+" (AUC = {:.3f})".format(roc_auc))
else:
label = p.name
ax.plot(tpr, fpr, label=label, color=color)
if plot_thresholds: if plot_thresholds:
ax2.plot(tpr, threshold, "--", color=color) ax2.plot(tpr, threshold, "--", color=color)
if lumifactor is not None:
sumw_b = p.w_test[p.l_test==0].sum()*lumifactor
sumw_s = p.w_test[p.l_test==1].sum()*lumifactor
ax_abs_b.plot(tpr, (1.-fpr)*sumw_b, alpha=0)
ax_abs_b.invert_yaxis()
ax_abs_s.plot(tpr*sumw_s, fpr, alpha=0)
if xlim is not None: if xlim is not None:
ax.set_xlim(*xlim) ax.set_xlim(*xlim)
if ylim is not None: if ylim is not None:
ax.set_ylim(*ylim) ax.set_ylim(*ylim)
if lumifactor is not None:
ax_abs_b.set_ylim((1-ax.get_ylim()[0])*sumw_b, (1-ax.get_ylim()[1])*sumw_b)
ax_abs_b.set_xlim(*ax.get_xlim())
ax_abs_s.set_xlim(ax.get_xlim()[0]*sumw_s, ax.get_xlim()[1]*sumw_s)
ax_abs_s.set_ylim(*ax.get_ylim())
ax_abs_b.set_ylabel("Number of background events")
ax_abs_s.set_xlabel("Number of signal events")
# plt.xticks(np.arange(0,1,0.1)) # plt.xticks(np.arange(0,1,0.1))
# plt.yticks(np.arange(0,1,0.1)) # plt.yticks(np.arange(0,1,0.1))
ax.legend(loc='lower left', framealpha=1.0) ax.legend(loc='lower left', framealpha=1.0)
ax.set_title('Receiver operating characteristic') if lumifactor is None:
ax.set_title('Receiver operating characteristic')
ax.set_ylabel("Background rejection") ax.set_ylabel("Background rejection")
ax.set_xlabel("Signal efficiency") ax.set_xlabel("Signal efficiency")
if plot_thresholds: if plot_thresholds or tight_layout:
# to fit right y-axis description # to fit right y-axis description
fig.tight_layout() fig.tight_layout()
return save_show(plt, fig, filename) return save_show(plt, fig, filename)
......
import keras.backend as K import keras.backend as K
from keras.layers import Masking, InputLayer
def get_activations(model, model_inputs, print_shape_only=False, layer_name=None): def get_activations(model, model_inputs, print_shape_only=False, layer_name=None):
print('----- activations -----') print('----- activations -----')
...@@ -13,12 +12,8 @@ def get_activations(model, model_inputs, print_shape_only=False, layer_name=None ...@@ -13,12 +12,8 @@ def get_activations(model, model_inputs, print_shape_only=False, layer_name=None
inp = [inp] inp = [inp]
model_multi_inputs_cond = False model_multi_inputs_cond = False
# all layer outputs
# skip input and masking layers
outputs = [layer.output for layer in model.layers if outputs = [layer.output for layer in model.layers if
(layer.name == layer_name or layer_name is None) layer.name == layer_name or layer_name is None] # all layer outputs
and not isinstance(layer, InputLayer)
and not isinstance(layer, Masking)]
funcs = [K.function(inp + [K.learning_phase()], [out]) for out in outputs] # evaluation functions funcs = [K.function(inp + [K.learning_phase()], [out]) for out in outputs] # evaluation functions
......
...@@ -20,9 +20,9 @@ logger.addHandler(logging.NullHandler()) ...@@ -20,9 +20,9 @@ logger.addHandler(logging.NullHandler())
Some further plotting functions Some further plotting functions
""" """
def save_show(plt, fig, filename): def save_show(plt, fig, filename, **kwargs):
"Save a figure and show it in case we are in ipython or jupyter notebook." "Save a figure and show it in case we are in ipython or jupyter notebook."
fig.savefig(filename) fig.savefig(filename, **kwargs)
try: try:
get_ipython get_ipython
plt.show() plt.show()
...@@ -234,6 +234,114 @@ def plot_NN_vs_var_2D_all(plotname, model, means, ...@@ -234,6 +234,114 @@ def plot_NN_vs_var_2D_all(plotname, model, means,
save_show(plt, fig, plotname, bbox_inches='tight') save_show(plt, fig, plotname, bbox_inches='tight')
def plot_profile_2D_all(plotname, model, events,
valsx, valsy,
nbinsx, xmin, xmax,
nbinsy, ymin, ymax,
transform_function=None,
varx_label=None,
vary_label=None,
zrange=None, logz=False,
plot_last_layer=False,
log_default_ymin=1e-5,
global_norm=True,
cmap="inferno", **kwargs):
"Similar to plot_profile_2D, but creates a grid of plots for all neurons."
# transform
if transform_function is not None:
events = transform_function(events)
logger.info("Reading activations for all neurons")
acts = get_activations(model, events, print_shape_only=True)
logger.info("Done")
if plot_last_layer:
n_neurons = [len(i.reshape(i.shape[0], -1)[0]) for i in acts]
else:
n_neurons = [len(i.reshape(i.shape[0], -1)[0]) for i in acts[:-1]]
layers = len(n_neurons)
nrows_ncols = (layers, max(n_neurons))
fig = plt.figure(1, figsize=nrows_ncols)
grid = ImageGrid(fig, 111, nrows_ncols=nrows_ncols[::-1], axes_pad=0,
label_mode="1",
aspect=False,
cbar_location="top",
cbar_mode="single",
cbar_pad=.2,
cbar_size="5%",)
grid_array = np.array(grid)
grid_array = grid_array.reshape(*nrows_ncols[::-1])
global_min = None
global_max = None
logger.info("Creating profile histograms")
ims = []
reg_plots = []
for layer in range(layers):
neurons_acts = acts[layer]
neurons_acts = neurons_acts.reshape(neurons_acts.shape[0], -1)
for neuron in range(len(neurons_acts[0])):
acts_neuron = neurons_acts[:,neuron]
ax = grid_array[neuron][layer]
extra_opts = {}
if not (plot_last_layer and layer == layers-1):
# for hidden layers, plot the same z-scale
if logz:
norm = matplotlib.colors.LogNorm
else:
norm = matplotlib.colors.Normalize
if zrange is not None:
extra_opts["norm"] = norm(vmin=zrange[0], vmax=zrange[1])
else:
extra_opts["norm"] = norm(vmin=global_min, vmax=global_max)
hist, xedges, yedges = get_profile_2D(
valsx, valsy, acts_neuron,
nbinsx, xmin, xmax,
nbinsy, ymin, ymax,
**kwargs
)
if global_min is None or hist.min() < global_min:
global_min = hist.min()
if global_max is None or hist.max() > global_max:
global_max = hist.max()
X, Y = np.meshgrid(xedges, yedges)
reg_plots.append((layer, neuron, ax, (X, Y, hist), dict(cmap="inferno", linewidth=0, rasterized=True, **extra_opts)))
logger.info("Done")
logger.info("global_min: {}".format(global_min))
logger.info("global_max: {}".format(global_max))
if global_min <= 0 and logz:
global_min = log_default_ymin
logger.info("Changing global_min to {}".format(log_default_ymin))
for layer, neuron, ax, args, kwargs in reg_plots:
if zrange is None:
kwargs["norm"].vmin = global_min
kwargs["norm"].vmax = global_max
if not global_norm:
kwargs["norm"] = None
im = ax.pcolormesh(*args, **kwargs)
ax.set_facecolor("black")
if varx_label is not None:
ax.set_xlabel(varx_label)
if vary_label is not None:
ax.set_ylabel(vary_label)
ax.text(0., 0.5, "{}, {}".format(layer, neuron), transform=ax.transAxes, color="white")
cb = fig.colorbar(im, cax=grid[0].cax, orientation="horizontal")
cb.ax.xaxis.set_ticks_position('top')
cb.ax.xaxis.set_label_position('top')
logger.info("Rendering")
save_show(plt, fig, plotname, bbox_inches='tight')
logger.info("Done")
def plot_hist_2D(plotname, xedges, yedges, hist, varx_label=None, vary_label=None, log=False, zlabel="# of events"): def plot_hist_2D(plotname, xedges, yedges, hist, varx_label=None, vary_label=None, log=False, zlabel="# of events"):
X, Y = np.meshgrid(xedges, yedges) X, Y = np.meshgrid(xedges, yedges)
......
#!/usr/bin/env python
import sys, argparse, re, random
parser = argparse.ArgumentParser(description="generate events that maximise the activation for a given neuron")
parser.add_argument("input_project")
parser.add_argument("output_file")
parser.add_argument("-n", "--nevents", default=100000, type=int)
parser.add_argument("-j", "--mask-jets", action="store_true",
help="mask variables called jet*Pt/Eta/Phi and generate a random uniform distribution of the number of jets (only nescessary for non-recurrent NN)")
args = parser.parse_args()
import logging
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
import h5py
import numpy as np
from KerasROOTClassification.utils import (
weighted_quantile,
get_max_activation_events,
create_random_event,
get_ranges
)
from KerasROOTClassification import load_from_dir
import meme
meme.setOptions(deactivated=True)
input_project = args.input_project
output_file = args.output_file
c = load_from_dir(input_project)
c._load_data()
ranges, mask_probs = get_ranges(c.transform(c.x_train), [0.01, 0.99], c.w_train_tot, mask_value=c.mask_value, max_evts=10000)
def mask_uniform(x, mask_value, recurrent_field_idx):
"""
Mask recurrent fields with a random (uniform) number of objects. Works in place.
"""
for rec_idx in recurrent_field_idx:
for evt in x:
masked = False
nobj = int(random.random()*(rec_idx.shape[1]+1))
for obj_number, line_idx in enumerate(rec_idx.reshape(*rec_idx.shape[1:])):
if obj_number == nobj:
masked=True
if masked:
evt[line_idx] = mask_value
def get_input_flat(x):
return x[0].reshape(-1, len(c.fields))
if args.mask_jets:
jet_fields = {}
for field_name in c.fields:
if any(field_name.startswith("jet") and field_name.endswith(suffix) for suffix in ["Pt", "Eta", "Phi"]):
jet_number = re.findall("[0-9]+", field_name)[0]
if not jet_number in jet_fields:
jet_fields[jet_number] = []
jet_fields[jet_number].append(c.fields.index(field_name))
jet_fields = [np.array([[v for k, v in sorted(jet_fields.items(), key=lambda l:l[0])]])]
def input_transform(x):
x = np.array(x)
if hasattr(c, "mask_uniform"):
c.mask_uniform(x)
return c.get_input_list(x)
elif args.mask_jets:
mask_uniform(x, c.mask_value, jet_fields)
return x
opt_kwargs = dict()
if hasattr(c, "mask_uniform"):
opt_kwargs["input_transform"] = input_transform
opt_kwargs["input_inverse_transform"] = c.get_input_flat
if args.mask_jets:
opt_kwargs["input_transform"] = input_transform
opt_kwargs["input_inverse_transform"] = get_input_flat
evts = get_max_activation_events(
c.model, ranges,
ntries=args.nevents,
layer=len(c.model.layers)-1,
neuron=0,
maxit=10,
seed=45,
threshold=0,
**opt_kwargs
)
with h5py.File(output_file, "w") as f:
f.create_dataset("actmax", data=evts[1])
...@@ -2,30 +2,6 @@ ...@@ -2,30 +2,6 @@
import sys import sys
import argparse import argparse
import logging
logging.basicConfig()
import numpy as np
import ROOT
ROOT.gROOT.SetBatch()
ROOT.PyConfig.IgnoreCommandLineOptions = True
from KerasROOTClassification import load_from_dir
from KerasROOTClassification.plotting import (
get_mean_event,
plot_NN_vs_var_2D,
plot_profile_2D,
plot_hist_2D_events,
plot_cond_avg_actmax_2D,
plot_NN_vs_var_2D_all,
)
from KerasROOTClassification.utils import (
get_single_neuron_function,
get_max_activation_events,
weighted_quantile,
get_ranges
)
parser = argparse.ArgumentParser(description='Create various 2D plots for a single neuron') parser = argparse.ArgumentParser(description='Create various 2D plots for a single neuron')
parser.add_argument("project_dir") parser.add_argument("project_dir")
...@@ -53,6 +29,31 @@ parser.add_argument("-s", "--step-size", help="step size for activation maximisa ...@@ -53,6 +29,31 @@ parser.add_argument("-s", "--step-size", help="step size for activation maximisa
args = parser.parse_args() args = parser.parse_args()
import logging
logging.basicConfig()
import numpy as np
import ROOT
ROOT.gROOT.SetBatch()
ROOT.PyConfig.IgnoreCommandLineOptions = True
from KerasROOTClassification import load_from_dir
from KerasROOTClassification.plotting import (
get_mean_event,
plot_NN_vs_var_2D,
plot_profile_2D,
plot_hist_2D_events,
plot_cond_avg_actmax_2D,
plot_NN_vs_var_2D_all,
)
from KerasROOTClassification.utils import (
get_single_neuron_function,
get_max_activation_events,
weighted_quantile,
get_ranges
)
if args.all_neurons and (not args.mode.startswith("mean")): if args.all_neurons and (not args.mode.startswith("mean")):
parser.error("--all-neurons currently only supported for mean_sig and mean_bkg") parser.error("--all-neurons currently only supported for mean_sig and mean_bkg")
...@@ -94,8 +95,8 @@ except AttributeError: ...@@ -94,8 +95,8 @@ except AttributeError:
# percentilesx = weighted_quantile(varx_test[x_not_masked], [0.01, 0.99], sample_weight=total_weights[x_not_masked]) # percentilesx = weighted_quantile(varx_test[x_not_masked], [0.01, 0.99], sample_weight=total_weights[x_not_masked])
# percentilesy = weighted_quantile(vary_test[y_not_masked], [0.01, 0.99], sample_weight=total_weights[y_not_masked]) # percentilesy = weighted_quantile(vary_test[y_not_masked], [0.01, 0.99], sample_weight=total_weights[y_not_masked])
percentilesx = get_ranges(c.x_test, [0.01, 0.99], total_weights, mask_value=mask_value, filter_index=varx_index)[0] percentilesx = get_ranges(c.x_test, [0.01, 0.99], total_weights, mask_value=mask_value, filter_index=varx_index, max_evts=10000)[0][0]
percentilesy = get_ranges(c.x_test, [0.01, 0.99], total_weights, mask_value=mask_value, filter_index=vary_index)[0] percentilesy = get_ranges(c.x_test, [0.01, 0.99], total_weights, mask_value=mask_value, filter_index=vary_index, max_evts=10000)[0][0]
if args.xrange is not None: if args.xrange is not None:
......
#!/usr/bin/env python
"""
Write new TTrees with signal parameters as branches. For the
backgrounds the parameters are generated following the total
distribution for all signals. The discrete values for the whole ntuple
of signal parameters are counted, such that correlations between
signal parameters are taken into account.
"""
import argparse, re, os
import ROOT
from root_numpy import list_trees
from root_pandas import read_root
import numpy as np
if __name__ == "__main__":
input_filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys.root"
output_filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys_parametrized.root"
param_names = ["mg", "mc", "mn"]
param_match = "GG_oneStep_(.*?)_(.*?)_(.*?)_NoSys"
output_signal_treename = "GG_oneStep_NoSys"
bkg_trees = [
"diboson_Sherpa221_NoSys",
"singletop_NoSys",
"ttbar_NoSys",
"ttv_NoSys",
"wjets_Sherpa221_NoSys",
"zjets_Sherpa221_NoSys",
]
# read in the number of events for each combination of parameters
f = ROOT.TFile.Open(input_filename)
count_dict = {}
for key in f.GetListOfKeys():
tree_name = key.GetName()
match = re.match(param_match, tree_name)
if match is not None:
tree = f.Get(tree_name)
params = tuple([float(i) for i in match.groups()])
if not params in count_dict:
count_dict[params] = 0
# TODO: might be better to use sum of weights
count_dict[params] += tree.GetEntries()
f.Close()
# calculate cumulative sum of counts to sample signal parameters for background from
numbers = np.array(count_dict.keys(), dtype=np.float)
counts = np.array(count_dict.values(), dtype=np.float)
probs = counts/counts.sum()
prob_bins = np.cumsum(probs)
# read and write the rest in chunks
if os.path.exists(output_filename):
os.remove(output_filename)
for tree_name in list_trees(input_filename):
match_signal = re.match(param_match, tree_name)
if match_signal is not None or tree_name in bkg_trees:
print("Writing {}".format(tree_name))
nwritten = 0
for df in read_root(input_filename, tree_name, chunksize=100000):
print("Writing event {}".format(nwritten))
if match_signal is None:
rnd = np.random.random(len(df))
rnd_idx = np.digitize(rnd, prob_bins)
param_values = numbers[rnd_idx]
for param_idx, param_name in enumerate(param_names):
df[param_name] = param_values[:,param_idx]
df["training_weight"] = df["eventWeight"]*df["genWeight"]
else:
for param_name, param_value in zip(param_names, match_signal.groups()):
df[param_name] = float(param_value)
df["training_weight"] = df["eventWeight"]
if match_signal is None:
out_tree_name = tree_name
else:
out_tree_name = output_signal_treename
df.to_root(output_filename, mode="a", key=out_tree_name)
nwritten += len(df)
import pytest
import numpy as np
import root_numpy
import pandas as pd
from sklearn.datasets import make_classification
from keras.layers import GRU
from KerasROOTClassification import ClassificationProject, ClassificationProjectRNN
def create_dataset(path):
# create example dataset with (low-weighted) noise added
X, y = make_classification(n_samples=10000, random_state=1)
X2 = np.random.normal(size=20*10000).reshape(-1, 20)
y2 = np.concatenate([np.zeros(5000), np.ones(5000)])
X = np.concatenate([X, X2])
y = np.concatenate([y, y2])
w = np.concatenate([np.ones(10000), 0.01*np.ones(10000)])
# shift and scale randomly (to check if transformation is working)
shift = np.random.rand(20)*100
scale = np.random.rand(20)*1000
X *= scale
X += shift
# write to root files
branches = ["var_{}".format(i) for i in range(len(X[0]))]
df = pd.DataFrame(X, columns=branches)
df["class"] = y
df["weight"] = w
tree_path_bkg = str(path / "bkg.root")
tree_path_sig = str(path / "sig.root")
root_numpy.array2root(df[df["class"]==0].to_records(), tree_path_bkg)
root_numpy.array2root(df[df["class"]==1].to_records(), tree_path_sig)
return branches, tree_path_sig, tree_path_bkg
def test_ClassificationProject(tmp_path):
branches, tree_path_sig, tree_path_bkg = create_dataset(tmp_path)
c = ClassificationProject(
str(tmp_path / "project"),
bkg_trees = [(tree_path_bkg, "tree")],
signal_trees = [(tree_path_sig, "tree")],
branches = branches,
weight_expr = "weight",
identifiers = ["index"],
optimizer="Adam",
earlystopping_opts=dict(patience=5),
dropout=0.5,
layers=3,
nodes=128,
)
c.train(epochs=200)
c.plot_all_inputs()
c.plot_loss()
assert min(c.history.history["val_loss"]) < 0.18
def test_ClassificationProjectRNN(tmp_path):
branches, tree_path_sig, tree_path_bkg = create_dataset(tmp_path)
c = ClassificationProjectRNN(
str(tmp_path / "project"),
bkg_trees = [(tree_path_bkg, "tree")],
signal_trees = [(tree_path_sig, "tree")],
branches = branches,
recurrent_field_names=[
[
["var_1", "var_2", "var_3"],
["var_4", "var_5", "var_6"]
],
[
["var_10", "var_11", "var_12"],
["var_13", "var_14", "var_15"]
],
],
weight_expr = "weight",
identifiers = ["index"],
optimizer="Adam",
earlystopping_opts=dict(patience=5),
dropout=0.5,
layers=3,
nodes=128,
)
assert sum([isinstance(layer, GRU) for layer in c.model.layers]) == 2
c.train(epochs=200)
c.plot_all_inputs()
c.plot_loss()
assert min(c.history.history["val_loss"]) < 0.18
This diff is collapsed.
...@@ -45,14 +45,19 @@ def create_random_event(ranges, mask_probs=None, mask_value=None): ...@@ -45,14 +45,19 @@ def create_random_event(ranges, mask_probs=None, mask_value=None):
return random_event return random_event
def get_ranges(x, quantiles, weights, mask_value=None, filter_index=None): def get_ranges(x, quantiles, weights, mask_value=None, filter_index=None, max_evts=None):
"Get ranges for plotting or random event generation based on quantiles" "Get ranges for plotting or random event generation based on quantiles"
ranges = [] ranges = []
mask_probs = [] mask_probs = []
if max_evts is not None:
rnd_idx = np.random.permutation(np.arange(len(x)))
rnd_idx = rnd_idx[:max_evts]
for var_index in range(x.shape[1]): for var_index in range(x.shape[1]):
if (filter_index is not None) and (var_index != filter_index): if (filter_index is not None) and (var_index != filter_index):
continue continue
x_var = x[:,var_index] x_var = x[:,var_index]
if max_evts is not None:
x_var = x_var[rnd_idx]
not_masked = np.where(x_var != mask_value)[0] not_masked = np.where(x_var != mask_value)[0]
masked = np.where(x_var == mask_value)[0] masked = np.where(x_var == mask_value)[0]
ranges.append(weighted_quantile(x_var[not_masked], quantiles, sample_weight=weights[not_masked])) ranges.append(weighted_quantile(x_var[not_masked], quantiles, sample_weight=weights[not_masked]))
...@@ -192,14 +197,26 @@ def weighted_quantile(values, quantiles, sample_weight=None, values_sorted=False ...@@ -192,14 +197,26 @@ def weighted_quantile(values, quantiles, sample_weight=None, values_sorted=False
class WeightedRobustScaler(RobustScaler): class WeightedRobustScaler(RobustScaler):
def fit(self, X, y=None, weights=None): def fit(self, X, y=None, weights=None, mask_value=None):
if not np.isnan(X).any(): if not np.isnan(X).any() and mask_value is not None and weights is None:
# these checks don't work for nan values # these checks don't work for nan values
super(WeightedRobustScaler, self).fit(X, y) return super(WeightedRobustScaler, self).fit(X, y)
if weights is None:
return self
else: else:
wqs = np.array([weighted_quantile(X[:,i][~np.isnan(X[:,i])], [0.25, 0.5, 0.75], sample_weight=weights) for i in range(X.shape[1])]) if weights is None:
weights = np.ones(len(self.X))
wqs = []
for i in range(X.shape[1]):
mask = ~np.isnan(X[:,i])
if mask_value is not None:
mask &= (X[:,i] != mask_value)
wqs.append(
weighted_quantile(
X[:,i][mask],
[0.25, 0.5, 0.75],
sample_weight=weights[mask]
)
)
wqs = np.array(wqs)
self.center_ = wqs[:,1] self.center_ = wqs[:,1]
self.scale_ = wqs[:,2]-wqs[:,0] self.scale_ = wqs[:,2]-wqs[:,0]
self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False) self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False)
......