Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • Eric.Schanet/KerasROOTClassification
  • Nikolai.Hartmann/KerasROOTClassification
2 results
Show changes
Commits on Source (25)
......@@ -22,6 +22,7 @@ def overlay_ROC(filename, *projects, **kwargs):
threshold_log = kwargs.pop("threshold_log", True)
lumifactor = kwargs.pop("lumifactor", None)
tight_layout = kwargs.pop("tight_layout", False)
show_auc = kwargs.pop("show_auc", True)
if kwargs:
raise KeyError("Unknown kwargs: {}".format(kwargs))
......@@ -43,7 +44,7 @@ def overlay_ROC(filename, *projects, **kwargs):
colors = prop_cycle.by_key()['color']
for p, color in zip(projects, colors):
fpr, tpr, threshold = roc_curve(p.y_test, p.scores_test, sample_weight = p.w_test)
fpr, tpr, threshold = roc_curve(p.l_test, p.scores_test, sample_weight = p.w_test)
fpr = 1.0 - fpr
try:
roc_auc = auc(tpr, fpr)
......@@ -52,12 +53,16 @@ def overlay_ROC(filename, *projects, **kwargs):
roc_auc = auc(tpr, fpr, reorder=True)
ax.grid(color='gray', linestyle='--', linewidth=1)
ax.plot(tpr, fpr, label=str(p.name+" (AUC = {:.3f})".format(roc_auc)), color=color)
if show_auc:
label = str(p.name+" (AUC = {:.3f})".format(roc_auc))
else:
label = p.name
ax.plot(tpr, fpr, label=label, color=color)
if plot_thresholds:
ax2.plot(tpr, threshold, "--", color=color)
if lumifactor is not None:
sumw_b = p.w_test[p.y_test==0].sum()*lumifactor
sumw_s = p.w_test[p.y_test==1].sum()*lumifactor
sumw_b = p.w_test[p.l_test==0].sum()*lumifactor
sumw_s = p.w_test[p.l_test==1].sum()*lumifactor
ax_abs_b.plot(tpr, (1.-fpr)*sumw_b, alpha=0)
ax_abs_b.invert_yaxis()
ax_abs_s.plot(tpr*sumw_s, fpr, alpha=0)
......
......@@ -7,13 +7,24 @@ from keras.layers import GRU
from KerasROOTClassification import ClassificationProject, ClassificationProjectRNN
def create_dataset(path):
# create example dataset with (low-weighted) noise added
X, y = make_classification(n_samples=10000, random_state=1)
X2 = np.random.normal(size=20*10000).reshape(-1, 20)
y2 = np.concatenate([np.zeros(5000), np.ones(5000)])
X = np.concatenate([X, X2])
y = np.concatenate([y, y2])
w = np.concatenate([np.ones(10000), 0.01*np.ones(10000)])
# shift and scale randomly (to check if transformation is working)
shift = np.random.rand(20)*100
scale = np.random.rand(20)*1000
X *= scale
X += shift
# write to root files
branches = ["var_{}".format(i) for i in range(len(X[0]))]
df = pd.DataFrame(X, columns=branches)
df["class"] = y
......@@ -40,7 +51,10 @@ def test_ClassificationProject(tmp_path):
layers=3,
nodes=128,
)
c.train(epochs=200)
c.plot_all_inputs()
c.plot_loss()
assert min(c.history.history["val_loss"]) < 0.18
......@@ -71,4 +85,6 @@ def test_ClassificationProjectRNN(tmp_path):
)
assert sum([isinstance(layer, GRU) for layer in c.model.layers]) == 2
c.train(epochs=200)
c.plot_all_inputs()
c.plot_loss()
assert min(c.history.history["val_loss"]) < 0.18
......@@ -38,9 +38,12 @@ from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Dropout, Input, Masking, GRU, LSTM, concatenate, SimpleRNN
from keras.callbacks import History, EarlyStopping, CSVLogger, ModelCheckpoint, TensorBoard
from keras.optimizers import SGD
from keras.activations import relu
import keras.initializers
import keras.optimizers
from keras.utils.vis_utils import model_to_dot
from keras import backend as K
import tensorflow as tf
import matplotlib.pyplot as plt
from .utils import WeightedRobustScaler, weighted_quantile, poisson_asimov_significance
......@@ -64,6 +67,25 @@ if version_info[0] > 2:
byteify = lambda input : input
def set_session_threads(n_cpu=None):
"Set the number of threads based on OMP_NUM_THREADS or the given argument"
if n_cpu is None:
if os.environ.get('OMP_NUM_THREADS'):
n_cpu = int(os.environ.get('OMP_NUM_THREADS'))
else:
return
# not sure if this is the best configuration ...
config = tf.ConfigProto(intra_op_parallelism_threads=n_cpu,
inter_op_parallelism_threads=1,
allow_soft_placement=True,
#log_device_placement=True,
device_count = {'CPU': n_cpu})
session = tf.Session(config=config)
K.set_session(session)
def load_from_dir(path):
"Load a project and the options from a directory"
try:
......@@ -100,6 +122,8 @@ class ClassificationProject(object):
:param branches: list of branch names or expressions to be used as input values for training
:param regression_branches: list of branch names to be used as regression targets
:param rename_branches: dictionary that maps branch expressions to names for better readability
:param weight_expr: expression to weight the events in the loss function
......@@ -138,6 +162,8 @@ class ClassificationProject(object):
:param activation_function_output: activation function in the output layer
:param leaky_relu_alpha: set this to a non-zero value to use the LeakyReLU variant with a slope in the negative part
:param out_dir: base directory in which the project directories should be stored
:param scaler_type: sklearn scaler class name to transform the data before training (options: "StandardScaler", "RobustScaler")
......@@ -180,7 +206,9 @@ class ClassificationProject(object):
the first time. This seed (increased by one) is used again before
training when keras shuffling is used.
:param loss: loss function name
:param loss: loss function name (or list of names in case of regression targets)
:param loss_weights: (optional) list of weights to weight the individual losses (for multiple targets)
:param mask_value: value that is used for non-existent entries (e.g. 4th jet pt in events with 3 jets)
......@@ -188,6 +216,10 @@ class ClassificationProject(object):
:param normalize_weights: normalize the weights to mean 1
:param ignore_neg_weights: ignore events with negative weights in training - not recommended! (default: False)
:param kernel_initializer: weight initializer for the dense layers - if None (default) the keras defaults are used
:param shuffle: shuffle training data after (and before first) epoch
"""
......@@ -224,6 +256,7 @@ class ClassificationProject(object):
def _init_from_args(self, name,
signal_trees, bkg_trees, branches, weight_expr,
regression_branches=None,
rename_branches=None,
project_dir=None,
data_dir=None,
......@@ -239,6 +272,7 @@ class ClassificationProject(object):
kfold_splits=None,
kfold_index=0,
activation_function='relu',
leaky_relu_alpha=None,
activation_function_output='sigmoid',
scaler_type="WeightedRobustScaler",
step_signal=2,
......@@ -257,9 +291,12 @@ class ClassificationProject(object):
shuffle_seed=42,
balance_dataset=False,
loss='binary_crossentropy',
loss_weights=None,
mask_value=None,
apply_class_weight=True,
normalize_weights=True,
ignore_neg_weights=False,
kernel_initializer=None,
shuffle=True,
):
......@@ -270,6 +307,9 @@ class ClassificationProject(object):
if rename_branches is None:
rename_branches = {}
self.rename_branches = rename_branches
if regression_branches is None:
regression_branches = []
self.regression_branches = regression_branches
self.weight_expr = weight_expr
self.selection = selection
......@@ -308,6 +348,9 @@ class ClassificationProject(object):
self.kfold_splits = kfold_splits
self.kfold_index = kfold_index
self.activation_function = activation_function
self.leaky_relu_alpha = leaky_relu_alpha
if self.activation_function == "relu" and self.leaky_relu_alpha:
self.activation_function = lambda x : relu(x, alpha=self.leaky_relu_alpha)
self.activation_function_output = activation_function_output
self.scaler_type = scaler_type
self.step_signal = step_signal
......@@ -340,9 +383,15 @@ class ClassificationProject(object):
self.shuffle_seed = shuffle_seed
self.balance_dataset = balance_dataset
self.loss = loss
self.loss_weights = loss_weights
if self.regression_branches and (not isinstance(self.loss, list)):
self.loss = [self.loss]+["mean_squared_error"]*len(self.regression_branches)
self.mask_value = mask_value
self.apply_class_weight = apply_class_weight
self.normalize_weights = normalize_weights
self.ignore_neg_weights = ignore_neg_weights
self.kernel_initializer = kernel_initializer
self.shuffle = shuffle
self.s_train = None
......@@ -366,6 +415,7 @@ class ClassificationProject(object):
self._b_eventlist_train = None
self._scaler = None
self._scaler_target = None
self._class_weight = None
self._balanced_class_weight = None
self._model = None
......@@ -377,12 +427,12 @@ class ClassificationProject(object):
self.total_epochs = 0
self.data_loaded = False
self.data_transformed = False
# track if we are currently training
self.is_training = False
self._fields = None
self._target_fields = None
@property
......@@ -395,6 +445,16 @@ class ClassificationProject(object):
return self._fields
@property
def target_fields(self):
"Renamed branch expressions for regression targets"
if self._target_fields is None:
self._target_fields = []
for branch_expr in self.regression_branches:
self._target_fields.append(self.rename_branches.get(branch_expr, branch_expr))
return self._target_fields
def rename_fields(self, ar):
"Rename fields of structured array"
fields = list(ar.dtype.names)
......@@ -425,22 +485,26 @@ class ClassificationProject(object):
for filename, treename in self.bkg_trees:
bkg_chain.AddFile(filename, -1, treename)
self.s_train = tree2array(signal_chain,
branches=self.branches+[self.weight_expr]+self.identifiers,
branches=set(self.branches+self.regression_branches+[self.weight_expr]+self.identifiers),
selection=self.selection,
start=0, step=self.step_signal, stop=self.stop_train)
self.b_train = tree2array(bkg_chain,
branches=self.branches+[self.weight_expr]+self.identifiers,
branches=set(self.branches+self.regression_branches+[self.weight_expr]+self.identifiers),
selection=self.selection,
start=0, step=self.step_bkg, stop=self.stop_train)
self.s_test = tree2array(signal_chain,
branches=self.branches+[self.weight_expr],
branches=set(self.branches+self.regression_branches+[self.weight_expr]),
selection=self.selection,
start=1, step=self.step_signal, stop=self.stop_test)
self.b_test = tree2array(bkg_chain,
branches=self.branches+[self.weight_expr],
branches=set(self.branches+self.regression_branches+[self.weight_expr]),
selection=self.selection,
start=1, step=self.step_bkg, stop=self.stop_test)
if self.ignore_neg_weights:
self.s_train = self.s_train[self.s_train[self.weight_expr]>0]
self.b_train = self.b_train[self.b_train[self.weight_expr]>0]
self.rename_fields(self.s_train)
self.rename_fields(self.b_train)
self.rename_fields(self.s_test)
......@@ -450,19 +514,27 @@ class ClassificationProject(object):
self.b_eventlist_train = self.b_train[self.identifiers].astype(dtype=[(branchName, "u8") for branchName in self.identifiers])
self._dump_training_list()
# now we don't need the identifiers anymore
self.s_train = self.s_train[self.fields+[self.weight_expr]]
self.b_train = self.b_train[self.fields+[self.weight_expr]]
# create x (input), y (target) and w (weights) arrays
# the first block will be signals, the second block backgrounds
self.x_train = rec2array(self.s_train[self.fields])
self.x_train = np.concatenate((self.x_train, rec2array(self.b_train[self.fields])))
self.w_train = self.s_train[self.weight_expr]
self.w_train = np.concatenate((self.w_train, self.b_train[self.weight_expr]))
self.y_train = np.empty(len(self.x_train), dtype=np.bool)
self.y_train[:len(self.s_train)] = 1
self.y_train[len(self.s_train):] = 0
def fill_target(x, s, b):
if not self.target_fields:
y = np.empty(len(x), dtype=np.bool)
y[:len(s)] = 1
y[len(s):] = 0
else:
y = np.empty((len(x), 1+len(self.target_fields)), dtype=np.float)
y[:len(s),0] = 1
y[len(s):,0] = 0
y[:len(s),1:] = rec2array(s[self.target_fields])
y[len(s):,1:] = rec2array(b[self.target_fields])
return y
self.y_train = fill_target(self.x_train, self.s_train, self.b_train)
self.b_train = None
self.s_train = None
......@@ -470,9 +542,8 @@ class ClassificationProject(object):
self.x_test = np.concatenate((self.x_test, rec2array(self.b_test[self.fields])))
self.w_test = self.s_test[self.weight_expr]
self.w_test = np.concatenate((self.w_test, self.b_test[self.weight_expr]))
self.y_test = np.empty(len(self.x_test), dtype=np.bool)
self.y_test[:len(self.s_test)] = 1
self.y_test[len(self.s_test):] = 0
self.y_test = fill_target(self.x_test, self.s_test, self.b_test)
self.b_test = None
self.s_test = None
......@@ -580,6 +651,7 @@ class ClassificationProject(object):
elif self.scaler_type == "WeightedRobustScaler":
self._scaler = WeightedRobustScaler()
scaler_fit_kwargs["weights"] = self.w_train_tot
scaler_fit_kwargs["mask_value"] = self.mask_value
else:
raise ValueError("Scaler type {} unknown".format(self.scaler_type))
logger.info("Fitting {} to training data".format(self.scaler_type))
......@@ -591,6 +663,39 @@ class ClassificationProject(object):
return self._scaler
@property
def scaler_target(self):
"same as scaler, but for scaling regression targets"
# create the scaler (and fit to training data) if not existent
if self._scaler_target is None:
filename = os.path.join(self.project_dir, "scaler_target.pkl")
try:
self._scaler_target = joblib.load(filename)
logger.info("Loaded existing scaler from {}".format(filename))
except IOError:
logger.info("Creating new {} for scaling the targets".format(self.scaler_type))
scaler_fit_kwargs = dict()
if self.scaler_type == "StandardScaler":
self._scaler_target = StandardScaler()
elif self.scaler_type == "RobustScaler":
self._scaler_target = RobustScaler()
elif self.scaler_type == "WeightedRobustScaler":
self._scaler_target = WeightedRobustScaler()
scaler_fit_kwargs["weights"] = self.w_train_tot
else:
raise ValueError("Scaler type {} unknown".format(self.scaler_type))
logger.info("Fitting {} to training data".format(self.scaler_type))
orig_copy_setting = self.scaler.copy
self.scaler.copy = False
self._scaler_target.fit(self.y_train, **scaler_fit_kwargs)
# i don't want to scale the classification target here
self._scaler_target.center_[0] = 0.
self._scaler_target.scale_[0] = 1.
self.scaler.copy = orig_copy_setting
joblib.dump(self._scaler_target, filename)
return self._scaler_target
def _batch_transform(self, x, fn, batch_size):
"Transform array in batches, temporarily setting mask_values to nan"
transformed = np.empty(x.shape, dtype=x.dtype)
......@@ -618,6 +723,24 @@ class ClassificationProject(object):
return self.scaler.inverse_transform(x)
def transform_target(self, y, batch_size=10000):
if not self.target_fields:
return y
if self.mask_value is not None:
return self._batch_transform(y, self.scaler_target.transform, batch_size)
else:
return self.scaler_target.transform(y)
def inverse_transform_target(self, y, batch_size=10000):
if not self.target_fields:
return y
if self.mask_value is not None:
return self._batch_transform(y, self.scaler_target.inverse_transform, batch_size)
else:
return self.scaler_target.inverse_transform(y)
@property
def history(self):
params_file = os.path.join(self.project_dir, "history_params.json")
......@@ -649,30 +772,6 @@ class ClassificationProject(object):
json.dump(self.history.history, of)
def _transform_data(self):
if not self.data_transformed:
if self.mask_value is not None:
self.x_train[self.x_train == self.mask_value] = np.nan
self.x_test[self.x_test == self.mask_value] = np.nan
if logger.level <= logging.DEBUG:
logger.debug("training data before transformation: {}".format(self.x_train))
logger.debug("minimum values: {}".format([np.min(self.x_train[:,i][~np.isnan(self.x_train[:,i])])
for i in range(self.x_train.shape[1])]))
logger.debug("maximum values: {}".format([np.max(self.x_train[:,i][~np.isnan(self.x_train[:,i])])
for i in range(self.x_train.shape[1])]))
orig_copy_setting = self.scaler.copy
self.scaler.copy = False
self.x_train = self.scaler.transform(self.x_train)
logger.debug("training data after transformation: {}".format(self.x_train))
self.x_test = self.scaler.transform(self.x_test)
self.scaler.copy = orig_copy_setting
if self.mask_value is not None:
self.x_train[np.isnan(self.x_train)] = self.mask_value
self.x_test[np.isnan(self.x_test)] = self.mask_value
self.data_transformed = True
logger.info("Training and test data transformed")
def _read_info(self, key, default):
filename = os.path.join(self.project_dir, "info.json")
if not os.path.exists(filename):
......@@ -714,6 +813,7 @@ class ClassificationProject(object):
if self._model is None:
# input
input_layer = Input((len(self.fields),))
......@@ -729,14 +829,31 @@ class ClassificationProject(object):
self.dropout,
self.use_bias,
):
hidden_layer = Dense(node_count, activation=self.activation_function, use_bias=use_bias)(hidden_layer)
extra_opts = dict()
if self.kernel_initializer is not None:
extra_opts["kernel_initializer"] = getattr(keras.initializers, self.kernel_initializer)()
hidden_layer = Dense(node_count, activation=self.activation_function, use_bias=use_bias, **extra_opts)(hidden_layer)
if (dropout_fraction is not None) and (dropout_fraction > 0):
hidden_layer = Dropout(rate=dropout_fraction)(hidden_layer)
# one output node for binary classification
output_layer = Dense(1, activation=self.activation_function_output)(hidden_layer)
# optional regression targets
extra_targets = []
for target_field in self.target_fields:
extra_target = Dense(1, activation="linear", name="target_{}".format(target_field))(hidden_layer)
extra_targets.append(extra_target)
if not self.target_fields:
# one output node for binary classification
output_layer = Dense(1, activation=self.activation_function_output)(hidden_layer)
outputs = [output_layer]
else:
# add another hidden layer on top of the regression targets and previous hidden layers
merge = concatenate([hidden_layer]+extra_targets)
hidden_layer2 = Dense(64, activation=self.activation_function)(merge)
output_class = Dense(1, activation=self.activation_function_output)(hidden_layer2)
outputs = [output_class]+extra_targets
self._model = Model(inputs=[input_layer], outputs=[output_layer])
self._model = Model(inputs=[input_layer], outputs=outputs)
self._compile_or_load_model()
return self._model
......@@ -751,6 +868,7 @@ class ClassificationProject(object):
np.random.seed(self.random_seed)
self._model.compile(optimizer=optimizer,
loss=self.loss,
loss_weights=self.loss_weights,
weighted_metrics=['accuracy']
)
np.random.set_state(rn_state)
......@@ -781,8 +899,8 @@ class ClassificationProject(object):
@property
def class_weight(self):
if self._class_weight is None:
sumw_bkg = np.sum(self.w_train[self.y_train == 0])
sumw_sig = np.sum(self.w_train[self.y_train == 1])
sumw_bkg = np.sum(self.w_train[self.l_train == 0])
sumw_sig = np.sum(self.w_train[self.l_train == 1])
self._class_weight = [(sumw_sig+sumw_bkg)/(2*sumw_bkg), (sumw_sig+sumw_bkg)/(2*sumw_sig)]
logger.debug("Calculated class_weight: {}".format(self._class_weight))
return self._class_weight
......@@ -797,11 +915,11 @@ class ClassificationProject(object):
event with class weights
"""
if self._balanced_class_weight is None:
sumw_bkg = np.sum(self.w_train[self.y_train == 0])
sumw_sig = np.sum(self.w_train[self.y_train == 1])
sumw_bkg = np.sum(self.w_train[self.l_train == 0])
sumw_sig = np.sum(self.w_train[self.l_train == 1])
# use sumw *per event* in this case
sumw_bkg /= len(self.w_train[self.y_train == 0])
sumw_sig /= len(self.w_train[self.y_train == 1])
sumw_bkg /= len(self.w_train[self.l_train == 0])
sumw_sig /= len(self.w_train[self.l_train == 1])
self._balanced_class_weight = [(sumw_sig+sumw_bkg)/(2*sumw_bkg), (sumw_sig+sumw_bkg)/(2*sumw_sig)]
logger.debug("Calculated balanced_class_weight: {}".format(self._balanced_class_weight))
return self._balanced_class_weight
......@@ -812,13 +930,27 @@ class ClassificationProject(object):
if reload:
self.data_loaded = False
self.data_transformed = False
if not self.data_loaded:
self._load_data()
if not self.data_transformed:
self._transform_data()
@property
def l_train(self):
"labels (in case y contains regression targets)"
if not self.target_fields:
return self.y_train
else:
return self.y_train[:,0]
@property
def l_test(self):
"labels (in case y contains regression targets)"
if not self.target_fields:
return self.y_test
else:
return self.y_test[:,0]
@property
......@@ -832,7 +964,7 @@ class ClassificationProject(object):
raise ValueError("Data not loaded! can't calculate total weight")
if self._w_train_tot is None:
if self.apply_class_weight:
self._w_train_tot = self.w_train*np.array(class_weight)[self.y_train.astype(int)]
self._w_train_tot = self.w_train*np.array(class_weight)[self.l_train.astype(int)]
else:
self._w_train_tot = np.array(self.w_train)
if self.normalize_weights:
......@@ -842,20 +974,24 @@ class ClassificationProject(object):
@property
def validation_data(self):
"Validation data for loss evaluation"
"(Transformed) validation data for loss evaluation"
idx = self.train_val_idx[1]
x_val, y_val, w_val = self.x_train[idx], self.y_train[idx], self.w_train_tot[idx]
x_val_input = self.get_input_list(x_val)
return x_val_input, y_val, w_val
x_val_input = self.get_input_list(self.transform(x_val))
y_val_output = self.get_output_list(self.transform_target(y_val))
w_val_list = self.get_weight_list(w_val)
return x_val_input, y_val_output, w_val_list
@property
def training_data(self):
"Training data with validation data split off"
"(Transformed) Training data with validation data split off"
idx = self.train_val_idx[0]
x_train, y_train, w_train = self.x_train[idx], self.y_train[idx], self.w_train_tot[idx]
x_train_input = self.get_input_list(x_train)
return x_train_input, y_train, w_train
x_train_input = self.get_input_list(self.transform(x_train))
y_train_output = self.get_output_list(self.transform_target(y_train))
w_train_list = self.get_weight_list(w_train)
return x_train_input, y_train_output, w_train_list
@property
......@@ -890,14 +1026,30 @@ class ClassificationProject(object):
return x
def get_output_list(self, y):
"Split target vector column wise in case of regression targets"
if not self.target_fields:
return y
else:
return np.hsplit(y, len(self.target_fields)+1)
def get_weight_list(self, w):
"Repeat weight n times for regression targets"
if not self.target_fields:
return w
else:
return [w]*(len(self.target_fields)+1)
def yield_batch(self):
"Batch generator - optionally shuffle the indices after each epoch"
x_train, y_train, w_train = self.x_train, self.y_train, self.w_train_tot
train_idx = list(self.train_val_idx[0])
np.random.seed(self.shuffle_seed+1)
logger.info("Generating training batches from {} signal and {} background events"
.format(len(np.where(self.y_train[train_idx]==1)[0]),
len(np.where(self.y_train[train_idx]==0)[0])))
.format(len(np.where(self.l_train[train_idx]==1)[0]),
len(np.where(self.l_train[train_idx]==0)[0])))
while True:
if self.shuffle:
shuffled_idx = np.random.permutation(train_idx)
......@@ -907,8 +1059,10 @@ class ClassificationProject(object):
x_batch = x_train[shuffled_idx[start:start+int(self.batch_size)]]
y_batch = y_train[shuffled_idx[start:start+int(self.batch_size)]]
w_batch = w_train[shuffled_idx[start:start+int(self.batch_size)]]
x_input = self.get_input_list(x_batch)
yield (x_input, y_batch, w_batch)
x_input = self.get_input_list(self.transform(x_batch))
y_output = self.get_output_list(self.transform_target(y_batch))
w_list = self.get_weight_list(w_batch)
yield (x_input, y_output, w_list)
def yield_single_class_batch(self, class_label):
......@@ -917,7 +1071,8 @@ class ClassificationProject(object):
The weights are multiplied by balanced_class_weight.
"""
x_train, y_train, w_train = self.training_data
class_idx = np.where(y_train==class_label)[0]
l_train = y_train[:,0] if self.target_fields else y_train
class_idx = np.where(l_train==class_label)[0]
while True:
# shuffle the indices for this class label
if self.shuffle:
......@@ -952,6 +1107,8 @@ class ClassificationProject(object):
self.total_epochs = self._read_info("epochs", 0)
set_session_threads()
logger.info("Train model")
if not self.balance_dataset:
try:
......@@ -968,7 +1125,7 @@ class ClassificationProject(object):
else:
try:
self.is_training = True
labels, label_counts = np.unique(self.y_train, return_counts=True)
labels, label_counts = np.unique(self.l_train, return_counts=True)
logger.info("Training on balanced batches")
# note: the batches have balanced_class_weight already applied
self.model.fit_generator(self.yield_balanced_batch(),
......@@ -1005,23 +1162,40 @@ class ClassificationProject(object):
self._write_info("epochs", self.total_epochs)
def evaluate_train_test(self, do_train=True, do_test=True, mode=None):
logger.info("Reloading (and re-transforming) training data")
self.load(reload=True)
def evaluate_train_test(self, do_train=True, do_test=True, batch_size=10000, mode=None):
"Calculate scores for training and test sample"
if mode is not None:
self._write_info("scores_mode", mode)
logger.info("Create/Update scores for train/test sample")
def eval_score(data_name):
logger.info("Create/Update scores for {} sample".format(data_name))
n_events = len(getattr(self, "x_"+data_name))
setattr(self, "scores_"+data_name, np.empty(n_events))
for start in range(0, n_events, batch_size):
stop = start+batch_size
outputs = self.predict(
self.get_input_list(self.transform(getattr(self, "x_"+data_name)[start:stop])),
mode=mode
)
if not self.target_fields:
scores_batch = outputs.reshape(-1)
else:
scores_batch = outputs[0].reshape(-1)
getattr(self, "scores_"+data_name)[start:stop] = scores_batch
self._dump_to_hdf5("scores_"+data_name)
if do_test:
self.scores_test = self.predict(self.x_test, mode=mode).reshape(-1)
self._dump_to_hdf5("scores_test")
eval_score("test")
if do_train:
self.scores_train = self.predict(self.x_train, mode=mode).reshape(-1)
self._dump_to_hdf5("scores_train")
eval_score("train")
def predict(self, x, mode=None):
"""
Calculate the scores for a (transformed) array of input values.
If the array is not transformed, use `evaluate` instead
"""
if mode is None:
# normal output - after activation function output layer
return self.model.predict(x)
......@@ -1041,6 +1215,10 @@ class ClassificationProject(object):
def evaluate(self, x_eval, mode=None):
"""
Calculate the scores for an array of input values.
All nescessary transformations are applied.
"""
logger.debug("Evaluate score for {}".format(x_eval))
x_eval = self.transform(x_eval)
logger.debug("Evaluate for transformed array: {}".format(x_eval))
......@@ -1053,6 +1231,7 @@ class ClassificationProject(object):
batch_size=100000,
score_mode=None,
fixed_params=None):
"TODO: doesn't work for regression targets"
f = ROOT.TFile.Open(source_filename)
tree = f.Get(source_treename)
entries = tree.GetEntries()
......@@ -1123,17 +1302,57 @@ class ClassificationProject(object):
return centers, hist, errors
def plot_input(self, var_index, ax=None):
"plot a single input variable"
def plot_input(self, var_index, ax=None, from_training_batches=False, max_n_batches=None):
"""
plot a single input variable as a histogram (signal vs background)
:param from_training_batches: use data from training batch generator
:param max_n_batches: if training batch generator is used, just use
this number of batches (otherwise steps_per_epoch is used)
"""
branch = self.fields[var_index]
if ax is None:
fig, ax = plt.subplots()
else:
fig = None
bkg = self.x_train[:,var_index][self.y_train == 0]
sig = self.x_train[:,var_index][self.y_train == 1]
bkg_weights = self.w_train_tot[self.y_train == 0]
sig_weights = self.w_train_tot[self.y_train == 1]
if not from_training_batches:
bkg = self.x_train[:,var_index][self.l_train == 0]
sig = self.x_train[:,var_index][self.l_train == 1]
bkg_weights = self.w_train_tot[self.l_train == 0]
sig_weights = self.w_train_tot[self.l_train == 1]
else:
bkg = None
sig = None
bkg_weights = None
sig_weights = None
if max_n_batches is not None:
n_batches = max_n_batches
else:
n_batches = self.steps_per_epoch
for i_batch, (x, y, w) in enumerate(self.yield_batch()):
if i_batch > n_batches:
break
if self.target_fields:
y = y[0]
try:
x = self.get_input_flat(x)
except NameError:
pass
bkg_batch = x[:,var_index][y==0]
sig_batch = x[:,var_index][y==1]
bkg_weights_batch = w[y==0]
sig_weights_batch = w[y==1]
if bkg is None:
bkg = bkg_batch
sig = sig_batch
bkg_weights = bkg_weights_batch
sig_weights = sig_weights_batch
else:
bkg = np.concatenate([bkg, bkg_batch])
sig = np.concatenate([sig, sig_batch])
bkg_weights = np.concatenate([bkg_weights, bkg_weights_batch])
sig_weights = np.concatenate([sig_weights, sig_weights_batch])
if hasattr(self, "mask_value"):
bkg_not_masked = np.where(bkg != self.mask_value)[0]
......@@ -1184,13 +1403,14 @@ class ClassificationProject(object):
centers_sig, hist_sig, _ = self.get_bin_centered_hist(sig, bins=bins, range=plot_range, weights=sig_weights)
centers_bkg, hist_bkg, _ = self.get_bin_centered_hist(bkg, bins=bins, range=plot_range, weights=bkg_weights)
width = centers_sig[1]-centers_sig[0]
if bins > 1:
width = centers_sig[1]-centers_sig[0]
else:
width = 1.
ax.bar(centers_bkg, hist_bkg, color="b", alpha=0.5, width=width)
ax.bar(centers_sig, hist_sig, color="r", alpha=0.5, width=width)
label = branch
if self.data_transformed:
label += " (transformed)"
ax.set_xlabel(label)
if fig is not None:
plot_dir = os.path.join(self.project_dir, "plots")
......@@ -1199,20 +1419,20 @@ class ClassificationProject(object):
return save_show(plt, fig, os.path.join(plot_dir, "var_{}.pdf".format(var_index)))
def plot_all_inputs(self):
def plot_all_inputs(self, **kwargs):
nrows = math.ceil(math.sqrt(len(self.fields)))
fig, axes = plt.subplots(nrows=int(nrows), ncols=int(nrows),
figsize=(3*nrows, 3*nrows),
gridspec_kw=dict(wspace=0.4, hspace=0.4))
for i in range(len(self.fields)):
self.plot_input(i, ax=axes.reshape(-1)[i])
self.plot_input(i, ax=axes.reshape(-1)[i], **kwargs)
return save_show(plt, fig, os.path.join(self.project_dir, "all_inputs.pdf"))
def plot_weights(self, bins=100, range=None):
fig, ax = plt.subplots()
bkg = self.w_train_tot[self.y_train == 0]
sig = self.w_train_tot[self.y_train == 1]
bkg = self.w_train_tot[self.l_train == 0]
sig = self.w_train_tot[self.l_train == 1]
ax.hist(bkg, bins=bins, range=range, color="b", alpha=0.5)
ax.set_yscale("log")
save_show(plt, fig, os.path.join(self.project_dir, "eventweights_bkg.pdf"))
......@@ -1230,8 +1450,8 @@ class ClassificationProject(object):
ax.grid(color='gray', linestyle='--', linewidth=1)
for y, scores, weight, label in [
(self.y_train, self.scores_train, self.w_train, "train"),
(self.y_test, self.scores_test, self.w_test, "test")
(self.l_train, self.scores_train, self.w_train, "train"),
(self.l_test, self.scores_test, self.w_test, "test")
]:
fpr, tpr, threshold = roc_curve(y, scores, sample_weight = weight)
fpr = 1.0 - fpr # background rejection
......@@ -1264,10 +1484,10 @@ class ClassificationProject(object):
trf = lambda y : y
fig, ax = plt.subplots()
for scores, weights, y, class_label, fn, opts in [
(self.scores_train, self.w_train, self.y_train, 1, ax.bar, dict(color="r", label="signal train")),
(self.scores_train, self.w_train, self.y_train, 0, ax.bar, dict(color="b", label="background train")),
(self.scores_test, self.w_test, self.y_test, 1, ax.errorbar, dict(fmt="ro", label="signal test")),
(self.scores_test, self.w_test, self.y_test, 0, ax.errorbar, dict(fmt="bo", label="background test")),
(self.scores_train, self.w_train, self.l_train, 1, ax.bar, dict(color="r", label="signal train")),
(self.scores_train, self.w_train, self.l_train, 0, ax.bar, dict(color="b", label="background train")),
(self.scores_test, self.w_test, self.l_test, 1, ax.errorbar, dict(fmt="ro", label="signal test")),
(self.scores_test, self.w_test, self.l_test, 0, ax.errorbar, dict(fmt="bo", label="background test")),
]:
weights = weights[y==class_label]
if apply_class_weight is True and (lumifactor is not None):
......@@ -1320,16 +1540,16 @@ class ClassificationProject(object):
else:
trf = lambda y : y
centers_sig_train, hist_sig_train, rel_errors_sig_train = self.get_bin_centered_hist(trf(self.scores_train[self.y_train==1].reshape(-1)), weights=self.w_train[self.y_train==1], **plot_opts)
centers_bkg_train, hist_bkg_train, rel_errors_bkg_train = self.get_bin_centered_hist(trf(self.scores_train[self.y_train==0].reshape(-1)), weights=self.w_train[self.y_train==0], **plot_opts)
centers_sig_test, hist_sig_test, rel_errors_sig_test = self.get_bin_centered_hist(trf(self.scores_test[self.y_test==1].reshape(-1)), weights=self.w_test[self.y_test==1], **plot_opts)
centers_bkg_test, hist_bkg_test, rel_errors_bkg_test = self.get_bin_centered_hist(trf(self.scores_test[self.y_test==0].reshape(-1)), weights=self.w_test[self.y_test==0], **plot_opts)
centers_sig_train, hist_sig_train, rel_errors_sig_train = self.get_bin_centered_hist(trf(self.scores_train[self.l_train==1].reshape(-1)), weights=self.w_train[self.l_train==1], **plot_opts)
centers_bkg_train, hist_bkg_train, rel_errors_bkg_train = self.get_bin_centered_hist(trf(self.scores_train[self.l_train==0].reshape(-1)), weights=self.w_train[self.l_train==0], **plot_opts)
centers_sig_test, hist_sig_test, rel_errors_sig_test = self.get_bin_centered_hist(trf(self.scores_test[self.l_test==1].reshape(-1)), weights=self.w_test[self.l_test==1], **plot_opts)
centers_bkg_test, hist_bkg_test, rel_errors_bkg_test = self.get_bin_centered_hist(trf(self.scores_test[self.l_test==0].reshape(-1)), weights=self.w_test[self.l_test==0], **plot_opts)
significances_train = []
significances_test = []
for hist_sig, hist_bkg, rel_errors_sig, rel_errors_bkg, significances, w, y in [
(hist_sig_train, hist_bkg_train, rel_errors_sig_train, rel_errors_bkg_train, significances_train, self.w_train, self.y_train),
(hist_sig_test, hist_bkg_test, rel_errors_sig_test, rel_errors_bkg_test, significances_test, self.w_test, self.y_test),
for hist_sig, hist_bkg, rel_errors_sig, rel_errors_bkg, significances in [
(hist_sig_train, hist_bkg_train, rel_errors_sig_train, rel_errors_bkg_train, significances_train),
(hist_sig_test, hist_bkg_test, rel_errors_sig_test, rel_errors_bkg_test, significances_test),
]:
# factor to rescale due to using only a fraction of events (training and test samples)
# normfactor_sig = (np.sum(self.w_train[self.y_train==1])+np.sum(self.w_test[self.y_test==1]))/np.sum(w[y==1])
......@@ -1418,8 +1638,8 @@ class ClassificationProject(object):
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']
for (scores, y, w, label), col in zip(
[(self.scores_train, self.y_train, self.w_train, "train"),
(self.scores_test, self.y_test, self.w_test, "test")],
[(self.scores_train, self.l_train, self.w_train, "train"),
(self.scores_test, self.l_test, self.w_test, "test")],
colors
):
scores = trf(scores)
......@@ -1483,19 +1703,19 @@ class ClassificationProject(object):
hist_dict = self.csv_hist
logger.info("Plot losses")
plt.plot(hist_dict['loss'])
plt.plot(hist_dict['val_loss'])
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['training data','validation data'], loc='upper left')
fig, ax = plt.subplots()
ax.plot(hist_dict['loss'])
ax.plot(hist_dict['val_loss'])
ax.set_ylabel('loss')
ax.set_xlabel('epoch')
ax.legend(['training data','validation data'], loc='upper left')
if log:
plt.yscale("log")
ax.set_yscale("log")
if xlim is not None:
plt.xlim(*xlim)
ax.set_xlim(*xlim)
if ylim is not None:
plt.ylim(*ylim)
plt.savefig(os.path.join(self.project_dir, "losses.pdf"))
plt.clf()
ax.set_ylim(*ylim)
return save_show(plt, fig, os.path.join(self.project_dir, "losses.pdf"))
def plot_accuracy(self, all_trainings=False, log=False, acc_suffix="weighted_acc"):
......@@ -1541,7 +1761,7 @@ class ClassificationProject(object):
df = pd.DataFrame(np.concatenate([self.x_train, self.x_test]), columns=self.fields)
df["weight"] = np.concatenate([self.w_train, self.w_test])
df["labels"] = pd.Categorical.from_codes(
np.concatenate([self.y_train, self.y_test]),
np.concatenate([self.l_train, self.l_test]),
categories=["background", "signal"]
)
for identifier in self.identifiers:
......@@ -1699,7 +1919,6 @@ class ClassificationProjectDataFrame(ClassificationProject):
if reload:
self.data_loaded = False
self.data_transformed = False
self._x_train = None
self._x_test = None
self._y_train = None
......@@ -1710,9 +1929,6 @@ class ClassificationProjectDataFrame(ClassificationProject):
self.data_loaded = True
if not self.data_transformed:
self._transform_data()
class ClassificationProjectRNN(ClassificationProject):
......@@ -1804,11 +2020,21 @@ class ClassificationProjectRNN(ClassificationProject):
flat_channel = flat_input
combined = concatenate(rnn_channels+[flat_channel])
for node_count, dropout_fraction in zip(self.nodes, self.dropout):
combined = Dense(node_count, activation=self.activation_function)(combined)
extra_opts = dict()
if self.kernel_initializer is not None:
extra_opts["kernel_initializer"] = getattr(keras.initializers, self.kernel_initializer)()
combined = Dense(node_count, activation=self.activation_function, **extra_opts)(combined)
if (dropout_fraction is not None) and (dropout_fraction > 0):
combined = Dropout(rate=dropout_fraction)(combined)
combined = Dense(1, activation=self.activation_function_output)(combined)
self._model = Model(inputs=rnn_inputs+[flat_input], outputs=combined)
outputs = [combined]
# optional regression targets
for target_field in self.target_fields:
extra_target = Dense(1, activation="linear", name="target_{}".format(target_field))(combined)
outputs.append(extra_target)
self._model = Model(inputs=rnn_inputs+[flat_input], outputs=outputs)
self._compile_or_load_model()
return self._model
......@@ -1874,33 +2100,6 @@ class ClassificationProjectRNN(ClassificationProject):
return x_flat
def evaluate_train_test(self, do_train=True, do_test=True, batch_size=10000, mode=None):
logger.info("Reloading (and re-transforming) unshuffled training data")
self.load(reload=True)
if mode is not None:
self._write_info("scores_mode", mode)
def eval_score(data_name):
logger.info("Create/Update scores for {} sample".format(data_name))
n_events = len(getattr(self, "x_"+data_name))
setattr(self, "scores_"+data_name, np.empty(n_events))
for start in range(0, n_events, batch_size):
stop = start+batch_size
getattr(self, "scores_"+data_name)[start:stop] = (
self.predict(
self.get_input_list(getattr(self, "x_"+data_name)[start:stop]),
mode=mode
).reshape(-1)
)
self._dump_to_hdf5("scores_"+data_name)
if do_test:
eval_score("test")
if do_train:
eval_score("train")
def evaluate(self, x_eval, mode=None):
logger.debug("Evaluate score for {}".format(x_eval))
x_eval = self.transform(x_eval)
......
......@@ -197,14 +197,26 @@ def weighted_quantile(values, quantiles, sample_weight=None, values_sorted=False
class WeightedRobustScaler(RobustScaler):
def fit(self, X, y=None, weights=None):
if not np.isnan(X).any():
def fit(self, X, y=None, weights=None, mask_value=None):
if not np.isnan(X).any() and mask_value is not None and weights is None:
# these checks don't work for nan values
super(WeightedRobustScaler, self).fit(X, y)
if weights is None:
return self
return super(WeightedRobustScaler, self).fit(X, y)
else:
wqs = np.array([weighted_quantile(X[:,i][~np.isnan(X[:,i])], [0.25, 0.5, 0.75], sample_weight=weights) for i in range(X.shape[1])])
if weights is None:
weights = np.ones(len(self.X))
wqs = []
for i in range(X.shape[1]):
mask = ~np.isnan(X[:,i])
if mask_value is not None:
mask &= (X[:,i] != mask_value)
wqs.append(
weighted_quantile(
X[:,i][mask],
[0.25, 0.5, 0.75],
sample_weight=weights[mask]
)
)
wqs = np.array(wqs)
self.center_ = wqs[:,1]
self.scale_ = wqs[:,2]-wqs[:,0]
self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False)
......