Newer
Older
__all__ = ["load_from_dir", "ClassificationProject", "ClassificationProjectDataFrame", "ClassificationProjectRNN"]
from sys import version_info
if version_info[0] > 2:
raw_input = input
izip = zip
else:
from itertools import izip
import logging
logger = logging.getLogger("KerasROOTClassification")
logger.addHandler(logging.NullHandler())
import numpy as np
import pandas as pd
import h5py
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.externals import joblib
from sklearn.utils.extmath import stable_cumsum
from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Dropout, Input, Masking, GRU, concatenate, SimpleRNN
from keras.callbacks import History, EarlyStopping, CSVLogger, ModelCheckpoint, TensorBoard
from keras.optimizers import SGD
import keras.optimizers
from keras.utils.vis_utils import model_to_dot
import matplotlib.pyplot as plt
from .utils import WeightedRobustScaler, weighted_quantile, poisson_asimov_significance
from .plotting import save_show
Nikolai.Hartmann
committed
def byteify(input):
"From stackoverflow https://stackoverflow.com/a/13105359"
if isinstance(input, dict):
return {byteify(key): byteify(value)
for key, value in input.iteritems()}
elif isinstance(input, list):
return [byteify(element) for element in input]
elif isinstance(input, unicode):
return input.encode('utf-8')
else:
return input
if version_info[0] > 2:
byteify = lambda input : input
Nikolai.Hartmann
committed
def load_from_dir(path):
"Load a project and the options from a directory"
try:
with open(os.path.join(path, "info.json")) as f:
info = json.load(f)
project_type = info["project_type"]
if project_type == "ClassificationProjectRNN":
return ClassificationProjectRNN(path)
pass
return ClassificationProject(path)
class ClassificationProject(object):
"""Simple framework to load data from ROOT TTrees and train Keras
neural networks for classification according to some global settings.
See the `Keras documentation <https://keras.io>` for further information
All needed data that is created is stored in a project dir and can
be used again later without the need to be recreated.
:param name: Name of the project - this will also be the name of
the project directory in the output dir. If no further arguments
are given, this argument is interpreted as a directory name, from
which a previously created project should be initialised
:param signal_trees: list of tuples (filename, treename) for the data that should be used as signal
:param bkg_trees: list of tuples (filename, treename) for the data that should be used as background
:param branches: list of branch names or expressions to be used as input values for training
:param rename_branches: dictionary that maps branch expressions to names for better readability
:param weight_expr: expression to weight the events in the loss function
Nikolai
committed
:param data_dir: if given, load the data from a previous project with the given name
instead of creating it from trees. If the data is on the same
disk (and the filesystem supports it), hard links will be used,
otherwise symlinks.
:param identifiers: list of branches or expressions that uniquely
identify events. This is used to store the list of training
events, such that they can be marked later on, for example when
creating friend trees with output score
:param selection: selection expression that events have to fulfill to be considered for training
:param layers: number of layers in the neural network
:param nodes: list number of nodes in each layer. If only a single number is given, use this number for every layer
:param dropout: dropout fraction after each hidden layer. You can also pass a list for dropout fractions for each layer. Set to None for no Dropout.
:param dropout_input: dropout fraction for the input layer. Set to None for no Dropout.
:param batch_size: size of the training batches
:param validation_split: split off this fraction of training events for loss evaluation
:param activation_function: activation function in the hidden layers
:param activation_function_output: activation function in the output layer
:param out_dir: base directory in which the project directories should be stored
:param scaler_type: sklearn scaler class name to transform the data before training (options: "StandardScaler", "RobustScaler")
:param step_signal: step size when selecting signal training events (e.g. 2 means take every second event)
:param step_bkg: step size when selecting background training events (e.g. 2 means take every second event)
:param stop_train: stop after this number of events for reading in training events
:param stop_test: stop after this number of events for reading in test events
:param optimizer: name of optimizer class in keras.optimizers
:param optimizer_opts: dictionary of options for the optimizer
:param use_earlystopping: set true to use the keras EarlyStopping callback
:param earlystopping_opts: options for the keras EarlyStopping callback
:param use_modelcheckpoint: save model weights after each epoch and don't save after no validation loss improvement (except if the options are set otherwise).
:param modelcheckpoint_opts: options for the Keras ModelCheckpoint
callback. After training, the newest saved weight will be used. If
you change the format of the saved model weights it has to be of
the form "weights*.h5"
:param use_tensorboard: if True, use the tensorboard callback to write logs for tensorboard
:param tensorboard_opts: options for the TensorBoard callback
:param balance_dataset: if True, balance the dataset instead of
applying class weights. Only a fraction of the overrepresented
class will be used in each epoch, but different subsets of the
overrepresented class will be used in each epoch.
:param random_seed: use this seed value when initialising the model and produce consistent results. Note:
random data is also used for shuffling the training data, so results may vary still. To
produce consistent results, set the numpy random seed before training.
:param mask_value: value that is used for non-existent entries (e.g. 4th jet pt in events with 3 jets)
:param apply_class_weight: apply a weight that scales the events such that sumw(signal) = sumw(background)
:param normalize_weights: normalize the weights to mean 1
# Datasets that are stored to (and dynamically loaded from) hdf5
dataset_names = ["x_train", "x_test", "y_train", "y_test", "w_train", "w_test", "scores_train", "scores_test"]
# Datasets that are retrieved from ROOT trees the first time
dataset_names_tree = ["x_train", "x_test", "y_train", "y_test", "w_train", "w_test"]
if len(args) < 1 and len(kwargs) < 1:
# if no further arguments given, interpret as directory name
self._init_from_dir(name)
else:
# otherwise initialise new project
self._init_from_args(name, *args, **kwargs)
with open(os.path.join(self.project_dir, "options.pickle"), "wb") as of:
Nikolai.Hartmann
committed
pickle.dump(dict(args=args, kwargs=kwargs), of)
def _init_from_dir(self, dirname):
Nikolai.Hartmann
committed
if not os.path.exists(os.path.join(dirname, "options.pickle")):
# for backward compatibility
with open(os.path.join(dirname, "options.json")) as f:
options = byteify(json.load(f))
else:
with open(os.path.join(dirname, "options.pickle"), "rb") as f:
Nikolai.Hartmann
committed
options = pickle.load(f)
options["kwargs"]["project_dir"] = dirname
self._init_from_args(os.path.basename(dirname), *options["args"], **options["kwargs"])
signal_trees, bkg_trees, branches, weight_expr,
Nikolai
committed
data_dir=None,
batch_size=128,
validation_split=0.33,
activation_function='relu',
scaler_type="WeightedRobustScaler",
use_tensorboard=False,
tensorboard_opts=None,
random_seed=1234,
apply_class_weight=True,
normalize_weights=True):
self.name = name
self.signal_trees = signal_trees
self.bkg_trees = bkg_trees
self.branches = branches
if rename_branches is None:
rename_branches = {}
self.rename_branches = rename_branches
self.selection = selection
self.project_dir = project_dir
if self.project_dir is None:
self.project_dir = name
if not os.path.exists(self.project_dir):
os.mkdir(self.project_dir)
Nikolai
committed
self.data_dir = data_dir
if identifiers is None:
identifiers = []
self.identifiers = identifiers
self.layers = layers
self.nodes = nodes
if not isinstance(self.nodes, list):
self.nodes = [self.nodes for i in range(self.layers)]
if len(self.nodes) != self.layers:
self.layers = len(self.nodes)
logger.warning("Number of layers not equal to the given nodes "
"per layer - adjusted to " + str(self.layers))
if not isinstance(self.dropout, list):
self.dropout = [self.dropout for i in range(self.layers)]
if len(self.dropout) != self.layers:
raise ValueError("List of dropout fractions has to be of equal size as the number of layers!")
self.dropout_input = dropout_input
self.validation_split = validation_split
self.activation_function = activation_function
self.activation_function_output = activation_function_output
self.scaler_type = scaler_type
self.step_signal = step_signal
self.step_bkg = step_bkg
self.stop_train = stop_train
self.stop_test = stop_test
self.use_earlystopping = use_earlystopping
self.use_modelcheckpoint = use_modelcheckpoint
if optimizer_opts is None:
optimizer_opts = dict()
self.optimizer_opts = optimizer_opts
if earlystopping_opts is None:
earlystopping_opts = dict()
self.earlystopping_opts = earlystopping_opts
if modelcheckpoint_opts is None:
modelcheckpoint_opts = dict(
save_best_only=True,
verbose=True,
)
self.modelcheckpoint_opts = modelcheckpoint_opts
self.tensorboard_opts = dict(
log_dir=os.path.join(self.project_dir, "tensorboard"),
)
if tensorboard_opts is not None:
self.tensorboard_opts.update(**tensorboard_opts)
self.random_seed = random_seed
self.balance_dataset = balance_dataset
self.apply_class_weight = apply_class_weight
self.normalize_weights = normalize_weights
self.s_train = None
self.b_train = None
self.s_test = None
self.b_test = None
self._x_train = None
self._x_test = None
self._y_train = None
self._y_test = None
self._w_train = None
self._w_test = None
self._scores_train = None
self._scores_test = None
# class weighted training data (divided by mean)
self._w_train_tot = None
self._s_eventlist_train = None
self._b_eventlist_train = None
self._balanced_class_weight = None
# track the number of epochs this model has been trained
self.total_epochs = 0
# track if we are currently training
self.is_training = False
self._fields = None
@property
def fields(self):
"Renamed branch expressions"
if self._fields is None:
self._fields = []
for branch_expr in self.branches:
self._fields.append(self.rename_branches.get(branch_expr, branch_expr))
return self._fields
def rename_fields(self, ar):
"Rename fields of structured array"
fields = list(ar.dtype.names)
renamed_fields = []
for old_name in fields:
renamed_fields.append(self.rename_branches.get(old_name, old_name))
ar.dtype.names = tuple(renamed_fields)
# if those don't exist, we need to load them from ROOT trees first
self._load_from_hdf5(*self.dataset_names_tree)
except KeyError:
logger.info("Couldn't load all datasets - reading from ROOT trees")
# Read signal and background trees into structured numpy arrays
signal_chain = ROOT.TChain()
bkg_chain = ROOT.TChain()
for filename, treename in self.signal_trees:
signal_chain.AddFile(filename, -1, treename)
for filename, treename in self.bkg_trees:
bkg_chain.AddFile(filename, -1, treename)
self.s_train = tree2array(signal_chain,
branches=self.branches+[self.weight_expr]+self.identifiers,
selection=self.selection,
start=0, step=self.step_signal, stop=self.stop_train)
self.b_train = tree2array(bkg_chain,
branches=self.branches+[self.weight_expr]+self.identifiers,
selection=self.selection,
start=0, step=self.step_bkg, stop=self.stop_train)
self.s_test = tree2array(signal_chain,
branches=self.branches+[self.weight_expr],
selection=self.selection,
start=1, step=self.step_signal, stop=self.stop_test)
self.b_test = tree2array(bkg_chain,
branches=self.branches+[self.weight_expr],
selection=self.selection,
start=1, step=self.step_bkg, stop=self.stop_test)
self.rename_fields(self.s_train)
self.rename_fields(self.b_train)
self.rename_fields(self.s_test)
self.rename_fields(self.b_test)
self.s_eventlist_train = self.s_train[self.identifiers].astype(dtype=[(branchName, "u8") for branchName in self.identifiers])
self.b_eventlist_train = self.b_train[self.identifiers].astype(dtype=[(branchName, "u8") for branchName in self.identifiers])
# now we don't need the identifiers anymore
self.s_train = self.s_train[self.fields+[self.weight_expr]]
self.b_train = self.b_train[self.fields+[self.weight_expr]]
# create x (input), y (target) and w (weights) arrays
# the first block will be signals, the second block backgrounds
self.x_train = rec2array(self.s_train[self.fields])
self.x_train = np.concatenate((self.x_train, rec2array(self.b_train[self.fields])))
self.w_train = self.s_train[self.weight_expr]
self.w_train = np.concatenate((self.w_train, self.b_train[self.weight_expr]))
self.y_train = np.empty(len(self.x_train), dtype=np.bool)
self.y_train[:len(self.s_train)] = 1
self.y_train[len(self.s_train):] = 0
self.b_train = None
self.s_train = None
self.x_test = rec2array(self.s_test[self.fields])
self.x_test = np.concatenate((self.x_test, rec2array(self.b_test[self.fields])))
self.w_test = self.s_test[self.weight_expr]
self.w_test = np.concatenate((self.w_test, self.b_test[self.weight_expr]))
self.y_test = np.empty(len(self.x_test), dtype=np.bool)
self.y_test[:len(self.s_test)] = 1
self.y_test[len(self.s_test):] = 0
self.b_test = None
self.s_test = None
self._dump_to_hdf5(*self.dataset_names_tree)
s_eventlist_df = pd.DataFrame(self.s_eventlist_train)
b_eventlist_df = pd.DataFrame(self.b_eventlist_train)
s_eventlist_df.to_csv(os.path.join(self.project_dir, "s_eventlist_train.csv"))
b_eventlist_df.to_csv(os.path.join(self.project_dir, "b_eventlist_train.csv"))
@property
def s_eventlist_train(self):
if self._s_eventlist_train is None:
df = pd.read_csv(os.path.join(self.project_dir, "s_eventlist_train.csv"))
self._s_eventlist_train = df.to_records()[self.identifiers]
return self._s_eventlist_train
@s_eventlist_train.setter
def s_eventlist_train(self, value):
self._s_eventlist_train = value
@property
def b_eventlist_train(self):
if self._b_eventlist_train is None:
df = pd.read_csv(os.path.join(self.project_dir, "b_eventlist_train.csv"))
self._b_eventlist_train = df.to_records()[self.identifiers]
return self._b_eventlist_train
@b_eventlist_train.setter
def b_eventlist_train(self, value):
self._b_eventlist_train = value
def _dump_to_hdf5(self, *dataset_names):
if len(dataset_names) < 1:
dataset_names = self.dataset_names
for dataset_name in dataset_names:
filename = os.path.join(self.project_dir, dataset_name+".h5")
logger.info("Writing {} to {}".format(dataset_name, filename))
with h5py.File(filename, "w") as hf:
hf.create_dataset(dataset_name, data=getattr(self, dataset_name))
def _load_from_hdf5(self, *dataset_names):
if len(dataset_names) < 1:
dataset_names = self.dataset_names
for dataset_name in dataset_names:
filename = os.path.join(self.project_dir, dataset_name+".h5")
Nikolai
committed
if (self.data_dir is not None) and (not os.path.exists(filename)):
srcpath = os.path.abspath(os.path.join(self.data_dir, dataset_name+".h5"))
Nikolai
committed
try:
os.link(srcpath, filename)
logger.info("Created hardlink from {} to {}".format(srcpath, filename))
except OSError:
os.symlink(srcpath, filename)
logger.info("Created symlink from {} to {}".format(srcpath, filename))
logger.info("Trying to load {} from {}".format(dataset_name, filename))
with h5py.File(filename) as hf:
setattr(self, dataset_name, hf[dataset_name][:])
logger.info("Data loaded")
self._callbacks_list = []
self._callbacks_list.append(self.history)
if self.use_earlystopping:
self._callbacks_list.append(EarlyStopping(**self.earlystopping_opts))
mc = ModelCheckpoint(**self.modelcheckpoint_opts)
self._callbacks_list.append(mc)
if not os.path.dirname(mc.filepath) == self.project_dir:
mc.filepath = os.path.join(self.project_dir, mc.filepath)
logger.debug("Prepending project dir to ModelCheckpoint filepath: {}".format(mc.filepath))
if self.use_tensorboard:
self._callbacks_list.append(TensorBoard(**self.tensorboard_opts))
self._callbacks_list.append(CSVLogger(os.path.join(self.project_dir, "training.log"), append=True))
@property
def scaler(self):
# create the scaler (and fit to training data) if not existent
if self._scaler is None:
filename = os.path.join(self.project_dir, "scaler.pkl")
try:
self._scaler = joblib.load(filename)
logger.info("Loaded existing scaler from {}".format(filename))
logger.info("Creating new {}".format(self.scaler_type))
scaler_fit_kwargs = dict()
if self.scaler_type == "StandardScaler":
self._scaler = StandardScaler()
elif self.scaler_type == "RobustScaler":
self._scaler = RobustScaler()
elif self.scaler_type == "WeightedRobustScaler":
self._scaler = WeightedRobustScaler()
scaler_fit_kwargs["weights"] = self.w_train_tot
else:
raise ValueError("Scaler type {} unknown".format(self.scaler_type))
logger.info("Fitting {} to training data".format(self.scaler_type))
orig_copy_setting = self.scaler.copy
self.scaler.copy = False
self._scaler.fit(self.x_train, **scaler_fit_kwargs)
self.scaler.copy = orig_copy_setting
joblib.dump(self._scaler, filename)
return self._scaler
def _batch_transform(self, x, fn, batch_size):
"Transform array in batches, temporarily setting mask_values to nan"
transformed = np.empty(x.shape, dtype=x.dtype)
for start in range(0, len(x), batch_size):
stop = start+batch_size
x_batch = np.array(x[start:stop]) # copy
x_batch[x_batch == self.mask_value] = np.nan
x_batch = fn(x_batch)
x_batch[np.isnan(x_batch)] = self.mask_value
transformed[start:stop] = x_batch
return transformed
def transform(self, x, batch_size=10000):
if self.mask_value is not None:
return self._batch_transform(x, self.scaler.transform, batch_size)
else:
return self.scaler.transform(x)
def inverse_transform(self, x, batch_size=10000):
if self.mask_value is not None:
return self._batch_transform(x, self.scaler.inverse_transform, batch_size)
else:
return self.scaler.inverse_transform(x)
@property
def history(self):
params_file = os.path.join(self.project_dir, "history_params.json")
history_file = os.path.join(self.project_dir, "history_history.json")
if self._history is None:
self._history = History()
if os.path.exists(params_file) and os.path.exists(history_file):
Nikolai.Hartmann
committed
try:
with open(params_file) as f:
self._history.params = json.load(f)
with open(history_file) as f:
self._history.history = json.load(f)
except ValueError:
logger.warning("Couldn't load history - starting with empty one")
return self._history
@history.setter
def history(self, value):
self._history = value
def _dump_history(self):
params_file = os.path.join(self.project_dir, "history_params.json")
history_file = os.path.join(self.project_dir, "history_history.json")
with open(params_file, "w") as of:
json.dump(self.history.params, of)
with open(history_file, "w") as of:
json.dump(self.history.history, of)
def _transform_data(self):
if not self.data_transformed:
if self.mask_value is not None:
self.x_train[self.x_train == self.mask_value] = np.nan
self.x_test[self.x_test == self.mask_value] = np.nan
if logger.level <= logging.DEBUG:
logger.debug("training data before transformation: {}".format(self.x_train))
logger.debug("minimum values: {}".format([np.min(self.x_train[:,i][~np.isnan(self.x_train[:,i])])
for i in range(self.x_train.shape[1])]))
logger.debug("maximum values: {}".format([np.max(self.x_train[:,i][~np.isnan(self.x_train[:,i])])
for i in range(self.x_train.shape[1])]))
orig_copy_setting = self.scaler.copy
self.scaler.copy = False
self.x_train = self.scaler.transform(self.x_train)
logger.debug("training data after transformation: {}".format(self.x_train))
self.x_test = self.scaler.transform(self.x_test)
if self.mask_value is not None:
self.x_train[np.isnan(self.x_train)] = self.mask_value
self.x_test[np.isnan(self.x_test)] = self.mask_value
logger.info("Training and test data transformed")
def _read_info(self, key, default):
filename = os.path.join(self.project_dir, "info.json")
if not os.path.exists(filename):
with open(filename, "w") as of:
json.dump({}, of)
with open(filename) as f:
info = json.load(f)
return info.get(key, default)
def _write_info(self, key, value):
filename = os.path.join(self.project_dir, "info.json")
if not os.path.exists(filename):
with open(filename, "w") as of:
json.dump({}, of)
with open(filename) as f:
info = json.load(f)
info[key] = value
with open(filename, "w") as of:
json.dump(info, of)
@staticmethod
def query_yn(text):
result = None
while result is None:
input_text = raw_input(text)
if len(input_text) > 0:
if input_text.upper()[0] == "Y":
result = True
elif input_text.upper()[0] == "N":
result = False
return result
@property
def model(self):
"Simple MLP"
if self._model is None:
self._model = Sequential()
if self.dropout_input is None:
self._model.add(Dense(self.nodes[0], input_dim=len(self.fields), activation=self.activation_function))
# in case of no Dropout we already have the first hidden layer
start_layer = 1
else:
self._model.add(Dropout(rate=self.dropout_input, input_shape=(len(self.fields),)))
start_layer = 0
# the (other) hidden layers
for node_count, dropout_fraction in zip(self.nodes[start_layer:], self.dropout[start_layer:]):
self._model.add(Dense(node_count, activation=self.activation_function))
if (dropout_fraction is not None) and (dropout_fraction > 0):
self._model.add(Dropout(rate=dropout_fraction))
# last layer is one neuron (binary classification)
self._model.add(Dense(1, activation=self.activation_function_output))
Nikolai.Hartmann
committed
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
def _compile_or_load_model(self):
logger.info("Using {}(**{}) as Optimizer".format(self.optimizer, self.optimizer_opts))
Optimizer = getattr(keras.optimizers, self.optimizer)
optimizer = Optimizer(**self.optimizer_opts)
logger.info("Compile model")
rn_state = np.random.get_state()
np.random.seed(self.random_seed)
self._model.compile(optimizer=optimizer,
loss=self.loss,
weighted_metrics=['accuracy']
)
np.random.set_state(rn_state)
if os.path.exists(os.path.join(self.project_dir, "weights.h5")):
if self.is_training:
continue_training = self.query_yn("Found previously trained weights - "
"continue training (choosing N will restart)? (Y/N) ")
else:
continue_training = True
if continue_training:
self.model.load_weights(os.path.join(self.project_dir, "weights.h5"))
logger.info("Found and loaded previously trained weights")
else:
logger.info("Starting completely new model")
else:
logger.info("No weights found, starting completely new model")
# dump to json for documentation
with open(os.path.join(self.project_dir, "model.json"), "w") as of:
of.write(self._model.to_json())
with open(os.path.join(self.project_dir, "model.svg"), "wb") as of:
of.write(model_to_dot(self._model, show_shapes=True).create("dot", format="svg"))
@property
def class_weight(self):
if self._class_weight is None:
sumw_bkg = np.sum(self.w_train[self.y_train == 0])
sumw_sig = np.sum(self.w_train[self.y_train == 1])
self._class_weight = [(sumw_sig+sumw_bkg)/(2*sumw_bkg), (sumw_sig+sumw_bkg)/(2*sumw_sig)]
logger.debug("Calculated class_weight: {}".format(self._class_weight))
@property
def balanced_class_weight(self):
"""
Class weight for the balance_dataset method
Since we have equal number of signal and background events in
each batch, we need to balance the ratio of sum of weights per
event with class weights
"""
if self._balanced_class_weight is None:
sumw_bkg = np.sum(self.w_train[self.y_train == 0])
sumw_sig = np.sum(self.w_train[self.y_train == 1])
# use sumw *per event* in this case
sumw_bkg /= len(self.w_train[self.y_train == 0])
sumw_sig /= len(self.w_train[self.y_train == 1])
self._balanced_class_weight = [(sumw_sig+sumw_bkg)/(2*sumw_bkg), (sumw_sig+sumw_bkg)/(2*sumw_sig)]
logger.debug("Calculated balanced_class_weight: {}".format(self._balanced_class_weight))
return self._balanced_class_weight
def load(self, reload=False):
"Load all data needed for plotting and training"
if reload:
self.data_loaded = False
self.data_transformed = False
if not self.data_loaded:
self._load_data()
if not self.data_transformed:
self._transform_data()
def shuffle_training_data(self):
rn_state = np.random.get_state()
np.random.shuffle(self.x_train)
np.random.set_state(rn_state)
np.random.shuffle(self.y_train)
np.random.set_state(rn_state)
np.random.shuffle(self.w_train)
np.random.set_state(rn_state)
np.random.shuffle(self.w_train_tot)
if self._scores_train is not None:
logger.info("Shuffling scores, since they are also there")
np.random.set_state(rn_state)
np.random.shuffle(self._scores_train)
def w_train_tot(self):
"(sample weight * class weight), divided by mean"
if not self.balance_dataset:
class_weight = self.class_weight
else:
class_weight = self.balanced_class_weight
if self._w_train_tot is None:
if self.apply_class_weight:
self._w_train_tot = self.w_train*np.array(class_weight)[self.y_train.astype(int)]
else:
self._w_train_tot = np.array(self.w_train)
if self.normalize_weights:
self._w_train_tot /= np.mean(self._w_train_tot)
@property
def validation_data(self):
"Validation data. Attention: Shuffle training data before using this!"
if not self.data_shuffled:
raise ValueError("Training data isn't shuffled, can't split of validation data")
split_index = int((1-self.validation_split)*len(self.x_train))
return self.x_train[split_index:], self.y_train[split_index:], self.w_train_tot[split_index:]
@property
def training_data(self):
"Training data with validation data split off. Attention: Shuffle training data before using this!"
if not self.data_shuffled:
raise ValueError("Training data isn't shuffled, can't split of validation data")
split_index = int((1-self.validation_split)*len(self.x_train))
return self.x_train[:split_index], self.y_train[:split_index], self.w_train_tot[:split_index]
def yield_single_class_batch(self, class_label):
"""
Generate batches of half batch size, containing only entries for the given class label.
The weights are multiplied by balanced_class_weight.
"""
x_train, y_train, w_train = self.training_data
class_idx = np.where(y_train==class_label)[0]
while True:
# shuffle the indices for this class label
shuffled_idx = np.random.permutation(class_idx)
# yield them batch wise
for start in range(0, len(shuffled_idx), int(self.batch_size/2)):
yield (x_train[shuffled_idx[start:start+int(self.batch_size/2)]],
y_train[shuffled_idx[start:start+int(self.batch_size/2)]],
w_train[shuffled_idx[start:start+int(self.batch_size/2)]])
def yield_balanced_batch(self):
"generate batches with equal amounts of both classes"
logcounter = 0
for batch_0, batch_1 in izip(self.yield_single_class_batch(0),
self.yield_single_class_batch(1)):
if logcounter == 10:
logger.debug("\rSumw sig*balanced_class_weight[1]: {}".format(np.sum(batch_1[2])))
logger.debug("\rSumw bkg*balanced_class_weight[0]: {}".format(np.sum(batch_0[2])))
logcounter = 0
logcounter += 1
yield (np.concatenate((batch_0[0], batch_1[0])),
np.concatenate((batch_0[1], batch_1[1])),
np.concatenate((batch_0[2], batch_1[2])))
def train(self, epochs=10):
self.load()
for branch_index, branch in enumerate(self.fields):
self.total_epochs = self._read_info("epochs", 0)
logger.info("Train model")
if not self.balance_dataset:
try:
self.is_training = True
self.model.fit(self.x_train,
# the reshape might be unnescessary here
self.y_train.reshape(-1, 1),
epochs=epochs,
# we have to multiply by class weight since keras ignores class weight if sample weight is given
# see https://github.com/keras-team/keras/issues/497
shuffle=True,
batch_size=self.batch_size,
callbacks=self.callbacks_list)
self.is_training = False
except KeyboardInterrupt:
logger.info("Interrupt training - continue with rest")
else:
try:
self.is_training = True
labels, label_counts = np.unique(self.y_train, return_counts=True)
logger.info("Training on balanced batches")
# note: the batches have balanced_class_weight already applied
self.model.fit_generator(self.yield_balanced_batch(),
steps_per_epoch=int(min(label_counts)/self.batch_size),
epochs=epochs,
callbacks=self.callbacks_list)
self.is_training = False
except KeyboardInterrupt:
logger.info("Interrupt training - continue with rest")
logger.info("Save history")
self._dump_history()
if not self.use_modelcheckpoint:
logger.info("Save weights")
self.model.save_weights(os.path.join(self.project_dir, "weights.h5"))
weight_file = sorted(glob.glob(os.path.join(self.project_dir, "weights*.h5")), key=lambda f:os.path.getmtime(f))[-1]
if not os.path.basename(weight_file) == "weights.h5":
logger.info("Copying latest weight file {} to weights.h5".format(weight_file))
shutil.copy(weight_file, os.path.join(self.project_dir, "weights.h5"))
logger.info("Reloading weights file since we are using model checkpoint!")
self.model.load_weights(os.path.join(self.project_dir, "weights.h5"))
self.total_epochs += self.history.epoch[-1]+1
self._write_info("epochs", self.total_epochs)
def evaluate_train_test(self, do_train=True, do_test=True, mode=None):
logger.info("Reloading (and re-transforming) unshuffled training data")
self.load(reload=True)
if mode is not None:
self._write_info("scores_mode", mode)
logger.info("Create/Update scores for train/test sample")
if do_test:
self.scores_test = self.predict(self.x_test, mode=mode).reshape(-1)
self._dump_to_hdf5("scores_test")
if do_train:
self.scores_train = self.predict(self.x_train, mode=mode).reshape(-1)
self._dump_to_hdf5("scores_train")
def predict(self, x, mode=None):
if mode is None:
# normal output - after activation function output layer
return self.model.predict(x)
elif mode == "skip_activation":
# output before applying activation function
# (after weighted sum + bias of last hidden layer)
if isinstance(self.model.input, list):
feed_dict={tuple(self.model.input) : x}
else:
feed_dict={self.model.input : x}
return K.get_session().run(
self.model.output.op.inputs[0],
)
else:
raise ValueError("Unknown mode {}".format(mode))
def evaluate(self, x_eval, mode=None):
x_eval = self.transform(x_eval)
logger.debug("Evaluate for transformed array: {}".format(x_eval))
def write_friend_tree(self, score_name,
source_filename, source_treename,
target_filename, target_treename,
batch_size=100000):
f = ROOT.TFile.Open(source_filename)
tree = f.Get(source_treename)
entries = tree.GetEntries()
logger.info("Write friend tree for {} in {}".format(source_treename, source_filename))
if os.path.exists(target_filename):
raise IOError("{} already exists, if you want to recreate it, delete it first".format(target_filename))
for start in range(0, entries, batch_size):
logger.info("Evaluating score for entry {}/{}".format(start, entries))
x_from_tree = tree2array(tree,
start=start, stop=start+batch_size)