Newer
Older
import logging
logger = logging.getLogger("KerasROOTClassification")
logger.addHandler(logging.NullHandler())
from root_numpy import tree2array, rec2array
import numpy as np
import pandas as pd
import h5py
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.externals import joblib
from keras.models import Sequential
from keras.layers import Dense
from keras.models import model_from_json
from keras.callbacks import History
from keras.optimizers import SGD
import keras.optimizers
import matplotlib.pyplot as plt
# configure number of cores
# this doesn't seem to work, but at least with these settings keras only uses 4 processes
import tensorflow as tf
from keras import backend as K
num_cores = 1
config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,
inter_op_parallelism_threads=num_cores,
allow_soft_placement=True,
device_count = {'CPU': num_cores})
session = tf.Session(config=config)
K.set_session(session)
class KerasROOTClassification(object):
dataset_names = ["x_train", "x_test", "y_train", "y_test", "w_train", "w_test"]
def __init__(self, name, *args, **kwargs):
self._init_from_args(name, *args, **kwargs)
with open(os.path.join(self.project_dir, "options.json"), "w") as of:
json.dump(dict(args=args, kwargs=kwargs), of)
def _init_from_args(self, name,
signal_trees, bkg_trees, branches, weight_expr, identifiers,
selection=None,
layers=3,
nodes=64,
batch_size=128,
validation_split=0.33,
activation_function='relu',
out_dir="./outputs",
scaler_type="RobustScaler",
step_signal=2,
step_bkg=2,
optimizer="SGD",
optimizer_opts=None):
self.name = name
self.signal_trees = signal_trees
self.bkg_trees = bkg_trees
self.branches = branches
self.weight_expr = weight_expr
self.selection = selection
self.identifiers = identifiers
self.layers = layers
self.nodes = nodes
self.validation_split = validation_split
self.activation_function = activation_function
self.scaler_type = scaler_type
self.step_signal = step_signal
self.step_bkg = step_bkg
self.optimizer = optimizer
if optimizer_opts is None:
optimizer_opts = dict()
self.optimizer_opts = optimizer_opts
self.project_dir = os.path.join(self.out_dir, name)
if not os.path.exists(self.out_dir):
os.mkdir(self.out_dir)
if not os.path.exists(self.project_dir):
os.mkdir(self.project_dir)
self.s_train = None
self.b_train = None
self.s_test = None
self.b_test = None
self.x_train = None
self.x_test = None
self.y_train = None
self.y_test = None
self.s_eventlist_train = None
self.b_eventlist_train = None
self._bkg_weights = None
self._sig_weights = None
self._scores_train = None
self._scores_test = None
# track the number of epochs this model has been trained
self.total_epochs = 0
try:
self._load_from_hdf5()
except KeyError:
logger.info("Couldn't load all datasets - reading from ROOT trees")
# Read signal and background trees into structured numpy arrays
signal_chain = ROOT.TChain()
bkg_chain = ROOT.TChain()
for filename, treename in self.signal_trees:
signal_chain.AddFile(filename, -1, treename)
for filename, treename in self.bkg_trees:
bkg_chain.AddFile(filename, -1, treename)
self.s_train = tree2array(signal_chain,
branches=self.branches+[self.weight_expr]+self.identifiers,
selection=self.selection,
start=0, step=self.step_signal)
self.b_train = tree2array(bkg_chain,
branches=self.branches+[self.weight_expr]+self.identifiers,
selection=self.selection,
start=0, step=self.step_bkg)
self.s_test = tree2array(signal_chain,
branches=self.branches+[self.weight_expr],
selection=self.selection,
start=1, step=self.step_signal)
self.b_test = tree2array(bkg_chain,
branches=self.branches+[self.weight_expr],
selection=self.selection,
start=1, step=self.step_bkg)
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
self._dump_training_list()
self.s_eventlist_train = self.s_train[self.identifiers]
self.b_eventlist_train = self.b_train[self.identifiers]
# now we don't need the identifiers anymore
self.s_train = self.s_train[self.branches+[self.weight_expr]]
self.b_train = self.b_train[self.branches+[self.weight_expr]]
# create x (input), y (target) and w (weights) arrays
# the first block will be signals, the second block backgrounds
self.x_train = rec2array(self.s_train[self.branches])
self.x_train = np.concatenate((self.x_train, rec2array(self.b_train[self.branches])))
self.x_test = rec2array(self.s_test[self.branches])
self.x_test = np.concatenate((self.x_test, rec2array(self.b_test[self.branches])))
self.w_train = self.s_train[self.weight_expr]
self.w_train = np.concatenate((self.w_train, self.b_train[self.weight_expr]))
self.w_test = self.s_test[self.weight_expr]
self.w_test = np.concatenate((self.w_test, self.b_test[self.weight_expr]))
self.y_train = np.empty(len(self.x_train))
self.y_train[:len(self.s_train)] = 1
self.y_train[len(self.s_train):] = 0
self.y_test = np.empty(len(self.x_test))
self.y_test[:len(self.s_test)] = 1
self.y_test[len(self.s_test):] = 0
logger.info("Writing to hdf5")
self._dump_to_hdf5()
def _dump_training_list(self):
s_eventlist = pd.DataFrame(self.s_train[self.identifiers])
b_eventlist = pd.DataFrame(self.b_train[self.identifiers])
s_eventlist.to_csv(os.path.join(self.project_dir, "s_eventlist_train.csv"))
s_eventlist.to_csv(os.path.join(self.project_dir, "b_eventlist_train.csv"))
def _dump_to_hdf5(self, dataset_names=None):
if dataset_names is None:
dataset_names = self.dataset_names
for dataset_name in dataset_names:
filename = os.path.join(self.project_dir, dataset_name+".h5")
logger.info("Writing {} to {}".format(dataset_name, filename))
with h5py.File(filename, "w") as hf:
hf.create_dataset(dataset_name, data=getattr(self, dataset_name))
def _load_from_hdf5(self, dataset_names=None):
if dataset_names is None:
dataset_names = self.dataset_names
for dataset_name in dataset_names:
filename = os.path.join(self.project_dir, dataset_name+".h5")
logger.info("Trying to load {} from {}".format(dataset_name, filename))
with h5py.File(filename) as hf:
setattr(self, dataset_name, hf[dataset_name][:])
logger.info("Data loaded")
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
@property
def scores_train(self):
if self._scores_train is None:
self._load_from_hdf5(["_scores_train"])
return self._scores_train
@scores_train.setter
def scores_train(self, value):
self._scores_train = value
self._dump_to_hdf5(["_scores_train"])
@property
def scores_test(self):
if self._scores_test is None:
self._load_from_hdf5(["_scores_test"])
return self._scores_test
@scores_test.setter
def scores_test(self, value):
self._scores_test = value
logger.info("dump")
self._dump_to_hdf5(["_scores_test"])
@property
def scaler(self):
# create the scaler (and fit to training data) if not existent
if self._scaler is None:
filename = os.path.join(self.project_dir, "scaler.pkl")
try:
self._scaler = joblib.load(filename)
logger.info("Loaded existing scaler from {}".format(filename))
logger.info("Creating new {}".format(self.scaler_type))
if self.scaler_type == "StandardScaler":
self._scaler = StandardScaler()
elif self.scaler_type == "RobustScaler":
self._scaler = RobustScaler()
else:
raise ValueError("Scaler type {} unknown".format(self.scaler_type))
logger.info("Fitting {} to training data".format(self.scaler_type))
# i think this would refit to test data (and overwrite the parameters)
# probably we either want to fit only training data or training and test data together
# logger.info("Fitting StandardScaler to test data")
# self._scaler.fit(self.x_test)
joblib.dump(self._scaler, filename)
return self._scaler
@property
def history(self):
params_file = os.path.join(self.project_dir, "history_params.json")
history_file = os.path.join(self.project_dir, "history_history.json")
if self._history is None:
self._history = History()
with open(params_file) as f:
self._history.params = json.load(f)
with open(history_file) as f:
self._history.history = json.load(f)
return self._history
@history.setter
def history(self, value):
self._history = value
def _dump_history(self):
params_file = os.path.join(self.project_dir, "history_params.json")
history_file = os.path.join(self.project_dir, "history_history.json")
with open(params_file, "w") as of:
json.dump(self.history.params, of)
with open(history_file, "w") as of:
json.dump(self.history.history, of)
def _transform_data(self):
if not self.data_transformed:
# todo: what to do about the outliers? Where do they come from?
logger.debug("training data before transformation: {}".format(self.x_train))
logger.debug("minimum values: {}".format([np.min(self.x_train[:,i]) for i in range(self.x_train.shape[1])]))
logger.debug("maximum values: {}".format([np.max(self.x_train[:,i]) for i in range(self.x_train.shape[1])]))
self.x_train = self.scaler.transform(self.x_train)
logger.debug("training data after transformation: {}".format(self.x_train))
self.x_test = self.scaler.transform(self.x_test)
self.data_transformed = True
def _read_info(self, key, default):
filename = os.path.join(self.project_dir, "info.json")
if not os.path.exists(filename):
with open(filename, "w") as of:
json.dump({}, of)
with open(filename) as f:
info = json.load(f)
return info.get(key, default)
def _write_info(self, key, value):
filename = os.path.join(self.project_dir, "info.json")
with open(filename) as f:
info = json.load(f)
info[key] = value
with open(filename, "w") as of:
json.dump(info, of)
@property
def model(self):
"Simple MLP"
if self._model is None:
self._model = Sequential()
# first hidden layer
self._model.add(Dense(self.nodes, input_dim=len(self.branches), activation=self.activation_function))
# the other hidden layers
for layer_number in range(self.layers-1):
self._model.add(Dense(self.nodes, activation=self.activation_function))
# last layer is one neuron (binary classification)
self._model.add(Dense(1, activation='sigmoid'))
logger.info("Using {}(**{}) as Optimizer".format(self.optimizer, self.optimizer_opts))
Optimizer = getattr(keras.optimizers, self.optimizer)
optimizer = Optimizer(**self.optimizer_opts)
logger.info("Compile model")
self._model.compile(optimizer=optimizer,
loss='binary_crossentropy',
metrics=['accuracy'])
# dump to json for documentation
with open(os.path.join(self.project_dir, "model.json"), "w") as of:
of.write(self._model.to_json())
return self._model
@property
def class_weight(self):
if self._class_weight is None:
sumw_bkg = np.sum(self.w_train[self.y_train == 0])
sumw_sig = np.sum(self.w_train[self.y_train == 1])
self._class_weight = [(sumw_sig+sumw_bkg)/(2*sumw_bkg), (sumw_sig+sumw_bkg)/(2*sumw_sig)]
return self._class_weight
def load(self):
"Load all data needed for plotting and training"
if not self.data_loaded:
self._load_data()
if not self.data_transformed:
self._transform_data()
def shuffle_training_data(self):
rn_state = np.random.get_state()
np.random.shuffle(self.x_train)
np.random.set_state(rn_state)
np.random.shuffle(self.y_train)
np.random.set_state(rn_state)
np.random.shuffle(self.w_train)
def train(self, epochs=10):
self.load()
for branch_index, branch in enumerate(self.branches):
self.plot_input(branch_index)
try:
self.model.load_weights(os.path.join(self.project_dir, "weights.h5"))
logger.info("Weights found and loaded")
logger.info("Continue training")
except IOError:
logger.info("No weights found, starting completely new training")
self.total_epochs = self._read_info("epochs", 0)
logger.info("Train model")
try:
self.history = History()
self.shuffle_training_data()
self.model.fit(self.x_train,
# the reshape might be unnescessary here
self.y_train.reshape(-1, 1),
epochs=epochs,
validation_split = self.validation_split,
class_weight=self.class_weight,
sample_weight=self.w_train,
shuffle=True,
batch_size=self.batch_size,
callbacks=[self.history])
except KeyboardInterrupt:
logger.info("Interrupt training - continue with rest")
print(self.history)
logger.info("Save history")
self._dump_history()
logger.info("Save weights")
self.model.save_weights(os.path.join(self.project_dir, "weights.h5"))
self.total_epochs += epochs
self._write_info("epochs", self.total_epochs)
logger.info("Create/Update scores for ROC curve")
self.scores_test = self.model.predict(self.x_test)
self.scores_train = self.model.predict(self.x_train)
@property
def bkg_weights(self):
"""
class weights multiplied by event weights (for plotting)
TODO: find a better way to do this
"""
if self._bkg_weights is None:
logger.debug("Calculating background weights for plotting")
self._bkg_weights = np.empty(sum(self.y_train == 0))
self._bkg_weights.fill(self.class_weight[0])
self._bkg_weights *= self.w_train[self.y_train == 0]
logger.debug("Background weights: {}".format(self._bkg_weights))
return self._bkg_weights
@property
def sig_weights(self):
"""
class weights multiplied by event weights (for plotting)
TODO: find a better way to do this
"""
if self._sig_weights is None:
logger.debug("Calculating signal weights for plotting")
self._sig_weights = np.empty(sum(self.y_train == 1))
self._sig_weights.fill(self.class_weight[1])
self._sig_weights *= self.w_train[self.y_train == 1]
logger.debug("Signal weights: {}".format(self._sig_weights))
return self._sig_weights
def plot_input(self, var_index):
"plot a single input variable"
branch = self.branches[var_index]
fig, ax = plt.subplots()
bkg = self.x_train[:,var_index][self.y_train == 0]
sig = self.x_train[:,var_index][self.y_train == 1]
logger.debug("Plotting bkg (min={}, max={}) from {}".format(np.min(bkg), np.max(bkg), bkg))
logger.debug("Plotting sig (min={}, max={}) from {}".format(np.min(sig), np.max(sig), sig))
# calculate percentiles to get a heuristic for the range to be plotted
# (should in principle also be done with weights, but for now do it unweighted)
range_sig = np.percentile(sig, [1, 99])
range_bkg = np.percentile(sig, [1, 99])
plot_range = (min(range_sig[0], range_bkg[0]), max(range_sig[1], range_sig[1]))
logger.debug("Calculated range based on percentiles: {}".format(plot_range))
try:
ax.hist(bkg, color="b", alpha=0.5, bins=50, range=plot_range, weights=self.bkg_weights)
ax.hist(sig, color="r", alpha=0.5, bins=50, range=plot_range, weights=self.sig_weights)
except ValueError:
# weird, probably not always working workaround for a numpy bug
plot_range = (float("{:.2f}".format(plot_range[0])), float("{:.2f}".format(plot_range[1])))
logger.warn("Got a value error during plotting, maybe this is due to a numpy bug - changing range to {}".format(plot_range))
ax.hist(bkg, color="b", alpha=0.5, bins=50, range=plot_range, weights=self.bkg_weights)
ax.hist(sig, color="r", alpha=0.5, bins=50, range=plot_range, weights=self.sig_weights)
ax.set_xlabel(branch+" (transformed)")
plot_dir = os.path.join(self.project_dir, "plots")
if not os.path.exists(plot_dir):
os.mkdir(plot_dir)
fig.savefig(os.path.join(plot_dir, "var_{}.pdf".format(var_index)))
logger.info("Plot ROC curve")
fpr, tpr, threshold = roc_curve(self.y_test, self.scores_test, sample_weight = self.w_test)
plt.grid(color='gray', linestyle='--', linewidth=1)
plt.plot([0,1],[1,0], linestyle='--', color='black', label='Luck')
plt.title('Receiver operating characteristic')
plt.xlim(0,1)
plt.ylim(0,1)
plt.xticks(np.arange(0,1,0.1))
plt.yticks(np.arange(0,1,0.1))
plt.legend(loc='lower left', framealpha=1.0)
plt.text(0.21,0.02,"AUC: {}".format(str(roc_auc)), size=12)
plt.savefig(os.path.join(self.project_dir, "ROC.pdf"))
plt.clf()
def plot_loss(self):
logger.info("Plot losses")
plt.plot(self.history.history['loss'])
plt.plot(self.history.history['val_loss'])
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.savefig(os.path.join(self.project_dir, "losses.pdf"))
def plot_accuracy(self):
logger.info("Plot accuracy")
plt.plot(self.history.history['acc'])
plt.plot(self.history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig(os.path.join(self.project_dir, "accuracy.pdf"))
logging.getLogger("KerasROOTClassification").setLevel(logging.INFO)
#logging.getLogger("KerasROOTClassification").setLevel(logging.DEBUG)
filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys.root"
c = KerasROOTClassification("test3",
signal_trees = [(filename, "GG_oneStep_1705_1105_505_NoSys")],
bkg_trees = [(filename, "ttbar_NoSys"),
(filename, "wjets_Sherpa221_NoSys")
],
selection="lep1Pt<5000", # cut out a few very weird outliers
branches = ["met", "mt"],
weight_expr = "eventWeight*genWeight",
identifiers = ["DatasetNumber", "EventNumber"],
step_bkg = 100)
#c.load()
c.train(epochs=10)
c.plot_loss()
c.plot_accuracy()