#!/usr/bin/env python import os import json import logging logger = logging.getLogger("KerasROOTClassification") logger.addHandler(logging.NullHandler()) from root_numpy import tree2array, rec2array import numpy as np import pandas as pd import h5py from sklearn.preprocessing import StandardScaler from sklearn.externals import joblib from sklearn.metrics import roc_curve from keras.models import Sequential from keras.layers import Dense from keras.models import model_from_json import matplotlib.pyplot as plt # configure number of cores # this doesn't seem to work, but at least with these settings keras only uses 4 processes import tensorflow as tf from keras import backend as K num_cores = 1 config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, inter_op_parallelism_threads=num_cores, allow_soft_placement=True, device_count = {'CPU': num_cores}) session = tf.Session(config=config) K.set_session(session) import ROOT class KerasROOTClassification: dataset_names = ["x_train", "x_test", "y_train", "y_test", "w_train", "w_test"] def __init__(self, name, signal_trees, bkg_trees, branches, weight_expr, identifiers, layers=3, nodes=64, batch_size=128, activation_function='relu', out_dir="./outputs"): self.name = name self.signal_trees = signal_trees self.bkg_trees = bkg_trees self.branches = branches self.weight_expr = weight_expr self.identifiers = identifiers self.layers = layers self.nodes = nodes self.batch_size = batch_size self.activation_function = activation_function self.out_dir = out_dir self.project_dir = os.path.join(self.out_dir, name) if not os.path.exists(self.out_dir): os.mkdir(self.out_dir) if not os.path.exists(self.project_dir): os.mkdir(self.project_dir) self.s_train = None self.b_train = None self.s_test = None self.b_test = None self.x_train = None self.x_test = None self.y_train = None self.y_test = None self.s_eventlist_train = None self.b_eventlist_train = None self._scaler = None self._class_weight = None self._model = None self.score_train = None self.score_test = None # track the number of epochs this model has been trained self.total_epochs = 0 self.data_loaded = False def _load_data(self): try: self._load_from_hdf5() except KeyError: logger.info("Couldn't load all datasets - reading from ROOT trees") # Read signal and background trees into structured numpy arrays signal_chain = ROOT.TChain() bkg_chain = ROOT.TChain() for filename, treename in self.signal_trees: signal_chain.AddFile(filename, -1, treename) for filename, treename in self.bkg_trees: bkg_chain.AddFile(filename, -1, treename) self.s_train = tree2array(signal_chain, branches=self.branches+[self.weight_expr]+self.identifiers, start=0, step=2) self.b_train = tree2array(bkg_chain, branches=self.branches+[self.weight_expr]+self.identifiers, start=0, step=2) self.s_test = tree2array(signal_chain, branches=self.branches+[self.weight_expr], start=1, step=2) self.b_test = tree2array(bkg_chain, branches=self.branches+[self.weight_expr], start=1, step=2) self._dump_training_list() self.s_eventlist_train = self.s_train[self.identifiers] self.b_eventlist_train = self.b_train[self.identifiers] # now we don't need the identifiers anymore self.s_train = self.s_train[self.branches+[self.weight_expr]] self.b_train = self.b_train[self.branches+[self.weight_expr]] # create x (input), y (target) and w (weights) arrays # the first block will be signals, the second block backgrounds self.x_train = rec2array(self.s_train[self.branches]) self.x_train = np.concatenate((self.x_train, rec2array(self.b_train[self.branches]))) self.x_test = rec2array(self.s_test[self.branches]) self.x_test = np.concatenate((self.x_test, rec2array(self.b_test[self.branches]))) self.w_train = self.s_train[self.weight_expr] self.w_train = np.concatenate((self.w_train, self.b_train[self.weight_expr])) self.w_test = self.s_test[self.weight_expr] self.w_test = np.concatenate((self.w_test, self.b_test[self.weight_expr])) self.y_train = np.empty(len(self.x_train)) self.y_train[:len(self.s_train)] = 1 self.y_train[len(self.s_train):] = 0 self.y_test = np.empty(len(self.x_test)) self.y_test[:len(self.s_test)] = 1 self.y_test[len(self.s_test):] = 0 logger.info("Writing to hdf5") self._dump_to_hdf5() self.data_loaded = True def _dump_training_list(self): s_eventlist = pd.DataFrame(self.s_train[self.identifiers]) b_eventlist = pd.DataFrame(self.b_train[self.identifiers]) s_eventlist.to_csv(os.path.join(self.project_dir, "s_eventlist_train.csv")) s_eventlist.to_csv(os.path.join(self.project_dir, "b_eventlist_train.csv")) def _dump_to_hdf5(self): for dataset_name in self.dataset_names: with h5py.File(os.path.join(self.project_dir, dataset_name+".h5"), "w") as hf: hf.create_dataset(dataset_name, data=getattr(self, dataset_name)) def _load_from_hdf5(self): for dataset_name in self.dataset_names: filename = os.path.join(self.project_dir, dataset_name+".h5") logger.info("Trying to load {} from {}".format(dataset_name, filename)) with h5py.File(filename) as hf: setattr(self, dataset_name, hf[dataset_name][:]) logger.info("Data loaded") @property def scaler(self): # create the scaler (and fit to training data) if not existent if self._scaler is None: filename = os.path.join(self.project_dir, "scaler.pkl") try: self._scaler = joblib.load(filename) logger.info("Loaded existing StandardScaler from {}".format(filename)) except IOError: logger.info("Creating new StandardScaler") self._scaler = StandardScaler() logger.info("Fitting StandardScaler to training data") self._scaler.fit(self.x_train) logger.info("Fitting StandardScaler to test data") self._scaler.fit(self.x_test) joblib.dump(self._scaler, filename) return self._scaler def _read_info(self, key, default): filename = os.path.join(self.project_dir, "info.json") if not os.path.exists(filename): with open(filename, "w") as of: json.dump({}, of) with open(filename) as f: info = json.load(f) return info.get(key, default) def _write_info(self, key, value): filename = os.path.join(self.project_dir, "info.json") with open(filename) as f: info = json.load(f) info[key] = value with open(filename, "w") as of: json.dump(info, of) @property def model(self): "Simple MLP" if self._model is None: self._model = Sequential() # first hidden layer self._model.add(Dense(self.nodes, input_dim=len(self.branches), activation=self.activation_function)) # the other hidden layers for layer_number in range(self.layers-1): self._model.add(Dense(self.nodes, activation=self.activation_function)) # last layer is one neuron (binary classification) self._model.add(Dense(1, activation='sigmoid')) logger.info("Compile model") self._model.compile(optimizer='SGD', loss='binary_crossentropy', metrics=['accuracy']) # dump to json for documentation with open(os.path.join(self.project_dir, "model.json"), "w") as of: of.write(self._model.to_json()) return self._model @property def class_weight(self): if self._class_weight is None: sumw_bkg = np.sum(self.w_train[self.y_train == 0]) sumw_sig = np.sum(self.w_train[self.y_train == 1]) self._class_weight = [(sumw_sig+sumw_bkg)/(2*sumw_bkg), (sumw_sig+sumw_bkg)/(2*sumw_sig)] return self._class_weight def train(self, epochs=10): if not self.data_loaded: self._load_data() self.scaler try: self.model.load_weights(os.path.join(self.project_dir, "weights.h5")) logger.info("Weights found and loaded") logger.info("Continue training") except IOError: logger.info("No weights found, starting completely new training") self.total_epochs = self._read_info("epochs", 0) logger.info("Train model") self.model.fit(self.x_train, self.y_train, epochs=epochs, class_weight=self.class_weight, shuffle=True, batch_size=self.batch_size) logger.info("Save weights") self.model.save_weights(os.path.join(self.project_dir, "weights.h5")) self.total_epochs += epochs self._write_info("epochs", self.total_epochs) logger.info("Create scores for ROC curve") self.scores_test = self.model.predict(self.x_test) self.scores_train = self.model.predict(self.x_train) def evaluate(self): pass def writeFriendTree(self): pass def plotROC(self): logger.info("Plot ROC curve") fpr, tpr, threshold = roc_curve(self.y_test, self.scores_test, sample_weight = self.w_test) plt.grid(color='gray', linestyle='--', linewidth=1) plt.plot(fpr, tpr, label='NN') plt.plot([0,1],[0,1], linestyle='--', color='black', label='Luck') plt.xlabel("False positive rate (background rejection)") plt.ylabel("True positive rate (signal efficiency)") plt.title('Receiver operating characteristic') plt.xlim(0,1) plt.ylim(0,1) plt.xticks(np.arange(0,1,0.1)) plt.yticks(np.arange(0,1,0.1)) plt.legend(loc='lower left', framealpha=1.0) plt.savefig(os.path.join(self.project_dir, "ROC.pdf")) plt.clf() def plotScore(self): pass if __name__ == "__main__": logging.basicConfig() logging.getLogger("KerasROOTClassification").setLevel(logging.INFO) filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys.root" c = KerasROOTClassification("test", signal_trees = [(filename, "GG_oneStep_1705_1105_505_NoSys")], bkg_trees = [(filename, "ttbar_NoSys"), (filename, "wjets_Sherpa221_NoSys") ], branches = ["met", "mt"], weight_expr = "eventWeight*genWeight", identifiers = ["DatasetNumber", "EventNumber"]) c.train(epochs=1) c.plotROC()