#!/usr/bin/env python import os import json import logging logger = logging.getLogger("KerasROOTClassification") logger.addHandler(logging.NullHandler()) from root_numpy import tree2array, rec2array import numpy as np import pandas as pd import h5py from sklearn.preprocessing import StandardScaler from sklearn.externals import joblib from keras.models import Sequential from keras.layers import Dense from keras.models import model_from_json # configure number of cores # this doesn't seem to work, but at least with these settings keras only uses 4 processes import tensorflow as tf from keras import backend as K num_cores = 1 config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, inter_op_parallelism_threads=num_cores, allow_soft_placement=True, device_count = {'CPU': num_cores}) session = tf.Session(config=config) K.set_session(session) import ROOT class KerasROOTClassification: dataset_names = ["x_train", "x_test", "y_train", "y_test", "w_train", "w_test"] def __init__(self, name, signal_trees, bkg_trees, branches, weight_expr, identifiers, layers=3, nodes=64, batch_size=128, activation_function='relu', out_dir="./outputs"): self.name = name self.signal_trees = signal_trees self.bkg_trees = bkg_trees self.branches = branches self.weight_expr = weight_expr self.identifiers = identifiers self.layers = layers self.nodes = nodes self.batch_size = batch_size self.activation_function = activation_function self.out_dir = out_dir self.project_dir = os.path.join(self.out_dir, name) if not os.path.exists(self.out_dir): os.mkdir(self.out_dir) if not os.path.exists(self.project_dir): os.mkdir(self.project_dir) self.s_train = None self.b_train = None self.s_test = None self.b_test = None self.x_train = None self.x_test = None self.y_train = None self.y_test = None self.s_eventlist_train = None self.b_eventlist_train = None self._scaler = None self._class_weight = None self._model = None # track the number of epochs this model has been trained self.total_epochs = 0 self.data_loaded = False def _load_data(self): try: self._load_from_hdf5() except KeyError: logger.info("Couldn't load all datasets - reading from ROOT trees") # Read signal and background trees into structured numpy arrays signal_chain = ROOT.TChain() bkg_chain = ROOT.TChain() for filename, treename in self.signal_trees: signal_chain.AddFile(filename, -1, treename) for filename, treename in self.bkg_trees: bkg_chain.AddFile(filename, -1, treename) self.s_train = tree2array(signal_chain, branches=self.branches+[self.weight_expr]+self.identifiers, start=0, step=2) self.b_train = tree2array(bkg_chain, branches=self.branches+[self.weight_expr]+self.identifiers, start=0, step=2) self.s_test = tree2array(signal_chain, branches=self.branches+[self.weight_expr], start=1, step=2) self.b_test = tree2array(bkg_chain, branches=self.branches+[self.weight_expr], start=1, step=2) self._dump_training_list() self.s_eventlist_train = self.s_train[self.identifiers] self.b_eventlist_train = self.b_train[self.identifiers] # now we don't need the identifiers anymore self.s_train = self.s_train[self.branches+[self.weight_expr]] self.b_train = self.b_train[self.branches+[self.weight_expr]] # create x (input), y (target) and w (weights) arrays # the first block will be signals, the second block backgrounds self.x_train = rec2array(self.s_train[self.branches]) self.x_train = np.concatenate((self.x_train, rec2array(self.b_train[self.branches]))) self.x_test = rec2array(self.s_test[self.branches]) self.x_test = np.concatenate((self.x_test, rec2array(self.b_test[self.branches]))) self.w_train = self.s_train[self.weight_expr] self.w_train = np.concatenate((self.w_train, self.b_train[self.weight_expr])) self.w_test = self.s_test[self.weight_expr] self.w_test = np.concatenate((self.w_test, self.b_test[self.weight_expr])) self.y_train = np.empty(len(self.x_train)) self.y_train[:len(self.s_train)] = 1 self.y_train[len(self.s_train):] = 0 self.y_test = np.empty(len(self.x_test)) self.y_test[:len(self.s_test)] = 1 self.y_test[len(self.s_test):] = 0 logger.info("Writing to hdf5") self._dump_to_hdf5() self.data_loaded = True def _dump_training_list(self): s_eventlist = pd.DataFrame(self.s_train[self.identifiers]) b_eventlist = pd.DataFrame(self.b_train[self.identifiers]) s_eventlist.to_csv(os.path.join(self.project_dir, "s_eventlist_train.csv")) s_eventlist.to_csv(os.path.join(self.project_dir, "b_eventlist_train.csv")) def _dump_to_hdf5(self): for dataset_name in self.dataset_names: with h5py.File(os.path.join(self.project_dir, dataset_name+".h5"), "w") as hf: hf.create_dataset(dataset_name, data=getattr(self, dataset_name)) def _load_from_hdf5(self): for dataset_name in self.dataset_names: filename = os.path.join(self.project_dir, dataset_name+".h5") logger.info("Trying to load {} from {}".format(dataset_name, filename)) with h5py.File(filename) as hf: setattr(self, dataset_name, hf[dataset_name][:]) logger.info("Data loaded") @property def scaler(self): # create the scaler (and fit to training data) if not existent if self._scaler is None: filename = os.path.join(self.project_dir, "scaler.pkl") try: self._scaler = joblib.load(filename) logger.info("Loaded existing StandardScaler from {}".format(filename)) except IOError: logger.info("Creating new StandardScaler") self._scaler = StandardScaler() logger.info("Fitting StandardScaler to training data") self._scaler.fit(self.x_train) joblib.dump(self._scaler, filename) return self._scaler def _read_info(self, key, default): filename = os.path.join(self.project_dir, "info.json") if not os.path.exists(filename): with open(filename, "w") as of: json.dump({}, of) with open(filename) as f: info = json.load(f) return info.get(key, default) def _write_info(self, key, value): filename = os.path.join(self.project_dir, "info.json") with open(filename) as f: info = json.load(f) info[key] = value with open(filename, "w") as of: json.dump(info, of) @property def model(self): "Simple MLP" if self._model is None: self._model = Sequential() # first hidden layer self._model.add(Dense(self.nodes, input_dim=len(self.branches), activation=self.activation_function)) # the other hidden layers for layer_number in range(self.layers-1): self._model.add(Dense(self.nodes, activation=self.activation_function)) # last layer is one neuron (binary classification) self._model.add(Dense(1, activation='sigmoid')) self._model.compile(optimizer='SGD', loss='binary_crossentropy', metrics=['accuracy']) # dump to json for documentation with open(os.path.join(self.project_dir, "model.json"), "w") as of: of.write(self._model.to_json()) return self._model @property def class_weight(self): if self._class_weight is None: sumw_bkg = np.sum(self.w_train[self.y_train == 0]) sumw_sig = np.sum(self.w_train[self.y_train == 1]) self._class_weight = [(sumw_sig+sumw_bkg)/(2*sumw_bkg), (sumw_sig+sumw_bkg)/(2*sumw_sig)] return self._class_weight def train(self, epochs=10): if not self.data_loaded: self._load_data() try: self.model.load_weights(os.path.join(self.project_dir, "weights.h5")) logger.info("Weights found and loaded") logger.info("Continue training") except IOError: logger.info("No weights found, starting completely new training") self.total_epochs = self._read_info("epochs", 0) self.model.fit(self.x_train, self.y_train, epochs=epochs, class_weight=self.class_weight, shuffle=True, batch_size=self.batch_size) self.model.save_weights(os.path.join(self.project_dir, "weights.h5")) self.total_epochs += epochs self._write_info("epochs", self.total_epochs) def evaluate(self): pass def writeFriendTree(self): pass def plotROC(self): pass def plotScore(self): pass if __name__ == "__main__": logging.basicConfig() logging.getLogger("KerasROOTClassification").setLevel(logging.INFO) filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys.root" c = KerasROOTClassification("test", signal_trees = [(filename, "GG_oneStep_1705_1105_505_NoSys")], bkg_trees = [(filename, "ttbar_NoSys"), (filename, "wjets_Sherpa221_NoSys") ], branches = ["met", "mt"], weight_expr = "eventWeight*genWeight", identifiers = ["DatasetNumber", "EventNumber"]) c.train(epochs=1)