Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • Eric.Schanet/KerasROOTClassification
  • Nikolai.Hartmann/KerasROOTClassification
2 results
Show changes
Commits on Source (4)
......@@ -4,3 +4,4 @@ run.py
*.swp
*.pyc
*.pdf
*.txt
#!/usr/bin/env python
""" Class that keeps track of all genomes trained so far, and their scores.
Among other things, ensures that genomes are unique.
"""
import random
import logging
from genome import Genome
class AllGenomes(object):
"""Store all genomes
"""
def __init__(self, firstgenome):
"""Initialize
"""
self.population = []
self.population.append(firstgenome)
def add_genome(self, genome):
"""Add the genome to our population.
"""
for i in range(0,len(self.population)):
if (genome.hash == self.population[i].hash):
logging.info("add_genome() ERROR: hash clash - duplicate genome")
return False
self.population.append(genome)
return True
def set_accuracy(self, genome):
"""Add the genome to our population.
"""
for i in range(0,len(self.population)):
if (genome.hash == self.population[i].hash):
self.population[i].accuracy = genome.accuracy
return
logging.info("set_accuracy() ERROR: Genome not found")
def is_duplicate(self, genome):
"""Add the genome to our population.
"""
for i in range(0,len(self.population)):
if (genome.hash == self.population[i].hash):
return True
return False
def print_all_genomes(self):
"""Print out a genome.
"""
for genome in self.population:
genome.print_genome_ma()
#!/usr/bin/env python
"""
Class that holds a genetic algorithm for evolving a network.
Inspiration:
http://lethain.com/genetic-algorithms-cool-name-damn-simple/
"""
from __future__ import print_function
import random
import logging
import copy
from functools import reduce
from operator import add
from genome import Genome
from idgen import IDgen
from allgenomes import AllGenomes
class Evolver(object):
"""Class that implements genetic algorithm."""
def __init__(self, all_possible_genes, retain=0.15, random_select=0.1, mutate_chance=0.3):
"""Create an optimizer.
Args:
all_possible_genes (dict): Possible genome parameters
retain (float): Percentage of population to retain after
each generation
random_select (float): Probability of a rejected genome
remaining in the population
mutate_chance (float): Probability a genome will be
randomly mutated
"""
self.all_possible_genes = all_possible_genes
self.retain = retain
self.random_select = random_select
self.mutate_chance = mutate_chance
#set the ID gen
self.ids = IDgen()
def create_population(self, count):
"""Create a population of random networks.
Args:
count (int): Number of networks to generate, aka the
size of the population
Returns:
(list): Population of network objects
"""
pop = []
i = 0
while i < count:
# Initialize a new genome.
genome = Genome( self.all_possible_genes, {}, self.ids.get_next_ID(), 0, 0, self.ids.get_Gen() )
# Set it to random parameters.
genome.set_genes_random()
if i == 0:
#this is where we will store all genomes
self.master = AllGenomes( genome )
else:
# Make sure it is unique....
while self.master.is_duplicate( genome ):
genome.mutate_one_gene()
# Add the genome to our population.
pop.append(genome)
# and add to the master list
if i > 0:
self.master.add_genome(genome)
i += 1
#self.master.print_all_genomes()
#exit()
return pop
@staticmethod
def fitness(genome):
"""Return the accuracy, which is our fitness function."""
return genome.accuracy
def grade(self, pop):
"""Find average fitness for a population.
Args:
pop (list): The population of networks/genome
Returns:
(float): The average accuracy of the population
"""
summed = reduce(add, (self.fitness(genome) for genome in pop))
return summed / float((len(pop)))
def breed(self, mom, dad):
"""Make two children from parental genes.
Args:
mother (dict): genome parameters
father (dict): genome parameters
Returns:
(list): Two network objects
"""
children = []
#where do we recombine? 0, 1, 2, 3, 4... N?
#with four genes, there are three choices for the recombination
# ___ * ___ * ___ * ___
#0 -> no recombination, and N == length of dictionary -> no recombination
#0 and 4 just (re)create more copies of the parents
#so the range is always 1 to len(all_possible_genes) - 1
pcl = len(self.all_possible_genes)
recomb_loc = random.randint(1,pcl - 1)
#for _ in range(2): #make _two_ children - could also make more
child1 = {}
child2 = {}
#enforce defined genome order using list
#keys = ['nb_neurons', 'nb_layers', 'activation', 'optimizer']
keys = list(self.all_possible_genes)
keys = sorted(keys) #paranoia - just to make sure we do not add unintentional randomization
#*** CORE RECOMBINATION CODE ****
for x in range(0, pcl):
if x < recomb_loc:
child1[keys[x]] = mom.geneparam[keys[x]]
child2[keys[x]] = dad.geneparam[keys[x]]
else:
child1[keys[x]] = dad.geneparam[keys[x]]
child2[keys[x]] = mom.geneparam[keys[x]]
# Initialize a new genome
# Set its parameters to those just determined
# they both have the same mom and dad
genome1 = Genome( self.all_possible_genes, child1, self.ids.get_next_ID(), mom.u_ID, dad.u_ID, self.ids.get_Gen() )
genome2 = Genome( self.all_possible_genes, child2, self.ids.get_next_ID(), mom.u_ID, dad.u_ID, self.ids.get_Gen() )
#at this point, there is zero guarantee that the genome is actually unique
# Randomly mutate one gene
if self.mutate_chance > random.random():
genome1.mutate_one_gene()
if self.mutate_chance > random.random():
genome2.mutate_one_gene()
#do we have a unique child or are we just retraining one we already have anyway?
while self.master.is_duplicate(genome1):
genome1.mutate_one_gene()
self.master.add_genome(genome1)
while self.master.is_duplicate(genome2):
genome2.mutate_one_gene()
self.master.add_genome(genome2)
children.append(genome1)
children.append(genome2)
return children
def evolve(self, pop):
"""Evolve a population of genomes.
Args:
pop (list): A list of genome parameters
Returns:
(list): The evolved population of networks
"""
#increase generation
self.ids.increase_Gen()
# Get scores for each genome
graded = [(self.fitness(genome), genome) for genome in pop]
#and use those scores to fill in the master list
for genome in pop:
self.master.set_accuracy(genome)
# Sort on the scores.
graded = [x[1] for x in sorted(graded, key=lambda x: x[0], reverse=True)]
# Get the number we want to keep unchanged for the next cycle.
retain_length = int(len(graded)*self.retain)
# In this first step, we keep the 'top' X percent (as defined in self.retain)
# We will not change them, except we will update the generation
new_generation = graded[:retain_length]
# For the lower scoring ones, randomly keep some anyway.
# This is wasteful, since we _know_ these are bad, so why keep rescoring them without modification?
# At least we should mutate them
for genome in graded[retain_length:]:
if self.random_select > random.random():
gtc = copy.deepcopy(genome)
while self.master.is_duplicate(gtc):
gtc.mutate_one_gene()
gtc.set_generation( self.ids.get_Gen() )
new_generation.append(gtc)
self.master.add_genome(gtc)
# Now find out how many spots we have left to fill.
ng_length = len(new_generation)
desired_length = len(pop) - ng_length
children = []
# Add children, which are bred from pairs of remaining (i.e. very high or lower scoring) genomes.
while len(children) < desired_length:
# Get a random mom and dad, but, need to make sure they are distinct
parents = random.sample(range(ng_length-1), k=2)
i_male = parents[0]
i_female = parents[1]
male = new_generation[i_male]
female = new_generation[i_female]
# Recombine and mutate
babies = self.breed(male, female)
# the babies are guaranteed to be novel
# Add the children one at a time.
for baby in babies:
# Don't grow larger than desired length.
#if len(children) < desired_length:
children.append(baby)
new_generation.extend(children)
return new_generation
#!/usr/bin/env python
"""The genome to be evolved."""
import random
import logging
import hashlib
import copy
from train import train_and_score
class Genome(object):
"""
Represents one genome and all relevant utility functions (add, mutate, etc.).
"""
def __init__( self, all_possible_genes = None, geneparam = {}, u_ID = 0, mom_ID = 0, dad_ID = 0, gen = 0 ):
"""Initialize a genome.
Args:
all_possible_genes (dict): Parameters for the genome, includes:
gene_nb_neurons (list): [64, 128, 256]
gene_nb_layers (list): [1, 2, 3, 4]
gene_activation (list): ['relu', 'elu']
gene_optimizer (list): ['rmsprop', 'adam']
"""
self.accuracy = 0.0
self.all_possible_genes = all_possible_genes
self.geneparam = geneparam #(dict): represents actual genome parameters
self.u_ID = u_ID
self.parents = [mom_ID, dad_ID]
self.generation = gen
#hash only makes sense when we have specified the genes
if not geneparam:
self.hash = 0
else:
self.update_hash()
def update_hash(self):
"""
Refesh each genome's unique hash - needs to run after any genome changes.
"""
# + str(self.geneparam['optimizer'])
genh = str(self.geneparam['nb_neurons']) + self.geneparam['activation'] \
+ str(self.geneparam['nb_layers']) \
+ str(self.geneparam['lr']) \
+ str(self.geneparam['decay']) \
+ str(self.geneparam['momentum'])
self.hash = hashlib.md5(genh.encode("UTF-8")).hexdigest()
self.accuracy = 0.0
def set_genes_random(self):
"""Create a random genome."""
#print("set_genes_random")
self.parents = [0,0] #very sad - no parents :(
for key in self.all_possible_genes:
self.geneparam[key] = random.choice(self.all_possible_genes[key])
self.update_hash()
def mutate_one_gene(self):
"""Randomly mutate one gene in the genome.
Args:
network (dict): The genome parameters to mutate
Returns:
(Genome): A randomly mutated genome object
"""
# Which gene shall we mutate? Choose one of N possible keys/genes.
gene_to_mutate = random.choice( list(self.all_possible_genes.keys()) )
# And then let's mutate one of the genes.
# Make sure that this actually creates mutation
current_value = self.geneparam[gene_to_mutate]
possible_choices = copy.deepcopy(self.all_possible_genes[gene_to_mutate])
possible_choices.remove(current_value)
self.geneparam[gene_to_mutate] = random.choice( possible_choices )
self.update_hash()
def set_generation(self, generation):
"""needed when a genome is passed on from one generation to the next.
the id stays the same, but the generation is increased"""
self.generation = generation
#logging.info("Setting Generation to %d" % self.generation)
def set_genes_to(self, geneparam, mom_ID, dad_ID):
"""Set genome properties.
this is used when breeding kids
Args:
genome (dict): The genome parameters
IMPROVE
"""
self.parents = [mom_ID, dad_ID]
self.geneparam = geneparam
self.update_hash()
def train(self):
"""Train the genome and record the accuracy.
Args:
"""
if self.accuracy == 0.0: #don't bother retraining ones we already trained
self.accuracy = train_and_score(self.geneparam)
def print_genome(self):
"""Print out a genome."""
logging.info(self.geneparam)
logging.info("Acc: %.2f%%" % (self.accuracy * 100))
logging.info("UniID: %d" % self.u_ID)
logging.info("Mom and Dad: %d %d" % (self.parents[0], self.parents[1]))
logging.info("Gen: %d" % self.generation)
logging.info("Hash: %s" % self.hash)
def print_genome_ma(self):
"""Print out a genome."""
logging.info(self.geneparam)
logging.info("Acc: %.2f%% UniID: %d Mom and Dad: %d %d Gen: %d" % (self.accuracy * 100, self.u_ID, self.parents[0], self.parents[1], self.generation))
logging.info("Hash: %s" % self.hash)
#!/usr/bin/env python
"""Provide unique genome IDs."""
import logging
class IDgen():
"""Generate unique IDs.
"""
def __init__(self):
"""Keep track of IDs.
"""
self.currentID = 0
self.currentGen = 1
def get_next_ID(self):
self.currentID += 1
return self.currentID
def increase_Gen(self):
self.currentGen += 1
def get_Gen(self):
return self.currentGen
#!/usr/bin/env python
"""Entry point to evolving the neural network. Start here."""
from __future__ import print_function
from evolver import Evolver
from tqdm import tqdm
import logging
import sys
# Setup logging.
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO,
filename='log.txt'
)
def train_genomes(genomes):
"""Train each genome.
Args:
networks (list): Current population of genomes
"""
logging.info("***train_networks(networks)***")
pbar = tqdm(total=len(genomes))
for genome in genomes:
genome.train()
pbar.update(1)
pbar.close()
def get_average_accuracy(genomes):
"""Get the average accuracy for a group of networks/genomes.
Args:
networks (list): List of networks/genomes
Returns:
float: The average accuracy of a population of networks/genomes.
"""
total_accuracy = 0
for genome in genomes:
total_accuracy += genome.accuracy
return total_accuracy / len(genomes)
def generate(generations, population, all_possible_genes):
"""Generate a network with the genetic algorithm.
Args:
generations (int): Number of times to evolve the population
population (int): Number of networks in each generation
all_possible_genes (dict): Parameter choices for networks
"""
logging.info("***generate(generations, population, all_possible_genes)***")
evolver = Evolver(all_possible_genes)
genomes = evolver.create_population(population)
# Evolve the generation.
for i in range( generations ):
logging.info("***Now in generation %d of %d***" % (i + 1, generations))
print_genomes(genomes)
# Train and get accuracy for networks/genomes.
train_genomes(genomes)
# Get the average accuracy for this generation.
average_accuracy = get_average_accuracy(genomes)
# Print out the average accuracy each generation.
logging.info("Generation average: %.2f%%" % (average_accuracy * 100))
logging.info('-'*80) #-----------
# Evolve, except on the last iteration.
if i != generations - 1:
# Evolve!
genomes = evolver.evolve(genomes)
# Sort our final population according to performance.
genomes = sorted(genomes, key=lambda x: x.accuracy, reverse=True)
# Print out the top 5 networks/genomes.
print_genomes(genomes[:5])
#save_path = saver.save(sess, '/output/model.ckpt')
#print("Model saved in file: %s" % save_path)
def print_genomes(genomes):
"""Print a list of genomes.
Args:
genomes (list): The population of networks/genomes
"""
logging.info('-'*80)
for genome in genomes:
genome.print_genome()
def main():
"""Evolve a genome."""
population = 20 # Number of networks/genomes in each generation.
#we only need to train the new ones....
generations = 2 # Number of times to evolve the population.
all_possible_genes = {
'nb_neurons': [8, 16, 32, 64, 128, 256, 512, 1024],
'nb_layers': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'activation': ['relu', 'elu', 'tanh', 'sigmoid', 'hard_sigmoid','softplus','linear'],
#'optimizer': ['rmsprop', 'adam', 'sgd', 'adagrad','adadelta', 'adamax', 'nadam']
'lr': [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 1.0, 10.0, 100.0],
'decay': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7],
'momentum': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}
print("***Evolving for %d generations with population size = %d***" % (generations, population))
generate(generations, population, all_possible_genes)
if __name__ == '__main__':
main()
#!/usr/bin/env python
import logging
from optimizer import Optimizer
from tqdm import tqdm
# Setup logging.
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.DEBUG,
filename='log.txt'
)
def train_networks(networks):
"""Train each network.
Args:
networks (list): Current population of networks
"""
pbar = tqdm(total=len(networks))
for network in networks:
network.train()
pbar.update(1)
pbar.close()
def get_average_accuracy(networks):
"""Get the average accuracy for a group of networks.
Args:
networks (list): List of networks
Returns:
float: The average accuracy of a population of networks.
"""
total_accuracy = 0
for network in networks:
total_accuracy += network.accuracy
return total_accuracy / len(networks)
def generate(generations, population, nn_param_choices):
"""Generate a network with the genetic algorithm.
Args:
generations (int): Number of times to evole the population
population (int): Number of networks in each generation
nn_param_choices (dict): Parameter choices for networks
"""
optimizer = Optimizer(nn_param_choices)
networks = optimizer.create_population(population)
# Evolve the generation.
for i in range(generations):
logging.info("***Doing generation %d of %d***" %
(i + 1, generations))
# Train and get accuracy for networks.
train_networks(networks)
# Get the average accuracy for this generation.
average_accuracy = get_average_accuracy(networks)
# Print out the average accuracy each generation.
logging.info("Generation average: %.2f%%" % (average_accuracy * 100))
logging.info('-'*80)
# Evolve, except on the last iteration.
if i != generations - 1:
# Do the evolution.
networks = optimizer.evolve(networks)
# Sort our final population.
networks = sorted(networks, key=lambda x: x.accuracy, reverse=True)
# Print out the top 5 networks.
print_networks(networks[:5])
def print_networks(networks):
"""Print a list of networks.
Args:
networks (list): The population of networks
"""
logging.info('-'*80)
for network in networks:
network.print_network()
def main():
"""Evolve a network."""
generations = 7 # Number of times to evolve the population.
population = 5 # Number of networks in each generation.
nn_param_choices = {
'nb_neurons': [8, 16, 32, 64, 128, 256, 512, 768, 1024],
'nb_layers': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'activation': ['relu', 'elu', 'tanh', 'sigmoid'],
#'optimizer': ['rmsprop', 'adam', 'sgd', 'adagrad',
# 'adadelta', 'adamax', 'nadam'],
#'optimizer_opts': {'lr': [0.1, 0.5, 1.0, 10.0, 100.0],
# 'decay': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7],
# 'momentum': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7,
# 0.8, 0.9, 1.0]},
'lr': [0.1, 0.5, 1.0, 10.0, 100.0],
'decay': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7],
'momentum': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}
logging.info("***Evolving %d generations with population %d***" %
(generations, population))
generate(generations, population, nn_param_choices)
if __name__ == '__main__':
main()
#!/usr/bin/env python
"""Class that represents the network to be evolved."""
import random
import logging
from train import train_and_score
class Network(object):
"""Represent a network and let us operate on it.
Currently only works for an MLP.
"""
def __init__(self, nn_param_choices=None):
"""Initialize our network.
Args:
nn_param_choices (dict): Parameters for the network, includes:
nb_neurons (list): [64, 128, 256]
nb_layers (list): [1, 2, 3, 4]
activation (list): ['relu', 'elu']
optimizer (list): ['rmsprop', 'adam']
optimizer_opts (dict(list)): {'lr': [0.5, ...], 'decay', ...}
"""
self.accuracy = 0.
self.nn_param_choices = nn_param_choices
self.network = {} # (dic): represents MLP network parameters
def create_random(self):
"""Create a random network."""
for key in self.nn_param_choices:
self.network[key] = random.choice(self.nn_param_choices[key])
def create_set(self, network):
"""Set network properties.
Args:
network (dict): The network parameters
"""
self.network = network
def train(self):
"""Train the network and record the accuracy.
Args:
"""
if self.accuracy == 0.:
self.accuracy = train_and_score(self.network)
def print_network(self):
"""Print out a network."""
logging.info(self.network)
logging.info("Network accuracy: %.2f%%" % (self.accuracy * 100))
#!/usr/bin/env python
"""
Class that holds a genetic algorithm for evolving a network.
Credit:
A lot of those code was originally inspired by:
http://lethain.com/genetic-algorithms-cool-name-damn-simple/
"""
from functools import reduce
from operator import add
import random
from network import Network
class Optimizer(object):
"""Class that implements genetic algorithm for MLP optimization."""
def __init__(self, nn_param_choices, retain=0.4,
random_select=0.1, mutate_chance=0.2):
"""Create an optimizer.
Args:
nn_param_choices (dict): Possible network paremters
retain (float): Percentage of population to retain after
each generation
random_select (float): Probability of a rejected network
remaining in the population
mutate_chance (float): Probability a network will be
randomly mutated
"""
self.mutate_chance = mutate_chance
self.random_select = random_select
self.retain = retain
self.nn_param_choices = nn_param_choices
def create_population(self, count):
"""Create a population of random networks.
Args:
count (int): Number of networks to generate, aka the
size of the population
Returns:
(list): Population of network objects
"""
pop = []
for _ in range(0, count):
# Create a random network.
network = Network(self.nn_param_choices)
network.create_random()
# Add the network to our population.
pop.append(network)
return pop
@staticmethod
def fitness(network):
"""Return the accuracy, which is our fitness function."""
return network.accuracy
def grade(self, pop):
"""Find average fitness for a population.
Args:
pop (list): The population of networks
Returns:
(float): The average accuracy of the population
"""
summed = reduce(add, (self.fitness(network) for network in pop))
return summed / float((len(pop)))
def breed(self, mother, father):
"""Make two children as parts of their parents.
Args:
mother (dict): Network parameters
father (dict): Network parameters
Returns:
(list): Two network objects
"""
children = []
for _ in range(2):
child = {}
# Loop through the parameters and pick params for the kid.
for param in self.nn_param_choices:
child[param] = random.choice(
[mother.network[param], father.network[param]]
)
# Now create a network object.
network = Network(self.nn_param_choices)
network.create_set(child)
# Randomly mutate some of the children.
if self.mutate_chance > random.random():
network = self.mutate(network)
children.append(network)
return children
def mutate(self, network):
"""Randomly mutate one part of the network.
Args:
network (dict): The network parameters to mutate
Returns:
(Network): A randomly mutated network object
"""
# Choose a random key.
mutation = random.choice(list(self.nn_param_choices.keys()))
# Mutate one of the params.
network.network[mutation] = random.choice(self.nn_param_choices[mutation])
return network
def evolve(self, pop):
"""Evolve a population of networks.
Args:
pop (list): A list of network parameters
Returns:
(list): The evolved population of networks
"""
# Get scores for each network.
graded = [(self.fitness(network), network) for network in pop]
# Sort on the scores.
graded = [x[1] for x in sorted(graded, key=lambda x: x[0], reverse=True)]
# Get the number we want to keep for the next gen.
retain_length = int(len(graded)*self.retain)
# The parents are every network we want to keep.
parents = graded[:retain_length]
# For those we aren't keeping, randomly keep some anyway.
for individual in graded[retain_length:]:
if self.random_select > random.random():
parents.append(individual)
# Now find out how many spots we have left to fill.
parents_length = len(parents)
desired_length = len(pop) - parents_length
children = []
# Add children, which are bred from two remaining networks.
while len(children) < desired_length:
# Get a random mom and dad.
male = random.randint(0, parents_length-1)
female = random.randint(0, parents_length-1)
# Assuming they aren't the same network...
if male != female:
male = parents[male]
female = parents[female]
# Breed them.
babies = self.breed(male, female)
# Add the children one at a time.
for baby in babies:
# Don't grow larger than desired length.
if len(children) < desired_length:
children.append(baby)
parents.extend(children)
return parents
#!/usr/bin/env python
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)
import toolkit
from toolkit import KerasROOTClassification
def init_model(geneparam):
nb_layers = geneparam['nb_layers']
nb_neurons = geneparam['nb_neurons']
activation = geneparam['activation']
optimizer = geneparam['optimizer']
#lr = network['lr']
#decay = network['decay']
#momentum = network['momentum']
filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys.root"
c = KerasROOTClassification("",
signal_trees = [(filename, "GG_oneStep_1545_1265_985_NoSys")],
bkg_trees = [(filename, "ttbar_NoSys"),
(filename, "wjets_Sherpa221_NoSys"),
(filename, "zjets_Sherpa221_NoSys"),
(filename, "diboson_Sherpa221_NoSys"),
(filename, "ttv_NoSys"),
(filename, "singletop_NoSys")
],
dumping_enabled=False,
optimizer=optimizer,
layers=nb_layers,
nodes=nb_neurons,
activation_function=activation,
# optimizer_opts=dict(lr=lr, decay=decay,
# momentum=momentum),
earlystopping_opts=dict(monitor='val_loss',
min_delta=0, patience=2, verbose=0, mode='auto'),
# optimizer="Adam",
selection="lep1Pt<5000", # cut out a few very weird outliers
branches = ["met", "mt"],
weight_expr = "eventWeight*genWeight",
identifiers = ["DatasetNumber", "EventNumber"],
step_bkg = 100)
return c
def train_and_score(geneparam):
model = init_model(geneparam)
model.train(epochs=20)
score = model.score
return score[1] # 1 is accuracy. 0 is loss.
#!/usr/bin/env python
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)
import toolkit
from toolkit import KerasROOTClassification
def init_model(geneparam):
nb_layers = geneparam['nb_layers']
nb_neurons = geneparam['nb_neurons']
activation = geneparam['activation']
#optimizer = geneparam['optimizer']
lr = geneparam['lr']
decay = geneparam['decay']
momentum = geneparam['momentum']
filename = "/project/etp4/nhartmann/trees/allTrees_m1.8_NoSys.root"
c = KerasROOTClassification("",
signal_trees = [(filename, "GG_oneStep_1545_1265_985_NoSys")],
bkg_trees = [(filename, "ttbar_NoSys"),
(filename, "wjets_Sherpa221_NoSys"),
(filename, "zjets_Sherpa221_NoSys"),
(filename, "diboson_Sherpa221_NoSys"),
(filename, "ttv_NoSys"),
(filename, "singletop_NoSys")
],
branches = ["jet1Pt", "jet1Phi==-999?0:jet1Phi", "jet1Eta==-999?0:jet1Eta",
"jet2Pt", "jet2Phi==-999?0:jet2Phi", "jet2Eta==-999?0:jet2Eta",
"jet3Pt", "jet3Phi==-999?0:jet3Phi", "jet3Eta==-999?0:jet3Eta",
"jet4Pt", "jet4Phi==-999?0:jet4Phi", "jet4Eta==-999?0:jet4Eta",
"jet5Pt", "jet5Phi==-999?0:jet5Phi", "jet5Eta==-999?0:jet5Eta",
"jet6Pt", "jet6Phi==-999?0:jet6Phi", "jet6Eta==-999?0:jet6Eta",
"jet7Pt", "jet7Phi==-999?0:jet7Phi", "jet7Eta==-999?0:jet7Eta",
"jet8Pt", "jet8Phi==-999?0:jet8Phi", "jet8Eta==-999?0:jet8Eta",
"lep1Pt", "lep1Phi", "lep1Eta", "nJet30",
"met", "met_Phi"],
dumping_enabled=False,
optimizer="SGD",
layers=nb_layers,
nodes=nb_neurons,
activation_function=activation,
optimizer_opts=dict(lr=lr, decay=decay,
momentum=momentum),
earlystopping_opts=dict(monitor='val_loss',
min_delta=0, patience=2, verbose=0, mode='auto'),
selection="lep1Pt<5000", # cut out a few very weird outliers
weight_expr = "eventWeight*genWeight",
identifiers = ["DatasetNumber", "EventNumber"],
step_bkg = 100)
return c
def train_and_score(geneparam):
model = init_model(geneparam)
model.train(epochs=20)
model.evaluate()
score = model.score
return score[1] # 1 is accuracy. 0 is loss.
......@@ -17,8 +17,9 @@ from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.externals import joblib
from sklearn.metrics import roc_curve, auc
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dense, Dropout
from keras.models import model_from_json
from keras.callbacks import History, EarlyStopping
from keras.optimizers import SGD
......@@ -44,19 +45,21 @@ class KerasROOTClassification(object):
# Datasets that are stored to (and dynamically loaded from) hdf5
dataset_names = ["x_train", "x_test", "y_train", "y_test", "w_train", "w_test", "scores_train", "scores_test"]
dataset_names = ["x_train", "x_test", "y_train", "y_test", "w_train", "w_test", "pred_train", "pred_test"]
# Datasets that are retrieved from ROOT trees the first time
dataset_names_tree = ["x_train", "x_test", "y_train", "y_test", "w_train", "w_test"]
def __init__(self, name, *args, **kwargs):
self._init_from_args(name, *args, **kwargs)
with open(os.path.join(self.project_dir, "options.json"), "w") as of:
json.dump(dict(args=args, kwargs=kwargs), of)
if self.dumping_enabled:
with open(os.path.join(self.project_dir, "options.json"), "w") as of:
json.dump(dict(args=args, kwargs=kwargs), of)
def _init_from_args(self, name,
signal_trees, bkg_trees, branches, weight_expr, identifiers,
dumping_enabled=True,
selection=None,
layers=3,
nodes=64,
......@@ -72,6 +75,7 @@ class KerasROOTClassification(object):
earlystopping_opts=None):
self.name = name
self.dumping_enabled = dumping_enabled
self.signal_trees = signal_trees
self.bkg_trees = bkg_trees
self.branches = branches
......@@ -114,8 +118,9 @@ class KerasROOTClassification(object):
self._y_test = None
self._w_train = None
self._w_test = None
self._scores_train = None
self._scores_test = None
self.pred_train = None
self.pred_test = None
self.score = None
self.s_eventlist_train = None
self.b_eventlist_train = None
......@@ -169,8 +174,8 @@ class KerasROOTClassification(object):
branches=self.branches+[self.weight_expr],
selection=self.selection,
start=1, step=self.step_bkg)
self._dump_training_list()
if self.dumping_enabled:
self._dump_training_list()
self.s_eventlist_train = self.s_train[self.identifiers]
self.b_eventlist_train = self.b_train[self.identifiers]
......@@ -196,7 +201,8 @@ class KerasROOTClassification(object):
self.y_test[:len(self.s_test)] = 1
self.y_test[len(self.s_test):] = 0
self._dump_to_hdf5(*self.dataset_names_tree)
if self.dumping_enabled:
self._dump_to_hdf5(*self.dataset_names_tree)
self.data_loaded = True
......@@ -261,7 +267,8 @@ class KerasROOTClassification(object):
# probably we either want to fit only training data or training and test data together
# logger.info("Fitting StandardScaler to test data")
# self._scaler.fit(self.x_test)
joblib.dump(self._scaler, filename)
if self.dumping_enabled:
joblib.dump(self._scaler, filename)
return self._scaler
......@@ -337,6 +344,7 @@ class KerasROOTClassification(object):
# the other hidden layers
for layer_number in range(self.layers-1):
self._model.add(Dense(self.nodes, activation=self.activation_function))
self._model.add(Dropout(0.2)) # hard-coded dropout for each layer
# last layer is one neuron (binary classification)
self._model.add(Dense(1, activation='sigmoid'))
logger.info("Using {}(**{}) as Optimizer".format(self.optimizer, self.optimizer_opts))
......@@ -354,8 +362,9 @@ class KerasROOTClassification(object):
logger.info("No weights found, starting completely new model")
# dump to json for documentation
with open(os.path.join(self.project_dir, "model.json"), "w") as of:
of.write(self._model.to_json())
if self.dumping_enabled:
with open(os.path.join(self.project_dir, "model.json"), "w") as of:
of.write(self._model.to_json())
return self._model
......@@ -386,19 +395,20 @@ class KerasROOTClassification(object):
np.random.shuffle(self.y_train)
np.random.set_state(rn_state)
np.random.shuffle(self.w_train)
if self._scores_test is not None:
if self.pred_test is not None:
np.random.set_state(rn_state)
np.random.shuffle(self._scores_test)
np.random.shuffle(self.pred_test)
def train(self, epochs=10):
self.load()
if self.dumping_enabled:
for branch_index, branch in enumerate(self.branches):
self.plot_input(branch_index)
for branch_index, branch in enumerate(self.branches):
self.plot_input(branch_index)
self.total_epochs = self._read_info("epochs", 0)
self.total_epochs = self._read_info("epochs", 0)
logger.info("Train model")
try:
......@@ -418,25 +428,38 @@ class KerasROOTClassification(object):
except KeyboardInterrupt:
logger.info("Interrupt training - continue with rest")
logger.info("Save history")
self._dump_history()
if self.dumping_enabled:
logger.info("Save history")
self._dump_history()
logger.info("Save weights")
self.model.save_weights(os.path.join(self.project_dir, "weights.h5"))
logger.info("Save weights")
self.model.save_weights(os.path.join(self.project_dir, "weights.h5"))
self.total_epochs += epochs
self._write_info("epochs", self.total_epochs)
self.total_epochs += epochs
self._write_info("epochs", self.total_epochs)
logger.info("Create/Update scores for ROC curve")
self.scores_test = self.model.predict(self.x_test)
self.scores_train = self.model.predict(self.x_train)
logger.info("Create/Update predictions for ROC curve")
self.pred_test = self.model.predict(self.x_test)
self.pred_train = self.model.predict(self.x_train)
self._dump_to_hdf5("scores_train", "scores_test")
if self.dumping_enabled:
self._dump_to_hdf5("pred_train", "pred_test")
def evaluate(self):
pass
logger.info("Get test loss and metrics of the model")
self.score = self.model.evaluate(self.x_test, self.y_test, verbose=0,
sample_weight=self.w_train)
print('Test loss:', self.score[0])
print('Test accuracy:', self.score[1])
#we do not care about keeping any of this in memory -
#we just need to know the final scores and the architecture
K.clear_session()
def write_friend_tree(self):
pass
......@@ -521,7 +544,7 @@ class KerasROOTClassification(object):
def plot_ROC(self):
logger.info("Plot ROC curve")
fpr, tpr, threshold = roc_curve(self.y_test, self.scores_test, sample_weight = self.w_test)
fpr, tpr, threshold = roc_curve(self.y_test, self.pred_test, sample_weight = self.w_test)
fpr = 1.0 - fpr
roc_auc = auc(tpr, fpr)
......@@ -571,7 +594,10 @@ class KerasROOTClassification(object):
def create_getter(dataset_name):
def getx(self):
if getattr(self, "_"+dataset_name) is None:
self._load_from_hdf5(dataset_name)
try:
self._load_from_hdf5(dataset_name)
except KeyError:
logger.info("KeyError")
return getattr(self, "_"+dataset_name)
return getx
......@@ -580,11 +606,12 @@ def create_setter(dataset_name):
setattr(self, "_"+dataset_name, value)
return setx
'''
# define getters and setters for all datasets
for dataset_name in KerasROOTClassification.dataset_names:
setattr(KerasROOTClassification, dataset_name, property(create_getter(dataset_name),
create_setter(dataset_name)))
'''
if __name__ == "__main__":
......