Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
K
KerasROOTClassification
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Nikolai.Hartmann
KerasROOTClassification
Commits
f14c65cd
Commit
f14c65cd
authored
6 years ago
by
Nikolai
Browse files
Options
Downloads
Plain Diff
Merge branch 'dev-rnn'
parents
c3855055
d92d5fbc
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
toolkit.py
+221
-35
221 additions, 35 deletions
toolkit.py
utils.py
+14
-2
14 additions, 2 deletions
utils.py
with
235 additions
and
37 deletions
toolkit.py
+
221
−
35
View file @
f14c65cd
#!/usr/bin/env python
#!/usr/bin/env python
__all__
=
[
"
ClassificationProject
"
,
"
ClassificationProjectDataFrame
"
]
__all__
=
[
"
ClassificationProject
"
,
"
ClassificationProjectDataFrame
"
,
"
ClassificationProjectRNN
"
]
from
sys
import
version_info
from
sys
import
version_info
...
@@ -31,9 +31,8 @@ import h5py
...
@@ -31,9 +31,8 @@ import h5py
from
sklearn.preprocessing
import
StandardScaler
,
RobustScaler
from
sklearn.preprocessing
import
StandardScaler
,
RobustScaler
from
sklearn.externals
import
joblib
from
sklearn.externals
import
joblib
from
sklearn.metrics
import
roc_curve
,
auc
from
sklearn.metrics
import
roc_curve
,
auc
from
keras.models
import
Sequential
from
keras.models
import
Sequential
,
Model
,
model_from_json
from
keras.layers
import
Dense
,
Dropout
from
keras.layers
import
Dense
,
Dropout
,
Input
,
Masking
,
GRU
,
concatenate
from
keras.models
import
model_from_json
from
keras.callbacks
import
History
,
EarlyStopping
,
CSVLogger
,
ModelCheckpoint
,
TensorBoard
from
keras.callbacks
import
History
,
EarlyStopping
,
CSVLogger
,
ModelCheckpoint
,
TensorBoard
from
keras.optimizers
import
SGD
from
keras.optimizers
import
SGD
import
keras.optimizers
import
keras.optimizers
...
@@ -578,9 +577,12 @@ class ClassificationProject(object):
...
@@ -578,9 +577,12 @@ class ClassificationProject(object):
def
_transform_data
(
self
):
def
_transform_data
(
self
):
if
not
self
.
data_transformed
:
if
not
self
.
data_transformed
:
# todo: what to do about the outliers? Where do they come from?
# todo: what to do about the outliers? Where do they come from?
logger
.
debug
(
"
training data before transformation: {}
"
.
format
(
self
.
x_train
))
if
logger
.
level
<=
logging
.
DEBUG
:
logger
.
debug
(
"
minimum values: {}
"
.
format
([
np
.
min
(
self
.
x_train
[:,
i
])
for
i
in
range
(
self
.
x_train
.
shape
[
1
])]))
logger
.
debug
(
"
training data before transformation: {}
"
.
format
(
self
.
x_train
))
logger
.
debug
(
"
maximum values: {}
"
.
format
([
np
.
max
(
self
.
x_train
[:,
i
])
for
i
in
range
(
self
.
x_train
.
shape
[
1
])]))
logger
.
debug
(
"
minimum values: {}
"
.
format
([
np
.
min
(
self
.
x_train
[:,
i
][
~
np
.
isnan
(
self
.
x_train
[:,
i
])])
for
i
in
range
(
self
.
x_train
.
shape
[
1
])]))
logger
.
debug
(
"
maximum values: {}
"
.
format
([
np
.
max
(
self
.
x_train
[:,
i
][
~
np
.
isnan
(
self
.
x_train
[:,
i
])])
for
i
in
range
(
self
.
x_train
.
shape
[
1
])]))
orig_copy_setting
=
self
.
scaler
.
copy
orig_copy_setting
=
self
.
scaler
.
copy
self
.
scaler
.
copy
=
False
self
.
scaler
.
copy
=
False
self
.
x_train
=
self
.
scaler
.
transform
(
self
.
x_train
)
self
.
x_train
=
self
.
scaler
.
transform
(
self
.
x_train
)
...
@@ -646,37 +648,41 @@ class ClassificationProject(object):
...
@@ -646,37 +648,41 @@ class ClassificationProject(object):
# last layer is one neuron (binary classification)
# last layer is one neuron (binary classification)
self
.
_model
.
add
(
Dense
(
1
,
activation
=
self
.
activation_function_output
))
self
.
_model
.
add
(
Dense
(
1
,
activation
=
self
.
activation_function_output
))
logger
.
info
(
"
Using {}(**{}) as Optimizer
"
.
format
(
self
.
optimizer
,
self
.
optimizer_opts
))
self
.
_compile_or_load_model
()
Optimizer
=
getattr
(
keras
.
optimizers
,
self
.
optimizer
)
optimizer
=
Optimizer
(
**
self
.
optimizer_opts
)
logger
.
info
(
"
Compile model
"
)
rn_state
=
np
.
random
.
get_state
()
np
.
random
.
seed
(
self
.
random_seed
)
self
.
_model
.
compile
(
optimizer
=
optimizer
,
loss
=
self
.
loss
,
weighted_metrics
=
[
'
accuracy
'
]
)
np
.
random
.
set_state
(
rn_state
)
if
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
project_dir
,
"
weights.h5
"
)):
return
self
.
_model
if
self
.
is_training
:
continue_training
=
self
.
query_yn
(
"
Found previously trained weights -
"
"
continue training (choosing N will restart)? (Y/N)
"
)
else
:
continue_training
=
True
if
continue_training
:
self
.
model
.
load_weights
(
os
.
path
.
join
(
self
.
project_dir
,
"
weights.h5
"
))
logger
.
info
(
"
Found and loaded previously trained weights
"
)
else
:
logger
.
info
(
"
Starting completely new model
"
)
else
:
logger
.
info
(
"
No weights found, starting completely new model
"
)
# dump to json for documentation
with
open
(
os
.
path
.
join
(
self
.
project_dir
,
"
model.json
"
),
"
w
"
)
as
of
:
of
.
write
(
self
.
_model
.
to_json
())
return
self
.
_model
def
_compile_or_load_model
(
self
):
logger
.
info
(
"
Using {}(**{}) as Optimizer
"
.
format
(
self
.
optimizer
,
self
.
optimizer_opts
))
Optimizer
=
getattr
(
keras
.
optimizers
,
self
.
optimizer
)
optimizer
=
Optimizer
(
**
self
.
optimizer_opts
)
logger
.
info
(
"
Compile model
"
)
rn_state
=
np
.
random
.
get_state
()
np
.
random
.
seed
(
self
.
random_seed
)
self
.
_model
.
compile
(
optimizer
=
optimizer
,
loss
=
self
.
loss
,
weighted_metrics
=
[
'
accuracy
'
]
)
np
.
random
.
set_state
(
rn_state
)
if
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
project_dir
,
"
weights.h5
"
)):
if
self
.
is_training
:
continue_training
=
self
.
query_yn
(
"
Found previously trained weights -
"
"
continue training (choosing N will restart)? (Y/N)
"
)
else
:
continue_training
=
True
if
continue_training
:
self
.
model
.
load_weights
(
os
.
path
.
join
(
self
.
project_dir
,
"
weights.h5
"
))
logger
.
info
(
"
Found and loaded previously trained weights
"
)
else
:
logger
.
info
(
"
Starting completely new model
"
)
else
:
logger
.
info
(
"
No weights found, starting completely new model
"
)
# dump to json for documentation
with
open
(
os
.
path
.
join
(
self
.
project_dir
,
"
model.json
"
),
"
w
"
)
as
of
:
of
.
write
(
self
.
_model
.
to_json
())
@property
@property
...
@@ -1348,6 +1354,186 @@ class ClassificationProjectDataFrame(ClassificationProject):
...
@@ -1348,6 +1354,186 @@ class ClassificationProjectDataFrame(ClassificationProject):
self
.
_transform_data
()
self
.
_transform_data
()
class
ClassificationProjectRNN
(
ClassificationProject
):
"""
A little wrapper to use recurrent units for things like jet collections
"""
def
__init__
(
self
,
name
,
recurrent_field_names
=
None
,
rnn_layer_nodes
=
32
,
mask_value
=-
999
,
**
kwargs
):
"""
recurrent_field_names example:
[[
"
jet1Pt
"
,
"
jet1Eta
"
,
"
jet1Phi
"
],
[
"
jet2Pt
"
,
"
jet2Eta
"
,
"
jet2Phi
"
],
[
"
jet3Pt
"
,
"
jet3Eta
"
,
"
jet3Phi
"
]],
[[
"
lep1Pt
"
,
"
lep1Eta
"
,
"
lep1Phi
"
,
"
lep1flav
"
],
[
"
lep2Pt
"
,
"
lep2Eta
"
,
"
lep2Phi
"
,
"
lep2flav
"
]],
"""
super
(
ClassificationProjectRNN
,
self
).
__init__
(
name
,
**
kwargs
)
self
.
recurrent_field_names
=
recurrent_field_names
if
self
.
recurrent_field_names
is
None
:
self
.
recurrent_field_names
=
[]
self
.
rnn_layer_nodes
=
rnn_layer_nodes
self
.
mask_value
=
mask_value
# convert to of indices
self
.
recurrent_field_idx
=
[]
for
field_name_list
in
self
.
recurrent_field_names
:
field_names
=
np
.
array
([
field_name_list
])
if
field_names
.
dtype
==
np
.
object
:
raise
ValueError
(
"
Invalid entry for recurrent fields: {} -
"
"
please ensure that the length for all elements in the list is equal
"
.
format
(
field_names
)
)
field_idx
=
(
np
.
array
([
self
.
fields
.
index
(
field_name
)
for
field_name
in
field_names
.
reshape
(
-
1
)])
.
reshape
(
field_names
.
shape
)
)
self
.
recurrent_field_idx
.
append
(
field_idx
)
self
.
flat_fields
=
[]
for
field
in
self
.
fields
:
if
any
(
self
.
fields
.
index
(
field
)
in
field_idx
.
reshape
(
-
1
)
for
field_idx
in
self
.
recurrent_field_idx
):
continue
self
.
flat_fields
.
append
(
field
)
if
self
.
scaler_type
!=
"
WeightedRobustScaler
"
:
raise
NotImplementedError
(
"
Invalid scaler
'
{}
'
- only WeightedRobustScaler is currently supported for RNN
"
.
format
(
self
.
scaler_type
)
)
def
_transform_data
(
self
):
self
.
x_train
[
self
.
x_train
==
self
.
mask_value
]
=
np
.
nan
self
.
x_test
[
self
.
x_test
==
self
.
mask_value
]
=
np
.
nan
super
(
ClassificationProjectRNN
,
self
).
_transform_data
()
self
.
x_train
[
np
.
isnan
(
self
.
x_train
)]
=
self
.
mask_value
self
.
x_test
[
np
.
isnan
(
self
.
x_test
)]
=
self
.
mask_value
@property
def
model
(
self
):
if
self
.
_model
is
None
:
# following the setup from the tutorial:
# https://github.com/YaleATLAS/CERNDeepLearningTutorial
rnn_inputs
=
[]
rnn_channels
=
[]
for
field_idx
in
self
.
recurrent_field_idx
:
chan_inp
=
Input
(
field_idx
.
shape
[
1
:])
channel
=
Masking
(
mask_value
=
self
.
mask_value
)(
chan_inp
)
channel
=
GRU
(
self
.
rnn_layer_nodes
)(
channel
)
# TODO: configure dropout for recurrent layers
#channel = Dropout(0.3)(channel)
rnn_inputs
.
append
(
chan_inp
)
rnn_channels
.
append
(
channel
)
flat_input
=
Input
((
len
(
self
.
flat_fields
),))
if
self
.
dropout_input
is
not
None
:
flat_channel
=
Dropout
(
rate
=
self
.
dropout_input
)(
flat_input
)
else
:
flat_channel
=
flat_input
combined
=
concatenate
(
rnn_channels
+
[
flat_channel
])
for
node_count
,
dropout_fraction
in
zip
(
self
.
nodes
,
self
.
dropout
):
combined
=
Dense
(
node_count
,
activation
=
self
.
activation_function
)(
combined
)
if
(
dropout_fraction
is
not
None
)
and
(
dropout_fraction
>
0
):
combined
=
Dropout
(
rate
=
dropout_fraction
)(
combined
)
combined
=
Dense
(
1
,
activation
=
self
.
activation_function_output
)(
combined
)
self
.
_model
=
Model
(
inputs
=
rnn_inputs
+
[
flat_input
],
outputs
=
combined
)
self
.
_compile_or_load_model
()
return
self
.
_model
def
train
(
self
,
epochs
=
10
):
self
.
load
()
for
branch_index
,
branch
in
enumerate
(
self
.
fields
):
self
.
plot_input
(
branch_index
)
try
:
self
.
shuffle_training_data
()
# needed here too, in order to get correct validation data
self
.
is_training
=
True
logger
.
info
(
"
Training on batches for RNN
"
)
# note: the batches have class_weight already applied
self
.
model
.
fit_generator
(
self
.
yield_batch
(),
steps_per_epoch
=
int
(
len
(
self
.
training_data
[
0
])
/
self
.
batch_size
),
epochs
=
epochs
,
validation_data
=
self
.
class_weighted_validation_data
,
callbacks
=
self
.
callbacks_list
)
self
.
is_training
=
False
except
KeyboardInterrupt
:
logger
.
info
(
"
Interrupt training - continue with rest
"
)
logger
.
info
(
"
Save history
"
)
self
.
_dump_history
()
def
get_input_list
(
self
,
x
):
"
Format the input starting from flat ntuple
"
x_input
=
[]
for
field_idx
in
self
.
recurrent_field_idx
:
x_recurrent
=
x
[:,
field_idx
.
reshape
(
-
1
)].
reshape
(
-
1
,
*
field_idx
.
shape
[
1
:])
x_input
.
append
(
x_recurrent
)
x_flat
=
x
[:,[
self
.
fields
.
index
(
field_name
)
for
field_name
in
self
.
flat_fields
]]
x_input
.
append
(
x_flat
)
return
x_input
def
yield_batch
(
self
):
x_train
,
y_train
,
w_train
=
self
.
training_data
while
True
:
shuffled_idx
=
np
.
random
.
permutation
(
len
(
x_train
))
for
start
in
range
(
0
,
len
(
shuffled_idx
),
int
(
self
.
batch_size
)):
x_batch
=
x_train
[
shuffled_idx
[
start
:
start
+
int
(
self
.
batch_size
)]]
y_batch
=
y_train
[
shuffled_idx
[
start
:
start
+
int
(
self
.
batch_size
)]]
w_batch
=
w_train
[
shuffled_idx
[
start
:
start
+
int
(
self
.
batch_size
)]]
x_input
=
self
.
get_input_list
(
x_batch
)
yield
(
x_input
,
y_train
[
shuffled_idx
[
start
:
start
+
int
(
self
.
batch_size
)]],
w_batch
*
np
.
array
(
self
.
class_weight
)[
y_batch
.
astype
(
int
)])
@property
def
class_weighted_validation_data
(
self
):
"
class weighted validation data. Attention: Shuffle training data before using this!
"
x_val
,
y_val
,
w_val
=
super
(
ClassificationProjectRNN
,
self
).
class_weighted_validation_data
x_val_input
=
self
.
get_input_list
(
x_val
)
return
x_val_input
,
y_val
,
w_val
def
evaluate_train_test
(
self
,
do_train
=
True
,
do_test
=
True
,
batch_size
=
10000
):
logger
.
info
(
"
Reloading (and re-transforming) unshuffled training data
"
)
self
.
load
(
reload
=
True
)
def
eval_score
(
data_name
):
logger
.
info
(
"
Create/Update scores for {} sample
"
.
format
(
data_name
))
n_events
=
len
(
getattr
(
self
,
"
x_
"
+
data_name
))
setattr
(
self
,
"
scores_
"
+
data_name
,
np
.
empty
(
n_events
))
for
start
in
range
(
0
,
n_events
,
batch_size
):
stop
=
start
+
batch_size
getattr
(
self
,
"
scores_
"
+
data_name
)[
start
:
stop
]
=
self
.
model
.
predict
(
self
.
get_input_list
(
getattr
(
self
,
"
x_
"
+
data_name
)[
start
:
stop
])).
reshape
(
-
1
)
self
.
_dump_to_hdf5
(
"
scores_
"
+
data_name
)
if
do_test
:
eval_score
(
"
test
"
)
if
do_train
:
eval_score
(
"
train
"
)
def
evaluate
(
self
,
x_eval
):
logger
.
debug
(
"
Evaluate score for {}
"
.
format
(
x_eval
))
x_eval
=
np
.
array
(
x_eval
)
# copy
x_eval
[
x_eval
==
self
.
mask_value
]
=
np
.
nan
x_eval
=
self
.
scaler
.
transform
(
x_eval
)
x_eval
[
np
.
isnan
(
x_eval
)]
=
self
.
mask_value
logger
.
debug
(
"
Evaluate for transformed array: {}
"
.
format
(
x_eval
))
return
self
.
model
.
predict
(
self
.
get_input_list
(
x_eval
))
if
__name__
==
"
__main__
"
:
if
__name__
==
"
__main__
"
:
logging
.
basicConfig
()
logging
.
basicConfig
()
...
...
This diff is collapsed.
Click to expand it.
utils.py
+
14
−
2
View file @
f14c65cd
...
@@ -134,13 +134,25 @@ def weighted_quantile(values, quantiles, sample_weight=None, values_sorted=False
...
@@ -134,13 +134,25 @@ def weighted_quantile(values, quantiles, sample_weight=None, values_sorted=False
class
WeightedRobustScaler
(
RobustScaler
):
class
WeightedRobustScaler
(
RobustScaler
):
def
fit
(
self
,
X
,
y
=
None
,
weights
=
None
):
def
fit
(
self
,
X
,
y
=
None
,
weights
=
None
):
RobustScaler
.
fit
(
self
,
X
,
y
)
if
not
np
.
isnan
(
X
).
any
():
# these checks don't work for nan values
super
(
WeightedRobustScaler
,
self
).
fit
(
X
,
y
)
if
weights
is
None
:
if
weights
is
None
:
return
self
return
self
else
:
else
:
wqs
=
np
.
array
([
weighted_quantile
(
X
[:,
i
],
[
0.25
,
0.5
,
0.75
],
sample_weight
=
weights
)
for
i
in
range
(
X
.
shape
[
1
])])
wqs
=
np
.
array
([
weighted_quantile
(
X
[:,
i
]
[
~
np
.
isnan
(
X
[:,
i
])]
,
[
0.25
,
0.5
,
0.75
],
sample_weight
=
weights
)
for
i
in
range
(
X
.
shape
[
1
])])
self
.
center_
=
wqs
[:,
1
]
self
.
center_
=
wqs
[:,
1
]
self
.
scale_
=
wqs
[:,
2
]
-
wqs
[:,
0
]
self
.
scale_
=
wqs
[:,
2
]
-
wqs
[:,
0
]
self
.
scale_
=
_handle_zeros_in_scale
(
self
.
scale_
,
copy
=
False
)
self
.
scale_
=
_handle_zeros_in_scale
(
self
.
scale_
,
copy
=
False
)
print
(
self
.
scale_
)
return
self
return
self
def
transform
(
self
,
X
):
if
np
.
isnan
(
X
).
any
():
# we'd like to ignore nan values, so lets calculate without further checks
X
-=
self
.
center_
X
/=
self
.
scale_
return
X
else
:
return
super
(
WeightedRobustScaler
,
self
).
transform
(
X
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment