From 04e548a0e05625d22542c8e9b930c38b21cd4a31 Mon Sep 17 00:00:00 2001
From: Christopher Polster <cpolster@uni-mainz.de>
Date: Wed, 8 Jun 2022 13:45:06 +0200
Subject: [PATCH 01/10] Fix name in field access

---
 enstools/feature/identification/threshold/identification.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/enstools/feature/identification/threshold/identification.py b/enstools/feature/identification/threshold/identification.py
index 5644de6..b707f49 100644
--- a/enstools/feature/identification/threshold/identification.py
+++ b/enstools/feature/identification/threshold/identification.py
@@ -38,8 +38,8 @@ class DoubleThresholdIdentification(IdentificationStrategy):
 
     def precompute(self, dataset: xr.Dataset, **kwargs):
         assert self.field in dataset
-        dataset['inner_thresh_mask'] = xr.zeros_like(dataset['E'], dtype=int)
-        dataset['outer_thresh_mask'] = xr.zeros_like(dataset['E'], dtype=int)
+        dataset['inner_thresh_mask'] = xr.zeros_like(dataset[self.field], dtype=int)
+        dataset['outer_thresh_mask'] = xr.zeros_like(dataset[self.field], dtype=int)
 
         return dataset
 
-- 
GitLab


From 709aaf4478adfd27f33c1b247590fddf66d960d3 Mon Sep 17 00:00:00 2001
From: Christopher Polster <cpolster@uni-mainz.de>
Date: Wed, 8 Jun 2022 13:45:48 +0200
Subject: [PATCH 02/10] Fix inner/outer mask usage

---
 .../identification/threshold/identification.py     | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/enstools/feature/identification/threshold/identification.py b/enstools/feature/identification/threshold/identification.py
index b707f49..8a541c4 100644
--- a/enstools/feature/identification/threshold/identification.py
+++ b/enstools/feature/identification/threshold/identification.py
@@ -49,9 +49,10 @@ class DoubleThresholdIdentification(IdentificationStrategy):
         if self.processing_mode == "2d":
             field = field.reshape((1,) + field.shape)
 
-        # TODO give operator to routine
-        inner_thresh_mask = np.ma.masked_where(self.compare(field, self.inner_threshold),field).mask
-        outer_thresh_mask = np.ma.masked_where(self.compare(field, self.outer_threshold),field).mask
+        _is_inner = self.compare(field, self.inner_threshold)
+        _is_outer = self.compare(field, self.outer_threshold)
+        inner_thresh_mask = np.ma.getmaskarray(np.ma.masked_where(_is_inner, field))
+        outer_thresh_mask = np.ma.getmaskarray(np.ma.masked_where(_is_outer, field))
         ids, outer_features, inner_features = detection_double_thresh(
             feature_id=0,
             inner_thresh_mask=inner_thresh_mask,
@@ -87,11 +88,14 @@ class DoubleThresholdIdentification(IdentificationStrategy):
             mask_inner[inner_features == id_] = 1
             if self.processing_mode == "2d":
                 mask = mask.reshape(mask.shape[1:])
+                mask_outer = mask_outer.reshape(mask_outer.shape[1:])
+                mask_inner = mask_inner.reshape(mask_inner.shape[1:])
 
             mask_to_proto(mask, properties.mask, compress=self.compress)
             obj_list.append(obj)
-        dataset["inner_thresh_mask"].values = dataset["inner_thresh_mask"].values + mask_inner
-        dataset["outer_thresh_mask"].values = dataset["outer_thresh_mask"].values + mask_outer
+
+            dataset["inner_thresh_mask"].values = dataset["inner_thresh_mask"].values + mask_inner
+            dataset["outer_thresh_mask"].values = dataset["outer_thresh_mask"].values + mask_outer
 
         # return the dataset (can be changed here), and the list of objects
         return dataset, obj_list
-- 
GitLab


From 6b490699e8b9338bc85dd2aead9bb12f6d66d7bd Mon Sep 17 00:00:00 2001
From: Christopher Polster <cpolster@uni-mainz.de>
Date: Wed, 8 Jun 2022 18:40:49 +0200
Subject: [PATCH 03/10] Add internal documentation

---
 .../threshold/identification.py               | 71 +++++++++++--------
 .../identification/threshold/processing.py    |  7 ++
 2 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/enstools/feature/identification/threshold/identification.py b/enstools/feature/identification/threshold/identification.py
index 8a541c4..476845b 100644
--- a/enstools/feature/identification/threshold/identification.py
+++ b/enstools/feature/identification/threshold/identification.py
@@ -13,42 +13,62 @@ class DoubleThresholdIdentification(IdentificationStrategy):
         self.field = field
         self.outer_threshold = outer_threshold
         self.inner_threshold = inner_threshold
-        # Save comparison operator name and get associated comparison function
-        self.comparison_operator = comparison_operator
+        # Save normalized comparison operator name and store associated
+        # comparison function
         if comparison_operator in { ">=", "ge" }:
+            self.comparison_operator = ">="
             self.compare = operator.ge
         elif comparison_operator in { ">", "gt" }:
+            self.comparison_operator = ">"
             self.compare = operator.gt
         elif comparison_operator in { "<=", "le" }:
+            self.comparison_operator = "<="
             self.compare = operator.le
         elif comparison_operator in { "<", "lt" }:
+            self.comparison_operator = "<"
             self.compare = operator.lt
         else:
             raise ValueError(f"unknown comparison operator '{comparison_operator}'")
         # Check that thresholds are properly sorted for the given comparison
         # operator. Always allow equality to make single threshold
         # identification possible.
-        assert self.inner_threshold == self.outer_threshold or self.compare(self.inner_threshold, self.outer_threshold)
+        assert (self.inner_threshold == self.outer_threshold
+                or self.compare(self.inner_threshold, self.outer_threshold)), \
+                "threshold values incompatible with ordering, check inner and outer values"
         # Compression of object mask array in protobuf objects
         self.compress = compress
         # Specify processing mode 2d or 3d: In 2d, identification will be performed on 2d levels,
         #   in 3d the identification will be performed per 3d block
-        assert processing_mode in { "2d", "3d" }
+        assert processing_mode in { "2d", "3d" }, "choose processing mode from '2d' or '3d'"
         self.processing_mode = processing_mode
 
+    def __repr__(self):
+        return "\n".join([
+            "DoubleThresholdIdentification(",
+            f"    field='{self.field}',",
+            f"    outer_threshold={self.outer_threshold},",
+            f"    inner_threshold={self.inner_threshold},",
+            f"    comparison_operator='{self.comparison_operator}',",
+            f"    processing_mode='{self.processing_mode}',",
+            f"    compress=" + ("True" if self.compress else "False"),
+            ")"
+        ])
+
     def precompute(self, dataset: xr.Dataset, **kwargs):
-        assert self.field in dataset
+        assert self.field in dataset, f"required field '{self.field}' does not exist"
         dataset['inner_thresh_mask'] = xr.zeros_like(dataset[self.field], dtype=int)
         dataset['outer_thresh_mask'] = xr.zeros_like(dataset[self.field], dtype=int)
-
         return dataset
 
     def identify(self, dataset: xr.Dataset, **kwargs):
+        # Field used for detection
         field = dataset[self.field].values
-
+        # Handle 2D processing as special case of 3D processing
         if self.processing_mode == "2d":
             field = field.reshape((1,) + field.shape)
-
+        # Create masks for outer and inner fields based on the given comparison
+        # operator and threshold values and pass to the feature extraction
+        # routine
         _is_inner = self.compare(field, self.inner_threshold)
         _is_outer = self.compare(field, self.outer_threshold)
         inner_thresh_mask = np.ma.getmaskarray(np.ma.masked_where(_is_inner, field))
@@ -59,26 +79,23 @@ class DoubleThresholdIdentification(IdentificationStrategy):
             outer_thresh_mask=outer_thresh_mask,
             entire_globe=True
         )
-
-        # Let's say you detected 5 objects:
+        # Process each feature
         obj_list = []
-
         for id_ in ids:
             # Feature 0 is the "no-feature feature"
             if id_ == 0: continue
-
-            # get an instance of a new object, can pass an ID or set in manually afterwards
+            # Create a new protobuf object to encapsulate the feature data
             obj = self.get_new_object()
-            # set some ID to it
-            obj.id = id_ # TODO Needs offset? What happens when executed in parallel?
-
-            # get properties of object and populate them (like defined in template.proto)
+            # TODO use an offset or some other method to avoid duplicate ids
+            # TODO when executed in parallel
+            obj.id = id_
+            # Store attributes of the feature specific to the detection
+            # technique in the object properties
             properties = obj.properties
             properties.outer_threshold = self.outer_threshold
-
             properties.inner_threshold = self.inner_threshold
-            properties.comparison_operator = ">" # TODO
-
+            properties.comparison_operator = self.comparison_operator
+            # Create feature-specfic mask
             mask_outer = np.zeros_like(field, dtype=np.int8)
             mask_inner = np.zeros_like(field, dtype=np.int8)
             mask = np.zeros_like(field, dtype=np.int8)
@@ -90,22 +107,18 @@ class DoubleThresholdIdentification(IdentificationStrategy):
                 mask = mask.reshape(mask.shape[1:])
                 mask_outer = mask_outer.reshape(mask_outer.shape[1:])
                 mask_inner = mask_inner.reshape(mask_inner.shape[1:])
-
+            # Store feature-specific mask in object properties
             mask_to_proto(mask, properties.mask, compress=self.compress)
+            # Collect protobuf feature representations in output object list
             obj_list.append(obj)
-
+            # Add the features to the full masks of the dataset
             dataset["inner_thresh_mask"].values = dataset["inner_thresh_mask"].values + mask_inner
             dataset["outer_thresh_mask"].values = dataset["outer_thresh_mask"].values + mask_outer
-
-        # return the dataset (can be changed here), and the list of objects
+        # Return the updated dataset and the list of objects
         return dataset, obj_list
 
     def postprocess(self, dataset: xr.Dataset, pb2_desc, **kwargs):
-        # Here post process: called once after processing the whole dataset.
-        # If needed, the dataset or objects can be changed here.
-
-        # objects are already in finished protobuf description, cf. the JSON output.
-        # pb2_desc.sets is list of trackable sets, each one contains the timesteps list, where each entry is a valid time and contains a list of objects.
+        # Nothing to do here for now
         return dataset, pb2_desc
 
 
diff --git a/enstools/feature/identification/threshold/processing.py b/enstools/feature/identification/threshold/processing.py
index b4e18bd..c8f68eb 100644
--- a/enstools/feature/identification/threshold/processing.py
+++ b/enstools/feature/identification/threshold/processing.py
@@ -3,6 +3,11 @@ import numpy as np
 from numba import njit
 
 
+# Protobuf does not have a native multidimensional array data type but to store
+# individual feature masks we need to serialize the int8 numpy array used for
+# the masks. A MaskArray protobuf type is defined in threshold.proto and can be
+# written and read with these routines:
+
 def mask_to_proto(arr, proto_obj, compress=False):
     proto_obj.shape.extend(arr.shape)
     data = np.require(arr, dtype=np.int8).tobytes()
@@ -18,6 +23,8 @@ def proto_to_mask(proto_obj):
     return np.frombuffer(data, dtype=np.int8).reshape(proto_obj.shape)
 
 
+# Feature extration
+
 @njit
 def detection_double_thresh(feature_id, inner_thresh_mask, outer_thresh_mask, entire_globe):
 
-- 
GitLab


From 276426e302e0de8bdfb534819d9220450e4fe5a3 Mon Sep 17 00:00:00 2001
From: Christopher Polster <cpolster@uni-mainz.de>
Date: Wed, 8 Jun 2022 16:11:15 +0200
Subject: [PATCH 04/10] Add documentation to threshold-based methods

---
 .../identification/threshold/__init__.py      |  2 +
 .../threshold/identification.py               | 52 +++++++++++++++++++
 .../identification/threshold/processing.py    | 22 ++++++++
 3 files changed, 76 insertions(+)

diff --git a/enstools/feature/identification/threshold/__init__.py b/enstools/feature/identification/threshold/__init__.py
index 14311c2..37e9ac1 100644
--- a/enstools/feature/identification/threshold/__init__.py
+++ b/enstools/feature/identification/threshold/__init__.py
@@ -1 +1,3 @@
 from .identification import  DoubleThresholdIdentification,SingleThresholdIdentification
+from .processing import proto_to_mask
+
diff --git a/enstools/feature/identification/threshold/identification.py b/enstools/feature/identification/threshold/identification.py
index 476845b..4a6fb49 100644
--- a/enstools/feature/identification/threshold/identification.py
+++ b/enstools/feature/identification/threshold/identification.py
@@ -7,6 +7,35 @@ import xarray as xr
 
 
 class DoubleThresholdIdentification(IdentificationStrategy):
+    """Identification technique based on two fixed thresholds.
+
+    Parameters
+    ----------
+    field : str
+        Name in the input dataset of the field in which to detect features.
+    outer_threshold : number
+        Value of the outer detection threshold.
+    inner_threshold : number
+        Value of the inner detection threshold.
+    comparison_operator : ">=" | ">" | "<" | "<="
+        Comparison operator used to determine which regions are inside of the
+        thresholds. Use ">=" or ">" to obtain features that lie above (i.e.
+        exceed) the thresholds and "<=" and "<" to obtain features that lie
+        below.
+    processing_mode : "2d" | "3d"
+        Detect features on individual vertical levels ("2d") or across vertical
+        levels ("3d").
+    compress : boolean
+        Toggle zlib compression of feature object masks in protobuf output.
+        Highly recommended to reduce the output size.
+
+
+    A feature must have at least one inner region. A feature may contain
+    multiple inner regions if they lie inside the same outer region.
+
+    Additional fields added to the input dataset during processing:
+    `inner_thresh_mask`, `outer_thresh_mask`.
+    """
 
     def __init__(self, field, outer_threshold, inner_threshold, comparison_operator,
                  processing_mode="2d", compress=True, **kwargs):
@@ -55,12 +84,14 @@ class DoubleThresholdIdentification(IdentificationStrategy):
         ])
 
     def precompute(self, dataset: xr.Dataset, **kwargs):
+        """""" # Should not be called directly by users, hide in docs
         assert self.field in dataset, f"required field '{self.field}' does not exist"
         dataset['inner_thresh_mask'] = xr.zeros_like(dataset[self.field], dtype=int)
         dataset['outer_thresh_mask'] = xr.zeros_like(dataset[self.field], dtype=int)
         return dataset
 
     def identify(self, dataset: xr.Dataset, **kwargs):
+        """""" # Should not be called directly by users, hide in docs
         # Field used for detection
         field = dataset[self.field].values
         # Handle 2D processing as special case of 3D processing
@@ -118,11 +149,32 @@ class DoubleThresholdIdentification(IdentificationStrategy):
         return dataset, obj_list
 
     def postprocess(self, dataset: xr.Dataset, pb2_desc, **kwargs):
+        """""" # Should not be called directly by users, hide in docs
         # Nothing to do here for now
         return dataset, pb2_desc
 
 
 class SingleThresholdIdentification(DoubleThresholdIdentification):
+    """Identification technique based on a fixed threshold.
+
+    Parameters
+    ----------
+    field : str
+        Name in the input dataset of the field in which to detect features.
+    threshold : number
+        Value of the detection threshold.
+    comparison_operator : ">=" | ">" | "<" | "<="
+        See :py:class:`DoubleThresholdIdentification`.
+    *args
+        Given to :py:class:`DoubleThresholdIdentification`.
+    **kwargs
+        Given to :py:class:`DoubleThresholdIdentification`.
+
+
+    Provided for convenience, :py:class:`SingleThresholdIdentification` objects
+    are :py:class:`DoubleThresholdIdentification` instances configured to use
+    the same inner and outer threshold.
+    """
 
     def __init__(self, field, threshold, comparison_operator, *args, **kwargs):
         super().__init__(field, threshold, threshold, comparison_operator, *args, **kwargs)
diff --git a/enstools/feature/identification/threshold/processing.py b/enstools/feature/identification/threshold/processing.py
index c8f68eb..bd63026 100644
--- a/enstools/feature/identification/threshold/processing.py
+++ b/enstools/feature/identification/threshold/processing.py
@@ -16,7 +16,29 @@ def mask_to_proto(arr, proto_obj, compress=False):
     proto_obj.data = data
     proto_obj.zlib = compress
 
+
 def proto_to_mask(proto_obj):
+    """Convert a protobuf-serialized object mask to a numpy array.
+
+    Parameters
+    ----------
+    proto_obj
+        Serialized object mask.
+
+    Returns
+    -------
+    array of int8
+
+
+    A feature obtained with :py:class:`DoubleThresholdIdentification` carries
+    an object mask serialized to the protobuf format used for the detection
+    output. This function converts the serialized output to a numpy array,
+    which is 0-valued outside of the feature, 1-valued inside the region
+    between the outer an inner threshold, 2-valued inside the inner threshold.
+
+    >>> from enstools.feature.identification.threshold import proto_to_mask
+    >>> arr = proto_to_mask(obj.properties.mask)
+    """
     data = proto_obj.data
     if proto_obj.zlib:
         data = zlib.decompress(data)
-- 
GitLab


From c4a005f0d42f449991bfdb8024912bd1e3bda41a Mon Sep 17 00:00:00 2001
From: Christopher Polster <cpolster@uni-mainz.de>
Date: Tue, 14 Jun 2022 10:40:32 +0200
Subject: [PATCH 05/10] Remove processing_mode argument (is handled by
 pipeline)

---
 .../identification/threshold/identification.py        | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/enstools/feature/identification/threshold/identification.py b/enstools/feature/identification/threshold/identification.py
index 4a6fb49..a4706ea 100644
--- a/enstools/feature/identification/threshold/identification.py
+++ b/enstools/feature/identification/threshold/identification.py
@@ -22,9 +22,6 @@ class DoubleThresholdIdentification(IdentificationStrategy):
         thresholds. Use ">=" or ">" to obtain features that lie above (i.e.
         exceed) the thresholds and "<=" and "<" to obtain features that lie
         below.
-    processing_mode : "2d" | "3d"
-        Detect features on individual vertical levels ("2d") or across vertical
-        levels ("3d").
     compress : boolean
         Toggle zlib compression of feature object masks in protobuf output.
         Highly recommended to reduce the output size.
@@ -37,8 +34,7 @@ class DoubleThresholdIdentification(IdentificationStrategy):
     `inner_thresh_mask`, `outer_thresh_mask`.
     """
 
-    def __init__(self, field, outer_threshold, inner_threshold, comparison_operator,
-                 processing_mode="2d", compress=True, **kwargs):
+    def __init__(self, field, outer_threshold, inner_threshold, comparison_operator, compress=True):
         self.field = field
         self.outer_threshold = outer_threshold
         self.inner_threshold = inner_threshold
@@ -66,10 +62,6 @@ class DoubleThresholdIdentification(IdentificationStrategy):
                 "threshold values incompatible with ordering, check inner and outer values"
         # Compression of object mask array in protobuf objects
         self.compress = compress
-        # Specify processing mode 2d or 3d: In 2d, identification will be performed on 2d levels,
-        #   in 3d the identification will be performed per 3d block
-        assert processing_mode in { "2d", "3d" }, "choose processing mode from '2d' or '3d'"
-        self.processing_mode = processing_mode
 
     def __repr__(self):
         return "\n".join([
@@ -78,7 +70,6 @@ class DoubleThresholdIdentification(IdentificationStrategy):
             f"    outer_threshold={self.outer_threshold},",
             f"    inner_threshold={self.inner_threshold},",
             f"    comparison_operator='{self.comparison_operator}',",
-            f"    processing_mode='{self.processing_mode}',",
             f"    compress=" + ("True" if self.compress else "False"),
             ")"
         ])
-- 
GitLab


From 8041f6dc9df714ce026cf043b94a8254f4a2545c Mon Sep 17 00:00:00 2001
From: Christopher Polster <cpolster@uni-mainz.de>
Date: Tue, 14 Jun 2022 10:47:51 +0200
Subject: [PATCH 06/10] Remove individual object masks stored in protobuf

---
 .../identification/threshold/__init__.py      |  3 +-
 .../threshold/identification.py               | 18 ++------
 .../identification/threshold/processing.py    | 43 -------------------
 .../identification/threshold/threshold.proto  | 13 ++----
 4 files changed, 7 insertions(+), 70 deletions(-)

diff --git a/enstools/feature/identification/threshold/__init__.py b/enstools/feature/identification/threshold/__init__.py
index 37e9ac1..11af4de 100644
--- a/enstools/feature/identification/threshold/__init__.py
+++ b/enstools/feature/identification/threshold/__init__.py
@@ -1,3 +1,2 @@
-from .identification import  DoubleThresholdIdentification,SingleThresholdIdentification
-from .processing import proto_to_mask
+from .identification import DoubleThresholdIdentification, SingleThresholdIdentification
 
diff --git a/enstools/feature/identification/threshold/identification.py b/enstools/feature/identification/threshold/identification.py
index a4706ea..0aa8c85 100644
--- a/enstools/feature/identification/threshold/identification.py
+++ b/enstools/feature/identification/threshold/identification.py
@@ -1,5 +1,5 @@
 from ..identification import IdentificationStrategy
-from .processing import mask_to_proto, detection_double_thresh
+from .processing import detection_double_thresh
 
 import operator
 import numpy as np
@@ -22,9 +22,6 @@ class DoubleThresholdIdentification(IdentificationStrategy):
         thresholds. Use ">=" or ">" to obtain features that lie above (i.e.
         exceed) the thresholds and "<=" and "<" to obtain features that lie
         below.
-    compress : boolean
-        Toggle zlib compression of feature object masks in protobuf output.
-        Highly recommended to reduce the output size.
 
 
     A feature must have at least one inner region. A feature may contain
@@ -34,7 +31,7 @@ class DoubleThresholdIdentification(IdentificationStrategy):
     `inner_thresh_mask`, `outer_thresh_mask`.
     """
 
-    def __init__(self, field, outer_threshold, inner_threshold, comparison_operator, compress=True):
+    def __init__(self, field, outer_threshold, inner_threshold, comparison_operator):
         self.field = field
         self.outer_threshold = outer_threshold
         self.inner_threshold = inner_threshold
@@ -60,8 +57,6 @@ class DoubleThresholdIdentification(IdentificationStrategy):
         assert (self.inner_threshold == self.outer_threshold
                 or self.compare(self.inner_threshold, self.outer_threshold)), \
                 "threshold values incompatible with ordering, check inner and outer values"
-        # Compression of object mask array in protobuf objects
-        self.compress = compress
 
     def __repr__(self):
         return "\n".join([
@@ -69,8 +64,7 @@ class DoubleThresholdIdentification(IdentificationStrategy):
             f"    field='{self.field}',",
             f"    outer_threshold={self.outer_threshold},",
             f"    inner_threshold={self.inner_threshold},",
-            f"    comparison_operator='{self.comparison_operator}',",
-            f"    compress=" + ("True" if self.compress else "False"),
+            f"    comparison_operator='{self.comparison_operator}'",
             ")"
         ])
 
@@ -120,17 +114,11 @@ class DoubleThresholdIdentification(IdentificationStrategy):
             # Create feature-specfic mask
             mask_outer = np.zeros_like(field, dtype=np.int8)
             mask_inner = np.zeros_like(field, dtype=np.int8)
-            mask = np.zeros_like(field, dtype=np.int8)
-            mask[outer_features == id_] = 1
-            mask[inner_features == id_] = 2
             mask_outer[outer_features == id_] = 1
             mask_inner[inner_features == id_] = 1
             if self.processing_mode == "2d":
-                mask = mask.reshape(mask.shape[1:])
                 mask_outer = mask_outer.reshape(mask_outer.shape[1:])
                 mask_inner = mask_inner.reshape(mask_inner.shape[1:])
-            # Store feature-specific mask in object properties
-            mask_to_proto(mask, properties.mask, compress=self.compress)
             # Collect protobuf feature representations in output object list
             obj_list.append(obj)
             # Add the features to the full masks of the dataset
diff --git a/enstools/feature/identification/threshold/processing.py b/enstools/feature/identification/threshold/processing.py
index bd63026..75d0f46 100644
--- a/enstools/feature/identification/threshold/processing.py
+++ b/enstools/feature/identification/threshold/processing.py
@@ -1,50 +1,7 @@
-import zlib
 import numpy as np
 from numba import njit
 
 
-# Protobuf does not have a native multidimensional array data type but to store
-# individual feature masks we need to serialize the int8 numpy array used for
-# the masks. A MaskArray protobuf type is defined in threshold.proto and can be
-# written and read with these routines:
-
-def mask_to_proto(arr, proto_obj, compress=False):
-    proto_obj.shape.extend(arr.shape)
-    data = np.require(arr, dtype=np.int8).tobytes()
-    if compress:
-        data = zlib.compress(data)
-    proto_obj.data = data
-    proto_obj.zlib = compress
-
-
-def proto_to_mask(proto_obj):
-    """Convert a protobuf-serialized object mask to a numpy array.
-
-    Parameters
-    ----------
-    proto_obj
-        Serialized object mask.
-
-    Returns
-    -------
-    array of int8
-
-
-    A feature obtained with :py:class:`DoubleThresholdIdentification` carries
-    an object mask serialized to the protobuf format used for the detection
-    output. This function converts the serialized output to a numpy array,
-    which is 0-valued outside of the feature, 1-valued inside the region
-    between the outer an inner threshold, 2-valued inside the inner threshold.
-
-    >>> from enstools.feature.identification.threshold import proto_to_mask
-    >>> arr = proto_to_mask(obj.properties.mask)
-    """
-    data = proto_obj.data
-    if proto_obj.zlib:
-        data = zlib.decompress(data)
-    return np.frombuffer(data, dtype=np.int8).reshape(proto_obj.shape)
-
-
 # Feature extration
 
 @njit
diff --git a/enstools/feature/identification/threshold/threshold.proto b/enstools/feature/identification/threshold/threshold.proto
index 56ae52f..3c3ec83 100644
--- a/enstools/feature/identification/threshold/threshold.proto
+++ b/enstools/feature/identification/threshold/threshold.proto
@@ -4,16 +4,9 @@ syntax = "proto2";
  * The "Properties" message is required and contains all the properties of an object.
  */
 
-message MaskArray {
-    repeated int32 shape = 1;
-    required bytes data = 2;
-    required bool zlib = 3;
-}
-
 message Properties {
-    required MaskArray mask = 1;
-    required float outer_threshold = 2;
-    required float inner_threshold = 3;
-    required string comparison_operator = 4;
+    required float outer_threshold = 1;
+    required float inner_threshold = 2;
+    required string comparison_operator = 3;
 }
 
-- 
GitLab


From a0f03ccc3fa04d8bb259cbec4518f78a323e33d7 Mon Sep 17 00:00:00 2001
From: Christopher Polster <cpolster@uni-mainz.de>
Date: Tue, 14 Jun 2022 10:55:23 +0200
Subject: [PATCH 07/10] Set proper object ids in inner_thresh_mask and
 outer_thresh_mask

---
 .../threshold/identification.py               | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/enstools/feature/identification/threshold/identification.py b/enstools/feature/identification/threshold/identification.py
index 0aa8c85..6d1ff14 100644
--- a/enstools/feature/identification/threshold/identification.py
+++ b/enstools/feature/identification/threshold/identification.py
@@ -111,19 +111,18 @@ class DoubleThresholdIdentification(IdentificationStrategy):
             properties.outer_threshold = self.outer_threshold
             properties.inner_threshold = self.inner_threshold
             properties.comparison_operator = self.comparison_operator
-            # Create feature-specfic mask
-            mask_outer = np.zeros_like(field, dtype=np.int8)
-            mask_inner = np.zeros_like(field, dtype=np.int8)
-            mask_outer[outer_features == id_] = 1
-            mask_inner[inner_features == id_] = 1
-            if self.processing_mode == "2d":
-                mask_outer = mask_outer.reshape(mask_outer.shape[1:])
-                mask_inner = mask_inner.reshape(mask_inner.shape[1:])
             # Collect protobuf feature representations in output object list
             obj_list.append(obj)
+            # Create feature-specfic mask
+            inner_mask = inner_features == id_
+            outer_mask = outer_features == id_
+            if self.processing_mode == "2d":
+                outer_mask = outer_mask.reshape(outer_mask.shape[1:])
+                inner_mask = inner_mask.reshape(inner_mask.shape[1:])
             # Add the features to the full masks of the dataset
-            dataset["inner_thresh_mask"].values = dataset["inner_thresh_mask"].values + mask_inner
-            dataset["outer_thresh_mask"].values = dataset["outer_thresh_mask"].values + mask_outer
+            dataset["inner_thresh_mask"].values[inner_mask] = id_
+            dataset["outer_thresh_mask"].values[outer_mask] = id_
+
         # Return the updated dataset and the list of objects
         return dataset, obj_list
 
-- 
GitLab


From 478ad692e7177e802863e374b2b003e1b1853ff3 Mon Sep 17 00:00:00 2001
From: Christopher Polster <cpolster@uni-mainz.de>
Date: Tue, 14 Jun 2022 12:57:20 +0200
Subject: [PATCH 08/10] Generate unique ids in postprocessing

---
 .../threshold/identification.py               | 37 +++++++++++++++++--
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/enstools/feature/identification/threshold/identification.py b/enstools/feature/identification/threshold/identification.py
index 6d1ff14..f3d2e55 100644
--- a/enstools/feature/identification/threshold/identification.py
+++ b/enstools/feature/identification/threshold/identification.py
@@ -1,3 +1,6 @@
+from enstools.misc import get_time_dim, get_ensemble_dim # TODO relative?
+from enstools.feature.util.enstools_utils import get_vertical_dim, get_init_time_dim
+from enstools.feature.util.data_utils import SplitDimension
 from ..identification import IdentificationStrategy
 from .processing import detection_double_thresh
 
@@ -102,8 +105,6 @@ class DoubleThresholdIdentification(IdentificationStrategy):
             if id_ == 0: continue
             # Create a new protobuf object to encapsulate the feature data
             obj = self.get_new_object()
-            # TODO use an offset or some other method to avoid duplicate ids
-            # TODO when executed in parallel
             obj.id = id_
             # Store attributes of the feature specific to the detection
             # technique in the object properties
@@ -128,7 +129,37 @@ class DoubleThresholdIdentification(IdentificationStrategy):
 
     def postprocess(self, dataset: xr.Dataset, pb2_desc, **kwargs):
         """""" # Should not be called directly by users, hide in docs
-        # Nothing to do here for now
+        dims_map = {}
+        if (ens_dim := get_ensemble_dim(dataset)) is not None:
+            dims_map[ens_dim] = SplitDimension.SplitDimensionDim.ENSEMBLE_MEMBER
+        if (init_dim := get_init_time_dim(dataset)) is not None:
+            dims_map[init_dim] = SplitDimension.SplitDimensionDim.INIT_TIME
+        if self.processing_mode == "2d" and (level_dim := get_vertical_dim(dataset)) is not None:
+            dims_map[level_dim] = SplitDimension.SplitDimensionDim.LEVEL
+            
+        time_dim = get_time_dim(dataset)
+
+        offset = 0
+
+        for obj_set in pb2_desc.sets:
+            selection = { dim: getattr(obj_set, pb_name) for dim, pb_name in dims_map.items() }
+            
+            for timestep in obj_set.timesteps:
+                new_offset = offset
+            
+                selection[time_dim] = getattr(timestep, SplitDimension.SplitDimensionDim.VALID_TIME)
+                field = dataset.sel(selection)
+                
+                field["inner_thresh_mask"] += offset
+                field["outer_thresh_mask"] += offset
+                
+                for obj in timestep.objects:
+                    obj.id = obj.id + offset
+                    if obj.id > new_offset:
+                        new_offset = obj.id
+            
+                offset = new_offset
+
         return dataset, pb2_desc
 
 
-- 
GitLab


From 6cac4e8aac3be6e8560e736b1ddf0fd93a2eee38 Mon Sep 17 00:00:00 2001
From: Christopher Polster <cpolster@uni-mainz.de>
Date: Tue, 14 Jun 2022 14:53:27 +0200
Subject: [PATCH 09/10] Clean up postprocessing

---
 .../threshold/identification.py               | 69 ++++++++++---------
 1 file changed, 37 insertions(+), 32 deletions(-)

diff --git a/enstools/feature/identification/threshold/identification.py b/enstools/feature/identification/threshold/identification.py
index f3d2e55..c3bb9b2 100644
--- a/enstools/feature/identification/threshold/identification.py
+++ b/enstools/feature/identification/threshold/identification.py
@@ -34,7 +34,7 @@ class DoubleThresholdIdentification(IdentificationStrategy):
     `inner_thresh_mask`, `outer_thresh_mask`.
     """
 
-    def __init__(self, field, outer_threshold, inner_threshold, comparison_operator):
+    def __init__(self, field, outer_threshold, inner_threshold, comparison_operator, unique_ids=True):
         self.field = field
         self.outer_threshold = outer_threshold
         self.inner_threshold = inner_threshold
@@ -60,6 +60,7 @@ class DoubleThresholdIdentification(IdentificationStrategy):
         assert (self.inner_threshold == self.outer_threshold
                 or self.compare(self.inner_threshold, self.outer_threshold)), \
                 "threshold values incompatible with ordering, check inner and outer values"
+        self.unique_ids = unique_ids
 
     def __repr__(self):
         return "\n".join([
@@ -129,37 +130,41 @@ class DoubleThresholdIdentification(IdentificationStrategy):
 
     def postprocess(self, dataset: xr.Dataset, pb2_desc, **kwargs):
         """""" # Should not be called directly by users, hide in docs
-        dims_map = {}
-        if (ens_dim := get_ensemble_dim(dataset)) is not None:
-            dims_map[ens_dim] = SplitDimension.SplitDimensionDim.ENSEMBLE_MEMBER
-        if (init_dim := get_init_time_dim(dataset)) is not None:
-            dims_map[init_dim] = SplitDimension.SplitDimensionDim.INIT_TIME
-        if self.processing_mode == "2d" and (level_dim := get_vertical_dim(dataset)) is not None:
-            dims_map[level_dim] = SplitDimension.SplitDimensionDim.LEVEL
-            
-        time_dim = get_time_dim(dataset)
-
-        offset = 0
-
-        for obj_set in pb2_desc.sets:
-            selection = { dim: getattr(obj_set, pb_name) for dim, pb_name in dims_map.items() }
-            
-            for timestep in obj_set.timesteps:
-                new_offset = offset
-            
-                selection[time_dim] = getattr(timestep, SplitDimension.SplitDimensionDim.VALID_TIME)
-                field = dataset.sel(selection)
-                
-                field["inner_thresh_mask"] += offset
-                field["outer_thresh_mask"] += offset
-                
-                for obj in timestep.objects:
-                    obj.id = obj.id + offset
-                    if obj.id > new_offset:
-                        new_offset = obj.id
-            
-                offset = new_offset
-
+        if self.unique_ids:
+            dataarray = dataset[self.field]
+            # Names of additional dimensions are pre-specified. Create a mapping
+            # that connects dimension names in the dataset with the pre-specified
+            # names used for the objects. Only consider dimensions that actually
+            # appear in the dataset.
+            dims_map = {}
+            if (ens_dim := get_ensemble_dim(dataarray)) is not None:
+                dims_map[ens_dim] = SplitDimension.SplitDimensionDim.ENSEMBLE_MEMBER
+            if (init_dim := get_init_time_dim(dataarray)) is not None:
+                dims_map[init_dim] = SplitDimension.SplitDimensionDim.INIT_TIME
+            if self.processing_mode == "2d" and (level_dim := get_vertical_dim(dataarray)) is not None:
+                dims_map[level_dim] = SplitDimension.SplitDimensionDim.LEVEL
+            # Valid time is a special dimension and has its own level in the object
+            # collection
+            time_dim = get_time_dim(dataarray)
+            # Generate unique ids by applying offsets to the existing ids
+            offset = 0
+            # Iterate over dimensions other than valid time (these are flattened)
+            for obj_set in pb2_desc.sets:
+                selection = { dim: getattr(obj_set, pb_name) for dim, pb_name in dims_map.items() }
+                # Iterate over the valid times
+                for timestep in obj_set.timesteps:
+                    selection[time_dim] = getattr(timestep, SplitDimension.SplitDimensionDim.VALID_TIME)
+                    field = dataset.sel(selection)
+                    # Increase ids by offset wherever an object was found
+                    field["inner_thresh_mask"].data[field["inner_thresh_mask"].data != 0] += offset
+                    field["outer_thresh_mask"].data[field["outer_thresh_mask"].data != 0] += offset
+                    # Change ids of objects and generate new offset
+                    new_offset = offset
+                    for obj in timestep.objects:
+                        obj.id = obj.id + offset
+                        if obj.id > new_offset:
+                            new_offset = obj.id
+                    offset = new_offset
         return dataset, pb2_desc
 
 
-- 
GitLab


From ed617a96d79159d6b021b23fc954640f63ccbcd4 Mon Sep 17 00:00:00 2001
From: Christopher Polster <cpolster@uni-mainz.de>
Date: Tue, 14 Jun 2022 15:44:40 +0200
Subject: [PATCH 10/10] Allow IdentificationTechnique to provide protobuf
 reference

---
 .../threshold/identification.py               |  3 +++
 enstools/feature/pipeline.py                  | 19 +++++++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/enstools/feature/identification/threshold/identification.py b/enstools/feature/identification/threshold/identification.py
index c3bb9b2..642cb1f 100644
--- a/enstools/feature/identification/threshold/identification.py
+++ b/enstools/feature/identification/threshold/identification.py
@@ -2,6 +2,7 @@ from enstools.misc import get_time_dim, get_ensemble_dim # TODO relative?
 from enstools.feature.util.enstools_utils import get_vertical_dim, get_init_time_dim
 from enstools.feature.util.data_utils import SplitDimension
 from ..identification import IdentificationStrategy
+from .._proto_gen import threshold_pb2
 from .processing import detection_double_thresh
 
 import operator
@@ -34,6 +35,8 @@ class DoubleThresholdIdentification(IdentificationStrategy):
     `inner_thresh_mask`, `outer_thresh_mask`.
     """
 
+    pb_reference = threshold_pb2
+
     def __init__(self, field, outer_threshold, inner_threshold, comparison_operator, unique_ids=True):
         self.field = field
         self.outer_threshold = outer_threshold
diff --git a/enstools/feature/pipeline.py b/enstools/feature/pipeline.py
index d60f958..c64a239 100644
--- a/enstools/feature/pipeline.py
+++ b/enstools/feature/pipeline.py
@@ -19,7 +19,15 @@ class FeaturePipeline:
         3D, per 3D block.
     """
 
-    def __init__(self, proto_ref, processing_mode='2d'):
+    def __init__(self, proto_ref=None, processing_mode='2d'):
+        """
+        Specify processing mode 2d or 3d: In 2d, identification and tracking will be performed on 2d levels, in 3d per 3d block.
+
+        Parameters
+        ----------
+        proto_ref
+        processing_mode
+        """
         self.id_tech = None
         self.tr_tech = None
 
@@ -42,8 +50,15 @@ class FeaturePipeline:
             The identification strategy to use in the pipeline.
         """
         self.id_tech = strategy
-        self.id_tech.pb_reference = self.pb_reference
         self.id_tech.processing_mode = self.processing_mode
+        if self.pb_reference is None:
+            try:
+                self.pb_reference = self.id_tech.pb_reference
+            except AttributeError:
+                raise Exception("a protobuf reference must be provided by either "
+                                "the feature pipeline or the identification technique/strategy")
+        else:
+            self.id_tech.pb_reference = self.pb_reference
 
     def set_tracking_strategy(self, strategy: TrackingStrategy):
         """
-- 
GitLab