Compare revisions

Oriol Tintó · Oriol Tintó · Oriol Tintó · Oriol Tintó · Oriol Tintó · Oriol Tintó
--- a/VERSION
+++ b/VERSION
-2023.5
+2023.6
--- a/enstools/compression/analyzer/analyze_data_array.py
+++ b/enstools/compression/analyzer/analyze_data_array.py
@@ -17,8 +17,12 @@ from typing import Tuple, Callable
 import numpy as np
 import xarray

+import enstools.encoding.chunk_size
 from enstools.compression.emulators import DefaultEmulator
+from enstools.compression.errors import ConditionsNotFulfilledError
+from enstools.compression.slicing import MultiDimensionalSliceCollection
 from enstools.encoding.api import VariableEncoding
+from enstools.encoding.dataset_encoding import find_chunk_sizes, convert_to_bytes
 from enstools.encoding.rules import COMPRESSION_SPECIFICATION_SEPARATOR
 from .analysis_options import AnalysisOptions
 from .analyzer_utils import get_metrics, get_parameter_range, bisection_method
@@ -33,23 +37,34 @@ COUNTER = 0
 def find_direct_relation(parameter_range, function_to_nullify):
    """Return whether the nullified function has a direct relation between the parameter and the nullified value."""
    min_val, max_val = parameter_range
-    first_q = min_val + (max_val - min_val) / 10
-    third_q = min_val + 9 * (max_val - min_val) / 10
+    first_percentile = min_val + (max_val - min_val) / 100
+    last_percentile = min_val + 99 * (max_val - min_val) / 100
+
+    eval_first_percentile = function_to_nullify(first_percentile)
+    eval_last_percentile = function_to_nullify(last_percentile)
+    return eval_last_percentile > eval_first_percentile
+
+
+def  get_one_slice(data_array: xarray.DataArray, chunk_size: str = "100KB"):
+    chunk_memory_size = convert_to_bytes(chunk_size)
+    chunk_sizes = find_chunk_sizes(data_array, chunk_memory_size)
+    chunk_sizes = [chunk_sizes[dim] for dim in data_array.dims]
+    multi_dimensional_slice = MultiDimensionalSliceCollection(shape=data_array.shape, chunk_sizes=chunk_sizes)
+    big_chunk_size = max(set([s.size for s in multi_dimensional_slice.objects.ravel()]))
+    big_chunks = [s for s in multi_dimensional_slice.objects.ravel() if s.size == big_chunk_size]
+
+    return {dim: size for dim, size in zip(data_array.dims, big_chunks[0].slices)}

-    eval_first_q = function_to_nullify(first_q)
-    eval_third_q = function_to_nullify(third_q)
-    return eval_third_q > eval_first_q


 def analyze_data_array(data_array: xarray.DataArray, options: AnalysisOptions) -> Tuple[str, dict]:
    """
    Find the compression specification corresponding to a certain data array and a given set of compression options.
    """
-    # In case there is a time dimension, select the last element.
-    # There are accumulative variables (like total precipitation) which have mostly 0 on the first time step.
-    # Using the last time-step can represent an advantage somehow.
-    if "time" in data_array.dims:
-        data_array = data_array.isel(time=-1)
+
+    slices = get_one_slice(data_array,
+                           chunk_size=enstools.encoding.chunk_size.analysis_chunk_size)
+    data_array = data_array.isel(**slices)
    # Check if the array contains any nan
    contains_nan = np.isnan(data_array.values).any()
    if contains_nan:
@@ -64,11 +79,6 @@ def analyze_data_array(data_array: xarray.DataArray, options: AnalysisOptions) -
    # Define parameter range
    parameter_range = get_parameter_range(data_array, options)

-    # If the aim is a specific compression ratio, the parameter range needs to be reversed
-    # because the relation between compression ratio and quality is inverse.
-    # if COMPRESSION_RATIO_LABEL in options.thresholds:
-    #     parameter_range = tuple(reversed(parameter_range))
-
    #  Ignore warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
@@ -80,6 +90,8 @@ def analyze_data_array(data_array: xarray.DataArray, options: AnalysisOptions) -
            fun=function_to_nullify,
            direct_relation=direct_relation)

+        if not constrain(parameter):
+            raise ConditionsNotFulfilledError("Condition not fulfilled!")
        # Compute metrics
        # When aiming for a compression ratio some other metrics need to be provided too.
        if COMPRESSION_RATIO_LABEL not in options.thresholds:

--- a/enstools/compression/analyzer/analyzer.py
+++ b/enstools/compression/analyzer/analyzer.py
@@ -18,6 +18,7 @@ from enstools.compression.compressor import drop_variables
 from enstools.io import read
 from .analysis_options import AnalysisOptions, AnalysisParameters
 from .analyze_data_array import analyze_data_array, ANALYSIS_DIAGNOSTIC_METRICS, COMPRESSION_RATIO_LABEL
+from ..errors import ConditionsNotFulfilledError

 logger = logging.getLogger("enstools.compression.analysis")

@@ -67,7 +68,8 @@ def select_optimal_encoding_based_on_compression_ratio(encodings: dict, metrics:
    for variable in variables:
        best_compression_ratio = 0
        for combination in combinations:
-            if metrics[combination][variable][COMPRESSION_RATIO_LABEL] > best_compression_ratio:
+            if variable in metrics[combination] and \
+                    metrics[combination][variable][COMPRESSION_RATIO_LABEL] > best_compression_ratio:
                best_compression_ratio = metrics[combination][variable][COMPRESSION_RATIO_LABEL]
                best_combination[variable] = combination

@@ -93,7 +95,7 @@ def select_optimal_encoding_based_on_quality_metrics(encodings: dict, metrics: d
        best_metrics = {met: -1.0 for met in ANALYSIS_DIAGNOSTIC_METRICS}
        for combination in combinations:
            for metric in ANALYSIS_DIAGNOSTIC_METRICS:
-                if metric in metrics[combination][variable]:
+                if variable in metrics[combination] and metric in metrics[combination][variable]:
                    if metrics[combination][variable][metric] > best_metrics[metric]:
                        best_metrics[metric] = metrics[combination][variable][metric]
                        best_combination[variable] = combination
@@ -145,18 +147,21 @@ def find_encodings_for_all_combinations(dataset: xarray.Dataset, options: Analys
                combination_metrics[var] = {COMPRESSION_RATIO_LABEL: 1.0}
                continue

-            variable_encoding, variable_metrics = analyze_data_array(
-                data_array=dataset[var],
-                options=AnalysisOptions(compressor, mode, thresholds=options.thresholds)
-            )
-            combination_encoding[var] = variable_encoding
-            combination_metrics[var] = variable_metrics
-            # (dataset, variable_name, thresholds, compressor_name, mode)
-            logger.debug("%s %s  CR:%.1f",
-                         var,
-                         variable_encoding,
-                         variable_metrics[COMPRESSION_RATIO_LABEL],
-                         )
+            try:
+                variable_encoding, variable_metrics = analyze_data_array(
+                    data_array=dataset[var],
+                    options=AnalysisOptions(compressor, mode, thresholds=options.thresholds)
+                )
+                combination_encoding[var] = variable_encoding
+                combination_metrics[var] = variable_metrics
+                # (dataset, variable_name, thresholds, compressor_name, mode)
+                logger.debug("%s %s  CR:%.1f",
+                             var,
+                             variable_encoding,
+                             variable_metrics[COMPRESSION_RATIO_LABEL],
+                             )
+            except ConditionsNotFulfilledError:
+                ...
        encodings[combination] = combination_encoding
        metrics[combination] = combination_metrics

@@ -240,7 +245,12 @@ def analyze_dataset(dataset: xarray.Dataset,
        dataset = dataset.fillna(fill_na)

    options = AnalysisOptions(compressor=compressor, mode=mode, constrains=constrains)
-    return find_optimal_encoding(dataset, options)
+    encodings, metrics = find_optimal_encoding(dataset, options)
+    if not encodings:
+        raise ConditionsNotFulfilledError(
+            "It was not possible to find a combination that fulfills the constrains provided"
+        )
+    return encodings, metrics


 def save_encoding(encoding: dict, output_file: Union[Path, str, None] = None, file_format: str = "yaml"):

--- a/enstools/compression/analyzer/analyzer_utils.py
+++ b/enstools/compression/analyzer/analyzer_utils.py
@@ -6,6 +6,7 @@ import logging
 from typing import List, Dict

 import xarray
+import numpy as np

 from enstools.compression.analyzer.analysis_options import AnalysisOptions
 from enstools.compression.metrics import DataArrayMetrics
@@ -23,7 +24,8 @@ def get_metrics(reference_data: xarray.DataArray, recovered_data: xarray.DataArr
    :return: a dictionary with the requested metrics
    """
    metrics = DataArrayMetrics(reference_data, recovered_data)
-    return {metric: float(metrics[metric]) for metric in metric_names if metric != "compression_ratio"}
+    # TODO: Is the average the proper thing to use here?
+    return {metric: float(np.average(metrics[metric])) for metric in metric_names if metric != "compression_ratio"}


 def check_compression_ratio(compression_ratio: float, thresholds: dict):
@@ -148,6 +150,7 @@ def continuous_bisection_method(parameter_range: tuple,
                                retry_repeated=5,
                                threshold=0.1,
                                direct_relation=True,
+                                results=None,
                                ):
    """
    Recursively refine a parameter range by evaluating the parameter that lies in the middle of the range.
@@ -168,8 +171,13 @@ def continuous_bisection_method(parameter_range: tuple,
    :param direct_relation: If True, the relation between the function and the parameter is direct, that is,
                            increasing the parameter will increase the function value. If False, the relation is
                            inverse, that is, increasing the parameter will decrease the function value.
+    :param results: A dictionary that stores the function values for each parameter that has been evaluated.
+
    :return: The best value of the parameter that meets the exit conditions.
    """
+
+    if results is None:
+        results = {}
    # Get start and end from parameter range
    start, end = parameter_range

@@ -184,24 +192,23 @@ def continuous_bisection_method(parameter_range: tuple,
    # TODO: use logging and a debug mode to print this kind of things
    logging.debug("start=%.2e,end=%.2e value_at_middle=%f", start, end, float(value_at_middle))

-    # If the value at the middle is positive (all thresholds are fulfilled) we can return the parameter at the middle,
-    # otherwise select the safer one.
-    parameter_to_return = middle if value_at_middle > 0.0 else end
+    # Save result
+    results[middle] = value_at_middle

    # In case the accuracy exit condition is reached, return the parameter value at that point
-    if 0.0 <= value_at_middle < threshold:
-        return parameter_to_return
+    if 0.0 <= value_at_middle < threshold or depth >= max_depth or\
+            (value_at_middle == last_value and retry_repeated == 0):
+        positive_results = {k: v for k,v in results.items() if v > 0}
+        if positive_results:
+            return min(positive_results, key=positive_results.get)
+        else:
+            return middle
+

    # If the value is the same that the last try, we can retry few times
    if value_at_middle == last_value:
-        if retry_repeated == 0:
-            return parameter_to_return
        retry_repeated -= 1

-    # In case having reached the maximum depth, return the proper value
-    if depth >= max_depth:
-        return parameter_to_return
-
    # # Otherwise, set new parameter range and call the function again
    if comparison(value_at_middle, direct_relation=direct_relation):
        new_start, new_end = start, middle
@@ -216,6 +223,7 @@ def continuous_bisection_method(parameter_range: tuple,
                                       retry_repeated=retry_repeated,
                                       threshold=threshold,
                                       direct_relation=direct_relation,
+                                       results=results
                                       )


@@ -228,6 +236,7 @@ def discrete_bisection_method(parameters_list: list,
                              retry_repeated=5,
                              threshold=0.1,
                              direct_relation=True,
+                              results=None,
                              ):
    """
    Apply the bisection method on a set of discrete parameters.
@@ -243,11 +252,16 @@ def discrete_bisection_method(parameters_list: list,
    :param threshold: The threshold for the method exit condition.
    :param direct_relation: Boolean indicating whether the relation between the parameter and the function value is
    direct or inverse.
+    :param results: A dictionary that stores the function values for each parameter that has been evaluated.

    :return: The parameter value that satisfies the constrain function.

    :raises: Exception if the maximum depth is reached.
    """
+
+    if results is None:
+        results = {}
+
    middle_index = len(parameters_list) // 2
    middle = parameters_list[middle_index]

@@ -260,13 +274,20 @@ def discrete_bisection_method(parameters_list: list,
                  middle,
                  float(value_at_middle))

+    results[middle] = value_at_middle
+
    # If the value at the middle is positive (all thresholds are fulfilled) we can return the parameter at the middle,
    # otherwise select the safer one.
    parameter_to_return = middle if value_at_middle > 0.0 else parameters_list[-1]

    # In case the accuracy exit condition is reached, return the parameter value at that point
-    if 0.0 <= value_at_middle < threshold:
-        return parameter_to_return
+    if 0.0 <= value_at_middle < threshold or depth >= max_depth or\
+            (value_at_middle == last_value and retry_repeated == 0):
+        positive_results = {k: v for k,v in results.items() if v > 0}
+        if positive_results:
+            return min(positive_results, key=positive_results.get)
+        else:
+            return middle

    # If the value is the same that the last try, we can retry few times
    if value_at_middle == last_value:
@@ -274,9 +295,6 @@ def discrete_bisection_method(parameters_list: list,
            return parameter_to_return
        retry_repeated -= 1

-    # In case having reached the maximum depth, return the proper value
-    if depth >= max_depth:
-        return parameter_to_return

    # # Otherwise, set new parameter range and call the function again
    if comparison(value_at_middle, direct_relation=direct_relation):
@@ -293,4 +311,5 @@ def discrete_bisection_method(parameters_list: list,
                                     retry_repeated=retry_repeated,
                                     threshold=threshold,
                                     direct_relation=direct_relation,
+                                     results=results,
                                     )
--- a/enstools/compression/emulators/filters_emulator.py
+++ b/enstools/compression/emulators/filters_emulator.py
@@ -86,6 +86,8 @@ class FilterEmulator(Emulator):
        encoding = dict(self.compression)
        if "chunksizes" in encoding:
            encoding["chunks"] = encoding.pop("chunksizes")
+        else:
+            encoding["chunks"] = uncompressed_data.shape

        # Initialize file object
        with io.BytesIO() as bio:

--- a/enstools/compression/errors.py
+++ b/enstools/compression/errors.py
+from enstools.core.errors import EnstoolsError
+class ConditionsNotFulfilledError(EnstoolsError):
+    ...
\ No newline at end of file
--- a/examples/python_scripts/compress_dummy_dataset.py
+++ b/examples/python_scripts/compress_dummy_dataset.py
+import xarray
+from enstools.compression.analyzer.analyzer import analyze_dataset
+import enstools.compression.xr_accessor  # noqa
+
+dataset_names = [
+    "air_temperature",
+    # "air_temperature_gradient",
+    # "basin_mask",
+    # "rasm",
+    # "ROMS_example",
+    # "tiny",
+    # "era5-2mt-2019-03-uk.grib",
+    # "eraint_uvz",
+    # "ersstv5"
+]
+
+
+def main():
+    results = {}
+    failed_datasets = []
+    for dataset_name in dataset_names:
+        try:
+            with xarray.tutorial.open_dataset(dataset_name) as dataset:
+                encoding, metrics = analyze_dataset(dataset=dataset)
+                results[dataset_name] = (encoding, metrics)
+                dataset.to_netcdf(f"reference_{dataset_name}.nc")
+                dataset.to_compressed_netcdf(f"compressed_{dataset_name}.nc", compression=encoding)
+        except ValueError:
+            failed_datasets.append(dataset_name)
+
+    print(results)
+    print(failed_datasets)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/streamlit/component/advanced_section.py
+++ b/examples/streamlit/component/advanced_section.py
+import matplotlib.pyplot as plt
+import streamlit as st
+
+import enstools.compression.xr_accessor  # noqa
+from enstools.encoding.errors import InvalidCompressionSpecification
+
+default_parameters = {
+    "sz": {
+        "abs": 1,
+        "rel": 0.001,
+        "pw_rel": 0.001,
+    },
+    "sz3": {
+        "abs": 1,
+        "rel": 0.001,
+    },
+    "zfp": {
+        "accuracy": 1,
+        "rate": 3.2,
+        "precision": 14,
+    }
+
+}
+
+
+def advanced_section(data, slice_selection):
+    if data.dataset is not None:
+        # st.markdown("# Compression")
+
+        with st.expander("Compression Specifications"):
+            specification_mode = st.radio(label="", options=["Options", "String"], horizontal=True)
+            # specification, options = st.tabs(["String", "Options"])
+            if specification_mode == "String":
+                compression_specification = st.text_input(label="Compression", value="lossy,sz,abs,1")
+            elif specification_mode == "Options":
+                col1_, col2_, col3_ = st.columns(3)
+                with col1_:
+                    compressor = st.selectbox(label="Compressor", options=["sz", "sz3", "zfp"])
+                if compressor == "sz":
+                    mode_options = ["abs", "rel", "pw_rel"]
+                elif compressor == "sz3":
+                    mode_options = ["abs", "rel"]
+                elif compressor == "zfp":
+                    mode_options = ["accuracy", "rate", "precision"]
+                else:
+                    mode_options = []
+                with col2_:
+                    mode = st.selectbox(label="Mode", options=mode_options)
+                with col3_:
+                    parameter = st.text_input(label="Parameter", value=default_parameters[compressor][mode])
+
+                compression_specification = f"lossy,{compressor},{mode},{parameter}"
+                st.markdown(f"**Compression Specification:** {compression_specification}")
+
+            if compression_specification:
+                try:
+                    data.compress(compression_specification)
+                except InvalidCompressionSpecification:
+                    st.warning("Invalid compression specification!")
+                    st.markdown(
+                        "Check [the compression specification format](https://enstools-encoding.readthedocs.io/en/latest/CompressionSpecificationFormat.html)")
+            if data.compressed_da is not None:
+                st.markdown(f"**Compression Ratio**: {data.compressed_da.attrs['compression_ratio']}")
\ No newline at end of file
--- a/examples/streamlit/component/analysis_section.py
+++ b/examples/streamlit/component/analysis_section.py
+import streamlit as st
+
+import enstools.compression.xr_accessor  # noqa
+
+
+def analysis_section(data, slice_selection):
+    if data.dataset is not None:
+        # st.markdown("# Compression")
+        col1, col2 = st.columns(2)
+        with col1:
+            constrains = st.text_input(label="Constraint", value="correlation_I:5,ssim_I:3")
+
+        options = {
+            "sz": ["abs", "rel", "pw_rel"],
+            "sz3": ["abs", "rel"],
+            "zfp": ["accuracy", "rate", "precision"],
+        }
+
+        all_options = []
+        [all_options.extend([f"{compressor}-{mode}" for mode in options[compressor]]) for compressor in options]
+
+        with col2:
+            cases = st.multiselect(label="Compressor and mode", options=all_options)
+
+        if data.reference_da is None:
+            return
+
+        if not cases:
+            return
+
+        st.markdown("# Results:")
+        n_cols = 4
+        cols = st.columns(n_cols)
+
+        all_results = {}
+
+        for idx, case in enumerate(cases):
+            with cols[idx % n_cols]:
+                compressor, mode = case.split("-")
+                encoding, metrics = data.reference_da.compression.analyze(
+                    constrains=constrains,
+                    compressor=compressor,
+                    compression_mode=mode
+                )
+                parameter = encoding.split(",")[-1]
+                compression_ratio = metrics["compression_ratio"]
+                all_results[case] = compression_ratio
+                st.markdown(f"## {compressor},{mode}:\n\n"
+                            f"**Compression Ratio:** {compression_ratio:.2f}x\n\n"
+                            f"**Parameter:** {parameter}\n\n"
+                            f"**Specification String:**")
+                st.code(encoding)
+                st.markdown(f"___")
+                # st.markdown(encoding)
+                # st.markdown(metrics)
--- a/examples/streamlit/component/basic_section.py
+++ b/examples/streamlit/component/basic_section.py
+import matplotlib.pyplot as plt
+import numpy as np
+import streamlit as st
+import xarray
+
+import enstools.compression.xr_accessor  # noqa
+from .data_source import DataContainer
+
+
+def get_compression_ratio(data_array: xarray.DataArray, relative_tolerance: float, mode: str) -> float:
+    what = data_array.compression(f"lossy,sz,{mode},{relative_tolerance}", in_place=False)
+    return float(what.attrs["compression_ratio"])
+
+
+def invert_function(function):
+    # Define its derivative
+    f_prime = function.deriv()
+
+    # Define the function for which we want to find the root
+    def func(x, y_val):
+        return function(x) - y_val
+
+    def newtons_method(y_val, epsilon=1e-7, max_iterations=100):
+        x = -2  # np.log10(0.01)
+        print(f"{y_val=}")
+        for _ in range(max_iterations):
+            x_new = x - func(x, y_val) / f_prime(x)
+            if abs(x - x_new) < epsilon:
+                return x_new
+            x = x_new
+            print(x_new)
+        return None
+
+    return newtons_method
+
+
+def create_parameter_from_compression_ratio(data: DataContainer, mode: str):
+    train_x = np.logspace(-12, -.5, 15)
+    train_y = [get_compression_ratio(data.reference_da, parameter, mode=mode) for parameter in train_x]
+
+    parameter_range = min(train_y), min(100., max(train_y))
+
+    x_log = np.log10(train_x)
+    y_log = np.log10(train_y)
+
+    coeff = np.polyfit(x_log, y_log, 10)
+
+    # Create a polynomial function from the coefficients
+    f = np.poly1d(coeff)
+
+    f_inverse = invert_function(f)
+
+    def function_to_return(compression_ratio: float) -> float:
+        return 10 ** f_inverse(np.log10(compression_ratio))
+
+    return parameter_range, function_to_return
+
+
+def basic_section(data: DataContainer, slice_selection):
+    mode = st.selectbox(label="Mode", options=["rel", "pw_rel"])
+    parameter_range, get_parameter = create_parameter_from_compression_ratio(data, mode=mode)
+
+    _min, _max = parameter_range
+    options = [_min + (_max - _min) * _x for _x in np.logspace(-2, 0)]
+    options = [f"{op:.2f}" for op in options]
+
+    compression_ratio = st.select_slider(label="Compression Ratio", options=options)
+    compression_ratio = float(compression_ratio)
+
+    parameter = get_parameter(compression_ratio)
+
+    with st.spinner():
+        data.compress(f"lossy,sz,rel,{parameter}")
--- a/examples/streamlit/component/data_source.py
+++ b/examples/streamlit/component/data_source.py
+import io
+from typing import Optional
+
+import pandas as pd
+import streamlit as st
+import xarray as xr
+
+
+class DataContainer:
+    def __init__(self, dataset: Optional[xr.Dataset] = None):
+        self.dataset = dataset
+        self.reference_da = None
+        self.compressed_da = None
+
+    def set_dataset(self, dataset):
+        self.dataset = dataset
+        self.reference_da = None
+        self.compressed_da = None
+
+    def select_variable(self, variable):
+        self.reference_da = self.dataset[variable]
+
+    @classmethod
+    def from_tutorial_data(cls, dataset_name: str = "air_temperature"):
+        return cls(dataset=xr.tutorial.open_dataset(dataset_name))
+
+    @property
+    def time_steps(self):
+        if self.reference_da is not None:
+            if "time" in self.reference_da.dims:
+                try:
+                    return pd.to_datetime(self.reference_da.time.values)
+                except TypeError:
+                    return self.reference_da.time.values
+
+            print(self.reference_da)
+
+    def compress(self, compression):
+        self.compressed_da = self.reference_da.compression(compression)
+
+    def __hash__(self):
+        return hash(self.reference_da.name)
+
+@st.cache_resource
+def create_data():
+    return DataContainer.from_tutorial_data()
+
+
+def select_dataset(data):
+    st.title("Select Dataset")
+    data_source = st.radio(label="Data source", options=["Tutorial Dataset", "Custom Dataset"])
+    col1, col2 = st.columns(2)
+    if data_source == "Tutorial Dataset":
+        tutorial_dataset_options = [
+            "air_temperature",
+            "air_temperature_gradient",
+            # "basin_mask",  # Different coordinates
+            # "rasm",  # Has nan
+            "ROMS_example",
+            "tiny",
+            # "era5-2mt-2019-03-uk.grib",
+            "eraint_uvz",
+            "ersstv5"
+        ]
+        with col1:
+            dataset_name = st.selectbox(label="Dataset", options=tutorial_dataset_options)
+        dataset = xr.tutorial.open_dataset(dataset_name)
+
+        data.set_dataset(dataset)
+
+    elif data_source == "Custom Dataset":
+        my_file = st.file_uploader(label="Your file")
+        data.set_dataset(None)
+
+        if my_file:
+            my_virtual_file = io.BytesIO(my_file.read())
+            my_dataset = xr.open_dataset(my_virtual_file)
+            st.text("Custom dataset loaded!")
+            data.set_dataset(my_dataset)
+
+    if data.dataset is not None:
+        with col2:
+            variable = st.selectbox(label="Variable", options=data.dataset.data_vars)
+        if variable:
+            data.select_variable(variable)
+
+
+def select_slice(data):
+    st.title("Select Slice")
+    slice_selection = {}
+    if data.reference_da is not None and data.reference_da.dims and True:
+
+        tabs = st.tabs(tabs=data.reference_da.dims)
+        for idx, dimension in enumerate(data.reference_da.dims):
+            with tabs[idx]:
+                if str(dimension) == "time":
+                    if len(data.reference_da.time) > 1:
+                        slice_selection[dimension] = st.select_slider(label=dimension,
+                                                                      options=data.reference_da[dimension].values,
+                                                                      )
+                    else:
+                        slice_selection[dimension] = data.reference_da.time.values[0]
+
+                else:
+                    _min = float(data.reference_da[dimension].values[0])
+                    _max = float(data.reference_da[dimension].values[-1])
+
+                    if _max - _min < 1000:
+                        slice_selection[dimension] = st.slider(label=dimension,
+                                                               min_value=_min,
+                                                               max_value=_max,
+                                                               value=(_min, _max),
+                                                               )
+
+        # if st.button("Clear Cache"):
+        #     st.cache_resource.clear()
+
+    return slice_selection
--- a/examples/streamlit/component/plotter.py
+++ b/examples/streamlit/component/plotter.py
+import streamlit as st
+import matplotlib.pyplot as plt
+
+
+def plot_comparison(data, slice_selection):
+    col1, col2, col3, *others = st.columns(3)
+
+    new_slice = {}
+
+    for key, values in slice_selection.items():
+        if isinstance(values, tuple):
+            start, stop = values
+            if start != stop:
+                new_slice[key] = slice(start, stop)
+            else:
+                new_slice[key] = start
+        else:
+            new_slice[key] = values
+
+    slice_selection = new_slice
+
+    if data.reference_da is not None:
+        slice_selection = {key: (value if key != "lat" else slice(value.stop, value.start)) for key, value in
+                           slice_selection.items()}
+
+        only_slices = {key: value for key, value in slice_selection.items() if isinstance(value, slice)}
+        non_slices = {key: value for key, value in slice_selection.items() if not isinstance(value, slice)}
+
+        if only_slices:
+            reference_slice = data.reference_da.sel(**only_slices)
+        else:
+            reference_slice = data.reference_da
+
+        if non_slices:
+            reference_slice = reference_slice.sel(**non_slices, method="nearest")
+        try:
+            plt.figure()
+            reference_slice.plot()
+            fig1 = plt.gcf()
+            with col1:
+                st.pyplot(fig1)
+        except TypeError:
+            pass
+
+    if data.compressed_da is not None:
+        plt.figure()
+        if only_slices:
+            compressed_slice = data.compressed_da.sel(**only_slices)
+        else:
+            compressed_slice = data.compressed_da
+        if non_slices:
+            compressed_slice = compressed_slice.sel(**non_slices, method="nearest")
+
+        try:
+            compressed_slice.plot()
+            fig2 = plt.gcf()
+            with col2:
+                st.pyplot(fig2)
+        except TypeError:
+            pass
+
+        diff = data.compressed_da - data.reference_da
+        plt.figure()
+        if only_slices:
+            diff_slice = diff.sel(**only_slices)
+        else:
+            diff_slice = data.compressed_da
+        if non_slices:
+            diff_slice = diff_slice.sel(**non_slices, method="nearest")
+
+        try:
+            diff_slice.plot()
+            fig3 = plt.gcf()
+            with col3:
+                st.pyplot(fig3)
+        except TypeError:
+            pass
+
+    else:
+        st.text("Compress the data to show the plot!")
--- a/examples/streamlit/playground.py
+++ b/examples/streamlit/playground.py
+import streamlit as st
+
+from component.data_source import create_data, select_dataset, select_slice
+from component.basic_section import basic_section
+from component.advanced_section import advanced_section
+from component.analysis_section import analysis_section
+from component.plotter import plot_comparison
+
+st.set_page_config(layout="wide", initial_sidebar_state="collapsed")
+
+
+data = create_data()
+
+
+def setup_main_frame():
+    st.title("Welcome to the :green[enstools-compression] playground!")
+    st.markdown("Find more information our [**GitHub repository**](https://github.com/wavestoweather/enstools-compression)"
+                " or in our [**documentation**](https://enstools-compression.readthedocs.io)")
+    with st.sidebar:
+        select_dataset(data)
+        slice_selection = select_slice(data)
+
+    st.markdown("---")
+    if data.reference_da is None:
+        st.markdown("## :point_left: Please :red[select a dataset] using the left sidebar!")
+        return
+
+    options = ["Compression", "Advanced Compression", "Analysis"]
+    basic, advanced, analysis = st.tabs(options)
+    with basic:
+        basic_section(data=data, slice_selection=slice_selection)
+        with st.spinner():
+            try:
+                plot_comparison(data=data, slice_selection=slice_selection)
+            except TypeError as err:
+                st.warning(err)
+
+    with advanced:
+        advanced_section(data=data, slice_selection=slice_selection)
+        with st.spinner():
+            try:
+                plot_comparison(data=data, slice_selection=slice_selection)
+            except TypeError as err:
+                st.warning(err)
+    with analysis:
+        analysis_section(data=data, slice_selection=slice_selection)
+
+
+setup_main_frame()
--- a/postBuild
+++ b/postBuild
+# Install additional dependencies to run the examples.
+pip install -e .[examples]
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+enstools>=2023.1
+enstools-encoding>=2023.6
+zfpy
+hdf5plugin>=4.0.0
+netCDF4
+streamlit
+pytest
+pytest-mock
+pooch
+enstools-compression
\ No newline at end of file
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -2,52 +2,35 @@
 set -e

 function usage {
-    echo "arguments:"
-    echo "-w    warnings are errors"
-    exit 1
+  echo "arguments:"
+  echo "-w    warnings are errors"
+  exit 1
 }

 # parse the command line
 excluded_files=""
 extra_arguments=""
-while getopts "w" opt ; do
-    case $opt in
-        w)
-            echo "WARNING: warnings are treated like errors for debugging."
-            extra_arguments="-W error"
-            ;;
-        *)
-            usage
-            ;;
-    esac
+while getopts "w" opt; do
+  case $opt in
+  w)
+    echo "WARNING: warnings are treated like errors for debugging."
+    extra_arguments="-W error"
+    ;;
+  *)
+    usage
+    ;;
+  esac
 done
-if [[ ! -z $excluded_files ]] ; then
-    ignore_option="--ignore=$excluded_files"
+if [[ ! -z $excluded_files ]]; then
+  ignore_option="--ignore=$excluded_files"
 fi

-
-# create a virtual environement and install all dependencies
-if [[ ! -d venv ]] ; then
-    python3 -m venv --prompt enstools-compression venv
-    source venv/bin/activate
-    pip install -U pip
-
-    # In case of having hdf5 available in the system, 
-    # install h5py without using precompiled binaries
-    if [ ! -z  $(command -v h5copy ) ]; then
-        pip install h5py --no-binary h5py
-    fi
-
-    # In ubuntu 22.04 proj >8.0 is available which allows us to install cartopy >= 0.20 but not
-    # Previous versions
-    # We'll check ubuntu's version and in case its previous to 22.04 we'll preinstall cartopy 0.19
-    source /etc/os-release
-    if (( $(echo "${VERSION_ID} < 22.04" |bc -l) )); then
-        pip install cartopy==0.19.0.post1
-    fi
-	    
-    pip install -e .
-    pip install --force-reinstall pytest pytest-mock
+# create a virtual environment and install all dependencies
+if [[ ! -d venv ]]; then
+  python3 -m venv --prompt enstools-compression venv
+  source venv/bin/activate
+  pip install -U pip
+  pip install -e .[test]
 fi

 source venv/bin/activate

--- a/setup.py
+++ b/setup.py
@@ -40,11 +40,20 @@ setup(name="enstools-compression",

      install_requires=[
          "enstools>=2023.1",
-          "enstools-encoding>=2023.1",
+          "enstools-encoding>=2023.6",
          "zfpy",
-          "hdf5plugin>=4.0.0",
+          "hdf5plugin>=4.1.3",
          "netCDF4",
      ],
+      extras_require={
+          'examples': ['pooch'],
+          'test': [
+              "pytest",
+              "pytest-mock",
+              "pooch",
+          ],
+      },
+
      entry_points={
          'console_scripts': [
              'enstools-compression=enstools.compression.cli:main'

--- a/tests/test_analyzer.py
+++ b/tests/test_analyzer.py
@@ -85,6 +85,19 @@ class TestAnalyzer(TestClass):
                          compressor="zfp",
                          mode="rate",
                          )
+    def test_rmse(self):
+        from enstools.compression.api import analyze_files
+        input_tempdir = self.input_directory_path
+        # Check that the compression without specifying compression parameters works
+        datasets = ["dataset_%iD.nc" % dimension for dimension in range(1, 4)]
+        for ds in datasets:
+            input_path = input_tempdir / ds
+            analyze_files(file_paths=[input_path],
+                          constrains="normalized_root_mean_square_error:1e-5",
+                          # Keep the analysis to a single compressor and mode to speed up tests
+                          compressor="zfp",
+                          mode="rate",
+                          )

    def test_wrong_constrain(self):
        from enstools.compression.api import analyze_files
@@ -115,7 +128,7 @@ class TestAnalyzer(TestClass):

        # Check that the analysis using a custom metric defined with a plugin works
        datasets = ["dataset_%iD.nc" % dimension for dimension in range(1, 4)]
-        constrains = f"{custom_metric_name}:3"
+        constrains = f"{custom_metric_name}:1e-5"

        for ds in datasets:
            input_path = input_tempdir / ds
@@ -154,7 +167,7 @@ class TestAnalyzer(TestClass):

        # Check that the analysis using a custom metric defined with a plugin works
        datasets = ["dataset_%iD.nc" % dimension for dimension in range(1, 4)]
-        constrains = f"{custom_metric_name}:3"
+        constrains = f"{custom_metric_name}:1e-5"

        for ds in datasets:
            input_path = input_tempdir / ds

--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -102,7 +102,7 @@ class TestCommandLineInterface(TestClass):
        with open(plugin_path, "w") as f:
            f.write(function_code)

-        commands = ["_", "analyze", str(file_path), "--constrains", f"{plugin_name}:4", "--plugins",
+        commands = ["_", "analyze", str(file_path), "--constrains", f"{plugin_name}:1e-5", "--plugins",
                    str(plugin_path), "-c", "zfp"]
        mocker.patch("sys.argv", commands)
        enstools.compression.cli.main()
@@ -131,7 +131,7 @@ class TestCommandLineInterface(TestClass):
        with open(plugin_path, "w") as f:
            f.write(function_code)

-        commands = ["_", "analyze", str(file_path), "--constrains", f"{plugin_name}:4", "--plugins", str(plugin_path),
+        commands = ["_", "analyze", str(file_path), "--constrains", f"{plugin_name}:1e-5", "--plugins", str(plugin_path),
                    "-c", "sz"]
        mocker.patch("sys.argv", commands)
        enstools.compression.cli.main()

--- a/tests/test_sz.py
+++ b/tests/test_sz.py
+import numpy as np
+
 from utils import wrapper, TestClass

 folders = None
@@ -13,13 +15,93 @@ class TestSZ(TestClass):
            analyze_files(file_paths=input_path, compressor="sz")

    def test_compress_sz_pw_rel(self):
-        compression = "lossy,sz,pw_rel,0.1"
+        compression = "lossy,sz,pw_rel,0.001"
        wrapper(self, compression=compression)

+    def test_consistency_sz_pw_rel(self):
+        import enstools.compression.api
+        from enstools.encoding.api import VariableEncoding
+        import enstools.io
+        tolerance = 0.001
+        compression = f"lossy,sz,pw_rel,{tolerance}"
+        # Check that the compression without specifying compression parameters works
+
+        datasets = ["dataset_%iD.nc" % dimension for dimension in range(1, 4)]
+        for dataset_name in datasets:
+            input_path = self.input_directory_path / dataset_name
+
+            # Check that the output file can be loaded
+            with enstools.io.read(input_path) as ds:
+                for var in ds.data_vars:
+                    data_array = ds[var]
+                    encoding = VariableEncoding(specification=compression)
+                    compressed_da, _ = enstools.compression.api.emulate_compression_on_data_array(
+                        data_array=data_array,
+                        compression_specification=encoding,
+                        in_place=False,
+                    )
+                    diff = compressed_da - data_array
+                    diff /= data_array
+
+                    assert (np.abs(diff.values) < (data_array.values * tolerance)).all()
+
    def test_compress_sz_abs(self):
        compression = "lossy,sz,abs,0.01"
        wrapper(self, compression=compression)

+    def test_consistency_sz_abs(self):
+        import enstools.compression.api
+        from enstools.encoding.api import VariableEncoding
+        import enstools.io
+        tolerance = 0.01
+        compression = f"lossy,sz,abs,{tolerance}"
+        # Check that the compression without specifying compression parameters works
+
+        datasets = ["dataset_%iD.nc" % dimension for dimension in range(1, 4)]
+        for dataset_name in datasets:
+            input_path = self.input_directory_path / dataset_name
+
+            # Check that the output file can be loaded
+            with enstools.io.read(input_path) as ds:
+                for var in ds.data_vars:
+                    data_array = ds[var]
+                    encoding = VariableEncoding(specification=compression)
+                    compressed_da, _ = enstools.compression.api.emulate_compression_on_data_array(
+                        data_array=data_array,
+                        compression_specification=encoding,
+                        in_place=False,
+                    )
+                    diff = compressed_da - data_array
+                    assert (np.abs(diff.values) < tolerance).all()
+
    def test_compress_sz_rel(self):
        compression = "lossy,sz,rel,0.001"
        wrapper(self, compression=compression)
+
+
+    def test_consistency_sz_rel(self):
+        import enstools.compression.api
+        from enstools.encoding.api import VariableEncoding
+        import enstools.io
+        tolerance = 0.01
+        compression = f"lossy,sz,rel,{tolerance}"
+        # Check that the compression without specifying compression parameters works
+
+        datasets = ["dataset_%iD.nc" % dimension for dimension in range(1, 4)]
+        for dataset_name in datasets:
+            input_path = self.input_directory_path / dataset_name
+
+            # Check that the output file can be loaded
+            with enstools.io.read(input_path) as ds:
+                for var in ds.data_vars:
+                    data_array = ds[var]
+                    encoding = VariableEncoding(specification=compression)
+                    compressed_da, _ = enstools.compression.api.emulate_compression_on_data_array(
+                        data_array=data_array,
+                        compression_specification=encoding,
+                        in_place=False,
+                    )
+                    abs_tolerance = float(data_array.max() - data_array.min()) * tolerance
+                    diff = compressed_da - data_array
+                    assert (np.abs(diff.values) < abs_tolerance).all()
+
No results found