Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • w2w/enstools-compression
1 result
Show changes
Commits on Source (31)
Showing
with 712 additions and 93 deletions
2023.5
2023.6
......@@ -17,8 +17,12 @@ from typing import Tuple, Callable
import numpy as np
import xarray
import enstools.encoding.chunk_size
from enstools.compression.emulators import DefaultEmulator
from enstools.compression.errors import ConditionsNotFulfilledError
from enstools.compression.slicing import MultiDimensionalSliceCollection
from enstools.encoding.api import VariableEncoding
from enstools.encoding.dataset_encoding import find_chunk_sizes, convert_to_bytes
from enstools.encoding.rules import COMPRESSION_SPECIFICATION_SEPARATOR
from .analysis_options import AnalysisOptions
from .analyzer_utils import get_metrics, get_parameter_range, bisection_method
......@@ -33,23 +37,34 @@ COUNTER = 0
def find_direct_relation(parameter_range, function_to_nullify):
"""Return whether the nullified function has a direct relation between the parameter and the nullified value."""
min_val, max_val = parameter_range
first_q = min_val + (max_val - min_val) / 10
third_q = min_val + 9 * (max_val - min_val) / 10
first_percentile = min_val + (max_val - min_val) / 100
last_percentile = min_val + 99 * (max_val - min_val) / 100
eval_first_percentile = function_to_nullify(first_percentile)
eval_last_percentile = function_to_nullify(last_percentile)
return eval_last_percentile > eval_first_percentile
def get_one_slice(data_array: xarray.DataArray, chunk_size: str = "100KB"):
chunk_memory_size = convert_to_bytes(chunk_size)
chunk_sizes = find_chunk_sizes(data_array, chunk_memory_size)
chunk_sizes = [chunk_sizes[dim] for dim in data_array.dims]
multi_dimensional_slice = MultiDimensionalSliceCollection(shape=data_array.shape, chunk_sizes=chunk_sizes)
big_chunk_size = max(set([s.size for s in multi_dimensional_slice.objects.ravel()]))
big_chunks = [s for s in multi_dimensional_slice.objects.ravel() if s.size == big_chunk_size]
return {dim: size for dim, size in zip(data_array.dims, big_chunks[0].slices)}
eval_first_q = function_to_nullify(first_q)
eval_third_q = function_to_nullify(third_q)
return eval_third_q > eval_first_q
def analyze_data_array(data_array: xarray.DataArray, options: AnalysisOptions) -> Tuple[str, dict]:
"""
Find the compression specification corresponding to a certain data array and a given set of compression options.
"""
# In case there is a time dimension, select the last element.
# There are accumulative variables (like total precipitation) which have mostly 0 on the first time step.
# Using the last time-step can represent an advantage somehow.
if "time" in data_array.dims:
data_array = data_array.isel(time=-1)
slices = get_one_slice(data_array,
chunk_size=enstools.encoding.chunk_size.analysis_chunk_size)
data_array = data_array.isel(**slices)
# Check if the array contains any nan
contains_nan = np.isnan(data_array.values).any()
if contains_nan:
......@@ -64,11 +79,6 @@ def analyze_data_array(data_array: xarray.DataArray, options: AnalysisOptions) -
# Define parameter range
parameter_range = get_parameter_range(data_array, options)
# If the aim is a specific compression ratio, the parameter range needs to be reversed
# because the relation between compression ratio and quality is inverse.
# if COMPRESSION_RATIO_LABEL in options.thresholds:
# parameter_range = tuple(reversed(parameter_range))
# Ignore warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore")
......@@ -80,6 +90,8 @@ def analyze_data_array(data_array: xarray.DataArray, options: AnalysisOptions) -
fun=function_to_nullify,
direct_relation=direct_relation)
if not constrain(parameter):
raise ConditionsNotFulfilledError("Condition not fulfilled!")
# Compute metrics
# When aiming for a compression ratio some other metrics need to be provided too.
if COMPRESSION_RATIO_LABEL not in options.thresholds:
......
......@@ -18,6 +18,7 @@ from enstools.compression.compressor import drop_variables
from enstools.io import read
from .analysis_options import AnalysisOptions, AnalysisParameters
from .analyze_data_array import analyze_data_array, ANALYSIS_DIAGNOSTIC_METRICS, COMPRESSION_RATIO_LABEL
from ..errors import ConditionsNotFulfilledError
logger = logging.getLogger("enstools.compression.analysis")
......@@ -67,7 +68,8 @@ def select_optimal_encoding_based_on_compression_ratio(encodings: dict, metrics:
for variable in variables:
best_compression_ratio = 0
for combination in combinations:
if metrics[combination][variable][COMPRESSION_RATIO_LABEL] > best_compression_ratio:
if variable in metrics[combination] and \
metrics[combination][variable][COMPRESSION_RATIO_LABEL] > best_compression_ratio:
best_compression_ratio = metrics[combination][variable][COMPRESSION_RATIO_LABEL]
best_combination[variable] = combination
......@@ -93,7 +95,7 @@ def select_optimal_encoding_based_on_quality_metrics(encodings: dict, metrics: d
best_metrics = {met: -1.0 for met in ANALYSIS_DIAGNOSTIC_METRICS}
for combination in combinations:
for metric in ANALYSIS_DIAGNOSTIC_METRICS:
if metric in metrics[combination][variable]:
if variable in metrics[combination] and metric in metrics[combination][variable]:
if metrics[combination][variable][metric] > best_metrics[metric]:
best_metrics[metric] = metrics[combination][variable][metric]
best_combination[variable] = combination
......@@ -145,18 +147,21 @@ def find_encodings_for_all_combinations(dataset: xarray.Dataset, options: Analys
combination_metrics[var] = {COMPRESSION_RATIO_LABEL: 1.0}
continue
variable_encoding, variable_metrics = analyze_data_array(
data_array=dataset[var],
options=AnalysisOptions(compressor, mode, thresholds=options.thresholds)
)
combination_encoding[var] = variable_encoding
combination_metrics[var] = variable_metrics
# (dataset, variable_name, thresholds, compressor_name, mode)
logger.debug("%s %s CR:%.1f",
var,
variable_encoding,
variable_metrics[COMPRESSION_RATIO_LABEL],
)
try:
variable_encoding, variable_metrics = analyze_data_array(
data_array=dataset[var],
options=AnalysisOptions(compressor, mode, thresholds=options.thresholds)
)
combination_encoding[var] = variable_encoding
combination_metrics[var] = variable_metrics
# (dataset, variable_name, thresholds, compressor_name, mode)
logger.debug("%s %s CR:%.1f",
var,
variable_encoding,
variable_metrics[COMPRESSION_RATIO_LABEL],
)
except ConditionsNotFulfilledError:
...
encodings[combination] = combination_encoding
metrics[combination] = combination_metrics
......@@ -240,7 +245,12 @@ def analyze_dataset(dataset: xarray.Dataset,
dataset = dataset.fillna(fill_na)
options = AnalysisOptions(compressor=compressor, mode=mode, constrains=constrains)
return find_optimal_encoding(dataset, options)
encodings, metrics = find_optimal_encoding(dataset, options)
if not encodings:
raise ConditionsNotFulfilledError(
"It was not possible to find a combination that fulfills the constrains provided"
)
return encodings, metrics
def save_encoding(encoding: dict, output_file: Union[Path, str, None] = None, file_format: str = "yaml"):
......
......@@ -6,6 +6,7 @@ import logging
from typing import List, Dict
import xarray
import numpy as np
from enstools.compression.analyzer.analysis_options import AnalysisOptions
from enstools.compression.metrics import DataArrayMetrics
......@@ -23,7 +24,8 @@ def get_metrics(reference_data: xarray.DataArray, recovered_data: xarray.DataArr
:return: a dictionary with the requested metrics
"""
metrics = DataArrayMetrics(reference_data, recovered_data)
return {metric: float(metrics[metric]) for metric in metric_names if metric != "compression_ratio"}
# TODO: Is the average the proper thing to use here?
return {metric: float(np.average(metrics[metric])) for metric in metric_names if metric != "compression_ratio"}
def check_compression_ratio(compression_ratio: float, thresholds: dict):
......@@ -148,6 +150,7 @@ def continuous_bisection_method(parameter_range: tuple,
retry_repeated=5,
threshold=0.1,
direct_relation=True,
results=None,
):
"""
Recursively refine a parameter range by evaluating the parameter that lies in the middle of the range.
......@@ -168,8 +171,13 @@ def continuous_bisection_method(parameter_range: tuple,
:param direct_relation: If True, the relation between the function and the parameter is direct, that is,
increasing the parameter will increase the function value. If False, the relation is
inverse, that is, increasing the parameter will decrease the function value.
:param results: A dictionary that stores the function values for each parameter that has been evaluated.
:return: The best value of the parameter that meets the exit conditions.
"""
if results is None:
results = {}
# Get start and end from parameter range
start, end = parameter_range
......@@ -184,24 +192,23 @@ def continuous_bisection_method(parameter_range: tuple,
# TODO: use logging and a debug mode to print this kind of things
logging.debug("start=%.2e,end=%.2e value_at_middle=%f", start, end, float(value_at_middle))
# If the value at the middle is positive (all thresholds are fulfilled) we can return the parameter at the middle,
# otherwise select the safer one.
parameter_to_return = middle if value_at_middle > 0.0 else end
# Save result
results[middle] = value_at_middle
# In case the accuracy exit condition is reached, return the parameter value at that point
if 0.0 <= value_at_middle < threshold:
return parameter_to_return
if 0.0 <= value_at_middle < threshold or depth >= max_depth or\
(value_at_middle == last_value and retry_repeated == 0):
positive_results = {k: v for k,v in results.items() if v > 0}
if positive_results:
return min(positive_results, key=positive_results.get)
else:
return middle
# If the value is the same that the last try, we can retry few times
if value_at_middle == last_value:
if retry_repeated == 0:
return parameter_to_return
retry_repeated -= 1
# In case having reached the maximum depth, return the proper value
if depth >= max_depth:
return parameter_to_return
# # Otherwise, set new parameter range and call the function again
if comparison(value_at_middle, direct_relation=direct_relation):
new_start, new_end = start, middle
......@@ -216,6 +223,7 @@ def continuous_bisection_method(parameter_range: tuple,
retry_repeated=retry_repeated,
threshold=threshold,
direct_relation=direct_relation,
results=results
)
......@@ -228,6 +236,7 @@ def discrete_bisection_method(parameters_list: list,
retry_repeated=5,
threshold=0.1,
direct_relation=True,
results=None,
):
"""
Apply the bisection method on a set of discrete parameters.
......@@ -243,11 +252,16 @@ def discrete_bisection_method(parameters_list: list,
:param threshold: The threshold for the method exit condition.
:param direct_relation: Boolean indicating whether the relation between the parameter and the function value is
direct or inverse.
:param results: A dictionary that stores the function values for each parameter that has been evaluated.
:return: The parameter value that satisfies the constrain function.
:raises: Exception if the maximum depth is reached.
"""
if results is None:
results = {}
middle_index = len(parameters_list) // 2
middle = parameters_list[middle_index]
......@@ -260,13 +274,20 @@ def discrete_bisection_method(parameters_list: list,
middle,
float(value_at_middle))
results[middle] = value_at_middle
# If the value at the middle is positive (all thresholds are fulfilled) we can return the parameter at the middle,
# otherwise select the safer one.
parameter_to_return = middle if value_at_middle > 0.0 else parameters_list[-1]
# In case the accuracy exit condition is reached, return the parameter value at that point
if 0.0 <= value_at_middle < threshold:
return parameter_to_return
if 0.0 <= value_at_middle < threshold or depth >= max_depth or\
(value_at_middle == last_value and retry_repeated == 0):
positive_results = {k: v for k,v in results.items() if v > 0}
if positive_results:
return min(positive_results, key=positive_results.get)
else:
return middle
# If the value is the same that the last try, we can retry few times
if value_at_middle == last_value:
......@@ -274,9 +295,6 @@ def discrete_bisection_method(parameters_list: list,
return parameter_to_return
retry_repeated -= 1
# In case having reached the maximum depth, return the proper value
if depth >= max_depth:
return parameter_to_return
# # Otherwise, set new parameter range and call the function again
if comparison(value_at_middle, direct_relation=direct_relation):
......@@ -293,4 +311,5 @@ def discrete_bisection_method(parameters_list: list,
retry_repeated=retry_repeated,
threshold=threshold,
direct_relation=direct_relation,
results=results,
)
......@@ -86,6 +86,8 @@ class FilterEmulator(Emulator):
encoding = dict(self.compression)
if "chunksizes" in encoding:
encoding["chunks"] = encoding.pop("chunksizes")
else:
encoding["chunks"] = uncompressed_data.shape
# Initialize file object
with io.BytesIO() as bio:
......
from enstools.core.errors import EnstoolsError
class ConditionsNotFulfilledError(EnstoolsError):
...
\ No newline at end of file
import xarray
from enstools.compression.analyzer.analyzer import analyze_dataset
import enstools.compression.xr_accessor # noqa
dataset_names = [
"air_temperature",
# "air_temperature_gradient",
# "basin_mask",
# "rasm",
# "ROMS_example",
# "tiny",
# "era5-2mt-2019-03-uk.grib",
# "eraint_uvz",
# "ersstv5"
]
def main():
results = {}
failed_datasets = []
for dataset_name in dataset_names:
try:
with xarray.tutorial.open_dataset(dataset_name) as dataset:
encoding, metrics = analyze_dataset(dataset=dataset)
results[dataset_name] = (encoding, metrics)
dataset.to_netcdf(f"reference_{dataset_name}.nc")
dataset.to_compressed_netcdf(f"compressed_{dataset_name}.nc", compression=encoding)
except ValueError:
failed_datasets.append(dataset_name)
print(results)
print(failed_datasets)
if __name__ == "__main__":
main()
import matplotlib.pyplot as plt
import streamlit as st
import enstools.compression.xr_accessor # noqa
from enstools.encoding.errors import InvalidCompressionSpecification
default_parameters = {
"sz": {
"abs": 1,
"rel": 0.001,
"pw_rel": 0.001,
},
"sz3": {
"abs": 1,
"rel": 0.001,
},
"zfp": {
"accuracy": 1,
"rate": 3.2,
"precision": 14,
}
}
def advanced_section(data, slice_selection):
if data.dataset is not None:
# st.markdown("# Compression")
with st.expander("Compression Specifications"):
specification_mode = st.radio(label="", options=["Options", "String"], horizontal=True)
# specification, options = st.tabs(["String", "Options"])
if specification_mode == "String":
compression_specification = st.text_input(label="Compression", value="lossy,sz,abs,1")
elif specification_mode == "Options":
col1_, col2_, col3_ = st.columns(3)
with col1_:
compressor = st.selectbox(label="Compressor", options=["sz", "sz3", "zfp"])
if compressor == "sz":
mode_options = ["abs", "rel", "pw_rel"]
elif compressor == "sz3":
mode_options = ["abs", "rel"]
elif compressor == "zfp":
mode_options = ["accuracy", "rate", "precision"]
else:
mode_options = []
with col2_:
mode = st.selectbox(label="Mode", options=mode_options)
with col3_:
parameter = st.text_input(label="Parameter", value=default_parameters[compressor][mode])
compression_specification = f"lossy,{compressor},{mode},{parameter}"
st.markdown(f"**Compression Specification:** {compression_specification}")
if compression_specification:
try:
data.compress(compression_specification)
except InvalidCompressionSpecification:
st.warning("Invalid compression specification!")
st.markdown(
"Check [the compression specification format](https://enstools-encoding.readthedocs.io/en/latest/CompressionSpecificationFormat.html)")
if data.compressed_da is not None:
st.markdown(f"**Compression Ratio**: {data.compressed_da.attrs['compression_ratio']}")
\ No newline at end of file
import streamlit as st
import enstools.compression.xr_accessor # noqa
def analysis_section(data, slice_selection):
if data.dataset is not None:
# st.markdown("# Compression")
col1, col2 = st.columns(2)
with col1:
constrains = st.text_input(label="Constraint", value="correlation_I:5,ssim_I:3")
options = {
"sz": ["abs", "rel", "pw_rel"],
"sz3": ["abs", "rel"],
"zfp": ["accuracy", "rate", "precision"],
}
all_options = []
[all_options.extend([f"{compressor}-{mode}" for mode in options[compressor]]) for compressor in options]
with col2:
cases = st.multiselect(label="Compressor and mode", options=all_options)
if data.reference_da is None:
return
if not cases:
return
st.markdown("# Results:")
n_cols = 4
cols = st.columns(n_cols)
all_results = {}
for idx, case in enumerate(cases):
with cols[idx % n_cols]:
compressor, mode = case.split("-")
encoding, metrics = data.reference_da.compression.analyze(
constrains=constrains,
compressor=compressor,
compression_mode=mode
)
parameter = encoding.split(",")[-1]
compression_ratio = metrics["compression_ratio"]
all_results[case] = compression_ratio
st.markdown(f"## {compressor},{mode}:\n\n"
f"**Compression Ratio:** {compression_ratio:.2f}x\n\n"
f"**Parameter:** {parameter}\n\n"
f"**Specification String:**")
st.code(encoding)
st.markdown(f"___")
# st.markdown(encoding)
# st.markdown(metrics)
import matplotlib.pyplot as plt
import numpy as np
import streamlit as st
import xarray
import enstools.compression.xr_accessor # noqa
from .data_source import DataContainer
def get_compression_ratio(data_array: xarray.DataArray, relative_tolerance: float, mode: str) -> float:
what = data_array.compression(f"lossy,sz,{mode},{relative_tolerance}", in_place=False)
return float(what.attrs["compression_ratio"])
def invert_function(function):
# Define its derivative
f_prime = function.deriv()
# Define the function for which we want to find the root
def func(x, y_val):
return function(x) - y_val
def newtons_method(y_val, epsilon=1e-7, max_iterations=100):
x = -2 # np.log10(0.01)
print(f"{y_val=}")
for _ in range(max_iterations):
x_new = x - func(x, y_val) / f_prime(x)
if abs(x - x_new) < epsilon:
return x_new
x = x_new
print(x_new)
return None
return newtons_method
def create_parameter_from_compression_ratio(data: DataContainer, mode: str):
train_x = np.logspace(-12, -.5, 15)
train_y = [get_compression_ratio(data.reference_da, parameter, mode=mode) for parameter in train_x]
parameter_range = min(train_y), min(100., max(train_y))
x_log = np.log10(train_x)
y_log = np.log10(train_y)
coeff = np.polyfit(x_log, y_log, 10)
# Create a polynomial function from the coefficients
f = np.poly1d(coeff)
f_inverse = invert_function(f)
def function_to_return(compression_ratio: float) -> float:
return 10 ** f_inverse(np.log10(compression_ratio))
return parameter_range, function_to_return
def basic_section(data: DataContainer, slice_selection):
mode = st.selectbox(label="Mode", options=["rel", "pw_rel"])
parameter_range, get_parameter = create_parameter_from_compression_ratio(data, mode=mode)
_min, _max = parameter_range
options = [_min + (_max - _min) * _x for _x in np.logspace(-2, 0)]
options = [f"{op:.2f}" for op in options]
compression_ratio = st.select_slider(label="Compression Ratio", options=options)
compression_ratio = float(compression_ratio)
parameter = get_parameter(compression_ratio)
with st.spinner():
data.compress(f"lossy,sz,rel,{parameter}")
import io
from typing import Optional
import pandas as pd
import streamlit as st
import xarray as xr
class DataContainer:
def __init__(self, dataset: Optional[xr.Dataset] = None):
self.dataset = dataset
self.reference_da = None
self.compressed_da = None
def set_dataset(self, dataset):
self.dataset = dataset
self.reference_da = None
self.compressed_da = None
def select_variable(self, variable):
self.reference_da = self.dataset[variable]
@classmethod
def from_tutorial_data(cls, dataset_name: str = "air_temperature"):
return cls(dataset=xr.tutorial.open_dataset(dataset_name))
@property
def time_steps(self):
if self.reference_da is not None:
if "time" in self.reference_da.dims:
try:
return pd.to_datetime(self.reference_da.time.values)
except TypeError:
return self.reference_da.time.values
print(self.reference_da)
def compress(self, compression):
self.compressed_da = self.reference_da.compression(compression)
def __hash__(self):
return hash(self.reference_da.name)
@st.cache_resource
def create_data():
return DataContainer.from_tutorial_data()
def select_dataset(data):
st.title("Select Dataset")
data_source = st.radio(label="Data source", options=["Tutorial Dataset", "Custom Dataset"])
col1, col2 = st.columns(2)
if data_source == "Tutorial Dataset":
tutorial_dataset_options = [
"air_temperature",
"air_temperature_gradient",
# "basin_mask", # Different coordinates
# "rasm", # Has nan
"ROMS_example",
"tiny",
# "era5-2mt-2019-03-uk.grib",
"eraint_uvz",
"ersstv5"
]
with col1:
dataset_name = st.selectbox(label="Dataset", options=tutorial_dataset_options)
dataset = xr.tutorial.open_dataset(dataset_name)
data.set_dataset(dataset)
elif data_source == "Custom Dataset":
my_file = st.file_uploader(label="Your file")
data.set_dataset(None)
if my_file:
my_virtual_file = io.BytesIO(my_file.read())
my_dataset = xr.open_dataset(my_virtual_file)
st.text("Custom dataset loaded!")
data.set_dataset(my_dataset)
if data.dataset is not None:
with col2:
variable = st.selectbox(label="Variable", options=data.dataset.data_vars)
if variable:
data.select_variable(variable)
def select_slice(data):
st.title("Select Slice")
slice_selection = {}
if data.reference_da is not None and data.reference_da.dims and True:
tabs = st.tabs(tabs=data.reference_da.dims)
for idx, dimension in enumerate(data.reference_da.dims):
with tabs[idx]:
if str(dimension) == "time":
if len(data.reference_da.time) > 1:
slice_selection[dimension] = st.select_slider(label=dimension,
options=data.reference_da[dimension].values,
)
else:
slice_selection[dimension] = data.reference_da.time.values[0]
else:
_min = float(data.reference_da[dimension].values[0])
_max = float(data.reference_da[dimension].values[-1])
if _max - _min < 1000:
slice_selection[dimension] = st.slider(label=dimension,
min_value=_min,
max_value=_max,
value=(_min, _max),
)
# if st.button("Clear Cache"):
# st.cache_resource.clear()
return slice_selection
import streamlit as st
import matplotlib.pyplot as plt
def plot_comparison(data, slice_selection):
col1, col2, col3, *others = st.columns(3)
new_slice = {}
for key, values in slice_selection.items():
if isinstance(values, tuple):
start, stop = values
if start != stop:
new_slice[key] = slice(start, stop)
else:
new_slice[key] = start
else:
new_slice[key] = values
slice_selection = new_slice
if data.reference_da is not None:
slice_selection = {key: (value if key != "lat" else slice(value.stop, value.start)) for key, value in
slice_selection.items()}
only_slices = {key: value for key, value in slice_selection.items() if isinstance(value, slice)}
non_slices = {key: value for key, value in slice_selection.items() if not isinstance(value, slice)}
if only_slices:
reference_slice = data.reference_da.sel(**only_slices)
else:
reference_slice = data.reference_da
if non_slices:
reference_slice = reference_slice.sel(**non_slices, method="nearest")
try:
plt.figure()
reference_slice.plot()
fig1 = plt.gcf()
with col1:
st.pyplot(fig1)
except TypeError:
pass
if data.compressed_da is not None:
plt.figure()
if only_slices:
compressed_slice = data.compressed_da.sel(**only_slices)
else:
compressed_slice = data.compressed_da
if non_slices:
compressed_slice = compressed_slice.sel(**non_slices, method="nearest")
try:
compressed_slice.plot()
fig2 = plt.gcf()
with col2:
st.pyplot(fig2)
except TypeError:
pass
diff = data.compressed_da - data.reference_da
plt.figure()
if only_slices:
diff_slice = diff.sel(**only_slices)
else:
diff_slice = data.compressed_da
if non_slices:
diff_slice = diff_slice.sel(**non_slices, method="nearest")
try:
diff_slice.plot()
fig3 = plt.gcf()
with col3:
st.pyplot(fig3)
except TypeError:
pass
else:
st.text("Compress the data to show the plot!")
import streamlit as st
from component.data_source import create_data, select_dataset, select_slice
from component.basic_section import basic_section
from component.advanced_section import advanced_section
from component.analysis_section import analysis_section
from component.plotter import plot_comparison
st.set_page_config(layout="wide", initial_sidebar_state="collapsed")
data = create_data()
def setup_main_frame():
st.title("Welcome to the :green[enstools-compression] playground!")
st.markdown("Find more information our [**GitHub repository**](https://github.com/wavestoweather/enstools-compression)"
" or in our [**documentation**](https://enstools-compression.readthedocs.io)")
with st.sidebar:
select_dataset(data)
slice_selection = select_slice(data)
st.markdown("---")
if data.reference_da is None:
st.markdown("## :point_left: Please :red[select a dataset] using the left sidebar!")
return
options = ["Compression", "Advanced Compression", "Analysis"]
basic, advanced, analysis = st.tabs(options)
with basic:
basic_section(data=data, slice_selection=slice_selection)
with st.spinner():
try:
plot_comparison(data=data, slice_selection=slice_selection)
except TypeError as err:
st.warning(err)
with advanced:
advanced_section(data=data, slice_selection=slice_selection)
with st.spinner():
try:
plot_comparison(data=data, slice_selection=slice_selection)
except TypeError as err:
st.warning(err)
with analysis:
analysis_section(data=data, slice_selection=slice_selection)
setup_main_frame()
# Install additional dependencies to run the examples.
pip install -e .[examples]
\ No newline at end of file
......@@ -2,52 +2,35 @@
set -e
function usage {
echo "arguments:"
echo "-w warnings are errors"
exit 1
echo "arguments:"
echo "-w warnings are errors"
exit 1
}
# parse the command line
excluded_files=""
extra_arguments=""
while getopts "w" opt ; do
case $opt in
w)
echo "WARNING: warnings are treated like errors for debugging."
extra_arguments="-W error"
;;
*)
usage
;;
esac
while getopts "w" opt; do
case $opt in
w)
echo "WARNING: warnings are treated like errors for debugging."
extra_arguments="-W error"
;;
*)
usage
;;
esac
done
if [[ ! -z $excluded_files ]] ; then
ignore_option="--ignore=$excluded_files"
if [[ ! -z $excluded_files ]]; then
ignore_option="--ignore=$excluded_files"
fi
# create a virtual environement and install all dependencies
if [[ ! -d venv ]] ; then
python3 -m venv --prompt enstools-compression venv
source venv/bin/activate
pip install -U pip
# In case of having hdf5 available in the system,
# install h5py without using precompiled binaries
if [ ! -z $(command -v h5copy ) ]; then
pip install h5py --no-binary h5py
fi
# In ubuntu 22.04 proj >8.0 is available which allows us to install cartopy >= 0.20 but not
# Previous versions
# We'll check ubuntu's version and in case its previous to 22.04 we'll preinstall cartopy 0.19
source /etc/os-release
if (( $(echo "${VERSION_ID} < 22.04" |bc -l) )); then
pip install cartopy==0.19.0.post1
fi
pip install -e .
pip install --force-reinstall pytest pytest-mock
# create a virtual environment and install all dependencies
if [[ ! -d venv ]]; then
python3 -m venv --prompt enstools-compression venv
source venv/bin/activate
pip install -U pip
pip install -e .[test]
fi
source venv/bin/activate
......
......@@ -40,11 +40,20 @@ setup(name="enstools-compression",
install_requires=[
"enstools>=2023.1",
"enstools-encoding>=2023.1",
"enstools-encoding>=2023.6",
"zfpy",
"hdf5plugin>=4.0.0",
"hdf5plugin>=4.1.3",
"netCDF4",
],
extras_require={
'examples': ['pooch'],
'test': [
"pytest",
"pytest-mock",
"pooch",
],
},
entry_points={
'console_scripts': [
'enstools-compression=enstools.compression.cli:main'
......
......@@ -85,6 +85,19 @@ class TestAnalyzer(TestClass):
compressor="zfp",
mode="rate",
)
def test_rmse(self):
from enstools.compression.api import analyze_files
input_tempdir = self.input_directory_path
# Check that the compression without specifying compression parameters works
datasets = ["dataset_%iD.nc" % dimension for dimension in range(1, 4)]
for ds in datasets:
input_path = input_tempdir / ds
analyze_files(file_paths=[input_path],
constrains="normalized_root_mean_square_error:1e-5",
# Keep the analysis to a single compressor and mode to speed up tests
compressor="zfp",
mode="rate",
)
def test_wrong_constrain(self):
from enstools.compression.api import analyze_files
......@@ -115,7 +128,7 @@ class TestAnalyzer(TestClass):
# Check that the analysis using a custom metric defined with a plugin works
datasets = ["dataset_%iD.nc" % dimension for dimension in range(1, 4)]
constrains = f"{custom_metric_name}:3"
constrains = f"{custom_metric_name}:1e-5"
for ds in datasets:
input_path = input_tempdir / ds
......@@ -154,7 +167,7 @@ class TestAnalyzer(TestClass):
# Check that the analysis using a custom metric defined with a plugin works
datasets = ["dataset_%iD.nc" % dimension for dimension in range(1, 4)]
constrains = f"{custom_metric_name}:3"
constrains = f"{custom_metric_name}:1e-5"
for ds in datasets:
input_path = input_tempdir / ds
......
......@@ -102,7 +102,7 @@ class TestCommandLineInterface(TestClass):
with open(plugin_path, "w") as f:
f.write(function_code)
commands = ["_", "analyze", str(file_path), "--constrains", f"{plugin_name}:4", "--plugins",
commands = ["_", "analyze", str(file_path), "--constrains", f"{plugin_name}:1e-5", "--plugins",
str(plugin_path), "-c", "zfp"]
mocker.patch("sys.argv", commands)
enstools.compression.cli.main()
......@@ -131,7 +131,7 @@ class TestCommandLineInterface(TestClass):
with open(plugin_path, "w") as f:
f.write(function_code)
commands = ["_", "analyze", str(file_path), "--constrains", f"{plugin_name}:4", "--plugins", str(plugin_path),
commands = ["_", "analyze", str(file_path), "--constrains", f"{plugin_name}:1e-5", "--plugins", str(plugin_path),
"-c", "sz"]
mocker.patch("sys.argv", commands)
enstools.compression.cli.main()
......
import numpy as np
from utils import wrapper, TestClass
folders = None
......@@ -13,13 +15,93 @@ class TestSZ(TestClass):
analyze_files(file_paths=input_path, compressor="sz")
def test_compress_sz_pw_rel(self):
compression = "lossy,sz,pw_rel,0.1"
compression = "lossy,sz,pw_rel,0.001"
wrapper(self, compression=compression)
def test_consistency_sz_pw_rel(self):
import enstools.compression.api
from enstools.encoding.api import VariableEncoding
import enstools.io
tolerance = 0.001
compression = f"lossy,sz,pw_rel,{tolerance}"
# Check that the compression without specifying compression parameters works
datasets = ["dataset_%iD.nc" % dimension for dimension in range(1, 4)]
for dataset_name in datasets:
input_path = self.input_directory_path / dataset_name
# Check that the output file can be loaded
with enstools.io.read(input_path) as ds:
for var in ds.data_vars:
data_array = ds[var]
encoding = VariableEncoding(specification=compression)
compressed_da, _ = enstools.compression.api.emulate_compression_on_data_array(
data_array=data_array,
compression_specification=encoding,
in_place=False,
)
diff = compressed_da - data_array
diff /= data_array
assert (np.abs(diff.values) < (data_array.values * tolerance)).all()
def test_compress_sz_abs(self):
compression = "lossy,sz,abs,0.01"
wrapper(self, compression=compression)
def test_consistency_sz_abs(self):
import enstools.compression.api
from enstools.encoding.api import VariableEncoding
import enstools.io
tolerance = 0.01
compression = f"lossy,sz,abs,{tolerance}"
# Check that the compression without specifying compression parameters works
datasets = ["dataset_%iD.nc" % dimension for dimension in range(1, 4)]
for dataset_name in datasets:
input_path = self.input_directory_path / dataset_name
# Check that the output file can be loaded
with enstools.io.read(input_path) as ds:
for var in ds.data_vars:
data_array = ds[var]
encoding = VariableEncoding(specification=compression)
compressed_da, _ = enstools.compression.api.emulate_compression_on_data_array(
data_array=data_array,
compression_specification=encoding,
in_place=False,
)
diff = compressed_da - data_array
assert (np.abs(diff.values) < tolerance).all()
def test_compress_sz_rel(self):
compression = "lossy,sz,rel,0.001"
wrapper(self, compression=compression)
def test_consistency_sz_rel(self):
import enstools.compression.api
from enstools.encoding.api import VariableEncoding
import enstools.io
tolerance = 0.01
compression = f"lossy,sz,rel,{tolerance}"
# Check that the compression without specifying compression parameters works
datasets = ["dataset_%iD.nc" % dimension for dimension in range(1, 4)]
for dataset_name in datasets:
input_path = self.input_directory_path / dataset_name
# Check that the output file can be loaded
with enstools.io.read(input_path) as ds:
for var in ds.data_vars:
data_array = ds[var]
encoding = VariableEncoding(specification=compression)
compressed_da, _ = enstools.compression.api.emulate_compression_on_data_array(
data_array=data_array,
compression_specification=encoding,
in_place=False,
)
abs_tolerance = float(data_array.max() - data_array.min()) * tolerance
diff = compressed_da - data_array
assert (np.abs(diff.values) < abs_tolerance).all()