diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c4d80d94d1308bfad3f44ff85a3625908c826817..659904c956ad255cd0dc6b6924840e6318ceaea3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -3,23 +3,20 @@ stages: - deploy_test - test_install - deploy_prod + - prod_install test_docker: stage: test - image: ubuntu:rolling - tags: + image: python:3.10 + tags: - docker.meteo.physik.lmu.de - before_script: - - apt update - - export DEBIAN_FRONTEND=noninteractive - - apt install -yq git python3 python3-pip python3-venv script: ./run_tests.sh rules: - if: '$CI_COMMIT_TAG == null' deploy-to-testpypi: stage: deploy_test - image: python:3.8 + image: python:3.10 tags: - docker.meteo.physik.lmu.de only: @@ -31,22 +28,20 @@ deploy-to-testpypi: install-from-testpypi: stage: test_install - image: python:3.8 + image: python:3.10 tags: - docker.meteo.physik.lmu.de only: - tags needs: ["deploy-to-testpypi"] script: - - pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ enstools-encoding - artifacts: - when: on_failure - paths: - - "*.log" + - pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ enstools-encoding --break-system-packages + - python3 -c "from enstools.encoding import api" + deploy-to-pypi: stage: deploy_prod - image: python:3.8 + image: python:3.10 only: - tags tags: @@ -55,4 +50,16 @@ deploy-to-pypi: script: - pip install twine - python setup.py sdist bdist_wheel - - twine upload -u "__token__" -p "$PYPI_PASSWORD" --skip-existing dist/* \ No newline at end of file + - twine upload -u "__token__" -p "$PYPI_PASSWORD" --skip-existing dist/* + +install-from-pypi: + stage: prod_install + image: python:3.10 + tags: + - docker.meteo.physik.lmu.de + only: + - tags + needs: ["deploy-to-pypi"] + script: + - pip install enstools-encoding --break-system-packages + - python3 -c "from enstools.encoding import api" \ No newline at end of file diff --git a/VERSION b/VERSION index 16df2afe305549a19b9cc3123aea9c2eff151ddb..18c2e0fe3573b0689cf5bbb6a261fa00176f7230 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2023.4.1 \ No newline at end of file +2023.5 \ No newline at end of file diff --git a/enstools/encoding/api.py b/enstools/encoding/api.py index 907f685b8114d1fa508e55143821afaf6d32b642..603012dc5176b6864f88d34cb72f6b8e8bd7b390 100644 --- a/enstools/encoding/api.py +++ b/enstools/encoding/api.py @@ -1,3 +1,10 @@ +""" +Application Programming Interface + +Access point to have access to all utilities defined in enstools-encoding +""" + +# pylint: disable= unused-import from .definitions import lossy_compressors, lossy_compression_modes, lossy_compressors_and_modes, lossless_backends from .variable_encoding import VariableEncoding, LossyEncoding, LosslessEncoding, NullEncoding, Encoding from .dataset_encoding import DatasetEncoding diff --git a/enstools/encoding/dataset_encoding.py b/enstools/encoding/dataset_encoding.py index 2485c1b80fe0e99d2ab655bccd86bbc99df6d36f..769b0b5e7570097ed15d012f2431f6f2db16e678 100644 --- a/enstools/encoding/dataset_encoding.py +++ b/enstools/encoding/dataset_encoding.py @@ -1,20 +1,54 @@ +""" +This module provides utility functions and a class for handling compression specifications +and chunking in xarray Datasets. + +Functions: +- convert_size(size_bytes): Converts a given size in bytes to a human-readable format. +- convert_to_bytes(size_string): Converts a size string (e.g., '5MB') to the number of bytes. +- compression_dictionary_to_string(compression_dictionary): Converts a dictionary with + compression entries to a single-line specification string. +- parse_full_specification(spec): Parses a full compression specification and returns a + dictionary of variable encodings. +- is_a_valid_dataset_compression_specification(specification): Checks if a compression + specification is valid for a dataset. +- find_chunk_sizes(data_array, chunk_size): Determines chunk sizes for each dimension of a + data array based on a desired chunk size. + +Class: +- DatasetEncoding: Encapsulates compression specification parameters for a full dataset. + Provides methods to generate encodings and add metadata. + +""" + import os +import re from copy import deepcopy from pathlib import Path from typing import Hashable, Union, Dict +import math +import numpy as np import xarray import yaml -import numpy as np from . import rules from .errors import InvalidCompressionSpecification from .variable_encoding import _Mapping, parse_variable_specification, Encoding, \ NullEncoding +from .rules import VARIABLE_SEPARATOR, VARIABLE_NAME_SEPARATOR, \ + DATA_DEFAULT_LABEL, DATA_DEFAULT_VALUE, COORD_LABEL, COORD_DEFAULT_VALUE def convert_size(size_bytes): - import math + """ + Converts a given size in bytes to a human-readable format. + + Args: + size_bytes (int): Size in bytes. + + Returns: + str: Size in human-readable format (e.g., '5MB'). + """ if size_bytes < 0: prefix = "-" size_bytes = -size_bytes @@ -23,11 +57,12 @@ def convert_size(size_bytes): if size_bytes == 0: return "0B" - size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") - i = int(math.floor(math.log(size_bytes, 1024))) - p = math.pow(1024, i) - s = round(size_bytes / p, 2) - return f"{prefix}{s}{size_name[i]}" + + size_units = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") + magnitude = int(math.floor(math.log(size_bytes, 1024))) + factor = math.pow(1024, magnitude) + size = round(size_bytes / factor, 2) + return f"{prefix}{size}{size_units[magnitude]}" def convert_to_bytes(size_string): @@ -40,7 +75,7 @@ def convert_to_bytes(size_string): Returns: int: The number of bytes. """ - import re + size_string = size_string.upper() digits = re.match(r'\d+(?:\.\d+)?', size_string) # matches digits and optionally a dot followed by more digits if digits: @@ -65,23 +100,35 @@ def compression_dictionary_to_string(compression_dictionary: Dict[str, str]) -> def parse_full_specification(spec: Union[str, None]) -> Dict[str, Encoding]: - from enstools.encoding.rules import VARIABLE_SEPARATOR, VARIABLE_NAME_SEPARATOR, \ - DATA_DEFAULT_LABEL, DATA_DEFAULT_VALUE, COORD_LABEL, COORD_DEFAULT_VALUE + """ + Parses a full compression specification and returns a dictionary of variable encodings. + + Args: + spec (Union[str, None]): The full compression specification as a string or None. + + Returns: + Dict[str, Encoding]: A dictionary mapping variable names to their corresponding encodings. + + Raises: + InvalidCompressionSpecification: If a variable has multiple definitions in the specification. + + """ + result = {} if spec is None: spec = "None" parts = spec.split(VARIABLE_SEPARATOR) - for p in parts: + for part in parts: # For each part, check if there's a variable name. # If there's a variable name, split the name and the specification - if VARIABLE_NAME_SEPARATOR in p: - var_name, var_spec = p.split(VARIABLE_NAME_SEPARATOR) + if VARIABLE_NAME_SEPARATOR in part: + var_name, var_spec = part.split(VARIABLE_NAME_SEPARATOR) # Otherwise, it corresponds to the default. else: var_name = DATA_DEFAULT_LABEL - var_spec = p + var_spec = part # If the variable name was already in the dictionary, raise an error. if var_name in result: @@ -125,6 +172,20 @@ class DatasetEncoding(_Mapping): @staticmethod def get_a_single_compression_string(compression: Union[str, Dict[str, str], Path, None]) -> Union[str, None]: + """ + Converts the compression parameter into a single compression specification string. + + Args: + compression (Union[str, Dict[str, str], Path, None]): The compression parameter, + which can be a string, a dictionary, a file path, or None. + + Returns: + Union[str, None]: The single compression specification string or None. + + Raises: + InvalidCompressionSpecification: If the compression argument is not a valid type. + + """ # The compression parameter can be a string or a dictionary. # In case it is a string, it can be directly a compression specification or a yaml file. @@ -136,21 +197,28 @@ class DatasetEncoding(_Mapping): # Just to make sure that we have all the mandatory fields (default, coordinates), we will convert # the input dictionary to a single specification string and convert it back. return compression_dictionary_to_string(compression) - elif isinstance(compression, Path): - with compression.open("r") as stream: + if isinstance(compression, Path): + with compression.open("r", encoding="utf-8") as stream: dict_of_strings = yaml.safe_load(stream) return compression_dictionary_to_string(dict_of_strings) - elif isinstance(compression, str): + if isinstance(compression, str): # Convert the single string in a dictionary with an entry for each specified variable plus the defaults # for data and coordinates return compression - elif compression is None: + if compression is None: return None - else: - raise InvalidCompressionSpecification( + + raise InvalidCompressionSpecification( f"The argument 'compression' should be a string, a dictionary or a Path. It is {type(compression)!r}-") def encoding(self): + """ + Generate the encoding dictionary for all variables in the dataset. + + Returns: + dict: A dictionary mapping variable names to their corresponding encodings. + + """ # Get the defaults data_default = self.variable_encodings[rules.DATA_DEFAULT_LABEL] coordinates_default = self.variable_encodings[rules.COORD_LABEL] @@ -185,12 +253,12 @@ class DatasetEncoding(_Mapping): # Loop over all the variables for variable in self.dataset.data_vars: - da = self.dataset[variable] - type_size = da.dtype.itemsize + data_array = self.dataset[variable] + type_size = data_array.dtype.itemsize optimal_chunk_size = chunk_memory_size / type_size - chunk_sizes = find_chunk_sizes(data_array=da, chunk_size=optimal_chunk_size) - chunk_sizes = tuple(chunk_sizes[d] for d in da.dims) + chunk_sizes = find_chunk_sizes(data_array=data_array, chunk_size=optimal_chunk_size) + chunk_sizes = tuple(chunk_sizes[d] for d in data_array.dims) encodings[variable].set_chunk_sizes(chunk_sizes) @property @@ -209,6 +277,21 @@ class DatasetEncoding(_Mapping): def is_a_valid_dataset_compression_specification(specification): + """ + Checks if a compression specification is valid for a dataset. + + Args: + specification: The compression specification to be validated. + + Returns: + bool: True if the specification is valid, False otherwise. + + Note: + - The function attempts to parse the specification using the `parse_full_specification` function. + - If the specification is successfully parsed without raising an exception, it is considered valid. + - If an `InvalidCompressionSpecification` exception is raised during parsing, + the specification is considered invalid. + """ try: _ = parse_full_specification(specification) return True @@ -217,7 +300,16 @@ def is_a_valid_dataset_compression_specification(specification): def find_chunk_sizes(data_array, chunk_size): - import math + """ + Determines the chunk sizes for each dimension of a data array based on a desired chunk size. + + Args: + data_array: The data array for which chunk sizes are determined. + chunk_size: The desired chunk size in terms of the number of elements. + + Returns: + dict: A dictionary mapping each dimension to its corresponding chunk size. + """ total_points = np.prod(data_array.shape) num_chunks = max(1, int(total_points // chunk_size)) chunk_sizes = {} diff --git a/enstools/encoding/definitions.py b/enstools/encoding/definitions.py index e7f4b19caa83401b014b18108d5cc583784afd9c..f9af5d34d7ed3b68d459eba6b746133d9aab31c5 100644 --- a/enstools/encoding/definitions.py +++ b/enstools/encoding/definitions.py @@ -1,6 +1,9 @@ +""" +This module provides configurations and mappings related to lossy and lossless compressors. +""" +import hdf5plugin # Dictionary of implemented lossy compressors and their respective modes and ranges. -# TODO: Would that be better to keep the definition of the available compressors and methods in a configuration file? lossy_compressors_and_modes = { "sz": { "abs": {"range": [0, float('inf')], "type": float}, @@ -21,11 +24,10 @@ lossy_compressors_and_modes = { } # Create a list with the lossy compressors -lossy_compressors = [c for c in lossy_compressors_and_modes] +lossy_compressors = list(lossy_compressors_and_modes) # Create a dictionary containing the available compression modes for each lossy compressor -lossy_compression_modes = {c: [k for k in lossy_compressors_and_modes[c]] for c in lossy_compressors} - +lossy_compression_modes = {c: list(lossy_compressors_and_modes) for c in lossy_compressors} # List of available BLOSC backends for lossless compression lossless_backends = ['blosclz', 'lz4', 'lz4hc', 'snappy', 'zlib', 'zstd'] @@ -40,7 +42,7 @@ sz_mode_map = { } # Mapping between compressor names and hdf5plugin classes -import hdf5plugin + compressor_map = { "zfp": hdf5plugin.Zfp, "sz": hdf5plugin.SZ, diff --git a/enstools/encoding/errors.py b/enstools/encoding/errors.py index d2ac549a7353e90ec5a54d36c6b97088ea7c240f..ed0acb79bb93c667614d642510662698ecc2eff7 100644 --- a/enstools/encoding/errors.py +++ b/enstools/encoding/errors.py @@ -1,6 +1,16 @@ +""" +This module provides exceptions related to compression in Enstools. + +""" + + class EnstoolsCompressionError(Exception): - ... + """ + Base exception class for Enstools compression-related errors. + """ class InvalidCompressionSpecification(EnstoolsCompressionError): - ... + """ + Exception raised for an invalid compression specification. + """ diff --git a/enstools/encoding/variable_encoding.py b/enstools/encoding/variable_encoding.py index 9691946ba61637432764fe29a062d1cde18c02c8..17737f579f26487bab018118ceb78c0a0ac02220 100644 --- a/enstools/encoding/variable_encoding.py +++ b/enstools/encoding/variable_encoding.py @@ -1,14 +1,21 @@ +""" +This module provides utility classes and functions for compression in Enstools. -from .definitions import lossy_compressors_and_modes -import logging +This module defines various encodings and exceptions for compression, as well as utility functions. + +""" + + +import logging from typing import Mapping, Union import hdf5plugin from enstools.encoding import rules, definitions from enstools.encoding.errors import InvalidCompressionSpecification -from .rules import LOSSLESS_DEFAULT_BACKEND, LOSSLESS_DEFAULT_COMPRESSION_LEVEL +from .definitions import lossy_compressors_and_modes +from .rules import LOSSLESS_DEFAULT_BACKEND, LOSSLESS_DEFAULT_COMPRESSION_LEVEL, COMPRESSION_SPECIFICATION_SEPARATOR # Change logging level for the hdf5plugin to avoid unnecessary warnings loggers = {name: logging.getLogger(name) for name in logging.root.manager.loggerDict} @@ -40,19 +47,51 @@ class _Mapping(Mapping): class Encoding(_Mapping): + """ + Base case for encoding representation. + """ def check_validity(self) -> bool: - ... + """ + Checks the validity of the encoding. + + Returns: + - bool: True if the encoding is valid. + + """ def to_string(self) -> str: - ... + """ + Returns the encoding specification as a string. + + Returns: + - str: The encoding specification as a string. + """ def encoding(self) -> Mapping: - ... + """ + Returns the mapping of encoding parameters. + Returns: + - Mapping: The mapping of encoding parameters. + + """ def description(self) -> str: - ... + """ + Returns a description of the encoding. + + Returns: + - str: A description of the encoding. + + """ def __repr__(self): + """ + Returns a string representation of the Encoding object. + + Returns: + - str: A string representation of the Encoding object. + + """ return f"{self.__class__.__name__}({self.to_string()})" def set_chunk_sizes(self, chunk_sizes: tuple) -> None: @@ -156,10 +195,11 @@ class LosslessEncoding(Encoding): def check_validity(self) -> bool: if self.backend not in definitions.lossless_backends: raise InvalidCompressionSpecification(f"Backend {self.backend!r} is not a valid backend.") - elif not (1 <= self.compression_level <= 9): + + if not 1 <= self.compression_level <= 9: raise InvalidCompressionSpecification(f"Compression level {self.compression_level} must be within 1 and 9.") - else: - return True + + return True def to_string(self) -> str: return rules.COMPRESSION_SPECIFICATION_SEPARATOR.join(["lossless", self.backend, str(self.compression_level)]) @@ -176,6 +216,9 @@ class LosslessEncoding(Encoding): class LossyEncoding(Encoding): + """ + Encoding subclass for lossy compression. + """ def __init__(self, compressor: str, mode: str, parameter: Union[float, int]): super().__init__() self.compressor = compressor @@ -189,6 +232,16 @@ class LossyEncoding(Encoding): self._kwargs = dict(self.encoding()) def check_validity(self): + """ + Checks the validity of the compressor, mode, and parameter. + + Raises: + - InvalidCompressionSpecification: If the compressor, mode, or parameter is invalid or out of range. + + Returns: + - bool: True if the compressor, mode, and parameter are valid. + + """ # Check compressor validity if self.compressor not in definitions.lossy_compressors_and_modes: raise InvalidCompressionSpecification(f"Invalid compressor {self.compressor}") @@ -203,7 +256,7 @@ class LossyEncoding(Encoding): if not isinstance(self.parameter, mode_type): try: self.parameter = mode_type(self.parameter) - except TypeError: + except TypeError as err: raise InvalidCompressionSpecification(f"Invalid parameter type {self.parameter!r}") # Check range if self.parameter <= mode_range[0] or self.parameter >= mode_range[1]: @@ -211,20 +264,47 @@ class LossyEncoding(Encoding): return True def to_string(self) -> str: + """ + Returns the encoding specification as a string. + + Returns: + - str: The encoding specification as a string. + + """ return rules.COMPRESSION_SPECIFICATION_SEPARATOR.join( ["lossy", self.compressor, self.mode, str(self.parameter)]) def encoding(self) -> Mapping: + """ + Returns the mapping of encoding parameters. + Returns: + - Mapping: The mapping of encoding parameters. + + """ mode = definitions.sz_mode_map[self.mode] if self.mode in definitions.sz_mode_map else self.mode arguments = {mode: self.parameter} return definitions.compressor_map[self.compressor](**arguments) def description(self) -> str: + """ + Returns a description of the lossy encoding. + + Returns: + - str: A description of the lossy encoding. + + """ return f"Lossy compressed using the HDF5 filters with specification: {self.to_string()} " \ f"(Using {self.compressor!r} with mode {self.mode!r} and parameter {self.parameter})" def __repr__(self): + """ + Returns a string representation of the LossyEncoding object. + + Returns: + - str: A string representation of the LossyEncoding object. + + """ return f"{self.__class__.__name__}(compressor={self.compressor}, mode={self.mode}, parameter={self.parameter})" @@ -243,7 +323,6 @@ def parse_variable_specification(var_spec: str) -> Encoding: if var_spec in (None, "None", "none"): return NullEncoding() - from enstools.encoding.rules import COMPRESSION_SPECIFICATION_SEPARATOR # Split the specification in the different parts. var_spec_parts = var_spec.split(COMPRESSION_SPECIFICATION_SEPARATOR) # Treatment for lossless @@ -252,7 +331,7 @@ def parse_variable_specification(var_spec: str) -> Encoding: compression_level = int(var_spec_parts[2]) if len(var_spec_parts) > 2 else None return LosslessEncoding(backend, compression_level) # Treatment for lossy - elif var_spec_parts[0] == "lossy": + if var_spec_parts[0] == "lossy": # Lossy specifications must have 4 elements (lossy,compressor,mode,parameter) if len(var_spec_parts) != 4: raise InvalidCompressionSpecification(f"Invalid specification {var_spec!r}") @@ -273,9 +352,9 @@ def parse_variable_specification(var_spec: str) -> Encoding: except ValueError: raise InvalidCompressionSpecification(f"Could not cast {specification!r} to type {specification_type!r}") return LossyEncoding(compressor, mode, specification) - else: - # In case its not lossy nor lossless, raise an exception. - raise InvalidCompressionSpecification(f"Invalid specification {var_spec!r}") + + # In case its not lossy nor lossless, raise an exception. + raise InvalidCompressionSpecification(f"Invalid specification {var_spec!r}") def get_variable_encoding( @@ -301,9 +380,9 @@ def get_variable_encoding( "Only one of the options can be used to create an Encoding" if specification: return parse_variable_specification(specification) - elif compressor: + if compressor: return LossyEncoding(compressor=compressor, mode=mode, parameter=parameter) - elif backend: + if backend: if compression_level is None: compression_level = LOSSLESS_DEFAULT_COMPRESSION_LEVEL return LosslessEncoding(backend=backend, compression_level=compression_level)