From 98d69771608fc7044ebb7a173004a3ec4dffad04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oriol=20Tint=C3=B3?= <oriol.tinto@lmu.de> Date: Thu, 11 May 2023 16:57:59 +0200 Subject: [PATCH] Clean code following pylint comments. --- enstools/encoding/api.py | 7 ++ enstools/encoding/dataset_encoding.py | 142 ++++++++++++++++++++----- enstools/encoding/definitions.py | 12 ++- enstools/encoding/errors.py | 14 ++- enstools/encoding/variable_encoding.py | 124 ++++++++++++++++----- 5 files changed, 238 insertions(+), 61 deletions(-) diff --git a/enstools/encoding/api.py b/enstools/encoding/api.py index 907f685..603012d 100644 --- a/enstools/encoding/api.py +++ b/enstools/encoding/api.py @@ -1,3 +1,10 @@ +""" +Application Programming Interface + +Access point to have access to all utilities defined in enstools-encoding +""" + +# pylint: disable= unused-import from .definitions import lossy_compressors, lossy_compression_modes, lossy_compressors_and_modes, lossless_backends from .variable_encoding import VariableEncoding, LossyEncoding, LosslessEncoding, NullEncoding, Encoding from .dataset_encoding import DatasetEncoding diff --git a/enstools/encoding/dataset_encoding.py b/enstools/encoding/dataset_encoding.py index 2485c1b..769b0b5 100644 --- a/enstools/encoding/dataset_encoding.py +++ b/enstools/encoding/dataset_encoding.py @@ -1,20 +1,54 @@ +""" +This module provides utility functions and a class for handling compression specifications +and chunking in xarray Datasets. + +Functions: +- convert_size(size_bytes): Converts a given size in bytes to a human-readable format. +- convert_to_bytes(size_string): Converts a size string (e.g., '5MB') to the number of bytes. +- compression_dictionary_to_string(compression_dictionary): Converts a dictionary with + compression entries to a single-line specification string. +- parse_full_specification(spec): Parses a full compression specification and returns a + dictionary of variable encodings. +- is_a_valid_dataset_compression_specification(specification): Checks if a compression + specification is valid for a dataset. +- find_chunk_sizes(data_array, chunk_size): Determines chunk sizes for each dimension of a + data array based on a desired chunk size. + +Class: +- DatasetEncoding: Encapsulates compression specification parameters for a full dataset. + Provides methods to generate encodings and add metadata. + +""" + import os +import re from copy import deepcopy from pathlib import Path from typing import Hashable, Union, Dict +import math +import numpy as np import xarray import yaml -import numpy as np from . import rules from .errors import InvalidCompressionSpecification from .variable_encoding import _Mapping, parse_variable_specification, Encoding, \ NullEncoding +from .rules import VARIABLE_SEPARATOR, VARIABLE_NAME_SEPARATOR, \ + DATA_DEFAULT_LABEL, DATA_DEFAULT_VALUE, COORD_LABEL, COORD_DEFAULT_VALUE def convert_size(size_bytes): - import math + """ + Converts a given size in bytes to a human-readable format. + + Args: + size_bytes (int): Size in bytes. + + Returns: + str: Size in human-readable format (e.g., '5MB'). + """ if size_bytes < 0: prefix = "-" size_bytes = -size_bytes @@ -23,11 +57,12 @@ def convert_size(size_bytes): if size_bytes == 0: return "0B" - size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") - i = int(math.floor(math.log(size_bytes, 1024))) - p = math.pow(1024, i) - s = round(size_bytes / p, 2) - return f"{prefix}{s}{size_name[i]}" + + size_units = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") + magnitude = int(math.floor(math.log(size_bytes, 1024))) + factor = math.pow(1024, magnitude) + size = round(size_bytes / factor, 2) + return f"{prefix}{size}{size_units[magnitude]}" def convert_to_bytes(size_string): @@ -40,7 +75,7 @@ def convert_to_bytes(size_string): Returns: int: The number of bytes. """ - import re + size_string = size_string.upper() digits = re.match(r'\d+(?:\.\d+)?', size_string) # matches digits and optionally a dot followed by more digits if digits: @@ -65,23 +100,35 @@ def compression_dictionary_to_string(compression_dictionary: Dict[str, str]) -> def parse_full_specification(spec: Union[str, None]) -> Dict[str, Encoding]: - from enstools.encoding.rules import VARIABLE_SEPARATOR, VARIABLE_NAME_SEPARATOR, \ - DATA_DEFAULT_LABEL, DATA_DEFAULT_VALUE, COORD_LABEL, COORD_DEFAULT_VALUE + """ + Parses a full compression specification and returns a dictionary of variable encodings. + + Args: + spec (Union[str, None]): The full compression specification as a string or None. + + Returns: + Dict[str, Encoding]: A dictionary mapping variable names to their corresponding encodings. + + Raises: + InvalidCompressionSpecification: If a variable has multiple definitions in the specification. + + """ + result = {} if spec is None: spec = "None" parts = spec.split(VARIABLE_SEPARATOR) - for p in parts: + for part in parts: # For each part, check if there's a variable name. # If there's a variable name, split the name and the specification - if VARIABLE_NAME_SEPARATOR in p: - var_name, var_spec = p.split(VARIABLE_NAME_SEPARATOR) + if VARIABLE_NAME_SEPARATOR in part: + var_name, var_spec = part.split(VARIABLE_NAME_SEPARATOR) # Otherwise, it corresponds to the default. else: var_name = DATA_DEFAULT_LABEL - var_spec = p + var_spec = part # If the variable name was already in the dictionary, raise an error. if var_name in result: @@ -125,6 +172,20 @@ class DatasetEncoding(_Mapping): @staticmethod def get_a_single_compression_string(compression: Union[str, Dict[str, str], Path, None]) -> Union[str, None]: + """ + Converts the compression parameter into a single compression specification string. + + Args: + compression (Union[str, Dict[str, str], Path, None]): The compression parameter, + which can be a string, a dictionary, a file path, or None. + + Returns: + Union[str, None]: The single compression specification string or None. + + Raises: + InvalidCompressionSpecification: If the compression argument is not a valid type. + + """ # The compression parameter can be a string or a dictionary. # In case it is a string, it can be directly a compression specification or a yaml file. @@ -136,21 +197,28 @@ class DatasetEncoding(_Mapping): # Just to make sure that we have all the mandatory fields (default, coordinates), we will convert # the input dictionary to a single specification string and convert it back. return compression_dictionary_to_string(compression) - elif isinstance(compression, Path): - with compression.open("r") as stream: + if isinstance(compression, Path): + with compression.open("r", encoding="utf-8") as stream: dict_of_strings = yaml.safe_load(stream) return compression_dictionary_to_string(dict_of_strings) - elif isinstance(compression, str): + if isinstance(compression, str): # Convert the single string in a dictionary with an entry for each specified variable plus the defaults # for data and coordinates return compression - elif compression is None: + if compression is None: return None - else: - raise InvalidCompressionSpecification( + + raise InvalidCompressionSpecification( f"The argument 'compression' should be a string, a dictionary or a Path. It is {type(compression)!r}-") def encoding(self): + """ + Generate the encoding dictionary for all variables in the dataset. + + Returns: + dict: A dictionary mapping variable names to their corresponding encodings. + + """ # Get the defaults data_default = self.variable_encodings[rules.DATA_DEFAULT_LABEL] coordinates_default = self.variable_encodings[rules.COORD_LABEL] @@ -185,12 +253,12 @@ class DatasetEncoding(_Mapping): # Loop over all the variables for variable in self.dataset.data_vars: - da = self.dataset[variable] - type_size = da.dtype.itemsize + data_array = self.dataset[variable] + type_size = data_array.dtype.itemsize optimal_chunk_size = chunk_memory_size / type_size - chunk_sizes = find_chunk_sizes(data_array=da, chunk_size=optimal_chunk_size) - chunk_sizes = tuple(chunk_sizes[d] for d in da.dims) + chunk_sizes = find_chunk_sizes(data_array=data_array, chunk_size=optimal_chunk_size) + chunk_sizes = tuple(chunk_sizes[d] for d in data_array.dims) encodings[variable].set_chunk_sizes(chunk_sizes) @property @@ -209,6 +277,21 @@ class DatasetEncoding(_Mapping): def is_a_valid_dataset_compression_specification(specification): + """ + Checks if a compression specification is valid for a dataset. + + Args: + specification: The compression specification to be validated. + + Returns: + bool: True if the specification is valid, False otherwise. + + Note: + - The function attempts to parse the specification using the `parse_full_specification` function. + - If the specification is successfully parsed without raising an exception, it is considered valid. + - If an `InvalidCompressionSpecification` exception is raised during parsing, + the specification is considered invalid. + """ try: _ = parse_full_specification(specification) return True @@ -217,7 +300,16 @@ def is_a_valid_dataset_compression_specification(specification): def find_chunk_sizes(data_array, chunk_size): - import math + """ + Determines the chunk sizes for each dimension of a data array based on a desired chunk size. + + Args: + data_array: The data array for which chunk sizes are determined. + chunk_size: The desired chunk size in terms of the number of elements. + + Returns: + dict: A dictionary mapping each dimension to its corresponding chunk size. + """ total_points = np.prod(data_array.shape) num_chunks = max(1, int(total_points // chunk_size)) chunk_sizes = {} diff --git a/enstools/encoding/definitions.py b/enstools/encoding/definitions.py index e7f4b19..f9af5d3 100644 --- a/enstools/encoding/definitions.py +++ b/enstools/encoding/definitions.py @@ -1,6 +1,9 @@ +""" +This module provides configurations and mappings related to lossy and lossless compressors. +""" +import hdf5plugin # Dictionary of implemented lossy compressors and their respective modes and ranges. -# TODO: Would that be better to keep the definition of the available compressors and methods in a configuration file? lossy_compressors_and_modes = { "sz": { "abs": {"range": [0, float('inf')], "type": float}, @@ -21,11 +24,10 @@ lossy_compressors_and_modes = { } # Create a list with the lossy compressors -lossy_compressors = [c for c in lossy_compressors_and_modes] +lossy_compressors = list(lossy_compressors_and_modes) # Create a dictionary containing the available compression modes for each lossy compressor -lossy_compression_modes = {c: [k for k in lossy_compressors_and_modes[c]] for c in lossy_compressors} - +lossy_compression_modes = {c: list(lossy_compressors_and_modes) for c in lossy_compressors} # List of available BLOSC backends for lossless compression lossless_backends = ['blosclz', 'lz4', 'lz4hc', 'snappy', 'zlib', 'zstd'] @@ -40,7 +42,7 @@ sz_mode_map = { } # Mapping between compressor names and hdf5plugin classes -import hdf5plugin + compressor_map = { "zfp": hdf5plugin.Zfp, "sz": hdf5plugin.SZ, diff --git a/enstools/encoding/errors.py b/enstools/encoding/errors.py index d2ac549..ed0acb7 100644 --- a/enstools/encoding/errors.py +++ b/enstools/encoding/errors.py @@ -1,6 +1,16 @@ +""" +This module provides exceptions related to compression in Enstools. + +""" + + class EnstoolsCompressionError(Exception): - ... + """ + Base exception class for Enstools compression-related errors. + """ class InvalidCompressionSpecification(EnstoolsCompressionError): - ... + """ + Exception raised for an invalid compression specification. + """ diff --git a/enstools/encoding/variable_encoding.py b/enstools/encoding/variable_encoding.py index 9691946..45d87b2 100644 --- a/enstools/encoding/variable_encoding.py +++ b/enstools/encoding/variable_encoding.py @@ -1,14 +1,21 @@ +""" +This module provides utility classes and functions for compression in Enstools. -from .definitions import lossy_compressors_and_modes -import logging +This module defines various encodings and exceptions for compression, as well as utility functions. + +""" + + +import logging from typing import Mapping, Union import hdf5plugin from enstools.encoding import rules, definitions from enstools.encoding.errors import InvalidCompressionSpecification -from .rules import LOSSLESS_DEFAULT_BACKEND, LOSSLESS_DEFAULT_COMPRESSION_LEVEL +from .definitions import lossy_compressors_and_modes +from .rules import LOSSLESS_DEFAULT_BACKEND, LOSSLESS_DEFAULT_COMPRESSION_LEVEL, COMPRESSION_SPECIFICATION_SEPARATOR # Change logging level for the hdf5plugin to avoid unnecessary warnings loggers = {name: logging.getLogger(name) for name in logging.root.manager.loggerDict} @@ -40,33 +47,52 @@ class _Mapping(Mapping): class Encoding(_Mapping): + """ + Base case for encoding representation. + """ def check_validity(self) -> bool: - ... + """ + Checks the validity of the encoding. + + Returns: + - bool: True if the encoding is valid. + + """ def to_string(self) -> str: - ... + """ + Returns the encoding specification as a string. + + Returns: + - str: The encoding specification as a string. + """ def encoding(self) -> Mapping: - ... + """ + Returns the mapping of encoding parameters. + Returns: + - Mapping: The mapping of encoding parameters. + + """ def description(self) -> str: - ... + """ + Returns a description of the encoding. + + Returns: + - str: A description of the encoding. + + """ def __repr__(self): - return f"{self.__class__.__name__}({self.to_string()})" - - def set_chunk_sizes(self, chunk_sizes: tuple) -> None: """ - Method to add chunksizes into the encoding dictionary. - Parameters - ---------- - chunk_sizes + Returns a string representation of the Encoding object. - Returns - ------- + Returns: + - str: A string representation of the Encoding object. - """ - self._kwargs["chunksizes"] = chunk_sizes + """ + return f"{self.__class__.__name__}({self.to_string()})" def set_chunk_sizes(self, chunk_sizes: tuple) -> None: """ @@ -156,10 +182,11 @@ class LosslessEncoding(Encoding): def check_validity(self) -> bool: if self.backend not in definitions.lossless_backends: raise InvalidCompressionSpecification(f"Backend {self.backend!r} is not a valid backend.") - elif not (1 <= self.compression_level <= 9): + + if not 1 <= self.compression_level <= 9: raise InvalidCompressionSpecification(f"Compression level {self.compression_level} must be within 1 and 9.") - else: - return True + + return True def to_string(self) -> str: return rules.COMPRESSION_SPECIFICATION_SEPARATOR.join(["lossless", self.backend, str(self.compression_level)]) @@ -176,6 +203,9 @@ class LosslessEncoding(Encoding): class LossyEncoding(Encoding): + """ + Encoding subclass for lossy compression. + """ def __init__(self, compressor: str, mode: str, parameter: Union[float, int]): super().__init__() self.compressor = compressor @@ -189,6 +219,16 @@ class LossyEncoding(Encoding): self._kwargs = dict(self.encoding()) def check_validity(self): + """ + Checks the validity of the compressor, mode, and parameter. + + Raises: + - InvalidCompressionSpecification: If the compressor, mode, or parameter is invalid or out of range. + + Returns: + - bool: True if the compressor, mode, and parameter are valid. + + """ # Check compressor validity if self.compressor not in definitions.lossy_compressors_and_modes: raise InvalidCompressionSpecification(f"Invalid compressor {self.compressor}") @@ -203,7 +243,7 @@ class LossyEncoding(Encoding): if not isinstance(self.parameter, mode_type): try: self.parameter = mode_type(self.parameter) - except TypeError: + except TypeError as err: raise InvalidCompressionSpecification(f"Invalid parameter type {self.parameter!r}") # Check range if self.parameter <= mode_range[0] or self.parameter >= mode_range[1]: @@ -211,20 +251,47 @@ class LossyEncoding(Encoding): return True def to_string(self) -> str: + """ + Returns the encoding specification as a string. + + Returns: + - str: The encoding specification as a string. + + """ return rules.COMPRESSION_SPECIFICATION_SEPARATOR.join( ["lossy", self.compressor, self.mode, str(self.parameter)]) def encoding(self) -> Mapping: + """ + Returns the mapping of encoding parameters. + Returns: + - Mapping: The mapping of encoding parameters. + + """ mode = definitions.sz_mode_map[self.mode] if self.mode in definitions.sz_mode_map else self.mode arguments = {mode: self.parameter} return definitions.compressor_map[self.compressor](**arguments) def description(self) -> str: + """ + Returns a description of the lossy encoding. + + Returns: + - str: A description of the lossy encoding. + + """ return f"Lossy compressed using the HDF5 filters with specification: {self.to_string()} " \ f"(Using {self.compressor!r} with mode {self.mode!r} and parameter {self.parameter})" def __repr__(self): + """ + Returns a string representation of the LossyEncoding object. + + Returns: + - str: A string representation of the LossyEncoding object. + + """ return f"{self.__class__.__name__}(compressor={self.compressor}, mode={self.mode}, parameter={self.parameter})" @@ -243,7 +310,6 @@ def parse_variable_specification(var_spec: str) -> Encoding: if var_spec in (None, "None", "none"): return NullEncoding() - from enstools.encoding.rules import COMPRESSION_SPECIFICATION_SEPARATOR # Split the specification in the different parts. var_spec_parts = var_spec.split(COMPRESSION_SPECIFICATION_SEPARATOR) # Treatment for lossless @@ -252,7 +318,7 @@ def parse_variable_specification(var_spec: str) -> Encoding: compression_level = int(var_spec_parts[2]) if len(var_spec_parts) > 2 else None return LosslessEncoding(backend, compression_level) # Treatment for lossy - elif var_spec_parts[0] == "lossy": + if var_spec_parts[0] == "lossy": # Lossy specifications must have 4 elements (lossy,compressor,mode,parameter) if len(var_spec_parts) != 4: raise InvalidCompressionSpecification(f"Invalid specification {var_spec!r}") @@ -273,9 +339,9 @@ def parse_variable_specification(var_spec: str) -> Encoding: except ValueError: raise InvalidCompressionSpecification(f"Could not cast {specification!r} to type {specification_type!r}") return LossyEncoding(compressor, mode, specification) - else: - # In case its not lossy nor lossless, raise an exception. - raise InvalidCompressionSpecification(f"Invalid specification {var_spec!r}") + + # In case its not lossy nor lossless, raise an exception. + raise InvalidCompressionSpecification(f"Invalid specification {var_spec!r}") def get_variable_encoding( @@ -301,9 +367,9 @@ def get_variable_encoding( "Only one of the options can be used to create an Encoding" if specification: return parse_variable_specification(specification) - elif compressor: + if compressor: return LossyEncoding(compressor=compressor, mode=mode, parameter=parameter) - elif backend: + if backend: if compression_level is None: compression_level = LOSSLESS_DEFAULT_COMPRESSION_LEVEL return LosslessEncoding(backend=backend, compression_level=compression_level) -- GitLab