diff --git a/VERSION b/VERSION index 18c2e0fe3573b0689cf5bbb6a261fa00176f7230..e7b67ba75119906ec900da7e329d0604c3e169f6 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2023.5 \ No newline at end of file +2023.6 \ No newline at end of file diff --git a/enstools/encoding/chunk_size.py b/enstools/encoding/chunk_size.py new file mode 100644 index 0000000000000000000000000000000000000000..611fe6d85fdacb38b9d870607f54da6b216a7450 --- /dev/null +++ b/enstools/encoding/chunk_size.py @@ -0,0 +1,16 @@ +# Using a module file to store the desired chunk size. + +# Chunk size used to save files to disk +chunk_size = "10MB" + +# Chunk size used for the analysis +analysis_chunk_size = "100KB" +def change_chunk_size(new_chunk_size: str): + global chunk_size + chunk_size = new_chunk_size + +# This variable will be used in many places. +# One can modify this variable directly assigning a new value +# enstools.encoding.chunk_size.chunk_size = "1MB" +# or using the function change_chunk_size("1MB") + diff --git a/enstools/encoding/dataset_encoding.py b/enstools/encoding/dataset_encoding.py index 769b0b5e7570097ed15d012f2431f6f2db16e678..0ccf0a351e447b7bfd36872a0f74fb832dc8e6ba 100644 --- a/enstools/encoding/dataset_encoding.py +++ b/enstools/encoding/dataset_encoding.py @@ -31,6 +31,7 @@ import numpy as np import xarray import yaml +import enstools.encoding.chunk_size from . import rules from .errors import InvalidCompressionSpecification from .variable_encoding import _Mapping, parse_variable_specification, Encoding, \ @@ -236,7 +237,10 @@ class DatasetEncoding(_Mapping): all_encodings = {**coordinate_encodings, **data_variable_encodings} # Need to specify chunk size, otherwise it breaks down. - self.chunk(encodings=all_encodings) + self.chunk( + encodings=all_encodings, + chunk_memory_size=enstools.encoding.chunk_size.chunk_size, + ) return all_encodings @@ -315,12 +319,19 @@ def find_chunk_sizes(data_array, chunk_size): chunk_sizes = {} chunk_number = {} - # Sort dimensions by size - dims = sorted(data_array.dims, key=lambda x: data_array[x].shape) + # Sort dimensions such that 'time' is always first and rest by size + dims = sorted(data_array.dims, key=lambda x: (x != 'time', data_array[x].shape)) + pending_num_chunks = num_chunks for dim in dims: - chunk_sizes[dim] = max(1, int(data_array[dim].size // pending_num_chunks)) - chunk_number[dim] = data_array[dim].size // chunk_sizes[dim] + if dim == 'time' or pending_num_chunks > 1: + chunk_sizes[dim] = max(1, int(data_array[dim].size // pending_num_chunks)) + chunk_number[dim] = data_array[dim].size // chunk_sizes[dim] - pending_num_chunks = math.ceil(pending_num_chunks / chunk_number[dim]) + pending_num_chunks = math.ceil(pending_num_chunks / chunk_number[dim]) + else: + # If we have already chunked in the 'time' dimension and pending_num_chunks <= 1, + # then keep the whole dimension together in one chunk + chunk_sizes[dim] = data_array[dim].size return chunk_sizes + diff --git a/enstools/encoding/variable_encoding.py b/enstools/encoding/variable_encoding.py index 17737f579f26487bab018118ceb78c0a0ac02220..2d8246dcf0d217461d25806d93a759683f202c9b 100644 --- a/enstools/encoding/variable_encoding.py +++ b/enstools/encoding/variable_encoding.py @@ -107,19 +107,6 @@ class Encoding(_Mapping): """ self._kwargs["chunksizes"] = chunk_sizes - def set_chunk_sizes(self, chunk_sizes: tuple) -> None: - """ - Method to add chunksizes into the encoding dictionary. - Parameters - ---------- - chunk_sizes - - Returns - ------- - - """ - self._kwargs["chunksizes"] = chunk_sizes - class VariableEncoding(_Mapping): """