Skip to content
Snippets Groups Projects
Commit f8c47fb9 authored by Oriol.Tinto's avatar Oriol.Tinto
Browse files

Merge branch 'development' into 'main'

Chunking

See merge request !11
parents 3c0812f4 bec5e45b
No related branches found
No related tags found
1 merge request!11Chunking
Pipeline #19033 passed
2023.5
\ No newline at end of file
2023.6
\ No newline at end of file
# Using a module file to store the desired chunk size.
# Chunk size used to save files to disk
chunk_size = "10MB"
# Chunk size used for the analysis
analysis_chunk_size = "100KB"
def change_chunk_size(new_chunk_size: str):
global chunk_size
chunk_size = new_chunk_size
# This variable will be used in many places.
# One can modify this variable directly assigning a new value
# enstools.encoding.chunk_size.chunk_size = "1MB"
# or using the function change_chunk_size("1MB")
......@@ -31,6 +31,7 @@ import numpy as np
import xarray
import yaml
import enstools.encoding.chunk_size
from . import rules
from .errors import InvalidCompressionSpecification
from .variable_encoding import _Mapping, parse_variable_specification, Encoding, \
......@@ -236,7 +237,10 @@ class DatasetEncoding(_Mapping):
all_encodings = {**coordinate_encodings, **data_variable_encodings}
# Need to specify chunk size, otherwise it breaks down.
self.chunk(encodings=all_encodings)
self.chunk(
encodings=all_encodings,
chunk_memory_size=enstools.encoding.chunk_size.chunk_size,
)
return all_encodings
......@@ -315,12 +319,19 @@ def find_chunk_sizes(data_array, chunk_size):
chunk_sizes = {}
chunk_number = {}
# Sort dimensions by size
dims = sorted(data_array.dims, key=lambda x: data_array[x].shape)
# Sort dimensions such that 'time' is always first and rest by size
dims = sorted(data_array.dims, key=lambda x: (x != 'time', data_array[x].shape))
pending_num_chunks = num_chunks
for dim in dims:
chunk_sizes[dim] = max(1, int(data_array[dim].size // pending_num_chunks))
chunk_number[dim] = data_array[dim].size // chunk_sizes[dim]
if dim == 'time' or pending_num_chunks > 1:
chunk_sizes[dim] = max(1, int(data_array[dim].size // pending_num_chunks))
chunk_number[dim] = data_array[dim].size // chunk_sizes[dim]
pending_num_chunks = math.ceil(pending_num_chunks / chunk_number[dim])
pending_num_chunks = math.ceil(pending_num_chunks / chunk_number[dim])
else:
# If we have already chunked in the 'time' dimension and pending_num_chunks <= 1,
# then keep the whole dimension together in one chunk
chunk_sizes[dim] = data_array[dim].size
return chunk_sizes
......@@ -107,19 +107,6 @@ class Encoding(_Mapping):
"""
self._kwargs["chunksizes"] = chunk_sizes
def set_chunk_sizes(self, chunk_sizes: tuple) -> None:
"""
Method to add chunksizes into the encoding dictionary.
Parameters
----------
chunk_sizes
Returns
-------
"""
self._kwargs["chunksizes"] = chunk_sizes
class VariableEncoding(_Mapping):
"""
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment