Source code for hsr4hci.hdf

"""
Methods for dealing with HDF files.

Parts of the code in the module are based on:
https://codereview.stackexchange.com/a/121308
"""

# -----------------------------------------------------------------------------
# IMPORTS
# -----------------------------------------------------------------------------

from pathlib import Path
from typing import Any, Union

import h5py
import numpy as np


# -----------------------------------------------------------------------------
# CONSTANT DEFINITIONS
# -----------------------------------------------------------------------------

# Define a (incomplete) list of types that h5py supports for reading and
# writing. Note that each type is mapped to a native numpy type by h5py.
H5PY_SUPPORTED_TYPES = (
    bool,
    bytes,
    complex,
    float,
    int,
    np.generic,
    np.ndarray,
    str,
)


# -----------------------------------------------------------------------------
# FUNCTION DEFINITIONS
# -----------------------------------------------------------------------------

[docs]def save_data_to_hdf( hdf_file: h5py.File, location: str, name: str, data: Any, overwrite: bool = True, ) -> None: """ Auxiliary function to write data to an open HDF file that provides automatic overwriting (which requires deleting and re-creating data sets that already exist). Args: hdf_file: An open HDF file (in write mode). location: The path (`"group_1/group_2/.../group_n"`) at which to create the new data set in the HDF file. Can be empty. name: The name of the data set. data: The data to be written to the data set. overwrite: Whether to overwrite a data set of the same name that already exists in the given location. """ # Ensure that we only try to save supported types if not isinstance(data, H5PY_SUPPORTED_TYPES): raise TypeError(f'Type "{type(data)}" not supported by HDF format!') # Check if the data set already exists if (location in hdf_file) and (name in hdf_file[location]): # If overwrite is True, we delete the data set and create it again # below (there is no direct overwrite) if overwrite: del hdf_file[location][name] # Otherwise, we raise an error else: raise KeyError(f'Data set with name "{name}" already exists!') # Finally, we create the full path and store it. Groups are automatically # created as needed by h5py. full_path = location.strip('/') + '/' + name.strip('/') hdf_file.create_dataset(name=full_path, data=data)
[docs]def save_dict_to_hdf( dictionary: dict, file_path: Union[Path, str], mode: str = 'a', prefix: str = '', ) -> None: """ Save the given ``dictionary`` as an HDF file at the ``file_path``. If the ``dictionary`` is nested, the HDF file will replicate this structure using groups. Args: dictionary: A (possibly nested) dictionary to be saved. file_path: The path to the target file (including name and file ending). mode: The mode (i.e., `"w"` or `"a"`) that is used when opening the HDF file for writing. prefix: Prefix to use when writing to the HDF file. This can be used, for example, to write the dictionary into its own group inside the HDF file. """ # Make sure that file_path is a proper Path file_path = Path(file_path) # Open an HDF file at the given location with h5py.File(file_path, mode=mode) as hdf_file: # Recursively loop over the given dictionary and store its contents recursively_save_dict_contents_to_group( hdf_object=hdf_file, prefix=prefix, dictionary=dictionary )
[docs]def recursively_save_dict_contents_to_group( hdf_object: Union[h5py.File, h5py.Group], prefix: str, dictionary: dict ) -> None: """ Auxiliary function for recursively looping over the contents of a dictionary and saving them to an HDF file. Args: hdf_object: Either an open HDF file, or a group inside such a file. prefix: Path to the location inside the HDF file; e.g., the name of a group, or a path (for nested groups). dictionary: The dictionary to be saved at the given location. """ # Loop over the given dictionary for key, item in dictionary.items(): # Define the path where the current key should be stored path = f'{prefix}/{key}' # If the current `item` is a dict, we have to create a group in the # file by calling this method recursively on `item` if isinstance(item, dict): recursively_save_dict_contents_to_group( hdf_object=hdf_object, prefix=path, dictionary=item ) # If the current `item` contains data, create a dataset to store them. # If the data set already exists, delete it (overwriting existing data # sets is not possible otherwise). elif isinstance(item, H5PY_SUPPORTED_TYPES): if path in hdf_object: del hdf_object[path] hdf_object.create_dataset(name=path, data=item) # If the type of `item` is not supported, raise a TypeError else: raise TypeError(f'Unsupported type {type(item)} for {path}!')
[docs]def load_dict_from_hdf(file_path: Union[Path, str]) -> dict: """ Load the contents of an HDF file into a dictionary to replicate the internal structure (group, subgroups, ...) of the HDF file. Args: file_path: The path to the target HDF file. Returns: A ``dict`` containing the contents of the specified HDF file. """ # Make sure that file_path is a proper Path file_path = Path(file_path) # Open the target HDF file with h5py.File(file_path, 'r') as hdf_file: # Recursively loop over its contents to load them into a dict return recursively_load_dict_contents_from_group( hdf_object=hdf_file, path='/' )
[docs]def recursively_load_dict_contents_from_group( hdf_object: Union[h5py.File, h5py.Group], path: str = '' ) -> dict: """ Auxiliary function for recursively looping over the contents of a given ``hdf_object`` and loading them into a dictionary. Args: hdf_object: A HDF object; either an HDF file (root) or a group. path: The path to the ``hdf_object`` in the actual HDF file. Returns: The contents of ``hdf_object[path]`` as a dictionary. """ # Initialize the output dict results = {} # Loop over the contents of the group (or root) at the current `path` for key, item in hdf_object[path].items(): # If the current item is a dataset, load its value. h5py will # automatically convert it to a numpy type. if isinstance(item, h5py.Dataset): value = item[()] if isinstance(value, bytes): value = value.decode('utf-8') results[key] = value # If the current item is a group, we recursively call this method on it elif isinstance(item, h5py.Group): new_path = f'{path}/{key}' results[key] = recursively_load_dict_contents_from_group( hdf_object=hdf_object, path=new_path, ) return results
[docs]def create_hdf_dir(experiment_dir: Path, create_on_work: bool = False) -> Path: """ Create a directory in which the HDF results files for an HSR experiment can be stores and return the Path to the directory. .. attention:: Unless you are working on the MPI-IS cluster in Tübingen, you **always** want to use ``create_on_work=False``! .. admonition:: Background This is slightly complicated, because the exact location depends on the machine on which this code is running. When running locally, it should simply be created directly in the respective ``experiment_dir``. However, when this code is running on the MPI-IS cluster, we want to store the (large) HDF files on ``/work``, with a symlink connecting it to the rest of the `experiment_dir`. Args: experiment_dir: The Path to the experiment directory for which we are going to create a `hdf` results directory. create_on_work: If `True`, the HDF directory is created on ``/work`` and a symlink is created in ``experiment_dir``. If `False`, the HDF directory is created directly in ``experiment_dir``. Returns: The Path to the ``hdf`` directory for the ``experiment_dir``. """ # In case we are creating the directory locally, things are easy if not create_on_work: home_hdf_dir = experiment_dir / 'hdf' home_hdf_dir.mkdir(exist_ok=True) # Otherwise, we need to create a directory on /work and symlink it else: # pragma: no cover # First, recreate the structure of the experiment directory in /work work_dir = Path(experiment_dir.as_posix().replace('/home/', '/work/')) work_dir.mkdir(exist_ok=True, parents=True) # Now, create an HDF directory on /work work_hdf_dir = work_dir / 'hdf' work_hdf_dir.mkdir(exist_ok=True) # Then, create the corresponding symlink in the experiment_dir home_hdf_dir = experiment_dir / 'hdf' try: home_hdf_dir.symlink_to(work_hdf_dir, target_is_directory=True) except FileExistsError: # pragma: no cover pass # Note: We do *not* ensure that the directory is empty here, because this # function may be called by multiple parallel jobs, and we do not want # these jobs to delete each other's results ... return home_hdf_dir