Source code for hsr4hci.hdf

"""
Methods for dealing with HDF files.

Parts of the code in the module are based on:
https://codereview.stackexchange.com/a/121308
"""

# -----------------------------------------------------------------------------
# IMPORTS
# -----------------------------------------------------------------------------

from pathlib import Path
from typing import Any, Union

import h5py
import numpy as np


# -----------------------------------------------------------------------------
# CONSTANT DEFINITIONS
# -----------------------------------------------------------------------------

# Define a (incomplete) list of types that h5py supports for reading and
# writing. Note that each type is mapped to a native numpy type by h5py.
H5PY_SUPPORTED_TYPES = (
    bool,
    bytes,
    complex,
    float,
    int,
    np.generic,
    np.ndarray,
    str,
)


# -----------------------------------------------------------------------------
# FUNCTION DEFINITIONS
# -----------------------------------------------------------------------------

[docs]def save_data_to_hdf(
    hdf_file: h5py.File,
    location: str,
    name: str,
    data: Any,
    overwrite: bool = True,
) -> None:
    """
    Auxiliary function to write data to an open HDF file that provides
    automatic overwriting (which requires deleting and re-creating data
    sets that already exist).

    Args:
        hdf_file: An open HDF file (in write mode).
        location: The path (`"group_1/group_2/.../group_n"`) at which to
            create the new data set in the HDF file. Can be empty.
        name: The name of the data set.
        data: The data to be written to the data set.
        overwrite: Whether to overwrite a data set of the same name
            that already exists in the given location.
    """

    # Ensure that we only try to save supported types
    if not isinstance(data, H5PY_SUPPORTED_TYPES):
        raise TypeError(f'Type "{type(data)}" not supported by HDF format!')

    # Check if the data set already exists
    if (location in hdf_file) and (name in hdf_file[location]):

        # If overwrite is True, we delete the data set and create it again
        # below (there is no direct overwrite)
        if overwrite:
            del hdf_file[location][name]

        # Otherwise, we raise an error
        else:
            raise KeyError(f'Data set with name "{name}" already exists!')

    # Finally, we create the full path and store it. Groups are automatically
    # created as needed by h5py.
    full_path = location.strip('/') + '/' + name.strip('/')
    hdf_file.create_dataset(name=full_path, data=data)


[docs]def save_dict_to_hdf(
    dictionary: dict,
    file_path: Union[Path, str],
    mode: str = 'a',
    prefix: str = '',
) -> None:
    """
    Save the given ``dictionary`` as an HDF file at the ``file_path``.
    If the ``dictionary`` is nested, the HDF file will replicate this
    structure using groups.

    Args:
        dictionary: A (possibly nested) dictionary to be saved.
        file_path: The path to the target file (including name and
            file ending).
        mode: The mode (i.e., `"w"` or `"a"`) that is used when opening
            the HDF file for writing.
        prefix: Prefix to use when writing to the HDF file. This can
            be used, for example, to write the dictionary into its own
            group inside the HDF file.
    """

    # Make sure that file_path is a proper Path
    file_path = Path(file_path)

    # Open an HDF file at the given location
    with h5py.File(file_path, mode=mode) as hdf_file:

        # Recursively loop over the given dictionary and store its contents
        recursively_save_dict_contents_to_group(
            hdf_object=hdf_file, prefix=prefix, dictionary=dictionary
        )


[docs]def recursively_save_dict_contents_to_group(
    hdf_object: Union[h5py.File, h5py.Group], prefix: str, dictionary: dict
) -> None:
    """
    Auxiliary function for recursively looping over the contents of a
    dictionary and saving them to an HDF file.

    Args:
        hdf_object: Either an open HDF file, or a group inside such
            a file.
        prefix: Path to the location inside the HDF file; e.g., the
            name of a group, or a path (for nested groups).
        dictionary: The dictionary to be saved at the given location.
    """

    # Loop over the given dictionary
    for key, item in dictionary.items():

        # Define the path where the current key should be stored
        path = f'{prefix}/{key}'

        # If the current `item` is a dict, we have to create a group in the
        # file by calling this method recursively on `item`
        if isinstance(item, dict):
            recursively_save_dict_contents_to_group(
                hdf_object=hdf_object, prefix=path, dictionary=item
            )

        # If the current `item` contains data, create a dataset to store them.
        # If the data set already exists, delete it (overwriting existing data
        # sets is not possible otherwise).
        elif isinstance(item, H5PY_SUPPORTED_TYPES):
            if path in hdf_object:
                del hdf_object[path]
            hdf_object.create_dataset(name=path, data=item)

        # If the type of `item` is not supported, raise a TypeError
        else:
            raise TypeError(f'Unsupported type {type(item)} for {path}!')


[docs]def load_dict_from_hdf(file_path: Union[Path, str]) -> dict:
    """
    Load the contents of an HDF file into a dictionary to replicate the
    internal structure (group, subgroups, ...) of the HDF file.

    Args:
        file_path: The path to the target HDF file.

    Returns:
        A ``dict`` containing the contents of the specified HDF file.
    """

    # Make sure that file_path is a proper Path
    file_path = Path(file_path)

    # Open the target HDF file
    with h5py.File(file_path, 'r') as hdf_file:

        # Recursively loop over its contents to load them into a dict
        return recursively_load_dict_contents_from_group(
            hdf_object=hdf_file, path='/'
        )


[docs]def recursively_load_dict_contents_from_group(
    hdf_object: Union[h5py.File, h5py.Group], path: str = ''
) -> dict:
    """
    Auxiliary function for recursively looping over the contents of a
    given ``hdf_object`` and loading them into a dictionary.

    Args:
        hdf_object: A HDF object; either an HDF file (root) or a group.
        path: The path to the ``hdf_object`` in the actual HDF file.

    Returns:
        The contents of ``hdf_object[path]`` as a dictionary.
    """

    # Initialize the output dict
    results = {}

    # Loop over the contents of the group (or root) at the current `path`
    for key, item in hdf_object[path].items():

        # If the current item is a dataset, load its value. h5py will
        # automatically convert it to a numpy type.
        if isinstance(item, h5py.Dataset):
            value = item[()]
            if isinstance(value, bytes):
                value = value.decode('utf-8')
            results[key] = value

        # If the current item is a group, we recursively call this method on it
        elif isinstance(item, h5py.Group):
            new_path = f'{path}/{key}'
            results[key] = recursively_load_dict_contents_from_group(
                hdf_object=hdf_object,
                path=new_path,
            )

    return results


[docs]def create_hdf_dir(experiment_dir: Path, create_on_work: bool = False) -> Path:
    """
    Create a directory in which the HDF results files for an HSR
    experiment can be stores and return the Path to the directory.

    .. attention::
        Unless you are working on the MPI-IS cluster in Tübingen, you
        **always** want to use ``create_on_work=False``!


    .. admonition:: Background

        This is slightly complicated, because the exact location
        depends on the machine on which this code is running. When
        running locally, it should simply be created directly in the
        respective ``experiment_dir``.
        However, when this code is running on the MPI-IS cluster, we
        want to store the (large) HDF files on ``/work``, with a
        symlink connecting it to the rest of the `experiment_dir`.

    Args:
        experiment_dir: The Path to the experiment directory for which
            we are going to create a `hdf` results directory.
        create_on_work: If `True`, the HDF directory is created on
            ``/work`` and a symlink is created in ``experiment_dir``.
            If `False`, the HDF directory is created directly in
            ``experiment_dir``.

    Returns:
        The Path to the ``hdf`` directory for the ``experiment_dir``.
    """

    # In case we are creating the directory locally, things are easy
    if not create_on_work:

        home_hdf_dir = experiment_dir / 'hdf'
        home_hdf_dir.mkdir(exist_ok=True)

    # Otherwise, we need to create a directory on /work and symlink it
    else:  # pragma: no cover

        # First, recreate the structure of the experiment directory in /work
        work_dir = Path(experiment_dir.as_posix().replace('/home/', '/work/'))
        work_dir.mkdir(exist_ok=True, parents=True)

        # Now, create an HDF directory on /work
        work_hdf_dir = work_dir / 'hdf'
        work_hdf_dir.mkdir(exist_ok=True)

        # Then, create the corresponding symlink in the experiment_dir
        home_hdf_dir = experiment_dir / 'hdf'
        try:
            home_hdf_dir.symlink_to(work_hdf_dir, target_is_directory=True)
        except FileExistsError:  # pragma: no cover
            pass

    # Note: We do *not* ensure that the directory is empty here, because this
    # function may be called by multiple parallel jobs, and we do not want
    # these jobs to delete each other's results ...

    return home_hdf_dir