Source code for hsr4hci.splitting
"""
Methods for performing train / test splits.
"""
# -----------------------------------------------------------------------------
# IMPORTS
# -----------------------------------------------------------------------------
from typing import Iterator, Tuple
import numpy as np
# -----------------------------------------------------------------------------
# CLASS DEFINITIONS
# -----------------------------------------------------------------------------
[docs]class AlternatingSplit:
"""
Alternating split cross-validator.
Provides train / test indices to split data in train / test sets.
The split is performed in an "alternating" way:
Assume that ``n_splits=3``. In this case, the samples / data points
are labeled: `A B C A B C A B C ...` In the first split, all points
labeled `A` or `B` constitute the training set, and `C` is the test
(or hold-out) set. In the second split, all points labeled `A` or
`C` are used for training and `B` is the test split. In the final
split, `A` is held out and training is performed on `B` and `C`.
This splitting scheme is useful for HCI / ADI data, because it means
that the effective field rotation in all splits is the same (using
standard $k$-fold splitting would---for $k=2$---cut the field
rotation in the training data in half).
.. note::
The syntax and usage is closely based on similar ``sklearn``
classes such as, e.g., :class:`sklearn.model_selection.KFold`.
"""
[docs] def __init__(self, n_splits: int) -> None:
# Sanity check: we cannot have less than 1 split
assert n_splits >= 1, 'n_splits must be a positive integer!'
self.n_splits = n_splits
[docs] def split(self, X: np.ndarray) -> Iterator[Tuple[np.ndarray, np.ndarray]]:
"""
Generate indices to split data into training and test set.
Args:
X: A 2D numpy array of shape `(n_samples, n_features)` that
contains the training data.
Yields:
A 2-tuple consisting of
* ``train_idx``: A 1D numpy array containing the training
set indices for that split.
* ``test_idx``: A 1D numpy array containing the testing set
indices for that split.
"""
# Get the number of samples (= number of rows in X)
n_samples = X.shape[0]
# Initialize the array of indices that gets split into train / test
indices = np.arange(n_samples)
# If n_splits = 1, we do not need to split. Instead, we simply return
# the indices right away such that train_idx == test_idx. (This is for
# compatibility reasons in cases where we do not really want to split
# the data into training and test.)
if self.n_splits == 1:
yield indices, indices
return
# Otherwise, generate indices for alternating splitting scheme
for i in range(self.n_splits):
test_idx = indices[i :: self.n_splits]
train_idx = np.setdiff1d(indices, test_idx, assume_unique=True)
yield train_idx, test_idx