Source code for sktools.model_selection

import numpy as np
from sklearn.utils import check_array
from sklearn.model_selection import GridSearchCV


[docs]class BootstrapFold:
    """Create folds based on bootsrapping

    For each fold, create a bootstrap sample, training data is the bootstrapped data.
    The test data is the rest of the data, the data that is not in the bootstrap sample

    The average size of the test data is 1/e of the total data.

    Parameters
    ----------

    n_bootstraps: int
        number of folds of our cross-validation setting
    size_fraction: float
        fraction of the training data being sampled. The lower, the bigger the test set
    Example
    -------
    >>> import numpy as np
    >>> from sktools.model_selection import BootstrapFold
    >>> X = np.array([
    >>>     np.random.randint(1, 3, 1000),
    >>>     np.random.randint(0, 2, 1000)]
    >>> ).T
    >>> loo = BootstrapFold(10, size_fraction=1)
    >>> for train_index, test_index in loo.split(X):
    >>>     print(f"Train length: {len(train_index)} Test length: {len(test_index)}")
    Train length: 1000 Test length: 393
    Train length: 1000 Test length: 367
    Train length: 1000 Test length: 372
    Train length: 1000 Test length: 377
    Train length: 1000 Test length: 361
    Train length: 1000 Test length: 356
    Train length: 1000 Test length: 366
    Train length: 1000 Test length: 369
    Train length: 1000 Test length: 390
    Train length: 1000 Test length: 365



    References
    ----------

    .. [1] Out of sample data for bootstrap sample, from https://stats.stackexchange.com/questions/88980/

    """

    def __init__(self, n_bootstraps=10, size_fraction=1):
        self.n_bootstraps = n_bootstraps
        self.size_fraction = size_fraction

[docs]    def split(self, X, y=None, groups=None):
        """
        Generator to iterate over the indices
        :param X: Array to split on
        :param y: Always ignored, exists for compatibility
        :param groups: Always ignored, exists for compatibility
        """

        X = check_array(X)

        row_range = range(X.shape[0])
        sample_size = int(round(self.size_fraction * len(row_range), 0))

        for boot in range(self.n_bootstraps):
            train_idx = np.random.choice(row_range, sample_size)
            test_idx = list(set(row_range).difference(train_idx))
            yield train_idx, test_idx

[docs]    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_bootstraps