Source code for sktools.model_selection

import numpy as np
from sklearn.utils import check_array
from sklearn.model_selection import GridSearchCV


[docs]class BootstrapFold: """Create folds based on bootsrapping For each fold, create a bootstrap sample, training data is the bootstrapped data. The test data is the rest of the data, the data that is not in the bootstrap sample The average size of the test data is 1/e of the total data. Parameters ---------- n_bootstraps: int number of folds of our cross-validation setting size_fraction: float fraction of the training data being sampled. The lower, the bigger the test set Example ------- >>> import numpy as np >>> from sktools.model_selection import BootstrapFold >>> X = np.array([ >>> np.random.randint(1, 3, 1000), >>> np.random.randint(0, 2, 1000)] >>> ).T >>> loo = BootstrapFold(10, size_fraction=1) >>> for train_index, test_index in loo.split(X): >>> print(f"Train length: {len(train_index)} Test length: {len(test_index)}") Train length: 1000 Test length: 393 Train length: 1000 Test length: 367 Train length: 1000 Test length: 372 Train length: 1000 Test length: 377 Train length: 1000 Test length: 361 Train length: 1000 Test length: 356 Train length: 1000 Test length: 366 Train length: 1000 Test length: 369 Train length: 1000 Test length: 390 Train length: 1000 Test length: 365 References ---------- .. [1] Out of sample data for bootstrap sample, from https://stats.stackexchange.com/questions/88980/ """ def __init__(self, n_bootstraps=10, size_fraction=1): self.n_bootstraps = n_bootstraps self.size_fraction = size_fraction
[docs] def split(self, X, y=None, groups=None): """ Generator to iterate over the indices :param X: Array to split on :param y: Always ignored, exists for compatibility :param groups: Always ignored, exists for compatibility """ X = check_array(X) row_range = range(X.shape[0]) sample_size = int(round(self.size_fraction * len(row_range), 0)) for boot in range(self.n_bootstraps): train_idx = np.random.choice(row_range, sample_size) test_idx = list(set(row_range).difference(train_idx)) yield train_idx, test_idx
[docs] def get_n_splits(self, X=None, y=None, groups=None): return self.n_bootstraps