Source code for sktools.preprocessing

__author__ = ["david26694", "cmougan"]

import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.multiclass import check_classification_targets
import scipy


[docs]class CyclicFeaturizer(BaseEstimator, TransformerMixin): """Cyclic featurizer Given some numeric columns, applies sine and cosine transformations to obtain cyclic features. This is specially suited to month of the year, day of the week, day of the month, hour of the day, etc, where the plain numeric representation doesn't work very well. Parameters ---------- cols : list columns to be encoded using sine and cosine transformations. Should be numeric columns period_mapping : dict keys should be names of cols and values should be tuples indicating minimum and maximum values Example ------- >>> from sktools import CyclicFeaturizer >>> import pandas as pd >>> df = pd.DataFrame( >>> { >>> "posted_at": pd.date_range( >>> start="1/1/2018", periods=365 * 3, freq="d" >>> ), >>> "created_at": pd.date_range( >>> start="1/1/2018", periods=365 * 3, freq="h" >>> ) >>> } >>> ) >>> df["month_posted"] = df.posted_at.dt.month >>> df["hour_created"] = df.created_at.dt.hour >>> transformed_df = CyclicFeaturizer( >>> cols=["month_posted", "hour_created"] >>> ).fit_transform(df) """ def __init__(self, cols, period_mapping=None): self.cols = cols self.period_mapping = period_mapping
[docs] def fit(self, X): # If the mapping is given, no need to run it if self.period_mapping is not None: if set(self.cols) != set(self.period_mapping.keys()): raise ValueError("Keys of period_mapping are not the same as cols") return self else: # Learn values to determine periods self.period_mapping = {} for col in self.cols: min_col = X[col].min() max_col = X[col].max() self.period_mapping[col] = (min_col, max_col) return self
[docs] def transform(self, X): X = X.copy() for col in self.cols: min_col, max_col = self.period_mapping[col] # 24 hours -> 23 - 0 + 1 # 365 days -> 365 - 1 + 1 period = max_col - min_col + 1 X[f"{col}_sin"] = np.sin(2 * (X[col] - min_col) * np.pi / period) X[f"{col}_cos"] = np.cos(2 * (X[col] - min_col) * np.pi / period) return X
[docs]class GradientBoostingFeatureGenerator(BaseEstimator, TransformerMixin): """ Feature generator from a gradient boosting. Gradient boosting decision trees are a powerful and very convenient way to implement non-linear and tuple transformations. We treat each individual tree as a categorical feature that takes as value the index of the leaf an instance ends up falling in and then perform one hot encoding for these features. Parameters ---------- stack_to_X: bool, default = True Generates leaves features using the fitted self.gbm and saves them in R. If `stack_to_X is True` then `.transform` returns the original features with 'R' appended as columns. If `stack_to_X is False` then `.transform` returns only the leaves features from 'R' add_probs: bool, default = False If `add_probs is True` then the created features are appended a probability [0,1]. If `add_probs is False` features are binary Example ------- >>> from sktools import GradientBoostingFeatureGenerator >>> from sklearn.datasets import make_classification >>> X, y = make_classification() >>> mf = GradientBoostingFeatureGenerator() >>> mf.fit(X, y) >>> mf.transform(X) References ---------- .. [1] Practical Lessons from Predicting Clicks on Ads at Facebook, from https://research.fb.com/wp-content/uploads/2016/11/practical-lessons-from-predicting-clicks-on-ads-at-facebook.pdf .. [2] Feature Generation with Gradient Boosted Decision Trees, Towards Data Science, Carlos Mougan """ def __init__( self, stack_to_X=True, add_probs=False, regression=False, **kwargs, ): # Deciding whether to append features or simply return generated features self.stack_to_X = stack_to_X self.add_probs = add_probs self.regression = regression if self.regression: # Key arguments for the gradient boosting regressor self.gbm = GradientBoostingRegressor(**kwargs) else: # Key arguments for the gradient boosting classifier self.gbm = GradientBoostingClassifier(**kwargs) def _get_leaves(self, X): X_leaves = self.gbm.apply(X) # Difference in return methods if self.regression: n_rows, n_cols = X_leaves.shape else: n_rows, n_cols, _ = X_leaves.shape X_leaves = X_leaves.reshape(n_rows, n_cols) return X_leaves def _predict_probs(self, X): if self.regression == True: # Key arguments for the gradient boosting regressor return self.gbm.predict(X) else: # Key arguments for the gradient boosting classifier return self.gbm.predict_proba(X) def _decode_leaves(self, X): return self.encoder.transform(X).todense()
[docs] def fit(self, X, y): if self.regression == False: # Check that is a classification target check_classification_targets(y) self.gbm.fit(X, y) self.encoder = OneHotEncoder(categories="auto") X_leaves = self._get_leaves(X) self.encoder.fit(X_leaves) return self
[docs] def transform(self, X): """ R contains the matrix with the encoded leaves. The shape depends upon the parameters. P contains a two columns array with the probability. """ R = self._decode_leaves(self._get_leaves(X)) if self.add_probs: P = self._predict_probs(X) R = np.hstack((R, P)) X_new = np.hstack((X, R)) if self.stack_to_X == True else R else: X_new = np.hstack((X, R)) if self.stack_to_X == True else R return X_new