Source code for sktools.quantilegroups
"""Grouped Quantile Featurizer"""
__author__ = "david26694"
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import QuantileTransformer
[docs]class GroupedQuantileTransformer(BaseEstimator, TransformerMixin):
"""
Computes the group quantile of a numeric feature with respect to a categorical feature.
For instance, if each datum is an apartment, and we have both the price and the city,
this feature tries to model how expensive is an apartment in its city. The most
expensive apartment in the city will score 1, and the cheapest will score 0.
It is equivalent at what it is done in:
https://stackoverflow.com/questions/33899369/ranking-order-per-group-in-pandas
Parameters
----------
feature_mapping: dict
mapping from numeric variables to categories that want to be used as groups.
n_quantiles: int
number of quantiles per category.
subsample: int
Maximum number of samples used to estimate the quantiles for computational efficiency.
random_state: Any
Determines random number generation for subsampling and smoothing noise. Please see ``subsample`` for more details.
Pass an int for reproducible results across multiple function calls. See :term:`Glossary `
copy: bool
Set to False to perform inplace transformation and avoid a copy (if the input is already a numpy array).
Example
-------
>>> from sktools import GroupedQuantileTransformer
>>> import pandas as pd
>>> X = pd.DataFrame(
>>> {
>>> "price": [1, 2, 3, 3, 2, 10, 0],
>>> "city": ["a", "a", "a", "b", "b", None, None],
>>> }
>>> )
>>> featurizer = GroupedQuantileTransformer(feature_mapping={"price": "city"})
>>> print(featurizer.fit_transform(X).columns)
Index(['price', 'city', 'price_quantile_city'], dtype='object')
"""
def __init__(
self,
feature_mapping,
handle_missing="value",
n_quantiles=1000,
subsample=int(1e5),
random_state=None,
copy=True,
):
self.transformer_dict = {}
self.feature_mapping = feature_mapping
self.transformers = None
self.n_quantiles = n_quantiles
self.subsample = subsample
self.random_state = random_state
self.copy = copy
self.handle_missing = handle_missing
[docs] def fit(self, X, y=None):
for col, group in self.feature_mapping.items():
self.transformer_dict[group] = {}
categories = X[group].unique()
for category in categories:
# Regular case - non-nulls -> create a quantile transformer
# and fit it with data in that category
if category is not None:
self.transformer_dict[group][category] = QuantileTransformer(
n_quantiles=self.n_quantiles,
subsample=self.subsample,
random_state=self.random_state,
copy=self.copy,
)
x_category = X[X[group] == category]
self.transformer_dict[group][category].fit(x_category.loc[:, [col]])
# Non-regular case -> impute missings by taking the whole distribution
if self.handle_missing == "value":
self.transformer_dict[group][np.nan] = QuantileTransformer(
n_quantiles=self.n_quantiles,
subsample=self.subsample,
random_state=self.random_state,
copy=self.copy,
)
x_category = X
self.transformer_dict[group][np.nan].fit(x_category.loc[:, [col]])
return self
[docs] def transform(self, X):
if self.copy:
X = X.copy()
for col, group in self.feature_mapping.items():
transform_feature_name = f"{col}_quantile_{group}"
X[transform_feature_name] = np.zeros(X.shape[0])
categories = X[group].unique()
fit_categories = self.transformer_dict[group].keys()
for category in categories:
# Easy case - regular category
# Just use transformer_dict to estimate to establish quantile
# for the features
if category in fit_categories and category is not np.nan:
x_category = X[X[group] == category]
x_col = x_category.loc[:, [col]]
transformer = self.transformer_dict[group][category]
x_transform = transformer.transform(x_col)
X.loc[X[group] == category, transform_feature_name] = x_transform
# New categories or nulls -> use default transformer
else:
# Keep new and null
nonnull_fit_cats = set(fit_categories).difference([np.nan])
other_cats_condition = ~X[group].isin(nonnull_fit_cats)
x_category = X[other_cats_condition]
# Use default transformer
transformer = self.transformer_dict[group][np.nan]
x_transform = transformer.transform(x_category.loc[:, [col]])
# Assign to X
X.loc[other_cats_condition, transform_feature_name] = x_transform
return X
[docs]class PercentileGroupFeaturizer(BaseEstimator, TransformerMixin):
"""
Creates features establishing a relationship between a numeric and a categorical feature,
by using a given percentile of the numeric feature in each cateogry.
For instance, if each datum is an apartment, and we have both the price and the city,
if we use the percentile 50 the features model how expensive is an apartment
with respect to the median in the city.
Parameters
----------
feature_mapping: dict
mapping from numeric variables to categories that want to be used as groups.
percentile: int
percentile used to compute features
create_features: bool
If false, it just computes percentiles by category
handle_missing: str
options are 'return_nan' and 'value', defaults to 'value', which uses the global quantile.
handle_unknown: str
options are 'return_nan' and 'value', defaults to 'value', which uses the global quantile.
Example
-------
>>> from sktools import PercentileGroupFeaturizer
>>> import pandas as pd
>>> X = pd.DataFrame(
>>> {
>>> "price": [1, 2, 3, 3, 2, 10, 0],
>>> "city": ["a", "a", "a", "b", "b", None, None],
>>> }
>>> )
>>> featurizer = PercentileGroupFeaturizer(
>>> feature_mapping={"price": "city"}
>>> )
>>> print(featurizer.fit_transform(X).columns)
Index(['price', 'city', 'p50_price_city', 'diff_p50_price_city',
'relu_diff_p50_price_city', 'ratio_p50_price_city'],
dtype='object')
"""
def __init__(
self,
feature_mapping,
percentile=50,
create_features=True,
handle_missing="value",
handle_unknown="value",
):
self.feature_mapping = feature_mapping
self.percentile = percentile
self.create_features = create_features
self.handle_missing = handle_missing
self.handle_unknown = handle_unknown
self.saved_percentiles = {}
[docs] def fit(self, X, y=None):
for col, group in self.feature_mapping.items():
pctl_col_name = f"p{self.percentile}_{col}_{group}"
# Create percentile by group
pctl_df = (
X.groupby(group, as_index=False)
.agg({col: lambda x: x.quantile(self.percentile / 100)})
.rename(columns={col: pctl_col_name})
)
# Regular handle missing -> add global percentile to missing
if self.handle_missing == "value":
global_pctl = X[col].agg(lambda x: x.quantile(self.percentile / 100))
global_pctl_df = pd.DataFrame(
{group: np.nan, pctl_col_name: [global_pctl]}
)
pctl_df = pd.concat([pctl_df, global_pctl_df])
self.saved_percentiles[col] = pctl_df
return self
[docs] def transform(self, X):
X = X.copy()
for col, group in self.feature_mapping.items():
X = X.merge(self.saved_percentiles[col], on=group, how="left")
pctl_col_name = f"p{self.percentile}_{col}_{group}"
# First assign percentiles to non-trivial cases, which are
# new categories
if self.handle_unknown == "value" and self.handle_missing == "value":
groups_fit = self.saved_percentiles[col][group]
new_condition = (~X[group].isin(groups_fit)) & X[pctl_col_name].isnull()
x_fit = self.saved_percentiles[col]
imputation = float(x_fit.loc[x_fit[group].isnull()][pctl_col_name])
X.loc[new_condition, pctl_col_name] = imputation
# Then trivially create features
if self.create_features:
diff_name = f"diff_p{self.percentile}_{col}_{group}"
relu_diff_name = f"relu_diff_p{self.percentile}_{col}_{group}"
ratio_name = f"ratio_p{self.percentile}_{col}_{group}"
X[diff_name] = X[col] - X[pctl_col_name]
X[relu_diff_name] = X[diff_name].clip(0, np.inf)
X[ratio_name] = X[col] / X[pctl_col_name]
return X
[docs]class MeanGroupFeaturizer(BaseEstimator, TransformerMixin):
"""
Creates features establishing a relationship between a numeric and a categorical feature,
by using the mean of the numeric feature in each cateogry.
For instance, if each datum is an apartment, and we have both the price and the city,
the features model how expensive is an apartment with respect to the mean in the city.
Parameters
----------
feature_mapping: dict
mapping from numeric variables to categories that want to be used as groups.
percentile: int
percentile used to compute features
create_features: bool
If false, it just computes percentiles by category
handle_missing: str
options are 'return_nan' and 'value', defaults to 'value', which uses the global quantile.
handle_unknown: str
options are 'return_nan' and 'value', defaults to 'value', which uses the global quantile.
Example
-------
>>> from sktools import MeanGroupFeaturizer
>>> import pandas as pd
>>> X = pd.DataFrame(
>>> {
>>> "price": [1, 2, 3, 3, 2, 10, 0],
>>> "city": ["a", "a", "a", "b", "b", None, None],
>>> }
>>> )
>>> featurizer = MeanGroupFeaturizer(
>>> feature_mapping={"price": "city"}
>>> )
>>> print(featurizer.fit_transform(X).columns)
Index(['price', 'city', 'mean_price_city', 'diff_mean_price_city',
'relu_diff_mean_price_city', 'ratio_mean_price_city'],
dtype='object')
"""
def __init__(
self,
feature_mapping,
create_features=True,
handle_missing="value",
handle_unknown="value",
):
self.feature_mapping = feature_mapping
self.create_features = create_features
self.handle_missing = handle_missing
self.handle_unknown = handle_unknown
self.saved_mean = {}
[docs] def fit(self, X, y=None):
for col, group in self.feature_mapping.items():
mean_col_name = f"mean_{col}_{group}"
# Create percentile by group
mean_df = (
X.groupby(group, as_index=False)
.agg({col: lambda x: x.mean()})
.rename(columns={col: mean_col_name})
)
# Regular handle missing -> add global percentile to missing
if self.handle_missing == "value":
global_mean = X[col].agg(lambda x: x.mean())
global_mean_df = pd.DataFrame(
{group: np.nan, mean_col_name: [global_mean]}
)
mean_df = pd.concat([mean_df, global_mean_df])
self.saved_mean[col] = mean_df
return self
[docs] def transform(self, X):
X = X.copy()
for col, group in self.feature_mapping.items():
X = X.merge(self.saved_mean[col], on=group, how="left")
mean_col_name = f"mean_{col}_{group}"
# First assign percentiles to non-trivial cases, which are
# new categories
if self.handle_unknown == "value" and self.handle_missing == "value":
groups_fit = self.saved_mean[col][group]
new_condition = (~X[group].isin(groups_fit)) & X[mean_col_name].isnull()
x_fit = self.saved_mean[col]
imputation = float(x_fit.loc[x_fit[group].isnull()][mean_col_name])
X.loc[new_condition, mean_col_name] = imputation
# Then trivially create features
if self.create_features:
diff_name = f"diff_mean_{col}_{group}"
relu_diff_name = f"relu_diff_mean_{col}_{group}"
ratio_name = f"ratio_mean_{col}_{group}"
X[diff_name] = X[col] - X[mean_col_name]
X[relu_diff_name] = X[diff_name].clip(0, np.inf)
X[ratio_name] = X[col] / X[mean_col_name]
return X