Source code for sktools.quantilegroups

"""Grouped Quantile Featurizer"""

__author__ = "david26694"


import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import QuantileTransformer


[docs]class GroupedQuantileTransformer(BaseEstimator, TransformerMixin):
    """
    Computes the group quantile of a numeric feature with respect to a categorical feature.

    For instance, if each datum is an apartment, and we have both the price and the city,
    this feature tries to model how expensive is an apartment in its city. The most
    expensive apartment in the city will score 1, and the cheapest will score 0.

    It is equivalent at what it is done in:
    https://stackoverflow.com/questions/33899369/ranking-order-per-group-in-pandas

    Parameters
    ----------

    feature_mapping: dict
        mapping from numeric variables to categories that want to be used as groups.
    n_quantiles: int
        number of quantiles per category.
    subsample: int
        Maximum number of samples used to estimate the quantiles for computational efficiency.
    random_state: Any
        Determines random number generation for subsampling and smoothing noise. Please see ``subsample`` for more details.
        Pass an int for reproducible results across multiple function calls. See :term:`Glossary  `
    copy: bool
        Set to False to perform inplace transformation and avoid a copy (if the input is already a numpy array).

    Example
    -------
    >>> from sktools import GroupedQuantileTransformer
    >>> import pandas as pd
    >>> X = pd.DataFrame(
    >>>         {
    >>>             "price": [1, 2, 3, 3, 2, 10, 0],
    >>>             "city": ["a", "a", "a", "b", "b", None, None],
    >>>         }
    >>>     )
    >>> featurizer = GroupedQuantileTransformer(feature_mapping={"price": "city"})
    >>> print(featurizer.fit_transform(X).columns)
    Index(['price', 'city', 'price_quantile_city'], dtype='object')

    """

    def __init__(
        self,
        feature_mapping,
        handle_missing="value",
        n_quantiles=1000,
        subsample=int(1e5),
        random_state=None,
        copy=True,
    ):
        self.transformer_dict = {}
        self.feature_mapping = feature_mapping
        self.transformers = None
        self.n_quantiles = n_quantiles
        self.subsample = subsample
        self.random_state = random_state
        self.copy = copy
        self.handle_missing = handle_missing

[docs]    def fit(self, X, y=None):

        for col, group in self.feature_mapping.items():

            self.transformer_dict[group] = {}
            categories = X[group].unique()

            for category in categories:
                # Regular case - non-nulls -> create a quantile transformer
                # and fit it with data in that category
                if category is not None:
                    self.transformer_dict[group][category] = QuantileTransformer(
                        n_quantiles=self.n_quantiles,
                        subsample=self.subsample,
                        random_state=self.random_state,
                        copy=self.copy,
                    )

                    x_category = X[X[group] == category]
                    self.transformer_dict[group][category].fit(x_category.loc[:, [col]])

            # Non-regular case -> impute missings by taking the whole distribution
            if self.handle_missing == "value":
                self.transformer_dict[group][np.nan] = QuantileTransformer(
                    n_quantiles=self.n_quantiles,
                    subsample=self.subsample,
                    random_state=self.random_state,
                    copy=self.copy,
                )

                x_category = X
                self.transformer_dict[group][np.nan].fit(x_category.loc[:, [col]])

        return self

[docs]    def transform(self, X):

        if self.copy:
            X = X.copy()

        for col, group in self.feature_mapping.items():

            transform_feature_name = f"{col}_quantile_{group}"
            X[transform_feature_name] = np.zeros(X.shape[0])

            categories = X[group].unique()
            fit_categories = self.transformer_dict[group].keys()

            for category in categories:

                # Easy case - regular category
                # Just use transformer_dict to estimate to establish quantile
                # for the features
                if category in fit_categories and category is not np.nan:
                    x_category = X[X[group] == category]
                    x_col = x_category.loc[:, [col]]
                    transformer = self.transformer_dict[group][category]
                    x_transform = transformer.transform(x_col)

                    X.loc[X[group] == category, transform_feature_name] = x_transform

                # New categories or nulls -> use default transformer
                else:
                    # Keep new and null
                    nonnull_fit_cats = set(fit_categories).difference([np.nan])
                    other_cats_condition = ~X[group].isin(nonnull_fit_cats)
                    x_category = X[other_cats_condition]

                    # Use default transformer
                    transformer = self.transformer_dict[group][np.nan]
                    x_transform = transformer.transform(x_category.loc[:, [col]])

                    # Assign to X
                    X.loc[other_cats_condition, transform_feature_name] = x_transform

        return X


[docs]class PercentileGroupFeaturizer(BaseEstimator, TransformerMixin):
    """
        Creates features establishing a relationship between a numeric and a categorical feature,
        by using a given percentile of the numeric feature in each cateogry.

        For instance, if each datum is an apartment, and we have both the price and the city,
        if we use the percentile 50 the features model how expensive is an apartment
        with respect to the median in the city.

    Parameters
    ----------

    feature_mapping: dict
        mapping from numeric variables to categories that want to be used as groups.
    percentile: int
        percentile used to compute features
    create_features: bool
        If false, it just computes percentiles by category
    handle_missing: str
        options are 'return_nan' and 'value', defaults to 'value', which uses the global quantile.
    handle_unknown: str
        options are 'return_nan' and 'value', defaults to 'value', which uses the global quantile.

    Example
    -------
    >>> from sktools import PercentileGroupFeaturizer
    >>> import pandas as pd
    >>> X = pd.DataFrame(
    >>>         {
    >>>             "price": [1, 2, 3, 3, 2, 10, 0],
    >>>             "city": ["a", "a", "a", "b", "b", None, None],
    >>>         }
    >>>     )
    >>> featurizer = PercentileGroupFeaturizer(
    >>>     feature_mapping={"price": "city"}
    >>> )
    >>> print(featurizer.fit_transform(X).columns)
    Index(['price', 'city', 'p50_price_city', 'diff_p50_price_city',
           'relu_diff_p50_price_city', 'ratio_p50_price_city'],
          dtype='object')


    """

    def __init__(
        self,
        feature_mapping,
        percentile=50,
        create_features=True,
        handle_missing="value",
        handle_unknown="value",
    ):
        self.feature_mapping = feature_mapping
        self.percentile = percentile
        self.create_features = create_features
        self.handle_missing = handle_missing
        self.handle_unknown = handle_unknown
        self.saved_percentiles = {}

[docs]    def fit(self, X, y=None):

        for col, group in self.feature_mapping.items():

            pctl_col_name = f"p{self.percentile}_{col}_{group}"

            # Create percentile by group
            pctl_df = (
                X.groupby(group, as_index=False)
                .agg({col: lambda x: x.quantile(self.percentile / 100)})
                .rename(columns={col: pctl_col_name})
            )

            # Regular handle missing -> add global percentile to missing
            if self.handle_missing == "value":
                global_pctl = X[col].agg(lambda x: x.quantile(self.percentile / 100))

                global_pctl_df = pd.DataFrame(
                    {group: np.nan, pctl_col_name: [global_pctl]}
                )

                pctl_df = pd.concat([pctl_df, global_pctl_df])

            self.saved_percentiles[col] = pctl_df

        return self

[docs]    def transform(self, X):

        X = X.copy()

        for col, group in self.feature_mapping.items():

            X = X.merge(self.saved_percentiles[col], on=group, how="left")
            pctl_col_name = f"p{self.percentile}_{col}_{group}"

            # First assign percentiles to non-trivial cases, which are
            # new categories
            if self.handle_unknown == "value" and self.handle_missing == "value":
                groups_fit = self.saved_percentiles[col][group]
                new_condition = (~X[group].isin(groups_fit)) & X[pctl_col_name].isnull()

                x_fit = self.saved_percentiles[col]
                imputation = float(x_fit.loc[x_fit[group].isnull()][pctl_col_name])
                X.loc[new_condition, pctl_col_name] = imputation

            # Then trivially create features
            if self.create_features:
                diff_name = f"diff_p{self.percentile}_{col}_{group}"
                relu_diff_name = f"relu_diff_p{self.percentile}_{col}_{group}"
                ratio_name = f"ratio_p{self.percentile}_{col}_{group}"

                X[diff_name] = X[col] - X[pctl_col_name]
                X[relu_diff_name] = X[diff_name].clip(0, np.inf)
                X[ratio_name] = X[col] / X[pctl_col_name]

        return X


[docs]class MeanGroupFeaturizer(BaseEstimator, TransformerMixin):
    """
        Creates features establishing a relationship between a numeric and a categorical feature,
        by using the mean of the numeric feature in each cateogry.

        For instance, if each datum is an apartment, and we have both the price and the city,
        the features model how expensive is an apartment with respect to the mean in the city.

    Parameters
    ----------

    feature_mapping: dict
        mapping from numeric variables to categories that want to be used as groups.
    percentile: int
        percentile used to compute features
    create_features: bool
        If false, it just computes percentiles by category
    handle_missing: str
        options are 'return_nan' and 'value', defaults to 'value', which uses the global quantile.
    handle_unknown: str
        options are 'return_nan' and 'value', defaults to 'value', which uses the global quantile.

    Example
    -------
    >>> from sktools import MeanGroupFeaturizer
    >>> import pandas as pd
    >>> X = pd.DataFrame(
    >>>         {
    >>>             "price": [1, 2, 3, 3, 2, 10, 0],
    >>>             "city": ["a", "a", "a", "b", "b", None, None],
    >>>         }
    >>>     )
    >>> featurizer = MeanGroupFeaturizer(
    >>>     feature_mapping={"price": "city"}
    >>> )
    >>> print(featurizer.fit_transform(X).columns)
    Index(['price', 'city', 'mean_price_city', 'diff_mean_price_city',
           'relu_diff_mean_price_city', 'ratio_mean_price_city'],
          dtype='object')


    """

    def __init__(
        self,
        feature_mapping,
        create_features=True,
        handle_missing="value",
        handle_unknown="value",
    ):
        self.feature_mapping = feature_mapping
        self.create_features = create_features
        self.handle_missing = handle_missing
        self.handle_unknown = handle_unknown
        self.saved_mean = {}

[docs]    def fit(self, X, y=None):

        for col, group in self.feature_mapping.items():

            mean_col_name = f"mean_{col}_{group}"

            # Create percentile by group
            mean_df = (
                X.groupby(group, as_index=False)
                .agg({col: lambda x: x.mean()})
                .rename(columns={col: mean_col_name})
            )

            # Regular handle missing -> add global percentile to missing
            if self.handle_missing == "value":
                global_mean = X[col].agg(lambda x: x.mean())

                global_mean_df = pd.DataFrame(
                    {group: np.nan, mean_col_name: [global_mean]}
                )

                mean_df = pd.concat([mean_df, global_mean_df])

            self.saved_mean[col] = mean_df

        return self

[docs]    def transform(self, X):

        X = X.copy()

        for col, group in self.feature_mapping.items():

            X = X.merge(self.saved_mean[col], on=group, how="left")
            mean_col_name = f"mean_{col}_{group}"

            # First assign percentiles to non-trivial cases, which are
            # new categories
            if self.handle_unknown == "value" and self.handle_missing == "value":
                groups_fit = self.saved_mean[col][group]
                new_condition = (~X[group].isin(groups_fit)) & X[mean_col_name].isnull()

                x_fit = self.saved_mean[col]
                imputation = float(x_fit.loc[x_fit[group].isnull()][mean_col_name])
                X.loc[new_condition, mean_col_name] = imputation

            # Then trivially create features
            if self.create_features:
                diff_name = f"diff_mean_{col}_{group}"
                relu_diff_name = f"relu_diff_mean_{col}_{group}"
                ratio_name = f"ratio_mean_{col}_{group}"

                X[diff_name] = X[col] - X[mean_col_name]
                X[relu_diff_name] = X[diff_name].clip(0, np.inf)
                X[ratio_name] = X[col] / X[mean_col_name]

        return X