Source code for shapash.explainer.smart_predictor

"""
Smart predictor module
"""
import copy

import pandas as pd

import shapash.explainer.smart_explainer
from shapash.decomposition.contributions import assign_contributions, rank_contributions
from shapash.manipulation.filters import (
    cap_contributions,
    combine_masks,
    cutoff_contributions,
    hide_contributions,
    sign_contributions,
)
from shapash.manipulation.mask import compute_masked_contributions, init_mask
from shapash.manipulation.select_lines import keep_right_contributions
from shapash.manipulation.summarize import create_grouped_features_values, group_contributions, summarize
from shapash.utils.check import (
    check_consistency_model_features,
    check_consistency_model_label,
    check_features_name,
    check_label_dict,
    check_mask_params,
    check_model,
    check_preprocessing,
    check_preprocessing_options,
    check_y,
)
from shapash.utils.columntransformer_backend import columntransformer
from shapash.utils.io import save_pickle
from shapash.utils.model import predict_proba
from shapash.utils.transform import adapt_contributions, apply_postprocessing, apply_preprocessing, preprocessing_tolist


[docs]class SmartPredictor:
    """
    The SmartPredictor class is an object lighter than SmartExplainer Object with
    additionnal consistency checks.

    The SmartPredictor object is provided to deploy the summary of local explanation
    for the operational needs.

    Switching from SmartExplainer to SmartPredictor, allows users to reproduce
    the same results automatically on datasets with right structure.

    SmartPredictor is designed to make new results understandable:
        - It checks consistency of all parameters
        - It applies preprocessing and postprocessing
        - It computes models contributions
        - It makes predictions
        - It summarizes local explainability

    This class allows the user to automatically summarize the results of his model
    on new datasets (prediction, preprocessing and postprocessing linking,
    explainability).
    The SmartPredictor has several methods described below.

    The SmartPredictor Attributes :

    features_dict: dict
        Dictionary mapping technical feature names to domain names.
    model: model object
        model used to check the different values of target estimate predict_proba
    backend: str or backend object
        backend (explainer) used to compute contributions
    columns_dict: dict
        Dictionary mapping integer column number (in the same order of the trained dataset) to technical feature names.
    features_types: dict
        Dictionary mapping features with the right types needed.
    label_dict: dict (optional)
        Dictionary mapping integer labels to domain names (classification - target values).
    preprocessing: category_encoders, ColumnTransformer, list or dict (optional)
        The processing apply to the original data.
    postprocessing: dict (optional)
        Dictionary of postprocessing modifications to apply in x_init dataframe.
    _case: string
        String that informs if the model used is for classification or regression problem.
    _classes: list, None
        List of labels if the model used is for classification problem, None otherwise.
    mask_params: dict (optional)
        Dictionary that specify how to summarize the explainability.

    How to declare a new SmartPredictor object?

    Example
    -------
    >>> predictor = SmartPredictor(features_dict=my_features_dict,
    >>>                             model=my_model,
    >>>                             backend=my_backend,
    >>>                             columns_dict=my_columns_dict,
    >>>                             features_types=my_features_type_dict,
    >>>                             label_dict=my_label_dict,
    >>>                             preprocessing=my_preprocess,
    >>>                             postprocessing=my_postprocess)

    or the most common syntax

    >>> predictor = xpl.to_smartpredictor()

    xpl, explainer: object
        SmartExplainer instance to point to.
    """

    def __init__(
        self,
        features_dict,
        model,
        columns_dict,
        backend,
        features_types,
        label_dict=None,
        preprocessing=None,
        postprocessing=None,
        features_groups=None,
        mask_params=None,
    ):
        params_dict = [features_dict, features_types, label_dict, columns_dict, postprocessing]

        for params in params_dict:
            if (params is not None) and (not isinstance(params, dict)):
                raise ValueError(
                    """
                    {} must be a dict.
                    """.format(
                        str(params)
                    )
                )

        self.model = model
        self._case, self._classes = self.check_model()
        self.backend = backend
        self.preprocessing = preprocessing
        self.check_preprocessing()
        self.features_dict = features_dict
        self.features_types = features_types
        self.label_dict = label_dict
        self.check_label_dict()
        self.columns_dict = columns_dict
        self.mask_params = (
            mask_params
            if mask_params is not None
            else {"features_to_hide": None, "threshold": None, "positive": None, "max_contrib": None}
        )
        self.check_mask_params()
        self.postprocessing = postprocessing
        self.features_groups = features_groups
        list_preprocessing = preprocessing_tolist(self.preprocessing)
        check_consistency_model_features(
            self.features_dict,
            self.model,
            self.columns_dict,
            self.features_types,
            self.mask_params,
            self.preprocessing,
            self.postprocessing,
            list_preprocessing,
            self.features_groups,
        )
        check_consistency_model_label(self.columns_dict, self.label_dict)
        self._drop_option = check_preprocessing_options(columns_dict, features_dict, preprocessing, list_preprocessing)

    def check_model(self):
        """
        Check if model has a predict_proba method is a one column dataframe of integer or float
        and if y_pred index matches x_init index

        Returns
        -------
        string:
            'regression' or 'classification' according to the attributes of the model
        """
        _case, _classes = check_model(self.model)
        return _case, _classes

    def check_preprocessing(self):
        """
        Check that all transformation of the preprocessing are supported.
        """
        return check_preprocessing(self.preprocessing)

    def check_label_dict(self):
        """
        Check if label_dict and model _classes match
        """
        if self._case != "regression":
            return check_label_dict(self.label_dict, self._case, self._classes)

    def check_mask_params(self):
        """
        Check if mask_params given respect the expected format.
        """
        return check_mask_params(self.mask_params)

[docs]    def add_input(self, x=None, ypred=None, contributions=None):
        """
        The add_input method is the first step to add a dataset for prediction and explainability.

        add_input applies to x parameter :
            - consistencies checks
            - preprocessing and postprocessing specified during the initialisation
            - features reordering with the right order for the model

        If you don't specify ypred or contributions, add_input compute them.
        It's possible to not specified one parameter if it has already been defined before.
        For example, if the user want to specified an ypred without reinitialize the dataset x already defined.
        If the user declare a new input x, all the parameters stored will be cleaned.

        Example
        --------
        >>> predictor.add_input(x=xtest_df)
        >>> predictor.add_input(ypred=ytest_df)

        Parameters
        ----------
        x: dict, pandas.DataFrame (optional)
            Raw dataset used by the model to perform the prediction (not preprocessed).
        ypred: pandas.DataFrame (optional)
            User-specified prediction values.
        contributions: pandas.DataFrame (regression) or list (classification) (optional)
            local contributions aggregated if the preprocessing part requires it (e.g. one-hot encoding).
        """
        if x is not None:
            x = self.check_dataset_features(self.check_dataset_type(x))
            self.data = self.clean_data(x)
            self.data["x_postprocessed"] = self.apply_postprocessing()
            try:
                self.data["x_preprocessed"] = self.apply_preprocessing()
            except BaseException:
                raise ValueError(
                    """
                    Preprocessing has failed. The preprocessing specified or the dataset doesn't match.
                    """
                )
        else:
            if not hasattr(self, "data"):
                raise ValueError("No dataset x specified.")

        if ypred is not None:
            self.data["ypred_init"] = self.check_ypred(ypred)

        if contributions is not None:
            self.data["ypred"], self.data["contributions"] = self.compute_contributions(
                contributions=contributions, use_groups=False
            )
        else:
            self.data["ypred"], self.data["contributions"] = self.compute_contributions(use_groups=False)

        if self.features_groups is not None:
            self._add_groups_input()

    def _add_groups_input(self):
        """
        Compute groups of features values, contributions the same way as add_input method
        and stores it in data_groups attribute
        """
        self.data_groups = dict()
        self.data_groups["x_postprocessed"] = create_grouped_features_values(
            x_init=self.data["x_postprocessed"],
            x_encoded=self.data["x_preprocessed"],
            preprocessing=self.preprocessing,
            features_groups=self.features_groups,
            features_dict=self.features_dict,
            how="dict_of_values",
        )
        self.data_groups["ypred"] = self.data["ypred"]
        self.data_groups["contributions"] = group_contributions(
            contributions=self.data["contributions"], features_groups=self.features_groups
        )

    def check_dataset_type(self, x=None):
        """
        Check if dataset x given respect the expected format.

        Parameters
        ----------
        x: dict, pandas.DataFrame (optional)
            Raw dataset used by the model to perform the prediction (not preprocessed).

        Returns
        -------
        x: pandas.DataFrame
            Raw dataset used by the model to perform the prediction (not preprocessed).

        """
        if not (type(x) in [pd.DataFrame, dict]):
            raise ValueError(
                """
                x must be a dict or a pandas.DataFrame.
                """
            )
        else:
            x = self.convert_dict_dataset(x)
        return x

    def convert_dict_dataset(self, x):
        """
        Convert a dict to a dataframe if the dataset specified is a dict.

        Parameters
        ----------
        x: dict
            Raw dataset used by the model to perform the prediction (not preprocessed).

        Returns
        -------
        x: pandas.DataFrame
            Raw dataset used by the model to perform the prediction (not preprocessed).
        """
        if type(x) == dict:
            if not all([column in self.features_types.keys() for column in x.keys()]):
                raise ValueError(
                    """
                All features from dataset x must be in the features_types dict initialized.
                """
                )
            try:
                x = pd.DataFrame.from_dict(x, orient="index").T
                for feature, type_feature in self.features_types.items():
                    x[feature] = x[feature].astype(type_feature)
            except BaseException:
                raise ValueError(
                    """
                    The structure of the given dict x isn't at the right format.
                    """
                )
        return x

    def check_dataset_features(self, x):
        """
        Check if the features of the dataset x has the expected types before using preprocessing and model.

        Parameters
        ----------
        x: pandas.DataFrame (optional)
            Raw dataset used by the model to perform the prediction (not preprocessed).
        """
        assert all(column in self.columns_dict.values() for column in x.columns)
        if not all([type(key) == int for key in self.columns_dict.keys()]):
            raise ValueError("columns_dict must have only integers keys for features order.")
        features_order = []
        for order in range(min(self.columns_dict.keys()), max(self.columns_dict.keys()) + 1):
            features_order.append(self.columns_dict[order])
        x = x[features_order]

        assert all(column in self.features_types.keys() for column in x.columns)
        if not all([str(x[feature].dtypes) == self.features_types[feature] for feature in x.columns]):
            raise ValueError(
                """
                  Types of features in x doesn't match with the expected one in features_types.
                  x input must be initial dataset without preprocessing applied.
                  """
            )
        return x

    def check_ypred(self, ypred=None):
        """
        Check that ypred given has the right shape and expected value.

        Parameters
        ----------
        ypred: pandas.DataFrame (optional)
            User-specified prediction values.
        """
        return check_y(self.data["x"], ypred)

    def adapt_contributions(self, contributions):
        """
        If _case is "classification" and contributions a np.array or pd.DataFrame
        this function transform contributions matrix in a list of 2 contributions
        matrices: Opposite contributions and contributions matrices.

        Parameters
        ----------
        contributions : pandas.DataFrame, np.ndarray or list

        Returns
        -------
            pandas.DataFrame, np.ndarray or list
            contributions object modified
        """
        return adapt_contributions(self._case, contributions)

    def check_contributions(self, contributions):
        """
        Check if contributions and prediction set match in terms of shape and index.
        """
        if self._drop_option is not None:
            x = self.data["x"][self.data["x"].columns.difference(self._drop_option["features_to_drop"])]
        else:
            x = self.data["x"]

        if not self.backend.state.check_contributions(contributions, x, features_names=False):
            raise ValueError(
                """
                Prediction set and contributions should have exactly the same number of lines
                and number of columns. the order of the columns must be the same
                Please check x, contributions and preprocessing arguments.
                """
            )

    def clean_data(self, x):
        """
        Clean data stored if x is defined and not None.

        Parameters
        ----------
        x: pandas.DataFrame
            Raw dataset used by the model to perform the prediction (not preprocessed).

        Returns
        -------
            dict of data stored
        """
        return {
            "x": x,
            "ypred_init": None,
            "ypred": None,
            "contributions": None,
            "x_preprocessed": None,
            "x_postprocessed": None,
        }

[docs]    def predict_proba(self):
        """
        The predict_proba compute the probabilities predicted for each x row defined in add_input.

        Returns
        -------
        pandas.DataFrame
            A dataset with all probabilities of each label if there is no ypred data or a dataset with ypred and the associated probability.

        Example
        --------
        >>> predictor.add_input(x=xtest_df)
        >>> predictor.predict_proba()

        """
        return predict_proba(self.model, self.data["x_preprocessed"], self._classes)

    def compute_contributions(self, contributions=None, use_groups=None):
        """
        The compute_contributions compute the contributions associated to data ypred specified.
        Need a data ypred specified in an add_input to display detail_contributions.

        Parameters
        -------
        contributions : object (optional)
            Local contributions, or list of local contributions.
        use_groups : bool (optional)
            Whether or not to compute groups of features contributions.

        Returns
        -------
        pandas.DataFrame
            Data with contributions associated to the ypred specified.
        pandas.DataFrame
            ypred data with right probabilities associated.

        """
        use_groups = True if (use_groups is not False and self.features_groups is not None) else False

        if not hasattr(self, "data"):
            raise ValueError("add_input method must be called at least once.")
        if self.data["x"] is None:
            raise ValueError(
                """
                x must be specified in an add_input method to apply detail_contributions.
                """
            )
        if self.data["ypred_init"] is None:
            self.predict()

        if contributions is None:
            explain_data = self.backend.run_explainer(x=self.data["x_preprocessed"])
            contributions = self.backend.get_local_contributions(
                explain_data=explain_data, x=self.data["x_preprocessed"]
            )
        else:
            contributions = self.backend.format_and_aggregate_local_contributions(
                x=self.data["x_preprocessed"], contributions=contributions
            )
        self.check_contributions(contributions)
        proba_values = self.predict_proba() if self._case == "classification" else None
        y_pred, match_contrib = keep_right_contributions(
            self.data["ypred_init"], contributions, self._case, self._classes, self.label_dict, proba_values
        )
        if use_groups:
            match_contrib = group_contributions(match_contrib, features_groups=self.features_groups)

        return y_pred, match_contrib

[docs]    def detail_contributions(self, contributions=None, use_groups=None):
        """
        The detail_contributions method associates the right contributions with the right data predicted.
        (with ypred specified in add_input or computed automatically)

        Parameters
        -------
        contributions : object (optional)
            Local contributions, or list of local contributions.
        use_groups : bool (optional)
            Whether or not to compute groups of features contributions.

        Returns
        -------
        pandas.DataFrame
            A Dataset with ypred and the right associated contributions.

        Example
        --------

        >>> predictor.add_input(x=xtest_df)
        >>> predictor.detail_contributions()

        """
        y_pred, detail_contrib = self.compute_contributions(contributions=contributions, use_groups=use_groups)
        return pd.concat([y_pred, detail_contrib], axis=1)

[docs]    def save(self, path):
        """
        Save method allows users to save SmartPredictor object on disk using a pickle file.
        Save method can be useful: you don't have to recompile to display results later.

        Load_smartpredictor method allow to load your SmartPredictor object saved. (See example below)

        Parameters
        ----------
        path : str
            File path to store the pickle file

        Example
        --------

        >>> predictor.save('path_to_pkl/predictor.pkl')
        >>> from shapash.utils.load_smartpredictor import load_smartpredictor
        >>> predictor_load = load_smartpredictor('path_to_pkl/predictor.pkl')
        """
        save_pickle(self, path)

    def apply_preprocessing(self):
        """
        Apply preprocessing on new dataset input specified.
        """
        return apply_preprocessing(self.data["x"], self.model, self.preprocessing)

    def filter(self):
        """
        The filter method is an important method which allows to summarize the local explainability
        by using the user defined mask_params parameters which correspond to its use case.
        """
        mask = [init_mask(self.summary["contrib_sorted"], True)]
        if self.mask_params["features_to_hide"] is not None:
            mask.append(
                hide_contributions(
                    self.summary["var_dict"],
                    features_list=self.check_features_name(self.mask_params["features_to_hide"]),
                )
            )
        if self.mask_params["threshold"] is not None:
            mask.append(cap_contributions(self.summary["contrib_sorted"], threshold=self.mask_params["threshold"]))
        if self.mask_params["positive"] is not None:
            mask.append(sign_contributions(self.summary["contrib_sorted"], positive=self.mask_params["positive"]))
        self.mask = combine_masks(mask)
        if self.mask_params["max_contrib"] is not None:
            self.mask = cutoff_contributions(mask=self.mask, k=self.mask_params["max_contrib"])
        self.masked_contributions = compute_masked_contributions(self.summary["contrib_sorted"], self.mask)

[docs]    def summarize(self, use_groups=None):
        """
        The summarize method allows to display the summary of local explainability.
        This method can be configured with modify_mask method to summarize the explainability to suit needs.

        If the user doesn't use modify_mask, the summarize method uses the mask_params parameters specified during
        the initialisation of the SmartPredictor.

        In classification case, The summarize method summarizes the explainability which corresponds to :
            - the predicted values specified by the user or automatically computed (with add_input method)
            - the right probabilities from predict_proba associated to the right predicted values
            - the right contributions ranked and filtered as specify with modify_mask method

        Parameters
        ----------
        use_groups : bool (optional)
            Whether or not to compute groups of features contributions.

        Returns
        -------
        pandas.DataFrame
            - selected explanation of each row for classification case

        Examples
        --------
        >>> summary_df = predictor.summarize()
        >>> summary_df
                pred	proba	    feature_1	value_1	    contribution_1	feature_2	value_2	    contribution_2
        0	0	    0.756416	Sex	        1.0	        0.322308	    Pclass	    3.0	        0.155069
        1	3	    0.628911	Sex	        2.0	        0.585475	    Pclass	    1.0	        0.370504
        2	0	    0.543308	Sex	        2.0	        -0.486667	    Pclass	    3.0	        0.255072

        >>> predictor.modify_mask(max_contrib=1)
        >>> summary_df = predictor.summarize()
        >>> summary_df
                pred	proba	    feature_1	value_1	    contribution_1
        0	0	    0.756416	Sex	        1.0	        0.322308
        1	3	    0.628911	Sex	        2.0	        0.585475
        2	0	    0.543308	Sex	        2.0	        -0.486667
        """
        # data is needed : add_input() method must be called at least once
        use_groups = True if (use_groups is not False and self.features_groups is not None) else False

        if not hasattr(self, "data"):
            raise ValueError("You have to specify dataset x and y_pred arguments. Please use add_input() method.")

        if use_groups is True:
            data = self.data_groups
        else:
            data = self.data

        if self._drop_option is not None:
            columns_to_keep = [
                x for x in self._drop_option["columns_dict_op"].values() if x in data["x_postprocessed"].columns
            ]
            if use_groups:
                columns_to_keep += list(self.features_groups.keys())
            x_preprocessed = data["x_postprocessed"][columns_to_keep]
        else:
            x_preprocessed = data["x_postprocessed"]

        columns_dict = {i: col for i, col in enumerate(x_preprocessed.columns)}
        features_dict = {k: v for k, v in self.features_dict.items() if k in x_preprocessed.columns}

        self.summary = assign_contributions(rank_contributions(data["contributions"], x_preprocessed))
        # Apply filter method with mask_params attributes parameters
        self.filter()

        # Summarize information
        data["summary"] = summarize(
            self.summary["contrib_sorted"],
            self.summary["var_dict"],
            self.summary["x_sorted"],
            self.mask,
            columns_dict,
            features_dict,
        )

        # Matching with y_pred
        return pd.concat([data["ypred"], data["summary"]], axis=1)

[docs]    def modify_mask(self, features_to_hide=None, threshold=None, positive=None, max_contrib=None):
        """
        This method allows the users to modify the mask_params values.
        Each parameter is optional, modify_mask method modifies only the values specified in parameters.

        This method has to be used to configure the summary displayed with summarize method.

        Parameters
        ----------
        features_to_hide : list, optional (default: None)
            List of strings, containing features to hide.
        threshold : float, optional (default: None)
            Absolute threshold below which any contribution is hidden.
        positive: bool, optional (default: None)
            If True, hide negative values. False, hide positive values
            If None, hide nothing.
        max_contrib : int, optional (default: None)
            Maximum number of contributions to show.

        Examples
        --------
        >>> predictor.modify_mask(max_contrib=1)
        >>> summary_df = predictor.summarize()
        >>> summary_df
                pred	proba	    feature_1	value_1	    contribution_1
        0	0	    0.756416	Sex	        1.0	        0.322308
        1	3	    0.628911	Sex	        2.0	        0.585475
        2	0	    0.543308	Sex	        2.0	        -0.486667

        """
        Attributes = {
            "features_to_hide": features_to_hide,
            "threshold": threshold,
            "positive": positive,
            "max_contrib": max_contrib,
        }
        for label, attribute in Attributes.items():
            if attribute is not None:
                self.mask_params[label] = attribute

[docs]    def predict(self):
        """
        The predict method compute the predicted values for each x row defined in add_input.

        Returns
        -------
        pandas.DataFrame
            A dataset with predicted values for each x row.

        Example
        --------
        >>> predictor.add_input(x=xtest_df)
        >>> predictor.predict()


        """
        if not hasattr(self, "data"):
            raise ValueError("add_input method must be called at least once.")
        if self.data["x_preprocessed"] is None:
            raise ValueError(
                """
                x must be specified in an add_input method to apply predict.
                """
            )
        if hasattr(self.model, "predict"):
            self.data["ypred_init"] = pd.DataFrame(
                self.model.predict(self.data["x_preprocessed"]),
                columns=["ypred"],
                index=self.data["x_preprocessed"].index,
            )
        else:
            raise ValueError("model has no predict method")

        return self.data["ypred_init"]

    def apply_postprocessing(self):
        """
        Modifies x Dataframe according to postprocessing modifications, if exists.

        Parameters
        ----------
        postprocessing: Dict
            Dictionnary of postprocessing modifications to apply in x.

        Returns
        -------
        pandas.Dataframe
            Returns x_init if postprocessing is empty, modified dataframe otherwise.
        """
        if self.postprocessing:
            return apply_postprocessing(self.data["x"], self.postprocessing)
        else:
            return self.data["x"]

    def check_features_name(self, features):
        """
        Convert a list of feature names (string) or features ids into features ids.
        Features names can be part of columns_dict or features_dict.

        Parameters
        ----------
        features : List
            List of ints (columns ids) or of strings (business names)

        Returns
        -------
        list of ints
            Columns ids compatible with var_dict
        """
        return check_features_name(self.columns_dict, self.features_dict, features)

    def to_smartexplainer(self):
        """
        Create a SmartExplainer object compiled with the data specified in add_input method with
        SmartPredictor attributes
        """
        if not hasattr(self, "data"):
            raise ValueError("add_input method must be called at least once.")

        if self.data["x"] is None:
            raise ValueError(
                """
                x must be specified in an add_input method to apply to_smartexplainer method.
                """
            )

        list_preprocessing = preprocessing_tolist(self.preprocessing)
        for enc in list_preprocessing:
            if str(type(enc)) in columntransformer:
                raise ValueError("SmartPredictor can't switch to SmartExplainer for ColumnTransformer preprocessing.")

        xpl = shapash.explainer.smart_explainer.SmartExplainer(
            model=self.model,
            backend=self.backend,
            preprocessing=self.preprocessing,
            postprocessing=self.postprocessing,
            features_groups=self.features_groups,
            features_dict=copy.deepcopy(self.features_dict),
            label_dict=copy.deepcopy(self.label_dict),
        )
        xpl.compile(x=copy.deepcopy(self.data["x_preprocessed"]), y_pred=copy.deepcopy(self.data["ypred_init"]))
        return xpl