Source code for shapash.explainer.smart_explainer

"""
Smart explainer module
"""
import copy
import logging
import shutil
import tempfile

import numpy as np
import pandas as pd

import shapash.explainer.smart_predictor
from shapash.backend import BaseBackend, get_backend_cls_from_name
from shapash.backend.shap_backend import get_shap_interaction_values
from shapash.manipulation.select_lines import keep_right_contributions
from shapash.manipulation.summarize import create_grouped_features_values
from shapash.report import check_report_requirements
from shapash.style.style_utils import colors_loading, select_palette
from shapash.utils.check import (
    check_additional_data,
    check_features_name,
    check_label_dict,
    check_model,
    check_postprocessing,
    check_y,
)
from shapash.utils.explanation_metrics import find_neighbors, get_distance, get_min_nb_features, shap_neighbors
from shapash.utils.io import load_pickle, save_pickle
from shapash.utils.model import predict, predict_error, predict_proba
from shapash.utils.threading import CustomThread
from shapash.utils.transform import apply_postprocessing, handle_categorical_missing, inverse_transform
from shapash.utils.utils import get_host_name
from shapash.webapp.smart_app import SmartApp

from .smart_plotter import SmartPlotter

logging.basicConfig(level=logging.INFO)


[docs]class SmartExplainer:
    """
    The SmartExplainer class is the main object of the Shapash library.
    It allows the Data Scientists to perform many operations to make the
    results more understandable :
    linking encoders, models, predictions, label dict and datasets.
    SmartExplainer users have several methods which are described below.
    Parameters
    ----------
    model : model object
        model used to consistency check. model object can also be used by some method to compute
        predict and predict_proba values
    backend : str or shapash.backend object (default: 'shap')
        Select which computation method to use in order to compute contributions
        and feature importance. Possible values are 'shap' or 'lime'. Default is 'shap'.
        It is also possible to pass a backend class inherited from shpash.backend.BaseBackend.
    preprocessing : category_encoders, ColumnTransformer, list, dict, optional (default: None)
        --> Differents types of preprocessing are available:
        - A single category_encoders (OrdinalEncoder/OnehotEncoder/BaseNEncoder/BinaryEncoder/TargetEncoder)
        - A single ColumnTransformer with scikit-learn encoding or category_encoders transformers
        - A list with multiple category_encoders with optional (dict, list of dict)
        - A list with a single ColumnTransformer with optional (dict, list of dict)
        - A dict
        - A list of dict
    postprocessing : dict, optional (default: None)
        Dictionnary of postprocessing modifications to apply in x_init dataframe.
        Dictionnary with feature names as keys (or number, or well labels referencing to features names),
        which modifies dataset features by features.
        --> Different types of postprocessing are available, but the syntax is this one:
        One key by features, 5 different types of modifications:
            features_groups : dict, optional (default: None)
        Dictionnary containing features that should be grouped together. This option allows
        to compute and display the contributions and importance of this group of features.
        Features that are grouped together will still be displayed in the webapp when clicking
        on a group.
        >>> {
        ‘feature1’ : { ‘type’ : ‘prefix’, ‘rule’ : ‘age: ‘ },
        ‘feature2’ : { ‘type’ : ‘suffix’, ‘rule’ : ‘$/week ‘ },
        ‘feature3’ : { ‘type’ : ‘transcoding’, ‘rule‘: { ‘code1’ : ‘single’, ‘code2’ : ‘married’}},
        ‘feature4’ : { ‘type’ : ‘regex’ , ‘rule‘: { ‘in’ : ‘AND’, ‘out’ : ‘ & ‘ }},
        ‘feature5’ : { ‘type’ : ‘case’ , ‘rule‘: ‘lower’‘ }
        }
        Only one transformation by features is possible.
    features_groups : dict, optional (default: None)
        Dictionnary containing features that should be grouped together. This option allows
        to compute and display the contributions and importance of this group of features.
        Features that are grouped together will still be displayed in the webapp when clicking
        on a group.
        >>> {
        ‘feature_group_1’ : ['feature3', 'feature7', 'feature24'],
        ‘feature_group_2’ : ['feature1', 'feature12'],
        }
    features_dict: dict
        Dictionary mapping technical feature names to domain names.
    label_dict: dict
        Dictionary mapping integer labels to domain names (classification - target values).
    title_story: str (default: None)
        The default title is empty. You can specify a custom title
        which can be used the webapp, or other methods
    palette_name : str
        Name of the palette used for the colors of the report (refer to style folder).
    colors_dic : dict
        dictionnary contaning every palettes of colors. You can use this parameter to change
        any color of the graphs.
    **backend_kwargs : dict
        Keyword parameters to be passed to the backend.

    Attributes
    ----------
    data: dict
        Data dictionary has 3 entries. Each key returns a pd.DataFrame (regression) or a list of pd.DataFrame
        (classification - The length of the lists is equivalent to the number of labels).
        All pd.DataFrame have she same shape (n_samples, n_features).
        For the regression case, data that should be regarded as a single array
        of size (n_samples, n_features, 3).
        data['contrib_sorted']: pandas.DataFrame (regression) or list of pandas.DataFrame (classification)
            Contains local contributions of the prediction set, with common line index.
            Columns are 'contrib_1', 'contrib_2', ... and contains the top contributions
            for each line from left to right. In multi-class problems, this is a list of
            contributions, one for each class.
        data['var_dict']: pandas.DataFrame (regression) or list of pandas.DataFrame (classification)
            Must contain only ints. It gives, for each line, the list of most import features
            regarding the local decomposition. In order to save space, columns are denoted by
            integers, the conversion being done with the columns_dict member. In multi-class
            problems, this is a list of dataframes, one for each class.
        data['x_sorted']: pandas.DataFrame (regression) or list of pandas.DataFrame (classification)
            It gives, for each line, the list of most important features values regarding the local
            decomposition. These values can only be understood with respect to data['var_dict']
    backend_name:
        backend name if backend passed is a string
    x_encoded: pandas.DataFrame
        preprocessed dataset used by the model to perform the prediction.
    x_init: pandas.DataFrame
        x_encoded dataset with inverse transformation with eventual postprocessing modifications.
    x_contrib_plot: pandas.DataFrame
        x_encoded dataset with inverse transformation, without postprocessing used for contribution_plot.
    y_pred: pandas.DataFrame
        User-specified prediction values.
    contributions: pandas.DataFrame (regression) or list (classification)
        local contributions aggregated if the preprocessing part requires it (e.g. one-hot encoding).
    features_dict: dict
        Dictionary mapping technical feature names to domain names.
    inv_features_dict: dict
        Inverse features_dict mapping.
    label_dict: dict
        Dictionary mapping integer labels to domain names (classification - target values).
    inv_label_dict: dict
        Inverse label_dict mapping.
    columns_dict: dict
        Dictionary mapping integer column number to technical feature names.
    plot: object
        Helper object containing all plotting functions (Bridge pattern).
    model: model object
        model used to check the different values of target estimate predict proba
    features_desc: dict
        Dictionary that references the numbers of feature values in the x_init
    features_imp: pandas.Series (regression) or list (classification)
        Features importance values
    local_neighbors: dict
        Dictionary of values to be displayed on the local_neighbors plot.
        The key is "norm_shap (normalized contributions values of instance and neighbors)
    features_stability: dict
        Dictionary of arrays to be displayed on the stability plot.
        The keys are "amplitude" (average contributions values for selected instances) and
        "stability" (stability metric across neighborhood)
    preprocessing : category_encoders, ColumnTransformer, list or dict
        The processing apply to the original data.
    postprocessing : dict
        Dictionnary of postprocessing modifications to apply in x_init dataframe.
    y_target : pandas.Series or pandas.DataFrame, optional (default: None)
        Target values

    Example
    --------
    >>> xpl = SmartExplainer(model, features_dict=featd,label_dict=labeld)
    >>> xpl.compile(x=x_encoded, y_target=y)
    >>> xpl.plot.features_importance()
    """

    def __init__(
        self,
        model,
        backend="shap",
        preprocessing=None,
        postprocessing=None,
        features_groups=None,
        features_dict=None,
        label_dict=None,
        title_story: str = None,
        palette_name=None,
        colors_dict=None,
        **backend_kwargs,
    ):
        if features_dict is not None and not isinstance(features_dict, dict):
            raise ValueError(
                """
                features_dict must be a dict
                """
            )
        if label_dict is not None and isinstance(label_dict, dict) is False:
            raise ValueError(
                """
                label_dict must be a dict
                """
            )
        self.model = model
        self.preprocessing = preprocessing
        self.backend_name = None
        if isinstance(backend, str):
            self.backend_name = backend
        elif isinstance(backend, BaseBackend):
            self.backend = backend
            if backend.preprocessing is None and self.preprocessing is not None:
                self.backend.preprocessing = self.preprocessing
        else:
            raise NotImplementedError(f"Unknown backend : {backend}")

        self.backend_kwargs = backend_kwargs
        self.features_dict = dict() if features_dict is None else copy.deepcopy(features_dict)
        self.label_dict = label_dict
        self.plot = SmartPlotter(self)
        self.title_story = title_story if title_story is not None else ""
        self.palette_name = palette_name if palette_name else "default"
        self.colors_dict = copy.deepcopy(select_palette(colors_loading(), self.palette_name))
        if colors_dict is not None:
            self.colors_dict.update(colors_dict)
        self.plot.define_style_attributes(colors_dict=self.colors_dict)

        self._case, self._classes = check_model(self.model)
        self.postprocessing = postprocessing
        self.check_label_dict()
        if self.label_dict:
            self.inv_label_dict = {v: k for k, v in self.label_dict.items()}

        self.features_groups = features_groups
        self.local_neighbors = None
        self.features_stability = None
        self.features_compacity = None
        self.contributions = None
        self.explain_data = None
        self.features_imp = None

[docs]    def compile(
        self,
        x,
        contributions=None,
        y_pred=None,
        proba_values=None,
        y_target=None,
        additional_data=None,
        additional_features_dict=None,
    ):
        """
        The compile method is the first step to understand model and
        prediction. It performs the sorting of contributions, the reverse
        preprocessing steps and performs all the calculations necessary for
        a quick display of plots and efficient display of summary of
        explanation. This step can last a few moments with large datasets.
        Parameters
        ----------
        x : pandas.DataFrame
            Prediction set.
            IMPORTANT: this should be the raw prediction set,
            whose values are seen by the end user.
            x is a preprocessed dataset: Shapash can apply the model to it
        contributions : pandas.DataFrame, np.ndarray or list
            single or multiple contributions (multi-class) to handle.
            if pandas.Dataframe, the index and columns should be share with
            the prediction set. if np.ndarray, index and columns will be
            generated according to x dataset
        y_pred : pandas.Series or pandas.DataFrame, optional (default: None)
            Prediction values (1 column only).
            The index must be identical to the index of x_init.
            This is an interesting parameter for more explicit outputs.
            Shapash lets users define their own predict,
            as they may wish to set their own threshold (classification)
        proba_values : pandas.Series or pandas.DataFrame, optional (default: None)
            Probability values (1 column only).
            The index must be identical to the index of x_init.
            This is an interesting parameter for more explicit outputs.
            Shapash lets users define their own probability values
        y_target : pandas.Series or pandas.DataFrame, optional (default: None)
            Target values (1 column only).
            The index must be identical to the index of x_init.
            This is an interesting parameter for outputs on prediction
        additional_data : pandas.DataFrame, optional (default: None)
            Additional dataset of features outsite the model.
            The index must be identical to the index of x_init.
            This is an interesting parameter for visualisation and filtering
            in Shapash SmartApp.
        additional_features_dict : dict
            Dictionary mapping technical feature names to domain names for additional data.

        Example
        --------
        >>> xpl.compile(x=x_test)
        """
        if isinstance(self.backend_name, str):
            backend_cls = get_backend_cls_from_name(self.backend_name)
            self.backend = backend_cls(
                model=self.model, preprocessing=self.preprocessing, masker=x, **self.backend_kwargs
            )
        self.x_encoded = handle_categorical_missing(x)
        x_init = inverse_transform(self.x_encoded, self.preprocessing)
        self.x_init = handle_categorical_missing(x_init)
        self.y_pred = check_y(self.x_init, y_pred, y_name="y_pred")
        if (self.y_pred is None) and (hasattr(self.model, "predict")):
            self.predict()

        self.proba_values = check_y(self.x_init, proba_values, y_name="proba_values")
        if (self._case == "classification") and (self.proba_values is None) and (hasattr(self.model, "predict_proba")):
            self.predict_proba()

        self.y_target = check_y(self.x_init, y_target, y_name="y_target")
        self.prediction_error = predict_error(self.y_target, self.y_pred, self._case)

        self._get_contributions_from_backend_or_user(x, contributions)
        self.check_contributions()

        self.columns_dict = {i: col for i, col in enumerate(self.x_init.columns)}
        self.check_features_dict()
        self.inv_features_dict = {v: k for k, v in self.features_dict.items()}
        self._apply_all_postprocessing_modifications()

        self.data = self.state.assign_contributions(self.state.rank_contributions(self.contributions, self.x_init))
        self.features_desc = dict(self.x_init.nunique())
        if self.features_groups is not None:
            self._compile_features_groups(self.features_groups)
        self.additional_features_dict = (
            dict()
            if additional_features_dict is None
            else self._compile_additional_features_dict(additional_features_dict)
        )
        self.additional_data = self._compile_additional_data(additional_data)

    def _get_contributions_from_backend_or_user(self, x, contributions):
        # Computing contributions using backend
        if contributions is None:
            self.explain_data = self.backend.run_explainer(x=x)
            self.contributions = self.backend.get_local_contributions(x=x, explain_data=self.explain_data)
        else:
            self.explain_data = contributions
            self.contributions = self.backend.format_and_aggregate_local_contributions(
                x=x,
                contributions=contributions,
            )
        self.state = self.backend.state

    def _apply_all_postprocessing_modifications(self):
        postprocessing = self.modify_postprocessing(self.postprocessing)
        check_postprocessing(self.x_init, postprocessing)
        self.postprocessing_modifications = self.check_postprocessing_modif_strings(postprocessing)
        self.postprocessing = postprocessing
        if self.postprocessing_modifications:
            self.x_contrib_plot = copy.deepcopy(self.x_init)
        self.x_init = self.apply_postprocessing(postprocessing)

    def _compile_features_groups(self, features_groups):
        """
        Performs required computations for groups of features.
        """
        if self.backend.support_groups is False:
            raise AssertionError(f"Selected backend ({self.backend.name}) " f"does not support groups of features.")
        # Compute contributions for groups of features
        self.contributions_groups = self.state.compute_grouped_contributions(self.contributions, features_groups)
        self.features_imp_groups = None
        # Update features dict with groups names
        self._update_features_dict_with_groups(features_groups=features_groups)
        # Compute t-sne projections for groups of features
        self.x_init_groups = create_grouped_features_values(
            x_init=self.x_init,
            x_encoded=self.x_encoded,
            preprocessing=self.preprocessing,
            features_groups=self.features_groups,
            features_dict=self.features_dict,
            how="dict_of_values",
        )
        # Compute data attribute for groups of features
        self.data_groups = self.state.assign_contributions(
            self.state.rank_contributions(self.contributions_groups, self.x_init_groups)
        )
        self.columns_dict_groups = {i: col for i, col in enumerate(self.x_init_groups.columns)}

    def _compile_additional_features_dict(self, additional_features_dict):
        """
        Performs required computations for additional features dict.
        """
        if not isinstance(additional_features_dict, dict):
            raise ValueError(
                """
                additional_features_dict must be a dict
                """
            )
        additional_features_dict = {f"_{key}": f"_{value}" for key, value in additional_features_dict.items()}
        return additional_features_dict

    def _compile_additional_data(self, additional_data):
        """
        Performs required computations for additional data.
        """
        if additional_data is not None:
            check_additional_data(self.x_init, additional_data)
            for feature in additional_data.columns:
                if feature in self.features_dict.keys() and feature not in self.columns_dict.values():
                    self.additional_features_dict[f"_{feature}"] = f"_{self.features_dict[feature]}"
                    del self.features_dict[feature]
            additional_data = additional_data.add_prefix("_")
            for feature in set(list(additional_data.columns)) - set(self.additional_features_dict):
                self.additional_features_dict[feature] = feature
        return additional_data

    def define_style(self, palette_name=None, colors_dict=None):
        """
        Set the color set to use in plots.
        """
        if palette_name is None and colors_dict is None:
            raise ValueError("At least one of palette_name or colors_dict parameters must be defined")
        new_palette_name = palette_name or self.palette_name
        new_colors_dict = copy.deepcopy(select_palette(colors_loading(), new_palette_name))
        if colors_dict is not None:
            new_colors_dict.update(colors_dict)
        self.colors_dict.update(new_colors_dict)
        self.plot.define_style_attributes(colors_dict=self.colors_dict)

[docs]    def add(
        self,
        y_pred=None,
        proba_values=None,
        y_target=None,
        label_dict=None,
        features_dict=None,
        title_story: str = None,
        additional_data=None,
        additional_features_dict=None,
    ):
        """
        add method allows the user to add a label_dict, features_dict
        or y_pred without compiling again (and it can last a few moments).
        y_pred can be used in the plot to color scatter.
        y_pred is needed in the to_pandas method.
        label_dict and features_dict displays allow to display clearer results.
        Parameters
        ----------
        y_pred : pandas.Series, optional (default: None)
            Prediction values (1 column only).
            The index must be identical to the index of x_init.
        proba_values : pandas.Series, optional (default: None)
            Probability values (1 column only).
            The index must be identical to the index of x_init.
        label_dict: dict, optional (default: None)
            Dictionary mapping integer labels to domain names.
        features_dict: dict, optional (default: None)
            Dictionary mapping technical feature names to domain names.
        title_story: str (default: None)
            The default title is empty. You can specify a custom title
            which can be used the webapp, or other methods
        y_target : pandas.Series or pandas.DataFrame, optional (default: None)
            Target values (1 column only).
            The index must be identical to the index of x_init.
            This is an interesting parameter for outputs on prediction
        additional_data : pandas.DataFrame, optional (default: None)
            Additional dataset of features outsite the model.
            The index must be identical to the index of x_init.
            This is an interesting parameter for visualisation and filtering
            in Shapash SmartApp.
        additional_features_dict : dict
            Dictionary mapping technical feature names to domain names for additional data.
        """
        if y_pred is not None:
            self.y_pred = check_y(self.x_init, y_pred, y_name="y_pred")
            if hasattr(self, "y_target"):
                self.prediction_error = predict_error(self.y_target, self.y_pred, self._case)
        if proba_values is not None:
            self.proba_values = check_y(self.x_init, proba_values, y_name="proba_values")
        if y_target is not None:
            self.y_target = check_y(self.x_init, y_target, y_name="y_target")
            if hasattr(self, "y_pred"):
                self.prediction_error = predict_error(self.y_target, self.y_pred, self._case)
        if label_dict is not None:
            if isinstance(label_dict, dict) is False:
                raise ValueError(
                    """
                    label_dict must be a dict
                    """
                )
            self.label_dict = label_dict
            self.check_label_dict()
            self.inv_label_dict = {v: k for k, v in self.label_dict.items()}
        if features_dict is not None:
            if isinstance(features_dict, dict) is False:
                raise ValueError(
                    """
                    features_dict must be a dict
                    """
                )
            self.features_dict = features_dict
            self.check_features_dict()
            self.inv_features_dict = {v: k for k, v in self.features_dict.items()}
        if title_story is not None:
            self.title_story = title_story
        if additional_features_dict is not None:
            self.additional_features_dict = self._compile_additional_features_dict(additional_features_dict)
        if additional_data is not None:
            self.additional_data = self._compile_additional_data(additional_data)

    def get_interaction_values(self, n_samples_max=None, selection=None):
        """
        Compute shap interaction values for each row of x_encoded.
        This function is only available for explainer of type TreeExplainer (used for tree based models).
        Please refer to the official tree shap paper for more information : https://arxiv.org/pdf/1802.03888.pdf
        Parameters
        ----------
        n_samples_max : int, optional
            Limit the number of points for which we compute the interactions.
        selection : list, optional
            Contains list of index, subset of the input DataFrame that we want to plot
        Returns
        -------
        np.ndarray
            Shap interaction values for each sample as an array of shape (# samples x # features x # features).
        """
        x = copy.deepcopy(self.x_encoded)

        if selection:
            x = x.loc[selection]

        if hasattr(self, "x_interaction"):
            if self.x_interaction.equals(x[:n_samples_max]):
                return self.interaction_values

        self.x_interaction = x[:n_samples_max]
        self.interaction_values = get_shap_interaction_values(self.x_interaction, self.backend.explainer)
        return self.interaction_values

    def check_postprocessing_modif_strings(self, postprocessing=None):
        """
        Check if any modification of postprocessing will convert numeric values into strings values.
        If so, return True, otherwise False.
        Parameters
        ----------
        postprocessing: dict
            Dict of postprocessing modifications to apply.
        Returns
        -------
        modif: bool
            Boolean which is True if any numerical variable will be converted into string.
        """
        modif = False
        if postprocessing is not None:
            for key in postprocessing.keys():
                dict_postprocess = postprocessing[key]
                if dict_postprocess["type"] in {"prefix", "suffix"} and pd.api.types.is_numeric_dtype(self.x_init[key]):
                    modif = True
        return modif

    def modify_postprocessing(self, postprocessing=None):
        """
        Modifies postprocessing parameter, to change only keys, with features name,
        in case of parameters are not real feature names (with columns_dict,
        or inv_features_dict).
        Parameters
        ----------
        postprocessing : Dict
            Dictionnary of postprocessing to modify.
        Returns
        -------
        Dict
            Modified dictionnary, with same values but keys directly referencing to feature names.
        """
        if postprocessing:
            new_dic = dict()
            for key in postprocessing.keys():
                if key in self.features_dict:
                    new_dic[key] = postprocessing[key]

                elif key in self.columns_dict.keys():
                    new_dic[self.columns_dict[key]] = postprocessing[key]

                elif key in self.inv_features_dict:
                    new_dic[self.inv_features_dict[key]] = postprocessing[key]

                else:
                    raise ValueError(f"Feature name '{key}' not found in the dataset.")

            return new_dic

    def apply_postprocessing(self, postprocessing=None):
        """
        Modifies x_init Dataframe according to postprocessing modifications, if exists.
        Parameters
        ----------
        postprocessing: Dict
            Dictionnary of postprocessing modifications to apply in x_init.
        Returns
        -------
        pandas.Dataframe
            Returns x_init if postprocessing is empty, modified dataframe otherwise.
        """
        if postprocessing:
            return apply_postprocessing(self.x_init, postprocessing)
        else:
            return self.x_init

    def check_label_dict(self):
        """
        Check if label_dict and model _classes match
        """
        if self._case != "regression":
            return check_label_dict(self.label_dict, self._case, self._classes)

    def check_features_dict(self):
        """
        Check the features_dict and add the necessary keys if all the
        input X columns are not present
        """
        for feature in set(list(self.columns_dict.values())) - set(list(self.features_dict)):
            self.features_dict[feature] = feature

    def _update_features_dict_with_groups(self, features_groups):
        """
        Add groups into features dict and inv_features_dict if not present.
        """
        for group_name in features_groups.keys():
            self.features_desc[group_name] = 1000
            if group_name not in self.features_dict.keys():
                self.features_dict[group_name] = group_name
                self.inv_features_dict[group_name] = group_name

    def check_contributions(self):
        """
        Check if contributions and prediction set match in terms of shape and index.
        """
        if not self.state.check_contributions(self.contributions, self.x_init):
            raise ValueError(
                """
                Prediction set and contributions should have exactly the same number of lines
                and number of columns. the order of the columns must be the same
                Please check x, contributions and preprocessing arguments.
                """
            )

    def check_label_name(self, label, origin=None):
        """
        Convert a string label in integer. If the label is already
        an integer nothing is done. In all other cases an error is raised.
        Parameters
        ----------
        label: int or string
            Integer (id) or string (business names)
        origin: None, 'num', 'code', 'value' (default: None)
            Kind of the label used in parameter
        Returns
        -------
        tuple
            label num, label code (class of the mode), label value
        """
        if origin is None:
            if label in self._classes:
                origin = "code"
            elif self.label_dict is not None and label in self.label_dict.values():
                origin = "value"
            elif isinstance(label, int) and label in range(-1, len(self._classes)):
                origin = "num"

        try:
            if origin == "num":
                label_num = label
                label_code = self._classes[label]
                label_value = self.label_dict[label_code] if self.label_dict else label_code
            elif origin == "code":
                label_code = label
                label_num = self._classes.index(label)
                label_value = self.label_dict[label_code] if self.label_dict else label_code
            elif origin == "value":
                label_code = self.inv_label_dict[label]
                label_num = self._classes.index(label_code)
                label_value = label
            else:
                raise ValueError

        except ValueError:
            raise Exception({"message": "Origin must be 'num', 'code' or 'value'."})

        except Exception:
            raise Exception({"message": f"Label ({label}) not found for origin ({origin})"})

        return label_num, label_code, label_value

    def check_features_name(self, features, use_groups=False):
        """
        Convert a list of feature names (string) or features ids into features ids.
        Features names can be part of columns_dict or features_dict.
        Parameters
        ----------
        features : List
            List of ints (columns ids) or of strings (business names)
        use_groups : bool
            Whether or not features parameter includes groups of features
        Returns
        -------
        list of ints
            Columns ids compatible with var_dict
        """
        columns_dict = self.columns_dict if use_groups is False else self.columns_dict_groups
        return check_features_name(columns_dict, self.features_dict, features)

    def check_attributes(self, attribute):
        """
        Check that explainer has the attribute precised
        Parameters
        ----------
        attribute: string
            the label of the attribute to test
        Returns
        -------
        Object content of the attribute specified from SmartExplainer instance
        """
        if not hasattr(self, attribute):
            raise ValueError(
                """
                attribute {} isn't an attribute of the explainer precised.
                """.format(
                    attribute
                )
            )

        return self.__dict__[attribute]

[docs]    def filter(self, features_to_hide=None, threshold=None, positive=None, max_contrib=None, display_groups=None):
        """
        The filter method is an important method which allows to summarize the local explainability
        by using the user defined parameters which correspond to its use case.
        Filter method is used with the local_plot method of Smarplotter to see the concrete result of this summary
        with a local contribution barchart
        Please, watch the local_plot tutorial to see how these two methods are combined with a concrete example
        Parameters
        ----------
        features_to_hide : list, optional (default: None)
            List of strings, containing features to hide.
        threshold : float, optional (default: None)
            Absolute threshold below which any contribution is hidden.
        positive: bool, optional (default: None)
            If True, hide negative values. False, hide positive values
            If None, hide nothing.
        max_contrib : int, optional (default: None)
            Maximum number of contributions to show.
        display_groups : bool (default: None)
            Whether or not to display groups of features. This option is
            only useful if groups of features are declared when compiling
            SmartExplainer object.
        """
        display_groups = True if (display_groups is not False and self.features_groups is not None) else False
        if display_groups:
            data = self.data_groups
        else:
            data = self.data
        mask = [self.state.init_mask(data["contrib_sorted"], True)]
        if features_to_hide:
            mask.append(
                self.state.hide_contributions(
                    data["var_dict"],
                    features_list=self.check_features_name(features_to_hide, use_groups=display_groups),
                )
            )
        if threshold:
            mask.append(self.state.cap_contributions(data["contrib_sorted"], threshold=threshold))
        if positive is not None:
            mask.append(self.state.sign_contributions(data["contrib_sorted"], positive=positive))
        self.mask = self.state.combine_masks(mask)
        if max_contrib:
            self.mask = self.state.cutoff_contributions(self.mask, max_contrib=max_contrib)
        self.masked_contributions = self.state.compute_masked_contributions(data["contrib_sorted"], self.mask)
        self.mask_params = {
            "features_to_hide": features_to_hide,
            "threshold": threshold,
            "positive": positive,
            "max_contrib": max_contrib,
        }

[docs]    def save(self, path):
        """
        Save method allows user to save SmartExplainer object on disk
        using a pickle file.
        Save method can be useful: you don't have to recompile to display
        results later
        Parameters
        ----------
        path : str
            File path to store the pickle file
        Example
        --------
        >>> xpl.save('path_to_pkl/xpl.pkl')
        """
        if hasattr(self, "smartapp"):
            self.smartapp = None
        save_pickle(self, path)

[docs]    @classmethod
    def load(cls, path):
        """
        Load method allows Shapash user to use pickled SmartExplainer.
        To use this method you must first declare your SmartExplainer object
        Watch the following example
        Parameters
        ----------
        path : str
            File path of the pickle file.
        Example
        --------
        >>> xpl = SmartExplainer.load('path_to_pkl/xpl.pkl')
        """
        xpl = load_pickle(path)
        if isinstance(xpl, SmartExplainer):
            smart_explainer = cls(model=xpl.model)
            smart_explainer.__dict__.update(xpl.__dict__)
            return smart_explainer
        else:
            raise ValueError("File is not a SmartExplainer object")

    def predict_proba(self):
        """
        The predict_proba compute the proba values for each x_encoded row
        """
        self.proba_values = predict_proba(self.model, self.x_encoded, self._classes)

    def predict(self):
        """
        The predict method computes the model output for each x_encoded row and stores it in y_pred attribute
        """
        self.y_pred = predict(self.model, self.x_encoded)
        if hasattr(self, "y_target"):
            self.prediction_error = predict_error(self.y_target, self.y_pred, self._case)

[docs]    def to_pandas(
        self, features_to_hide=None, threshold=None, positive=None, max_contrib=None, proba=False, use_groups=None
    ):
        """
        The to_pandas method allows to export the summary of local explainability.
        This method proposes a set of parameters to summarize the explainability of each point.
        If the user does not specify any, the to_pandas method uses the parameter specified during
        the last execution of the filter method.
        In classification case, The method to_pandas summarizes the explicability which corresponds
        to the predicted values specified by the user (with compile or add method).
        the proba parameter displays the corresponding predict proba value for each point
        In classification case, There are 2 ways to use this to pandas method.
        - Provide a real prediction set to explain
        - Focus on a constant target value and look at the proba and explainability corresponding to each point.
        (in that case, specify a constant pd.Series with add or compile method)
        Examples are presented in the tutorial local_plot (please check tutorial part of this doc)
        Parameters
        ----------
        features_to_hide : list, optional (default: None)
            List of strings, containing features to hide.
        threshold : float, optional (default: None)
            Absolute threshold below which any contribution is hidden.
        positive: bool, optional (default: None)
            If True, hide negative values. Hide positive values otherwise. If None, hide nothing.
        max_contrib : int, optional (default: 5)
            Number of contributions to show in the pandas df
        proba : bool, optional (default: False)
            adding proba in output df
        use_groups : bool (optional)
            Whether or not to use groups of features contributions (only available if features_groups
            parameter was not empty when calling compile method).
        Returns
        -------
        pandas.DataFrame
            - selected explanation of each row for classification case
        Examples
        --------
        >>> summary_df = xpl.to_pandas(max_contrib=2,proba=True)
        >>> summary_df
            pred	proba	    feature_1	value_1	    contribution_1	feature_2	value_2	    contribution_2
        0	0	    0.756416	Sex	        1.0	        0.322308	    Pclass	    3.0	        0.155069
        1	3	    0.628911	Sex	        2.0	        0.585475	    Pclass	    1.0	        0.370504
        2	0	    0.543308	Sex	        2.0	        -0.486667	    Pclass	    3.0	        0.255072
        """
        use_groups = True if (use_groups is not False and self.features_groups is not None) else False
        if use_groups:
            data = self.data_groups
        else:
            data = self.data

        # Classification: y_pred is needed
        if self.y_pred is None:
            raise ValueError("You have to specify y_pred argument. Please use add() or compile() method")

        # Apply filter method if necessary
        if (
            all(var is None for var in [features_to_hide, threshold, positive, max_contrib])
            and hasattr(self, "mask_params")
            and (
                # if the already computed mask does not have the right shape (this can happen when
                # we use groups of features once and then use method without groups)
                (
                    isinstance(data["contrib_sorted"], pd.DataFrame)
                    and len(data["contrib_sorted"].columns) == len(self.mask.columns)
                )
                or (
                    isinstance(data["contrib_sorted"], list)
                    and len(data["contrib_sorted"][0].columns) == len(self.mask[0].columns)
                )
            )
        ):
            print("to_pandas params: " + str(self.mask_params))
        else:
            self.filter(
                features_to_hide=features_to_hide,
                threshold=threshold,
                positive=positive,
                max_contrib=max_contrib,
                display_groups=use_groups,
            )
        if use_groups:
            columns_dict = {i: col for i, col in enumerate(self.x_init_groups.columns)}
        else:
            columns_dict = self.columns_dict
        # Summarize information
        data["summary"] = self.state.summarize(
            data["contrib_sorted"], data["var_dict"], data["x_sorted"], self.mask, columns_dict, self.features_dict
        )
        # Matching with y_pred
        if proba:
            self.predict_proba()
            proba_values = self.proba_values
        else:
            proba_values = None

        y_pred, summary = keep_right_contributions(
            self.y_pred, data["summary"], self._case, self._classes, self.label_dict, proba_values
        )

        return pd.concat([y_pred, summary], axis=1)

    def compute_features_import(self, force=False):
        """
        Compute a relative features importance, sum of absolute values
        of the contributions for each.
        Features importance compute in base 100
        Parameters
        ----------
        force: bool (default: False)
            True to force de compute if features importance is
            already calculated
        Returns
        -------
        pd.Serie (Regression)
        or list of pd.Serie (Classification: One Serie for each target modality)
            Each Serie: feature importance, One row by feature,
            index of the serie = contributions.columns
        """
        self.features_imp = self.backend.get_global_features_importance(
            contributions=self.contributions, explain_data=self.explain_data, subset=None
        )

        if self.features_groups is not None and self.features_imp_groups is None:
            self.features_imp_groups = self.state.compute_features_import(self.contributions_groups)

    def compute_features_stability(self, selection):
        """
        For a selection of instances, compute features stability metrics used in
        methods `local_neighbors_plot` and `local_stability_plot`.
        - If selection is a single instance, the method returns the (normalized) contribution values
        of instance and corresponding neighbors.
        - If selection represents multiple instances, the method returns the average (normalized) contribution values
        of instances and neighbors (=amplitude), as well as the variability of those values in the neighborhood (=variability)
        Parameters
        ----------
        selection: list
            Indices of rows to be displayed on the stability plot
        Returns
        -------
        Dictionary
            Values that will be displayed on the graph. Keys are "amplitude", "variability" and "norm_shap"
        """
        if (self._case == "classification") and (len(self._classes) > 2):
            raise AssertionError("Multi-class classification is not supported")

        all_neighbors = find_neighbors(selection, self.x_encoded, self.model, self._case)

        # Check if entry is a single instance or not
        if len(selection) == 1:
            # Compute explanations for instance and neighbors
            norm_shap, _, _ = shap_neighbors(all_neighbors[0], self.x_encoded, self.contributions, self._case)
            self.local_neighbors = {"norm_shap": norm_shap}
        else:
            numb_expl = len(selection)
            amplitude = np.zeros((numb_expl, self.x_init.shape[1]))
            variability = np.zeros((numb_expl, self.x_init.shape[1]))
            # For each instance (+ neighbors), compute explanation
            for i in range(numb_expl):
                (
                    _,
                    variability[i, :],
                    amplitude[i, :],
                ) = shap_neighbors(all_neighbors[i], self.x_encoded, self.contributions, self._case)
            self.features_stability = {"variability": variability, "amplitude": amplitude}

    def compute_features_compacity(self, selection, distance, nb_features):
        """
        For a selection of instances, compute features compacity metrics used in method `compacity_plot`.
        The method returns :
        * the minimum number of features needed for a given approximation level
        * conversely, the approximation reached with a given number of features
        Parameters
        ----------
        selection: list
            Indices of rows to be displayed on the stability plot
        distance : float
            How close we want to be from model with all features
        nb_features : int
            Number of features used
        """
        if (self._case == "classification") and (len(self._classes) > 2):
            raise AssertionError("Multi-class classification is not supported")

        features_needed = get_min_nb_features(selection, self.contributions, self._case, distance)
        distance_reached = get_distance(selection, self.contributions, self._case, nb_features)
        # We clip large approximations to 100%
        distance_reached = np.clip(distance_reached, 0, 1)

        self.features_compacity = {"features_needed": features_needed, "distance_reached": distance_reached}

    def init_app(self, settings: dict = None):
        """
        Simple init of SmartApp in case of host smartapp by another way

        Parameters
        ----------
        settings : dict (default: None)
            A dict describing the default webapp settings values to be used
            Possible settings (dict keys) are 'rows', 'points', 'violin', 'features'
            Values should be positive ints
        """
        self.smartapp = SmartApp(self, settings)

[docs]    def run_app(
        self, port: int = None, host: str = None, title_story: str = None, settings: dict = None
    ) -> CustomThread:
        """
        run_app method launches the interpretability web app associated with the shapash object.
        run_app method can be used directly in a Jupyter notebook
        The link to the webapp is directly mentioned in the Jupyter output
        Use object.kill() method to kill the current instance
        Examples are presented in the web_app tutorial (please check tutorial part of this doc)
        Parameters
        ----------
        port: int (default: None)
            The port is by default on 8050. You can specify a custom port
            for your webapp.
        host: str (default: None)
            The default host is '0.0.0.0'. You can specify a custom
            ip address for your webapp
        title_story: str (default: None)
            The default title is empty. You can specify a custom title
            for your webapp (can be reused in other methods like in a report, ...)
        settings : dict (default: None)
            A dict describing the default webapp settings values to be used
            Possible settings (dict keys) are 'rows', 'points', 'violin', 'features'
            Values should be positive ints
        Returns
        -------
        CustomThread
            Return the thread instance of your server.
        Example
        --------
        >>> app = xpl.run_app()
        >>> app.kill()
        """

        if title_story is not None:
            self.title_story = title_story
        if hasattr(self, "_case"):
            self.smartapp = SmartApp(self, settings)
            if host is None:
                host = "0.0.0.0"
            if port is None:
                port = 8050
            host_name = get_host_name()
            server_instance = CustomThread(
                target=lambda: self.smartapp.app.run_server(debug=False, host=host, port=port)
            )
            if host_name is None:
                host_name = host
            elif host != "0.0.0.0":
                host_name = host
            server_instance.start()
            logging.info(f"Your Shapash application run on http://{host_name}:{port}/")
            logging.info("Use the method .kill() to down your app.")
            return server_instance

        else:
            raise ValueError("Explainer must be compiled before running app.")

    def to_smartpredictor(self):
        """
        Create a SmartPredictor object designed from the following attributes
        needed from the SmartExplainer Object :
        features_dict: dict
            Dictionary mapping technical feature names to domain names.
        label_dict: dict
            Dictionary mapping integer labels to domain names (classification - target values).
        columns_dict: dict
            Dictionary mapping integer column number to technical feature names.
        features_types: dict
            Dictionnary mapping features with the right types needed.
        model: model object
            model used to check the different values of target estimate predict proba
        backend : backend object
            backend used to compute contributions
        preprocessing: category_encoders, ColumnTransformer, list or dict
            The processing apply to the original data.
        postprocessing: dict
            Dictionnary of postprocessing modifications to apply in x_init dataframe.
        _case: string
            String that informs if the model used is for classification or regression problem.
        _classes: list, None
            List of labels if the model used is for classification problem, None otherwise.
        mask_params: dict (optional)
            Dictionnary allowing the user to define a apply a filter to summarize the local explainability.
        """
        if self.backend is None:
            raise ValueError(
                """
                SmartPredictor needs a backend (explainer).
                Please compile without contributions or specify  the
                explainer used. Make change in compile() step.
                """
            )

        self.features_types = {features: str(self.x_init[features].dtypes) for features in self.x_init.columns}

        listattributes = [
            "features_dict",
            "model",
            "columns_dict",
            "backend",
            "features_types",
            "label_dict",
            "preprocessing",
            "postprocessing",
            "features_groups",
        ]

        params_smartpredictor = [self.check_attributes(attribute) for attribute in listattributes]

        if not hasattr(self, "mask_params"):
            self.mask_params = {"features_to_hide": None, "threshold": None, "positive": None, "max_contrib": None}
        params_smartpredictor.append(self.mask_params)

        return shapash.explainer.smart_predictor.SmartPredictor(*params_smartpredictor)

    def check_x_y_attributes(self, x_str, y_str):
        """
        Check if x_str and y_str are attributes of the SmartExplainer
        Parameters
        ----------
        x_str: string
            label of the attribute x
        y_str: string
            label of the attribute y
        Returns
        -------
        list of object detained by attributes x and y.
        """
        if not (isinstance(x_str, str) and isinstance(y_str, str)):
            raise ValueError(
                """
                x and y must be strings.
                """
            )
        params_checkypred = []
        attributs_explainer = [x_str, y_str]

        for attribut in attributs_explainer:
            if hasattr(self, attribut):
                params_checkypred.append(self.__dict__[attribut])
            else:
                params_checkypred.append(None)
        return params_checkypred

[docs]    def generate_report(
        self,
        output_file,
        project_info_file,
        x_train=None,
        y_train=None,
        y_test=None,
        title_story=None,
        title_description=None,
        metrics=None,
        working_dir=None,
        notebook_path=None,
        kernel_name=None,
    ):
        """
        This method will generate an HTML report containing different information about the project.
        It analyzes the data and the model used in order to provide interesting
        insights that can be shared using the HTML format.
        It requires a project info yml file on which can figure different information about the project.
        Parameters
        ----------
        output_file : str
            Path to the HTML file to write.
        project_info_file : str
            Path to the file used to display some information about the project in the report.
        x_train : pd.DataFrame, optional
            DataFrame used for training the model.
        y_train: pd.Series or pd.DataFrame, optional
            Series of labels in the training set.
        y_test : pd.Series or pd.DataFrame, optional
            Series of labels in the test set.
        title_story : str, optional
            Report title.
        title_description : str, optional
            Report title description (as written just below the title).
        metrics : list, optional
            Metrics used in the model performance section. The metrics parameter should be a list
            of dict. Each dict contains they following keys :
            'path' (path to the metric function, ex: 'sklearn.metrics.mean_absolute_error'),
            'name' (optional, name of the metric as displayed in the report),
            and 'use_proba_values' (optional, possible values are False (default) or True
            if the metric uses proba values instead of predicted values).
            For example, metrics=[{'name': 'F1 score', 'path': 'sklearn.metrics.f1_score'}]
        working_dir : str, optional
            Working directory in which will be generated the notebook used to create the report
            and where the objects used to execute it will be saved. This parameter can be usefull
            if one wants to create its own custom report and debug the notebook used to generate
            the html report. If None, a temporary directory will be used.
        notebook_path : str, optional
            Path to the notebook used to generate the report. If None, the Shapash base report
            notebook will be used.
        kernel_name : str, optional
            Name of the kernel used to generate the report. This parameter can be usefull if
            you have multiple jupyter kernels and that the method does not use the right kernel
            by default.
        Examples
        --------
        >>> xpl.generate_report(
                output_file='report.html',
                project_info_file='utils/project_info.yml',
                x_train=x_train,
                y_train=y_train,
                y_test=ytest,
                title_story="House prices project report",
                title_description="This document is a data science report of the kaggle house prices project."
                metrics=[
                    {
                        'path': 'sklearn.metrics.mean_squared_error',
                        'name': 'Mean squared error',  # Optional : name that will be displayed next to the metric
                    },
                    {
                        'path': 'sklearn.metrics.mean_absolute_error',
                        'name': 'Mean absolute error',
                    }
                ]
            )
        """
        check_report_requirements()
        if x_train is not None:
            x_train = handle_categorical_missing(x_train)
        # Avoid Import Errors with requirements specific to the Shapash Report
        from shapash.report.generation import execute_report, export_and_save_report

        rm_working_dir = False
        if not working_dir:
            working_dir = tempfile.mkdtemp()
            rm_working_dir = True

        if not hasattr(self, "model"):
            raise AssertionError(
                "Explainer object was not compiled. Please compile the explainer "
                "object using .compile(...) method before generating the report."
            )

        try:
            execute_report(
                working_dir=working_dir,
                explainer=self,
                project_info_file=project_info_file,
                x_train=x_train,
                y_train=y_train,
                y_test=y_test,
                config=dict(
                    title_story=title_story,
                    title_description=title_description,
                    metrics=metrics,
                ),
                notebook_path=notebook_path,
                kernel_name=kernel_name,
            )
            export_and_save_report(working_dir=working_dir, output_file=output_file)

            if rm_working_dir:
                shutil.rmtree(working_dir)

        except Exception as e:
            if rm_working_dir:
                shutil.rmtree(working_dir)
            raise e