"""
Smart explainer module
"""
import copy
import logging
import shutil
import tempfile
import numpy as np
import pandas as pd
import shapash.explainer.smart_predictor
from shapash.backend import BaseBackend, get_backend_cls_from_name
from shapash.backend.shap_backend import get_shap_interaction_values
from shapash.manipulation.select_lines import keep_right_contributions
from shapash.manipulation.summarize import create_grouped_features_values
from shapash.report import check_report_requirements
from shapash.style.style_utils import colors_loading, select_palette
from shapash.utils.check import (
check_additional_data,
check_features_name,
check_label_dict,
check_model,
check_postprocessing,
check_y,
)
from shapash.utils.explanation_metrics import find_neighbors, get_distance, get_min_nb_features, shap_neighbors
from shapash.utils.io import load_pickle, save_pickle
from shapash.utils.model import predict, predict_error, predict_proba
from shapash.utils.threading import CustomThread
from shapash.utils.transform import apply_postprocessing, handle_categorical_missing, inverse_transform
from shapash.utils.utils import get_host_name
from shapash.webapp.smart_app import SmartApp
from .smart_plotter import SmartPlotter
logging.basicConfig(level=logging.INFO)
[docs]class SmartExplainer:
"""
The SmartExplainer class is the main object of the Shapash library.
It allows the Data Scientists to perform many operations to make the
results more understandable :
linking encoders, models, predictions, label dict and datasets.
SmartExplainer users have several methods which are described below.
Parameters
----------
model : model object
model used to consistency check. model object can also be used by some method to compute
predict and predict_proba values
backend : str or shapash.backend object (default: 'shap')
Select which computation method to use in order to compute contributions
and feature importance. Possible values are 'shap' or 'lime'. Default is 'shap'.
It is also possible to pass a backend class inherited from shpash.backend.BaseBackend.
preprocessing : category_encoders, ColumnTransformer, list, dict, optional (default: None)
--> Differents types of preprocessing are available:
- A single category_encoders (OrdinalEncoder/OnehotEncoder/BaseNEncoder/BinaryEncoder/TargetEncoder)
- A single ColumnTransformer with scikit-learn encoding or category_encoders transformers
- A list with multiple category_encoders with optional (dict, list of dict)
- A list with a single ColumnTransformer with optional (dict, list of dict)
- A dict
- A list of dict
postprocessing : dict, optional (default: None)
Dictionnary of postprocessing modifications to apply in x_init dataframe.
Dictionnary with feature names as keys (or number, or well labels referencing to features names),
which modifies dataset features by features.
--> Different types of postprocessing are available, but the syntax is this one:
One key by features, 5 different types of modifications:
features_groups : dict, optional (default: None)
Dictionnary containing features that should be grouped together. This option allows
to compute and display the contributions and importance of this group of features.
Features that are grouped together will still be displayed in the webapp when clicking
on a group.
>>> {
‘feature1’ : { ‘type’ : ‘prefix’, ‘rule’ : ‘age: ‘ },
‘feature2’ : { ‘type’ : ‘suffix’, ‘rule’ : ‘$/week ‘ },
‘feature3’ : { ‘type’ : ‘transcoding’, ‘rule‘: { ‘code1’ : ‘single’, ‘code2’ : ‘married’}},
‘feature4’ : { ‘type’ : ‘regex’ , ‘rule‘: { ‘in’ : ‘AND’, ‘out’ : ‘ & ‘ }},
‘feature5’ : { ‘type’ : ‘case’ , ‘rule‘: ‘lower’‘ }
}
Only one transformation by features is possible.
features_groups : dict, optional (default: None)
Dictionnary containing features that should be grouped together. This option allows
to compute and display the contributions and importance of this group of features.
Features that are grouped together will still be displayed in the webapp when clicking
on a group.
>>> {
‘feature_group_1’ : ['feature3', 'feature7', 'feature24'],
‘feature_group_2’ : ['feature1', 'feature12'],
}
features_dict: dict
Dictionary mapping technical feature names to domain names.
label_dict: dict
Dictionary mapping integer labels to domain names (classification - target values).
title_story: str (default: None)
The default title is empty. You can specify a custom title
which can be used the webapp, or other methods
palette_name : str
Name of the palette used for the colors of the report (refer to style folder).
colors_dic : dict
dictionnary contaning every palettes of colors. You can use this parameter to change
any color of the graphs.
**backend_kwargs : dict
Keyword parameters to be passed to the backend.
Attributes
----------
data: dict
Data dictionary has 3 entries. Each key returns a pd.DataFrame (regression) or a list of pd.DataFrame
(classification - The length of the lists is equivalent to the number of labels).
All pd.DataFrame have she same shape (n_samples, n_features).
For the regression case, data that should be regarded as a single array
of size (n_samples, n_features, 3).
data['contrib_sorted']: pandas.DataFrame (regression) or list of pandas.DataFrame (classification)
Contains local contributions of the prediction set, with common line index.
Columns are 'contrib_1', 'contrib_2', ... and contains the top contributions
for each line from left to right. In multi-class problems, this is a list of
contributions, one for each class.
data['var_dict']: pandas.DataFrame (regression) or list of pandas.DataFrame (classification)
Must contain only ints. It gives, for each line, the list of most import features
regarding the local decomposition. In order to save space, columns are denoted by
integers, the conversion being done with the columns_dict member. In multi-class
problems, this is a list of dataframes, one for each class.
data['x_sorted']: pandas.DataFrame (regression) or list of pandas.DataFrame (classification)
It gives, for each line, the list of most important features values regarding the local
decomposition. These values can only be understood with respect to data['var_dict']
backend_name:
backend name if backend passed is a string
x_encoded: pandas.DataFrame
preprocessed dataset used by the model to perform the prediction.
x_init: pandas.DataFrame
x_encoded dataset with inverse transformation with eventual postprocessing modifications.
x_contrib_plot: pandas.DataFrame
x_encoded dataset with inverse transformation, without postprocessing used for contribution_plot.
y_pred: pandas.DataFrame
User-specified prediction values.
contributions: pandas.DataFrame (regression) or list (classification)
local contributions aggregated if the preprocessing part requires it (e.g. one-hot encoding).
features_dict: dict
Dictionary mapping technical feature names to domain names.
inv_features_dict: dict
Inverse features_dict mapping.
label_dict: dict
Dictionary mapping integer labels to domain names (classification - target values).
inv_label_dict: dict
Inverse label_dict mapping.
columns_dict: dict
Dictionary mapping integer column number to technical feature names.
plot: object
Helper object containing all plotting functions (Bridge pattern).
model: model object
model used to check the different values of target estimate predict proba
features_desc: dict
Dictionary that references the numbers of feature values in the x_init
features_imp: pandas.Series (regression) or list (classification)
Features importance values
local_neighbors: dict
Dictionary of values to be displayed on the local_neighbors plot.
The key is "norm_shap (normalized contributions values of instance and neighbors)
features_stability: dict
Dictionary of arrays to be displayed on the stability plot.
The keys are "amplitude" (average contributions values for selected instances) and
"stability" (stability metric across neighborhood)
preprocessing : category_encoders, ColumnTransformer, list or dict
The processing apply to the original data.
postprocessing : dict
Dictionnary of postprocessing modifications to apply in x_init dataframe.
y_target : pandas.Series or pandas.DataFrame, optional (default: None)
Target values
Example
--------
>>> xpl = SmartExplainer(model, features_dict=featd,label_dict=labeld)
>>> xpl.compile(x=x_encoded, y_target=y)
>>> xpl.plot.features_importance()
"""
def __init__(
self,
model,
backend="shap",
preprocessing=None,
postprocessing=None,
features_groups=None,
features_dict=None,
label_dict=None,
title_story: str = None,
palette_name=None,
colors_dict=None,
**backend_kwargs,
):
if features_dict is not None and not isinstance(features_dict, dict):
raise ValueError(
"""
features_dict must be a dict
"""
)
if label_dict is not None and isinstance(label_dict, dict) is False:
raise ValueError(
"""
label_dict must be a dict
"""
)
self.model = model
self.preprocessing = preprocessing
self.backend_name = None
if isinstance(backend, str):
self.backend_name = backend
elif isinstance(backend, BaseBackend):
self.backend = backend
if backend.preprocessing is None and self.preprocessing is not None:
self.backend.preprocessing = self.preprocessing
else:
raise NotImplementedError(f"Unknown backend : {backend}")
self.backend_kwargs = backend_kwargs
self.features_dict = dict() if features_dict is None else copy.deepcopy(features_dict)
self.label_dict = label_dict
self.plot = SmartPlotter(self)
self.title_story = title_story if title_story is not None else ""
self.palette_name = palette_name if palette_name else "default"
self.colors_dict = copy.deepcopy(select_palette(colors_loading(), self.palette_name))
if colors_dict is not None:
self.colors_dict.update(colors_dict)
self.plot.define_style_attributes(colors_dict=self.colors_dict)
self._case, self._classes = check_model(self.model)
self.postprocessing = postprocessing
self.check_label_dict()
if self.label_dict:
self.inv_label_dict = {v: k for k, v in self.label_dict.items()}
self.features_groups = features_groups
self.local_neighbors = None
self.features_stability = None
self.features_compacity = None
self.contributions = None
self.explain_data = None
self.features_imp = None
[docs] def compile(
self,
x,
contributions=None,
y_pred=None,
proba_values=None,
y_target=None,
additional_data=None,
additional_features_dict=None,
):
"""
The compile method is the first step to understand model and
prediction. It performs the sorting of contributions, the reverse
preprocessing steps and performs all the calculations necessary for
a quick display of plots and efficient display of summary of
explanation. This step can last a few moments with large datasets.
Parameters
----------
x : pandas.DataFrame
Prediction set.
IMPORTANT: this should be the raw prediction set,
whose values are seen by the end user.
x is a preprocessed dataset: Shapash can apply the model to it
contributions : pandas.DataFrame, np.ndarray or list
single or multiple contributions (multi-class) to handle.
if pandas.Dataframe, the index and columns should be share with
the prediction set. if np.ndarray, index and columns will be
generated according to x dataset
y_pred : pandas.Series or pandas.DataFrame, optional (default: None)
Prediction values (1 column only).
The index must be identical to the index of x_init.
This is an interesting parameter for more explicit outputs.
Shapash lets users define their own predict,
as they may wish to set their own threshold (classification)
proba_values : pandas.Series or pandas.DataFrame, optional (default: None)
Probability values (1 column only).
The index must be identical to the index of x_init.
This is an interesting parameter for more explicit outputs.
Shapash lets users define their own probability values
y_target : pandas.Series or pandas.DataFrame, optional (default: None)
Target values (1 column only).
The index must be identical to the index of x_init.
This is an interesting parameter for outputs on prediction
additional_data : pandas.DataFrame, optional (default: None)
Additional dataset of features outsite the model.
The index must be identical to the index of x_init.
This is an interesting parameter for visualisation and filtering
in Shapash SmartApp.
additional_features_dict : dict
Dictionary mapping technical feature names to domain names for additional data.
Example
--------
>>> xpl.compile(x=x_test)
"""
if isinstance(self.backend_name, str):
backend_cls = get_backend_cls_from_name(self.backend_name)
self.backend = backend_cls(
model=self.model, preprocessing=self.preprocessing, masker=x, **self.backend_kwargs
)
self.x_encoded = handle_categorical_missing(x)
x_init = inverse_transform(self.x_encoded, self.preprocessing)
self.x_init = handle_categorical_missing(x_init)
self.y_pred = check_y(self.x_init, y_pred, y_name="y_pred")
if (self.y_pred is None) and (hasattr(self.model, "predict")):
self.predict()
self.proba_values = check_y(self.x_init, proba_values, y_name="proba_values")
if (self._case == "classification") and (self.proba_values is None) and (hasattr(self.model, "predict_proba")):
self.predict_proba()
self.y_target = check_y(self.x_init, y_target, y_name="y_target")
self.prediction_error = predict_error(self.y_target, self.y_pred, self._case)
self._get_contributions_from_backend_or_user(x, contributions)
self.check_contributions()
self.columns_dict = {i: col for i, col in enumerate(self.x_init.columns)}
self.check_features_dict()
self.inv_features_dict = {v: k for k, v in self.features_dict.items()}
self._apply_all_postprocessing_modifications()
self.data = self.state.assign_contributions(self.state.rank_contributions(self.contributions, self.x_init))
self.features_desc = dict(self.x_init.nunique())
if self.features_groups is not None:
self._compile_features_groups(self.features_groups)
self.additional_features_dict = (
dict()
if additional_features_dict is None
else self._compile_additional_features_dict(additional_features_dict)
)
self.additional_data = self._compile_additional_data(additional_data)
def _get_contributions_from_backend_or_user(self, x, contributions):
# Computing contributions using backend
if contributions is None:
self.explain_data = self.backend.run_explainer(x=x)
self.contributions = self.backend.get_local_contributions(x=x, explain_data=self.explain_data)
else:
self.explain_data = contributions
self.contributions = self.backend.format_and_aggregate_local_contributions(
x=x,
contributions=contributions,
)
self.state = self.backend.state
def _apply_all_postprocessing_modifications(self):
postprocessing = self.modify_postprocessing(self.postprocessing)
check_postprocessing(self.x_init, postprocessing)
self.postprocessing_modifications = self.check_postprocessing_modif_strings(postprocessing)
self.postprocessing = postprocessing
if self.postprocessing_modifications:
self.x_contrib_plot = copy.deepcopy(self.x_init)
self.x_init = self.apply_postprocessing(postprocessing)
def _compile_features_groups(self, features_groups):
"""
Performs required computations for groups of features.
"""
if self.backend.support_groups is False:
raise AssertionError(f"Selected backend ({self.backend.name}) " f"does not support groups of features.")
# Compute contributions for groups of features
self.contributions_groups = self.state.compute_grouped_contributions(self.contributions, features_groups)
self.features_imp_groups = None
# Update features dict with groups names
self._update_features_dict_with_groups(features_groups=features_groups)
# Compute t-sne projections for groups of features
self.x_init_groups = create_grouped_features_values(
x_init=self.x_init,
x_encoded=self.x_encoded,
preprocessing=self.preprocessing,
features_groups=self.features_groups,
features_dict=self.features_dict,
how="dict_of_values",
)
# Compute data attribute for groups of features
self.data_groups = self.state.assign_contributions(
self.state.rank_contributions(self.contributions_groups, self.x_init_groups)
)
self.columns_dict_groups = {i: col for i, col in enumerate(self.x_init_groups.columns)}
def _compile_additional_features_dict(self, additional_features_dict):
"""
Performs required computations for additional features dict.
"""
if not isinstance(additional_features_dict, dict):
raise ValueError(
"""
additional_features_dict must be a dict
"""
)
additional_features_dict = {f"_{key}": f"_{value}" for key, value in additional_features_dict.items()}
return additional_features_dict
def _compile_additional_data(self, additional_data):
"""
Performs required computations for additional data.
"""
if additional_data is not None:
check_additional_data(self.x_init, additional_data)
for feature in additional_data.columns:
if feature in self.features_dict.keys() and feature not in self.columns_dict.values():
self.additional_features_dict[f"_{feature}"] = f"_{self.features_dict[feature]}"
del self.features_dict[feature]
additional_data = additional_data.add_prefix("_")
for feature in set(list(additional_data.columns)) - set(self.additional_features_dict):
self.additional_features_dict[feature] = feature
return additional_data
def define_style(self, palette_name=None, colors_dict=None):
"""
Set the color set to use in plots.
"""
if palette_name is None and colors_dict is None:
raise ValueError("At least one of palette_name or colors_dict parameters must be defined")
new_palette_name = palette_name or self.palette_name
new_colors_dict = copy.deepcopy(select_palette(colors_loading(), new_palette_name))
if colors_dict is not None:
new_colors_dict.update(colors_dict)
self.colors_dict.update(new_colors_dict)
self.plot.define_style_attributes(colors_dict=self.colors_dict)
[docs] def add(
self,
y_pred=None,
proba_values=None,
y_target=None,
label_dict=None,
features_dict=None,
title_story: str = None,
additional_data=None,
additional_features_dict=None,
):
"""
add method allows the user to add a label_dict, features_dict
or y_pred without compiling again (and it can last a few moments).
y_pred can be used in the plot to color scatter.
y_pred is needed in the to_pandas method.
label_dict and features_dict displays allow to display clearer results.
Parameters
----------
y_pred : pandas.Series, optional (default: None)
Prediction values (1 column only).
The index must be identical to the index of x_init.
proba_values : pandas.Series, optional (default: None)
Probability values (1 column only).
The index must be identical to the index of x_init.
label_dict: dict, optional (default: None)
Dictionary mapping integer labels to domain names.
features_dict: dict, optional (default: None)
Dictionary mapping technical feature names to domain names.
title_story: str (default: None)
The default title is empty. You can specify a custom title
which can be used the webapp, or other methods
y_target : pandas.Series or pandas.DataFrame, optional (default: None)
Target values (1 column only).
The index must be identical to the index of x_init.
This is an interesting parameter for outputs on prediction
additional_data : pandas.DataFrame, optional (default: None)
Additional dataset of features outsite the model.
The index must be identical to the index of x_init.
This is an interesting parameter for visualisation and filtering
in Shapash SmartApp.
additional_features_dict : dict
Dictionary mapping technical feature names to domain names for additional data.
"""
if y_pred is not None:
self.y_pred = check_y(self.x_init, y_pred, y_name="y_pred")
if hasattr(self, "y_target"):
self.prediction_error = predict_error(self.y_target, self.y_pred, self._case)
if proba_values is not None:
self.proba_values = check_y(self.x_init, proba_values, y_name="proba_values")
if y_target is not None:
self.y_target = check_y(self.x_init, y_target, y_name="y_target")
if hasattr(self, "y_pred"):
self.prediction_error = predict_error(self.y_target, self.y_pred, self._case)
if label_dict is not None:
if isinstance(label_dict, dict) is False:
raise ValueError(
"""
label_dict must be a dict
"""
)
self.label_dict = label_dict
self.check_label_dict()
self.inv_label_dict = {v: k for k, v in self.label_dict.items()}
if features_dict is not None:
if isinstance(features_dict, dict) is False:
raise ValueError(
"""
features_dict must be a dict
"""
)
self.features_dict = features_dict
self.check_features_dict()
self.inv_features_dict = {v: k for k, v in self.features_dict.items()}
if title_story is not None:
self.title_story = title_story
if additional_features_dict is not None:
self.additional_features_dict = self._compile_additional_features_dict(additional_features_dict)
if additional_data is not None:
self.additional_data = self._compile_additional_data(additional_data)
def get_interaction_values(self, n_samples_max=None, selection=None):
"""
Compute shap interaction values for each row of x_encoded.
This function is only available for explainer of type TreeExplainer (used for tree based models).
Please refer to the official tree shap paper for more information : https://arxiv.org/pdf/1802.03888.pdf
Parameters
----------
n_samples_max : int, optional
Limit the number of points for which we compute the interactions.
selection : list, optional
Contains list of index, subset of the input DataFrame that we want to plot
Returns
-------
np.ndarray
Shap interaction values for each sample as an array of shape (# samples x # features x # features).
"""
x = copy.deepcopy(self.x_encoded)
if selection:
x = x.loc[selection]
if hasattr(self, "x_interaction"):
if self.x_interaction.equals(x[:n_samples_max]):
return self.interaction_values
self.x_interaction = x[:n_samples_max]
self.interaction_values = get_shap_interaction_values(self.x_interaction, self.backend.explainer)
return self.interaction_values
def check_postprocessing_modif_strings(self, postprocessing=None):
"""
Check if any modification of postprocessing will convert numeric values into strings values.
If so, return True, otherwise False.
Parameters
----------
postprocessing: dict
Dict of postprocessing modifications to apply.
Returns
-------
modif: bool
Boolean which is True if any numerical variable will be converted into string.
"""
modif = False
if postprocessing is not None:
for key in postprocessing.keys():
dict_postprocess = postprocessing[key]
if dict_postprocess["type"] in {"prefix", "suffix"} and pd.api.types.is_numeric_dtype(self.x_init[key]):
modif = True
return modif
def modify_postprocessing(self, postprocessing=None):
"""
Modifies postprocessing parameter, to change only keys, with features name,
in case of parameters are not real feature names (with columns_dict,
or inv_features_dict).
Parameters
----------
postprocessing : Dict
Dictionnary of postprocessing to modify.
Returns
-------
Dict
Modified dictionnary, with same values but keys directly referencing to feature names.
"""
if postprocessing:
new_dic = dict()
for key in postprocessing.keys():
if key in self.features_dict:
new_dic[key] = postprocessing[key]
elif key in self.columns_dict.keys():
new_dic[self.columns_dict[key]] = postprocessing[key]
elif key in self.inv_features_dict:
new_dic[self.inv_features_dict[key]] = postprocessing[key]
else:
raise ValueError(f"Feature name '{key}' not found in the dataset.")
return new_dic
def apply_postprocessing(self, postprocessing=None):
"""
Modifies x_init Dataframe according to postprocessing modifications, if exists.
Parameters
----------
postprocessing: Dict
Dictionnary of postprocessing modifications to apply in x_init.
Returns
-------
pandas.Dataframe
Returns x_init if postprocessing is empty, modified dataframe otherwise.
"""
if postprocessing:
return apply_postprocessing(self.x_init, postprocessing)
else:
return self.x_init
def check_label_dict(self):
"""
Check if label_dict and model _classes match
"""
if self._case != "regression":
return check_label_dict(self.label_dict, self._case, self._classes)
def check_features_dict(self):
"""
Check the features_dict and add the necessary keys if all the
input X columns are not present
"""
for feature in set(list(self.columns_dict.values())) - set(list(self.features_dict)):
self.features_dict[feature] = feature
def _update_features_dict_with_groups(self, features_groups):
"""
Add groups into features dict and inv_features_dict if not present.
"""
for group_name in features_groups.keys():
self.features_desc[group_name] = 1000
if group_name not in self.features_dict.keys():
self.features_dict[group_name] = group_name
self.inv_features_dict[group_name] = group_name
def check_contributions(self):
"""
Check if contributions and prediction set match in terms of shape and index.
"""
if not self.state.check_contributions(self.contributions, self.x_init):
raise ValueError(
"""
Prediction set and contributions should have exactly the same number of lines
and number of columns. the order of the columns must be the same
Please check x, contributions and preprocessing arguments.
"""
)
def check_label_name(self, label, origin=None):
"""
Convert a string label in integer. If the label is already
an integer nothing is done. In all other cases an error is raised.
Parameters
----------
label: int or string
Integer (id) or string (business names)
origin: None, 'num', 'code', 'value' (default: None)
Kind of the label used in parameter
Returns
-------
tuple
label num, label code (class of the mode), label value
"""
if origin is None:
if label in self._classes:
origin = "code"
elif self.label_dict is not None and label in self.label_dict.values():
origin = "value"
elif isinstance(label, int) and label in range(-1, len(self._classes)):
origin = "num"
try:
if origin == "num":
label_num = label
label_code = self._classes[label]
label_value = self.label_dict[label_code] if self.label_dict else label_code
elif origin == "code":
label_code = label
label_num = self._classes.index(label)
label_value = self.label_dict[label_code] if self.label_dict else label_code
elif origin == "value":
label_code = self.inv_label_dict[label]
label_num = self._classes.index(label_code)
label_value = label
else:
raise ValueError
except ValueError:
raise Exception({"message": "Origin must be 'num', 'code' or 'value'."})
except Exception:
raise Exception({"message": f"Label ({label}) not found for origin ({origin})"})
return label_num, label_code, label_value
def check_features_name(self, features, use_groups=False):
"""
Convert a list of feature names (string) or features ids into features ids.
Features names can be part of columns_dict or features_dict.
Parameters
----------
features : List
List of ints (columns ids) or of strings (business names)
use_groups : bool
Whether or not features parameter includes groups of features
Returns
-------
list of ints
Columns ids compatible with var_dict
"""
columns_dict = self.columns_dict if use_groups is False else self.columns_dict_groups
return check_features_name(columns_dict, self.features_dict, features)
def check_attributes(self, attribute):
"""
Check that explainer has the attribute precised
Parameters
----------
attribute: string
the label of the attribute to test
Returns
-------
Object content of the attribute specified from SmartExplainer instance
"""
if not hasattr(self, attribute):
raise ValueError(
"""
attribute {} isn't an attribute of the explainer precised.
""".format(
attribute
)
)
return self.__dict__[attribute]
[docs] def filter(self, features_to_hide=None, threshold=None, positive=None, max_contrib=None, display_groups=None):
"""
The filter method is an important method which allows to summarize the local explainability
by using the user defined parameters which correspond to its use case.
Filter method is used with the local_plot method of Smarplotter to see the concrete result of this summary
with a local contribution barchart
Please, watch the local_plot tutorial to see how these two methods are combined with a concrete example
Parameters
----------
features_to_hide : list, optional (default: None)
List of strings, containing features to hide.
threshold : float, optional (default: None)
Absolute threshold below which any contribution is hidden.
positive: bool, optional (default: None)
If True, hide negative values. False, hide positive values
If None, hide nothing.
max_contrib : int, optional (default: None)
Maximum number of contributions to show.
display_groups : bool (default: None)
Whether or not to display groups of features. This option is
only useful if groups of features are declared when compiling
SmartExplainer object.
"""
display_groups = True if (display_groups is not False and self.features_groups is not None) else False
if display_groups:
data = self.data_groups
else:
data = self.data
mask = [self.state.init_mask(data["contrib_sorted"], True)]
if features_to_hide:
mask.append(
self.state.hide_contributions(
data["var_dict"],
features_list=self.check_features_name(features_to_hide, use_groups=display_groups),
)
)
if threshold:
mask.append(self.state.cap_contributions(data["contrib_sorted"], threshold=threshold))
if positive is not None:
mask.append(self.state.sign_contributions(data["contrib_sorted"], positive=positive))
self.mask = self.state.combine_masks(mask)
if max_contrib:
self.mask = self.state.cutoff_contributions(self.mask, max_contrib=max_contrib)
self.masked_contributions = self.state.compute_masked_contributions(data["contrib_sorted"], self.mask)
self.mask_params = {
"features_to_hide": features_to_hide,
"threshold": threshold,
"positive": positive,
"max_contrib": max_contrib,
}
[docs] def save(self, path):
"""
Save method allows user to save SmartExplainer object on disk
using a pickle file.
Save method can be useful: you don't have to recompile to display
results later
Parameters
----------
path : str
File path to store the pickle file
Example
--------
>>> xpl.save('path_to_pkl/xpl.pkl')
"""
if hasattr(self, "smartapp"):
self.smartapp = None
save_pickle(self, path)
[docs] @classmethod
def load(cls, path):
"""
Load method allows Shapash user to use pickled SmartExplainer.
To use this method you must first declare your SmartExplainer object
Watch the following example
Parameters
----------
path : str
File path of the pickle file.
Example
--------
>>> xpl = SmartExplainer.load('path_to_pkl/xpl.pkl')
"""
xpl = load_pickle(path)
if isinstance(xpl, SmartExplainer):
smart_explainer = cls(model=xpl.model)
smart_explainer.__dict__.update(xpl.__dict__)
return smart_explainer
else:
raise ValueError("File is not a SmartExplainer object")
def predict_proba(self):
"""
The predict_proba compute the proba values for each x_encoded row
"""
self.proba_values = predict_proba(self.model, self.x_encoded, self._classes)
def predict(self):
"""
The predict method computes the model output for each x_encoded row and stores it in y_pred attribute
"""
self.y_pred = predict(self.model, self.x_encoded)
if hasattr(self, "y_target"):
self.prediction_error = predict_error(self.y_target, self.y_pred, self._case)
[docs] def to_pandas(
self, features_to_hide=None, threshold=None, positive=None, max_contrib=None, proba=False, use_groups=None
):
"""
The to_pandas method allows to export the summary of local explainability.
This method proposes a set of parameters to summarize the explainability of each point.
If the user does not specify any, the to_pandas method uses the parameter specified during
the last execution of the filter method.
In classification case, The method to_pandas summarizes the explicability which corresponds
to the predicted values specified by the user (with compile or add method).
the proba parameter displays the corresponding predict proba value for each point
In classification case, There are 2 ways to use this to pandas method.
- Provide a real prediction set to explain
- Focus on a constant target value and look at the proba and explainability corresponding to each point.
(in that case, specify a constant pd.Series with add or compile method)
Examples are presented in the tutorial local_plot (please check tutorial part of this doc)
Parameters
----------
features_to_hide : list, optional (default: None)
List of strings, containing features to hide.
threshold : float, optional (default: None)
Absolute threshold below which any contribution is hidden.
positive: bool, optional (default: None)
If True, hide negative values. Hide positive values otherwise. If None, hide nothing.
max_contrib : int, optional (default: 5)
Number of contributions to show in the pandas df
proba : bool, optional (default: False)
adding proba in output df
use_groups : bool (optional)
Whether or not to use groups of features contributions (only available if features_groups
parameter was not empty when calling compile method).
Returns
-------
pandas.DataFrame
- selected explanation of each row for classification case
Examples
--------
>>> summary_df = xpl.to_pandas(max_contrib=2,proba=True)
>>> summary_df
pred proba feature_1 value_1 contribution_1 feature_2 value_2 contribution_2
0 0 0.756416 Sex 1.0 0.322308 Pclass 3.0 0.155069
1 3 0.628911 Sex 2.0 0.585475 Pclass 1.0 0.370504
2 0 0.543308 Sex 2.0 -0.486667 Pclass 3.0 0.255072
"""
use_groups = True if (use_groups is not False and self.features_groups is not None) else False
if use_groups:
data = self.data_groups
else:
data = self.data
# Classification: y_pred is needed
if self.y_pred is None:
raise ValueError("You have to specify y_pred argument. Please use add() or compile() method")
# Apply filter method if necessary
if (
all(var is None for var in [features_to_hide, threshold, positive, max_contrib])
and hasattr(self, "mask_params")
and (
# if the already computed mask does not have the right shape (this can happen when
# we use groups of features once and then use method without groups)
(
isinstance(data["contrib_sorted"], pd.DataFrame)
and len(data["contrib_sorted"].columns) == len(self.mask.columns)
)
or (
isinstance(data["contrib_sorted"], list)
and len(data["contrib_sorted"][0].columns) == len(self.mask[0].columns)
)
)
):
print("to_pandas params: " + str(self.mask_params))
else:
self.filter(
features_to_hide=features_to_hide,
threshold=threshold,
positive=positive,
max_contrib=max_contrib,
display_groups=use_groups,
)
if use_groups:
columns_dict = {i: col for i, col in enumerate(self.x_init_groups.columns)}
else:
columns_dict = self.columns_dict
# Summarize information
data["summary"] = self.state.summarize(
data["contrib_sorted"], data["var_dict"], data["x_sorted"], self.mask, columns_dict, self.features_dict
)
# Matching with y_pred
if proba:
self.predict_proba()
proba_values = self.proba_values
else:
proba_values = None
y_pred, summary = keep_right_contributions(
self.y_pred, data["summary"], self._case, self._classes, self.label_dict, proba_values
)
return pd.concat([y_pred, summary], axis=1)
def compute_features_import(self, force=False):
"""
Compute a relative features importance, sum of absolute values
of the contributions for each.
Features importance compute in base 100
Parameters
----------
force: bool (default: False)
True to force de compute if features importance is
already calculated
Returns
-------
pd.Serie (Regression)
or list of pd.Serie (Classification: One Serie for each target modality)
Each Serie: feature importance, One row by feature,
index of the serie = contributions.columns
"""
self.features_imp = self.backend.get_global_features_importance(
contributions=self.contributions, explain_data=self.explain_data, subset=None
)
if self.features_groups is not None and self.features_imp_groups is None:
self.features_imp_groups = self.state.compute_features_import(self.contributions_groups)
def compute_features_stability(self, selection):
"""
For a selection of instances, compute features stability metrics used in
methods `local_neighbors_plot` and `local_stability_plot`.
- If selection is a single instance, the method returns the (normalized) contribution values
of instance and corresponding neighbors.
- If selection represents multiple instances, the method returns the average (normalized) contribution values
of instances and neighbors (=amplitude), as well as the variability of those values in the neighborhood (=variability)
Parameters
----------
selection: list
Indices of rows to be displayed on the stability plot
Returns
-------
Dictionary
Values that will be displayed on the graph. Keys are "amplitude", "variability" and "norm_shap"
"""
if (self._case == "classification") and (len(self._classes) > 2):
raise AssertionError("Multi-class classification is not supported")
all_neighbors = find_neighbors(selection, self.x_encoded, self.model, self._case)
# Check if entry is a single instance or not
if len(selection) == 1:
# Compute explanations for instance and neighbors
norm_shap, _, _ = shap_neighbors(all_neighbors[0], self.x_encoded, self.contributions, self._case)
self.local_neighbors = {"norm_shap": norm_shap}
else:
numb_expl = len(selection)
amplitude = np.zeros((numb_expl, self.x_init.shape[1]))
variability = np.zeros((numb_expl, self.x_init.shape[1]))
# For each instance (+ neighbors), compute explanation
for i in range(numb_expl):
(
_,
variability[i, :],
amplitude[i, :],
) = shap_neighbors(all_neighbors[i], self.x_encoded, self.contributions, self._case)
self.features_stability = {"variability": variability, "amplitude": amplitude}
def compute_features_compacity(self, selection, distance, nb_features):
"""
For a selection of instances, compute features compacity metrics used in method `compacity_plot`.
The method returns :
* the minimum number of features needed for a given approximation level
* conversely, the approximation reached with a given number of features
Parameters
----------
selection: list
Indices of rows to be displayed on the stability plot
distance : float
How close we want to be from model with all features
nb_features : int
Number of features used
"""
if (self._case == "classification") and (len(self._classes) > 2):
raise AssertionError("Multi-class classification is not supported")
features_needed = get_min_nb_features(selection, self.contributions, self._case, distance)
distance_reached = get_distance(selection, self.contributions, self._case, nb_features)
# We clip large approximations to 100%
distance_reached = np.clip(distance_reached, 0, 1)
self.features_compacity = {"features_needed": features_needed, "distance_reached": distance_reached}
def init_app(self, settings: dict = None):
"""
Simple init of SmartApp in case of host smartapp by another way
Parameters
----------
settings : dict (default: None)
A dict describing the default webapp settings values to be used
Possible settings (dict keys) are 'rows', 'points', 'violin', 'features'
Values should be positive ints
"""
self.smartapp = SmartApp(self, settings)
[docs] def run_app(
self, port: int = None, host: str = None, title_story: str = None, settings: dict = None
) -> CustomThread:
"""
run_app method launches the interpretability web app associated with the shapash object.
run_app method can be used directly in a Jupyter notebook
The link to the webapp is directly mentioned in the Jupyter output
Use object.kill() method to kill the current instance
Examples are presented in the web_app tutorial (please check tutorial part of this doc)
Parameters
----------
port: int (default: None)
The port is by default on 8050. You can specify a custom port
for your webapp.
host: str (default: None)
The default host is '0.0.0.0'. You can specify a custom
ip address for your webapp
title_story: str (default: None)
The default title is empty. You can specify a custom title
for your webapp (can be reused in other methods like in a report, ...)
settings : dict (default: None)
A dict describing the default webapp settings values to be used
Possible settings (dict keys) are 'rows', 'points', 'violin', 'features'
Values should be positive ints
Returns
-------
CustomThread
Return the thread instance of your server.
Example
--------
>>> app = xpl.run_app()
>>> app.kill()
"""
if title_story is not None:
self.title_story = title_story
if hasattr(self, "_case"):
self.smartapp = SmartApp(self, settings)
if host is None:
host = "0.0.0.0"
if port is None:
port = 8050
host_name = get_host_name()
server_instance = CustomThread(
target=lambda: self.smartapp.app.run_server(debug=False, host=host, port=port)
)
if host_name is None:
host_name = host
elif host != "0.0.0.0":
host_name = host
server_instance.start()
logging.info(f"Your Shapash application run on http://{host_name}:{port}/")
logging.info("Use the method .kill() to down your app.")
return server_instance
else:
raise ValueError("Explainer must be compiled before running app.")
def to_smartpredictor(self):
"""
Create a SmartPredictor object designed from the following attributes
needed from the SmartExplainer Object :
features_dict: dict
Dictionary mapping technical feature names to domain names.
label_dict: dict
Dictionary mapping integer labels to domain names (classification - target values).
columns_dict: dict
Dictionary mapping integer column number to technical feature names.
features_types: dict
Dictionnary mapping features with the right types needed.
model: model object
model used to check the different values of target estimate predict proba
backend : backend object
backend used to compute contributions
preprocessing: category_encoders, ColumnTransformer, list or dict
The processing apply to the original data.
postprocessing: dict
Dictionnary of postprocessing modifications to apply in x_init dataframe.
_case: string
String that informs if the model used is for classification or regression problem.
_classes: list, None
List of labels if the model used is for classification problem, None otherwise.
mask_params: dict (optional)
Dictionnary allowing the user to define a apply a filter to summarize the local explainability.
"""
if self.backend is None:
raise ValueError(
"""
SmartPredictor needs a backend (explainer).
Please compile without contributions or specify the
explainer used. Make change in compile() step.
"""
)
self.features_types = {features: str(self.x_init[features].dtypes) for features in self.x_init.columns}
listattributes = [
"features_dict",
"model",
"columns_dict",
"backend",
"features_types",
"label_dict",
"preprocessing",
"postprocessing",
"features_groups",
]
params_smartpredictor = [self.check_attributes(attribute) for attribute in listattributes]
if not hasattr(self, "mask_params"):
self.mask_params = {"features_to_hide": None, "threshold": None, "positive": None, "max_contrib": None}
params_smartpredictor.append(self.mask_params)
return shapash.explainer.smart_predictor.SmartPredictor(*params_smartpredictor)
def check_x_y_attributes(self, x_str, y_str):
"""
Check if x_str and y_str are attributes of the SmartExplainer
Parameters
----------
x_str: string
label of the attribute x
y_str: string
label of the attribute y
Returns
-------
list of object detained by attributes x and y.
"""
if not (isinstance(x_str, str) and isinstance(y_str, str)):
raise ValueError(
"""
x and y must be strings.
"""
)
params_checkypred = []
attributs_explainer = [x_str, y_str]
for attribut in attributs_explainer:
if hasattr(self, attribut):
params_checkypred.append(self.__dict__[attribut])
else:
params_checkypred.append(None)
return params_checkypred
[docs] def generate_report(
self,
output_file,
project_info_file,
x_train=None,
y_train=None,
y_test=None,
title_story=None,
title_description=None,
metrics=None,
working_dir=None,
notebook_path=None,
kernel_name=None,
):
"""
This method will generate an HTML report containing different information about the project.
It analyzes the data and the model used in order to provide interesting
insights that can be shared using the HTML format.
It requires a project info yml file on which can figure different information about the project.
Parameters
----------
output_file : str
Path to the HTML file to write.
project_info_file : str
Path to the file used to display some information about the project in the report.
x_train : pd.DataFrame, optional
DataFrame used for training the model.
y_train: pd.Series or pd.DataFrame, optional
Series of labels in the training set.
y_test : pd.Series or pd.DataFrame, optional
Series of labels in the test set.
title_story : str, optional
Report title.
title_description : str, optional
Report title description (as written just below the title).
metrics : list, optional
Metrics used in the model performance section. The metrics parameter should be a list
of dict. Each dict contains they following keys :
'path' (path to the metric function, ex: 'sklearn.metrics.mean_absolute_error'),
'name' (optional, name of the metric as displayed in the report),
and 'use_proba_values' (optional, possible values are False (default) or True
if the metric uses proba values instead of predicted values).
For example, metrics=[{'name': 'F1 score', 'path': 'sklearn.metrics.f1_score'}]
working_dir : str, optional
Working directory in which will be generated the notebook used to create the report
and where the objects used to execute it will be saved. This parameter can be usefull
if one wants to create its own custom report and debug the notebook used to generate
the html report. If None, a temporary directory will be used.
notebook_path : str, optional
Path to the notebook used to generate the report. If None, the Shapash base report
notebook will be used.
kernel_name : str, optional
Name of the kernel used to generate the report. This parameter can be usefull if
you have multiple jupyter kernels and that the method does not use the right kernel
by default.
Examples
--------
>>> xpl.generate_report(
output_file='report.html',
project_info_file='utils/project_info.yml',
x_train=x_train,
y_train=y_train,
y_test=ytest,
title_story="House prices project report",
title_description="This document is a data science report of the kaggle house prices project."
metrics=[
{
'path': 'sklearn.metrics.mean_squared_error',
'name': 'Mean squared error', # Optional : name that will be displayed next to the metric
},
{
'path': 'sklearn.metrics.mean_absolute_error',
'name': 'Mean absolute error',
}
]
)
"""
check_report_requirements()
if x_train is not None:
x_train = handle_categorical_missing(x_train)
# Avoid Import Errors with requirements specific to the Shapash Report
from shapash.report.generation import execute_report, export_and_save_report
rm_working_dir = False
if not working_dir:
working_dir = tempfile.mkdtemp()
rm_working_dir = True
if not hasattr(self, "model"):
raise AssertionError(
"Explainer object was not compiled. Please compile the explainer "
"object using .compile(...) method before generating the report."
)
try:
execute_report(
working_dir=working_dir,
explainer=self,
project_info_file=project_info_file,
x_train=x_train,
y_train=y_train,
y_test=y_test,
config=dict(
title_story=title_story,
title_description=title_description,
metrics=metrics,
),
notebook_path=notebook_path,
kernel_name=kernel_name,
)
export_and_save_report(working_dir=working_dir, output_file=output_file)
if rm_working_dir:
shutil.rmtree(working_dir)
except Exception as e:
if rm_working_dir:
shutil.rmtree(working_dir)
raise e