{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Compute Contributions with Shap - Summarize Them With Shapash\n",
    "\n",
    "Shapash uses Shap backend to compute the Shapley contributions <br />\n",
    "in order to satisfy the most hurry users who wish to display <br />\n",
    "results with little lines of code.\n",
    "\n",
    "But we recommend you to refer to the excellent [Shap library](https://github.com/slundberg/shap).\n",
    "\n",
    "This tutorial shows how to use precalculated contributions with Shap in Shapash \n",
    "\n",
    "Contents:\n",
    "- Build a Binary Classifier\n",
    "- Use Shap KernelExplainer\n",
    "- Compile Shapash SmartExplainer\n",
    "- Display local_plot\n",
    "- to_pandas export\n",
    "\n",
    "We used Kaggle's [Titanic](https://www.kaggle.com/c/titanic) dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from category_encoders import OrdinalEncoder\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "import shap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from shapash.data.data_loader import data_loading"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "titan_df, titan_dict = data_loading('titanic')\n",
    "del titan_df['Name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>Title</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PassengerId</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>Third class</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.25</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>Mr</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>First class</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.28</td>\n",
       "      <td>Cherbourg</td>\n",
       "      <td>Mrs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>Third class</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.92</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>Miss</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>First class</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.10</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>Mrs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>Third class</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.05</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>Mr</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             Survived       Pclass     Sex   Age  SibSp  Parch   Fare  \\\n",
       "PassengerId                                                             \n",
       "1                   0  Third class    male  22.0      1      0   7.25   \n",
       "2                   1  First class  female  38.0      1      0  71.28   \n",
       "3                   1  Third class  female  26.0      0      0   7.92   \n",
       "4                   1  First class  female  35.0      1      0  53.10   \n",
       "5                   0  Third class    male  35.0      0      0   8.05   \n",
       "\n",
       "                Embarked Title  \n",
       "PassengerId                     \n",
       "1            Southampton    Mr  \n",
       "2              Cherbourg   Mrs  \n",
       "3            Southampton  Miss  \n",
       "4            Southampton   Mrs  \n",
       "5            Southampton    Mr  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "titan_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create Classification Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = titan_df['Survived']\n",
    "X = titan_df.drop('Survived', axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "varcat=['Pclass','Sex','Embarked','Title']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "categ_encoding = OrdinalEncoder(cols=varcat, \\\n",
    "                                handle_unknown='ignore', \\\n",
    "                                return_df=True).fit(X)\n",
    "X = categ_encoding.transform(X)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Train Test split + Random Forest fit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>#sk-container-id-1 {color: black;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>RandomForestClassifier(min_samples_leaf=3)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">RandomForestClassifier</label><div class=\"sk-toggleable__content\"><pre>RandomForestClassifier(min_samples_leaf=3)</pre></div></div></div></div></div>"
      ],
      "text/plain": [
       "RandomForestClassifier(min_samples_leaf=3)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.75, random_state=1)\n",
    "\n",
    "rf = RandomForestClassifier(n_estimators=100,min_samples_leaf=3)\n",
    "rf.fit(Xtrain, ytrain)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "ypred = pd.DataFrame(rf.predict(Xtest),columns=['pred'],index=Xtest.index)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Use Shapash With Shapley Contributions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from shapash import SmartExplainer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Differents ways to compute Shapeley values with Shap"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Let Shapash choose the method for you"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO: Shap explainer type - <shap.explainers._exact.ExactExplainer object at 0x7f0c1702dca0>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "ExactExplainer explainer: 224it [00:42,  4.62it/s]                         \n"
     ]
    }
   ],
   "source": [
    "xpl = SmartExplainer(\n",
    "    model=rf,\n",
    "    backend='shap',\n",
    "    preprocessing=categ_encoding,\n",
    "    features_dict=titan_dict\n",
    ")\n",
    "xpl.compile(\n",
    "    y_pred=ypred,\n",
    "    y_target=ytest, # Optional: allows to display True Values vs Predicted Values\n",
    "    x=Xtest\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Let Shap choose the method for you and give the masker you want"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO: Shap explainer type - <shap.explainers._exact.ExactExplainer object at 0x7f0c1702d910>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "ExactExplainer explainer: 224it [00:36,  4.36it/s]                         \n"
     ]
    }
   ],
   "source": [
    "xpl = SmartExplainer(\n",
    "    model=rf,\n",
    "    backend='shap',\n",
    "    explainer_args={'model': rf.predict_proba, 'masker': Xtest},\n",
    "    preprocessing=categ_encoding,\n",
    "    features_dict=titan_dict\n",
    ")\n",
    "xpl.compile(\n",
    "    y_pred=ypred,\n",
    "    y_target=ytest, # Optional: allows to display True Values vs Predicted Values\n",
    "    x=Xtest\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tell Shap what do "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO: Shap explainer type - shap.explainers.PermutationExplainer()\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "PermutationExplainer explainer: 224it [03:04,  1.14it/s]                         \n"
     ]
    }
   ],
   "source": [
    "xpl = SmartExplainer(\n",
    "    model=rf,\n",
    "    backend='shap',\n",
    "    explainer_args={'explainer': shap.explainers.PermutationExplainer, 'model': rf.predict_proba, 'masker': Xtest},\n",
    "    preprocessing=categ_encoding,\n",
    "    features_dict=titan_dict\n",
    ")\n",
    "xpl.compile(\n",
    "    y_pred=ypred,\n",
    "    y_target=ytest, # Optional: allows to display True Values vs Predicted Values\n",
    "    x=Xtest\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Use contributions parameter of compile method to declare Shapley contributions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "PermutationExplainer explainer: 224it [00:23,  5.51it/s]                         \n"
     ]
    }
   ],
   "source": [
    "xpl = SmartExplainer(\n",
    "    model=rf,\n",
    "    preprocessing=categ_encoding,\n",
    "    features_dict=titan_dict\n",
    ")\n",
    "\n",
    "masker = pd.DataFrame(shap.kmeans(Xtest, 50).data, columns=Xtest.columns)\n",
    "explainer = shap.explainers.PermutationExplainer(model=rf.predict_proba, masker=masker)\n",
    "shap_contrib = explainer.shap_values(Xtest)\n",
    "\n",
    "xpl.compile(\n",
    "    contributions=shap_contrib, # Shap Contributions pd.DataFrame\n",
    "    y_pred=ypred,\n",
    "    y_target=ytest, # Optional: allows to display True Values vs Predicted Values\n",
    "    x=Xtest\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "celltoolbar": "Aucun(e)",
  "hide_input": false,
  "kernelspec": {
   "display_name": "myenv_39",
   "language": "python",
   "name": "myenv_39"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.9"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "vscode": {
   "interpreter": {
    "hash": "6dbaec60c0b0d722a3fa908c2fd7b738d946da6332c67fea5eea602801fdaf43"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}