{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dictionnary Encoding tutorial\n",
    "\n",
    "This tutorial shows how to use simple python dictionnaries to reverse data preprocessing and display explicit labels\n",
    "\n",
    "Data from Kaggle [Titanic](https://www.kaggle.com/c/titanic)\n",
    "\n",
    "Content :\n",
    "- Encode data with dictionary\n",
    "- Build a Binary Classifier (Random Forest)\n",
    "- Using Shapash\n",
    "- Show inversed data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load titanic Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from shapash.data.data_loader import data_loading\n",
    "\n",
    "titan_df, titan_dict = data_loading('titanic')\n",
    "del titan_df['Name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>Title</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PassengerId</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>Third class</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.25</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>Mr</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>First class</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.28</td>\n",
       "      <td>Cherbourg</td>\n",
       "      <td>Mrs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>Third class</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.92</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>Miss</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>First class</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.10</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>Mrs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>Third class</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.05</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>Mr</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             Survived       Pclass     Sex   Age  SibSp  Parch   Fare  \\\n",
       "PassengerId                                                             \n",
       "1                   0  Third class    male  22.0      1      0   7.25   \n",
       "2                   1  First class  female  38.0      1      0  71.28   \n",
       "3                   1  Third class  female  26.0      0      0   7.92   \n",
       "4                   1  First class  female  35.0      1      0  53.10   \n",
       "5                   0  Third class    male  35.0      0      0   8.05   \n",
       "\n",
       "                Embarked Title  \n",
       "PassengerId                     \n",
       "1            Southampton    Mr  \n",
       "2              Cherbourg   Mrs  \n",
       "3            Southampton  Miss  \n",
       "4            Southampton   Mrs  \n",
       "5            Southampton    Mr  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "titan_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare data for the model\n",
    "\n",
    "Create Target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = titan_df['Survived']\n",
    "X = titan_df.drop('Survived', axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Build dict tranformation and reversed dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#construct new variable\n",
    "X['new_embarked'] = X.apply(lambda x : 1 if x.Embarked in ['Southampton','Cherbourg'] else 2 if x.Embarked in 'Queenstown' else 3, axis = 1)\n",
    "#Construct the reversed dict\n",
    "transfo_embarked = {'col': 'new_embarked',\n",
    "                    'mapping': pd.Series(data=[1, 2, np.nan], index=['Southampton-Cherbourg', 'Queenstown','missing']),\n",
    "                    'data_type': 'object'}\n",
    "\n",
    "#construct new variable\n",
    "X['new_ages'] = X.apply(lambda x : 1 if x.Age <= 25 else 2 if x.Age <= 40 else 3, axis = 1)\n",
    "#Construct the reversed dict\n",
    "transfo_age = dict()\n",
    "transfo_age = {'col': 'new_ages',\n",
    "                'mapping': pd.Series(data=[1, 2, 3, np.nan], index=['-25 years', '26-40 years', '+40 years','missing']),\n",
    "                'data_type': 'object'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#put transformation into list\n",
    "encoder = [transfo_age,transfo_embarked]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>Title</th>\n",
       "      <th>new_embarked</th>\n",
       "      <th>new_ages</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PassengerId</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Third class</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.25</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>Mr</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>First class</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.28</td>\n",
       "      <td>Cherbourg</td>\n",
       "      <td>Mrs</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Third class</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.92</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>Miss</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>First class</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.10</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>Mrs</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  Pclass     Sex   Age  SibSp  Parch   Fare     Embarked  \\\n",
       "PassengerId                                                                \n",
       "1            Third class    male  22.0      1      0   7.25  Southampton   \n",
       "2            First class  female  38.0      1      0  71.28    Cherbourg   \n",
       "3            Third class  female  26.0      0      0   7.92  Southampton   \n",
       "4            First class  female  35.0      1      0  53.10  Southampton   \n",
       "\n",
       "            Title  new_embarked  new_ages  \n",
       "PassengerId                                \n",
       "1              Mr             1         1  \n",
       "2             Mrs             1         2  \n",
       "3            Miss             1         2  \n",
       "4             Mrs             1         2  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.head(4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fit a model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = X[['new_embarked','new_ages','Fare','Parch','Age']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,\n",
       "              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,\n",
       "              importance_type='gain', interaction_constraints=None,\n",
       "              learning_rate=0.300000012, max_delta_step=0, max_depth=6,\n",
       "              min_child_weight=2, missing=nan, monotone_constraints=None,\n",
       "              n_estimators=200, n_jobs=0, num_parallel_tree=1,\n",
       "              objective='binary:logistic', random_state=0, reg_alpha=0,\n",
       "              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,\n",
       "              validate_parameters=False, verbosity=None)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.75, random_state=1)\n",
    "\n",
    "clf = XGBClassifier(n_estimators=200,min_child_weight=2).fit(Xtrain,ytrain)\n",
    "clf.fit(Xtrain, ytrain)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using Shapash"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from shapash import SmartExplainer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "xpl = SmartExplainer(model=clf, preprocessing=encoder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Backend: Shap TreeExplainer\n"
     ]
    }
   ],
   "source": [
    "xpl.compile(x=Xtest,\n",
    "y_target=ytest, # Optional: allows to display True Values vs Predicted Values\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualize data in pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>new_embarked</th>\n",
       "      <th>new_ages</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Age</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PassengerId</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>863</th>\n",
       "      <td>Southampton-Cherbourg</td>\n",
       "      <td>+40 years</td>\n",
       "      <td>25.93</td>\n",
       "      <td>0</td>\n",
       "      <td>48.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>224</th>\n",
       "      <td>Southampton-Cherbourg</td>\n",
       "      <td>26-40 years</td>\n",
       "      <td>7.90</td>\n",
       "      <td>0</td>\n",
       "      <td>29.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>85</th>\n",
       "      <td>Southampton-Cherbourg</td>\n",
       "      <td>-25 years</td>\n",
       "      <td>10.50</td>\n",
       "      <td>0</td>\n",
       "      <td>17.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>681</th>\n",
       "      <td>Queenstown</td>\n",
       "      <td>26-40 years</td>\n",
       "      <td>8.14</td>\n",
       "      <td>0</td>\n",
       "      <td>29.5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      new_embarked     new_ages   Fare  Parch   Age\n",
       "PassengerId                                                        \n",
       "863          Southampton-Cherbourg    +40 years  25.93      0  48.0\n",
       "224          Southampton-Cherbourg  26-40 years   7.90      0  29.5\n",
       "85           Southampton-Cherbourg    -25 years  10.50      0  17.0\n",
       "681                     Queenstown  26-40 years   8.14      0  29.5"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xpl.x_init.head(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>new_embarked</th>\n",
       "      <th>new_ages</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Age</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PassengerId</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>863</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>25.93</td>\n",
       "      <td>0</td>\n",
       "      <td>48.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>224</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>7.90</td>\n",
       "      <td>0</td>\n",
       "      <td>29.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>85</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>10.50</td>\n",
       "      <td>0</td>\n",
       "      <td>17.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>681</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>8.14</td>\n",
       "      <td>0</td>\n",
       "      <td>29.5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             new_embarked  new_ages   Fare  Parch   Age\n",
       "PassengerId                                            \n",
       "863                     1         3  25.93      0  48.0\n",
       "224                     1         2   7.90      0  29.5\n",
       "85                      1         1  10.50      0  17.0\n",
       "681                     2         2   8.14      0  29.5"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xpl.x_encoded.head(4)"
   ]
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}