{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Dictionnary Encoding tutorial\n", "\n", "This tutorial shows how to use simple python dictionnaries to reverse data preprocessing and display explicit labels\n", "\n", "Data from Kaggle [Titanic](https://www.kaggle.com/c/titanic)\n", "\n", "Content :\n", "- Encode data with dictionary\n", "- Build a Binary Classifier (Random Forest)\n", "- Using Shapash\n", "- Show inversed data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from xgboost import XGBClassifier\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load titanic Data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from shapash.data.data_loader import data_loading\n", "\n", "titan_df, titan_dict = data_loading('titanic')\n", "del titan_df['Name']" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassSexAgeSibSpParchFareEmbarkedTitle
PassengerId
10Third classmale22.0107.25SouthamptonMr
21First classfemale38.01071.28CherbourgMrs
31Third classfemale26.0007.92SouthamptonMiss
41First classfemale35.01053.10SouthamptonMrs
50Third classmale35.0008.05SouthamptonMr
\n", "
" ], "text/plain": [ " Survived Pclass Sex Age SibSp Parch Fare \\\n", "PassengerId \n", "1 0 Third class male 22.0 1 0 7.25 \n", "2 1 First class female 38.0 1 0 71.28 \n", "3 1 Third class female 26.0 0 0 7.92 \n", "4 1 First class female 35.0 1 0 53.10 \n", "5 0 Third class male 35.0 0 0 8.05 \n", "\n", " Embarked Title \n", "PassengerId \n", "1 Southampton Mr \n", "2 Cherbourg Mrs \n", "3 Southampton Miss \n", "4 Southampton Mrs \n", "5 Southampton Mr " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titan_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prepare data for the model\n", "\n", "Create Target" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "y = titan_df['Survived']\n", "X = titan_df.drop('Survived', axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Build dict tranformation and reversed dict" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "#construct new variable\n", "X['new_embarked'] = X.apply(lambda x : 1 if x.Embarked in ['Southampton','Cherbourg'] else 2 if x.Embarked in 'Queenstown' else 3, axis = 1)\n", "#Construct the reversed dict\n", "transfo_embarked = {'col': 'new_embarked',\n", " 'mapping': pd.Series(data=[1, 2, np.nan], index=['Southampton-Cherbourg', 'Queenstown','missing']),\n", " 'data_type': 'object'}\n", "\n", "#construct new variable\n", "X['new_ages'] = X.apply(lambda x : 1 if x.Age <= 25 else 2 if x.Age <= 40 else 3, axis = 1)\n", "#Construct the reversed dict\n", "transfo_age = dict()\n", "transfo_age = {'col': 'new_ages',\n", " 'mapping': pd.Series(data=[1, 2, 3, np.nan], index=['-25 years', '26-40 years', '+40 years','missing']),\n", " 'data_type': 'object'}" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "#put transformation into list\n", "encoder = [transfo_age,transfo_embarked]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PclassSexAgeSibSpParchFareEmbarkedTitlenew_embarkednew_ages
PassengerId
1Third classmale22.0107.25SouthamptonMr11
2First classfemale38.01071.28CherbourgMrs12
3Third classfemale26.0007.92SouthamptonMiss12
4First classfemale35.01053.10SouthamptonMrs12
\n", "
" ], "text/plain": [ " Pclass Sex Age SibSp Parch Fare Embarked \\\n", "PassengerId \n", "1 Third class male 22.0 1 0 7.25 Southampton \n", "2 First class female 38.0 1 0 71.28 Cherbourg \n", "3 Third class female 26.0 0 0 7.92 Southampton \n", "4 First class female 35.0 1 0 53.10 Southampton \n", "\n", " Title new_embarked new_ages \n", "PassengerId \n", "1 Mr 1 1 \n", "2 Mrs 1 2 \n", "3 Miss 1 2 \n", "4 Mrs 1 2 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.head(4)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Fit a model" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "X = X[['new_embarked','new_ages','Fare','Parch','Age']]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,\n", " colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,\n", " importance_type='gain', interaction_constraints=None,\n", " learning_rate=0.300000012, max_delta_step=0, max_depth=6,\n", " min_child_weight=2, missing=nan, monotone_constraints=None,\n", " n_estimators=200, n_jobs=0, num_parallel_tree=1,\n", " objective='binary:logistic', random_state=0, reg_alpha=0,\n", " reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,\n", " validate_parameters=False, verbosity=None)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.75, random_state=1)\n", "\n", "clf = XGBClassifier(n_estimators=200,min_child_weight=2).fit(Xtrain,ytrain)\n", "clf.fit(Xtrain, ytrain)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using Shapash" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from shapash import SmartExplainer" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "xpl = SmartExplainer(model=clf, preprocessing=encoder)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Backend: Shap TreeExplainer\n" ] } ], "source": [ "xpl.compile(x=Xtest,\n", "y_target=ytest, # Optional: allows to display True Values vs Predicted Values\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Visualize data in pandas" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
new_embarkednew_agesFareParchAge
PassengerId
863Southampton-Cherbourg+40 years25.93048.0
224Southampton-Cherbourg26-40 years7.90029.5
85Southampton-Cherbourg-25 years10.50017.0
681Queenstown26-40 years8.14029.5
\n", "
" ], "text/plain": [ " new_embarked new_ages Fare Parch Age\n", "PassengerId \n", "863 Southampton-Cherbourg +40 years 25.93 0 48.0\n", "224 Southampton-Cherbourg 26-40 years 7.90 0 29.5\n", "85 Southampton-Cherbourg -25 years 10.50 0 17.0\n", "681 Queenstown 26-40 years 8.14 0 29.5" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xpl.x_init.head(4)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
new_embarkednew_agesFareParchAge
PassengerId
8631325.93048.0
224127.90029.5
851110.50017.0
681228.14029.5
\n", "
" ], "text/plain": [ " new_embarked new_ages Fare Parch Age\n", "PassengerId \n", "863 1 3 25.93 0 48.0\n", "224 1 2 7.90 0 29.5\n", "85 1 1 10.50 0 17.0\n", "681 2 2 8.14 0 29.5" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xpl.x_encoded.head(4)" ] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.11" } }, "nbformat": 4, "nbformat_minor": 4 }