{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Dictionnary Encoding tutorial\n",
"\n",
"This tutorial shows how to use simple python dictionnaries to reverse data preprocessing and display explicit labels\n",
"\n",
"Data from Kaggle [Titanic](https://www.kaggle.com/c/titanic)\n",
"\n",
"Content :\n",
"- Encode data with dictionary\n",
"- Build a Binary Classifier (Random Forest)\n",
"- Using Shapash\n",
"- Show inversed data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from xgboost import XGBClassifier\n",
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load titanic Data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from shapash.data.data_loader import data_loading\n",
"\n",
"titan_df, titan_dict = data_loading('titanic')\n",
"del titan_df['Name']"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Survived | \n",
" Pclass | \n",
" Sex | \n",
" Age | \n",
" SibSp | \n",
" Parch | \n",
" Fare | \n",
" Embarked | \n",
" Title | \n",
"
\n",
" \n",
" PassengerId | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 0 | \n",
" Third class | \n",
" male | \n",
" 22.0 | \n",
" 1 | \n",
" 0 | \n",
" 7.25 | \n",
" Southampton | \n",
" Mr | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" First class | \n",
" female | \n",
" 38.0 | \n",
" 1 | \n",
" 0 | \n",
" 71.28 | \n",
" Cherbourg | \n",
" Mrs | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" Third class | \n",
" female | \n",
" 26.0 | \n",
" 0 | \n",
" 0 | \n",
" 7.92 | \n",
" Southampton | \n",
" Miss | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" First class | \n",
" female | \n",
" 35.0 | \n",
" 1 | \n",
" 0 | \n",
" 53.10 | \n",
" Southampton | \n",
" Mrs | \n",
"
\n",
" \n",
" 5 | \n",
" 0 | \n",
" Third class | \n",
" male | \n",
" 35.0 | \n",
" 0 | \n",
" 0 | \n",
" 8.05 | \n",
" Southampton | \n",
" Mr | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Survived Pclass Sex Age SibSp Parch Fare \\\n",
"PassengerId \n",
"1 0 Third class male 22.0 1 0 7.25 \n",
"2 1 First class female 38.0 1 0 71.28 \n",
"3 1 Third class female 26.0 0 0 7.92 \n",
"4 1 First class female 35.0 1 0 53.10 \n",
"5 0 Third class male 35.0 0 0 8.05 \n",
"\n",
" Embarked Title \n",
"PassengerId \n",
"1 Southampton Mr \n",
"2 Cherbourg Mrs \n",
"3 Southampton Miss \n",
"4 Southampton Mrs \n",
"5 Southampton Mr "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"titan_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prepare data for the model\n",
"\n",
"Create Target"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"y = titan_df['Survived']\n",
"X = titan_df.drop('Survived', axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Build dict tranformation and reversed dict"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"#construct new variable\n",
"X['new_embarked'] = X.apply(lambda x : 1 if x.Embarked in ['Southampton','Cherbourg'] else 2 if x.Embarked in 'Queenstown' else 3, axis = 1)\n",
"#Construct the reversed dict\n",
"transfo_embarked = {'col': 'new_embarked',\n",
" 'mapping': pd.Series(data=[1, 2, np.nan], index=['Southampton-Cherbourg', 'Queenstown','missing']),\n",
" 'data_type': 'object'}\n",
"\n",
"#construct new variable\n",
"X['new_ages'] = X.apply(lambda x : 1 if x.Age <= 25 else 2 if x.Age <= 40 else 3, axis = 1)\n",
"#Construct the reversed dict\n",
"transfo_age = dict()\n",
"transfo_age = {'col': 'new_ages',\n",
" 'mapping': pd.Series(data=[1, 2, 3, np.nan], index=['-25 years', '26-40 years', '+40 years','missing']),\n",
" 'data_type': 'object'}"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"#put transformation into list\n",
"encoder = [transfo_age,transfo_embarked]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pclass | \n",
" Sex | \n",
" Age | \n",
" SibSp | \n",
" Parch | \n",
" Fare | \n",
" Embarked | \n",
" Title | \n",
" new_embarked | \n",
" new_ages | \n",
"
\n",
" \n",
" PassengerId | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" Third class | \n",
" male | \n",
" 22.0 | \n",
" 1 | \n",
" 0 | \n",
" 7.25 | \n",
" Southampton | \n",
" Mr | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" First class | \n",
" female | \n",
" 38.0 | \n",
" 1 | \n",
" 0 | \n",
" 71.28 | \n",
" Cherbourg | \n",
" Mrs | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" Third class | \n",
" female | \n",
" 26.0 | \n",
" 0 | \n",
" 0 | \n",
" 7.92 | \n",
" Southampton | \n",
" Miss | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" 4 | \n",
" First class | \n",
" female | \n",
" 35.0 | \n",
" 1 | \n",
" 0 | \n",
" 53.10 | \n",
" Southampton | \n",
" Mrs | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Pclass Sex Age SibSp Parch Fare Embarked \\\n",
"PassengerId \n",
"1 Third class male 22.0 1 0 7.25 Southampton \n",
"2 First class female 38.0 1 0 71.28 Cherbourg \n",
"3 Third class female 26.0 0 0 7.92 Southampton \n",
"4 First class female 35.0 1 0 53.10 Southampton \n",
"\n",
" Title new_embarked new_ages \n",
"PassengerId \n",
"1 Mr 1 1 \n",
"2 Mrs 1 2 \n",
"3 Miss 1 2 \n",
"4 Mrs 1 2 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.head(4)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fit a model"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"X = X[['new_embarked','new_ages','Fare','Parch','Age']]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,\n",
" colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,\n",
" importance_type='gain', interaction_constraints=None,\n",
" learning_rate=0.300000012, max_delta_step=0, max_depth=6,\n",
" min_child_weight=2, missing=nan, monotone_constraints=None,\n",
" n_estimators=200, n_jobs=0, num_parallel_tree=1,\n",
" objective='binary:logistic', random_state=0, reg_alpha=0,\n",
" reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,\n",
" validate_parameters=False, verbosity=None)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.75, random_state=1)\n",
"\n",
"clf = XGBClassifier(n_estimators=200,min_child_weight=2).fit(Xtrain,ytrain)\n",
"clf.fit(Xtrain, ytrain)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Using Shapash"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from shapash import SmartExplainer"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"xpl = SmartExplainer(model=clf, preprocessing=encoder)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Backend: Shap TreeExplainer\n"
]
}
],
"source": [
"xpl.compile(x=Xtest,\n",
"y_target=ytest, # Optional: allows to display True Values vs Predicted Values\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Visualize data in pandas"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" new_embarked | \n",
" new_ages | \n",
" Fare | \n",
" Parch | \n",
" Age | \n",
"
\n",
" \n",
" PassengerId | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 863 | \n",
" Southampton-Cherbourg | \n",
" +40 years | \n",
" 25.93 | \n",
" 0 | \n",
" 48.0 | \n",
"
\n",
" \n",
" 224 | \n",
" Southampton-Cherbourg | \n",
" 26-40 years | \n",
" 7.90 | \n",
" 0 | \n",
" 29.5 | \n",
"
\n",
" \n",
" 85 | \n",
" Southampton-Cherbourg | \n",
" -25 years | \n",
" 10.50 | \n",
" 0 | \n",
" 17.0 | \n",
"
\n",
" \n",
" 681 | \n",
" Queenstown | \n",
" 26-40 years | \n",
" 8.14 | \n",
" 0 | \n",
" 29.5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" new_embarked new_ages Fare Parch Age\n",
"PassengerId \n",
"863 Southampton-Cherbourg +40 years 25.93 0 48.0\n",
"224 Southampton-Cherbourg 26-40 years 7.90 0 29.5\n",
"85 Southampton-Cherbourg -25 years 10.50 0 17.0\n",
"681 Queenstown 26-40 years 8.14 0 29.5"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"xpl.x_init.head(4)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" new_embarked | \n",
" new_ages | \n",
" Fare | \n",
" Parch | \n",
" Age | \n",
"
\n",
" \n",
" PassengerId | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 863 | \n",
" 1 | \n",
" 3 | \n",
" 25.93 | \n",
" 0 | \n",
" 48.0 | \n",
"
\n",
" \n",
" 224 | \n",
" 1 | \n",
" 2 | \n",
" 7.90 | \n",
" 0 | \n",
" 29.5 | \n",
"
\n",
" \n",
" 85 | \n",
" 1 | \n",
" 1 | \n",
" 10.50 | \n",
" 0 | \n",
" 17.0 | \n",
"
\n",
" \n",
" 681 | \n",
" 2 | \n",
" 2 | \n",
" 8.14 | \n",
" 0 | \n",
" 29.5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" new_embarked new_ages Fare Parch Age\n",
"PassengerId \n",
"863 1 3 25.93 0 48.0\n",
"224 1 2 7.90 0 29.5\n",
"85 1 1 10.50 0 17.0\n",
"681 2 2 8.14 0 29.5"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"xpl.x_encoded.head(4)"
]
}
],
"metadata": {
"hide_input": false,
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 4
}