Feature Engineering – Rahul Saraf

Import public packages

import fastcore
import pandas as pd
import pathlib
from fastcore.all import *
from fastcore.imports import *
import os
import sys
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold, train_test_split, cross_validate
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, confusion_matrix
from IPython.display import display
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

Import private packages

is_kaggle = 'kaggle_secrets' in sys.modules

if is_kaggle:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    os.environ['KAGGLE_USERNAME'] = user_secrets.get_secret("kaggle_username")
    if not os.environ['KAGGLE_USERNAME']: raise Exception("Please insert your Kaggle username and key into Kaggle secrets")
    os.environ['KAGGLE_KEY'] = user_secrets.get_secret("kaggle_key")
    github_pat = user_secrets.get_secret("GITHUB_PAT")
    !pip install -Uqq git+https://{github_pat}@github.com/Rahuketu86/aiking
else:
    from aiking.data.external import *
    path = untar_data("kaggle_competitions::titanic"); 
    print(path.ls())

[Path('/Users/rahul1.saraf/rahuketu/programming/AIKING_HOME/data/titanic/test.csv'), Path('/Users/rahul1.saraf/rahuketu/programming/AIKING_HOME/data/titanic/train.csv'), Path('/Users/rahul1.saraf/rahuketu/programming/AIKING_HOME/data/titanic/gender_submission.csv')]

from aiking.ml.structured import *

Read the Dataset

data_dir = pathlib.Path(os.getenv('DATA_DIR', "/kaggle/input")); 
path = data_dir/"titanic"
path.ls()

(#3) [Path('/Users/rahul1.saraf/rahuketu/programming/AIKING_HOME/data/titanic/test.csv'),Path('/Users/rahul1.saraf/rahuketu/programming/AIKING_HOME/data/titanic/train.csv'),Path('/Users/rahul1.saraf/rahuketu/programming/AIKING_HOME/data/titanic/gender_submission.csv')]

df_train = pd.read_csv(path/"train.csv"); df_train.head()
df_test = pd.read_csv(path/"test.csv"); df_test.head()

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	892	3	Kelly, Mr. James	male	34.5	0	0	330911	7.8292	NaN	Q
1	893	3	Wilkes, Mrs. James (Ellen Needs)	female	47.0	1	0	363272	7.0000	NaN	S
2	894	2	Myles, Mr. Thomas Francis	male	62.0	0	0	240276	9.6875	NaN	Q
3	895	3	Wirz, Mr. Albert	male	27.0	0	0	315154	8.6625	NaN	S
4	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	female	22.0	1	1	3101298	12.2875	NaN	S

EDA

display(df_train.describe(include='number').T, df_test.describe(include='number').T)

	count	mean	std	min	25%	50%	75%	max
PassengerId	891.0	446.000000	257.353842	1.00	223.5000	446.0000	668.5	891.0000
Survived	891.0	0.383838	0.486592	0.00	0.0000	0.0000	1.0	1.0000
Pclass	891.0	2.308642	0.836071	1.00	2.0000	3.0000	3.0	3.0000
Age	714.0	29.699118	14.526497	0.42	20.1250	28.0000	38.0	80.0000
SibSp	891.0	0.523008	1.102743	0.00	0.0000	0.0000	1.0	8.0000
Parch	891.0	0.381594	0.806057	0.00	0.0000	0.0000	0.0	6.0000
Fare	891.0	32.204208	49.693429	0.00	7.9104	14.4542	31.0	512.3292

	count	mean	std	min	25%	50%	75%	max
PassengerId	418.0	1100.500000	120.810458	892.00	996.2500	1100.5000	1204.75	1309.0000
Pclass	418.0	2.265550	0.841838	1.00	1.0000	3.0000	3.00	3.0000
Age	332.0	30.272590	14.181209	0.17	21.0000	27.0000	39.00	76.0000
SibSp	418.0	0.447368	0.896760	0.00	0.0000	0.0000	1.00	8.0000
Parch	418.0	0.392344	0.981429	0.00	0.0000	0.0000	0.00	9.0000
Fare	417.0	35.627188	55.907576	0.00	7.8958	14.4542	31.50	512.3292

display(df_train.describe(include='object').T, df_test.describe(include='object').T)

	count	unique	top	freq
Name	891	891	Braund, Mr. Owen Harris	1
Sex	891	2	male	577
Ticket	891	681	347082	7
Cabin	204	147	B96 B98	4
Embarked	889	3	S	644

	count	unique	top	freq
Name	418	418	Kelly, Mr. James	1
Sex	418	2	male	266
Ticket	418	363	PC 17608	5
Cabin	91	76	B57 B59 B63 B66	3
Embarked	418	3	S	270

display(df_train['Cabin'].str[0], df_train['Cabin'].str[1:])

0      NaN
1        C
2      NaN
3        C
4      NaN
      ... 
886    NaN
887      B
888    NaN
889      C
890    NaN
Name: Cabin, Length: 891, dtype: object

0      NaN
1       85
2      NaN
3      123
4      NaN
      ... 
886    NaN
887     42
888    NaN
889    148
890    NaN
Name: Cabin, Length: 891, dtype: object

[i.rsplit(" ", -1) for i in df_train['Ticket'].tolist()[:5]]

[['A/5', '21171'],
 ['PC', '17599'],
 ['STON/O2.', '3101282'],
 ['113803'],
 ['373450']]

def get_ticket_features(row):
    if len(row) == 2: return [row[0], int(row[1])]
    else:
        if row[0].isdigit(): return [pd.NA, int(row[0])]
        else: return [row[0], pd.NA]

def expand_ticket(X):
    s = X.squeeze().str.rsplit(" ").apply(get_ticket_features)
    df = pd.DataFrame(s.tolist(), columns=['prefix_ticket', 'num_ticket'])
    return df
    

# def split_col(X, splitter=" "): return X.squeeze().str.split(splitter, expand=True).apply(pd.to_numeric, errors='ignore', downcast='integer')
# split_col(df_train[['Ticket']], splitter=' ')

# expand_ticket(df_train[['Ticket']])



ticket_transformer = ColExpanderTransform(names=['prefix_ticket', 'num_ticket'], func=expand_ticket, func_kw_args={})
display(ticket_transformer.fit_transform(df_train[['Ticket']]), ticket_transformer.get_feature_names())

	prefix_ticket	num_ticket
0	A/5	21171
1	PC	17599
2	STON/O2.	3101282
3	<NA>	113803
4	<NA>	373450
...	...	...
886	<NA>	211536
887	<NA>	112053
888	W./C.	6607
889	<NA>	111369
890	<NA>	370376

891 rows × 2 columns

['prefix_ticket', 'num_ticket']

df_train['Name'][1]

'Cumings, Mrs. John Bradley (Florence Briggs Thayer)'

feature_specs = {
    'Ticket':(ColExpanderTransform, {'names':['prefix_ticket', 'num_ticket'], 'func':expand_ticket, 'func_kw_args':{}}), 
}
gen_feature_layer(df_train, feature_specs)

[('Ticket', <aiking.ml.structured.ColExpanderTransform>, {}),
 ('PassengerId', None),
 ('Survived', None),
 ('Pclass', None),
 ('Name', None),
 ('Sex', None),
 ('Age', None),
 ('SibSp', None),
 ('Parch', None),
 ('Fare', None),
 ('Cabin', None),
 ('Embarked', None)]

Modelling

Define Pipeline

def get_model_pipeline(max_n_cat=0, 
                       cat_dict=None, 
                       scale_dict={'class': StandardScaler},
                       cat_num_dict={'class':NumericalEncoder,'categories':None},
                       cat_dummy_dict={'class':OneHotEncoder,'handle_unknown':'ignore'},
                       imputer_dict={'class':SimpleImputer, 'strategy':'median'},
                      ):

    layer_spec_preprocess = (gen_feature_layer,
                {
                    'feature_specs':{
                        'Ticket':(ColExpanderTransform, {'names':['prefix_ticket', 'num_ticket'], 'func':expand_ticket, 'func_kw_args':{}}), 
                    }
                })
    layer_spec_default = (get_default_feature_def, 
                      {
                          'skip_flds':None, 
                          'ignored_flds':None, 
                          'max_n_cat':max_n_cat, 
                          'na_exclude_cols':[],
                          'scale_var_num':True,
                          'scale_var_cat':False,
                          'scale_dict':scale_dict,
                          'cat_num_dict':cat_num_dict,
                          'cat_dummy_dict':cat_dummy_dict,
                          'imputer_dict':imputer_dict,
                          'include_time_cols':True,
                          'keep_dt_cols':False,
                          'cat_dict':cat_dict
                      }
                     )

    layer_specs = [layer_spec_preprocess, layer_spec_default]
    proc = Proc(layer_specs=layer_specs); #proc.fit_transform(X)
    model = RandomForestClassifier(oob_score=True, n_jobs=-1)
    pipeline = make_pipeline(proc, model); pipeline
    return pipeline

pipeline = get_model_pipeline(cat_dict=None); pipeline

Pipeline(steps=[('proc', <aiking.ml.structured.Proc object at 0x2927bae80>),
                ('randomforestclassifier',
                 RandomForestClassifier(n_jobs=-1, oob_score=True))])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Train on Partial Data

max_n_cat = 5

def get_xy(df, col='Survived'): return df.drop([col], axis=1), df[col]
    
X, y = get_xy(df_train)

pipeline = get_model_pipeline(max_n_cat,cat_dict=None)
pipeline.fit(X, y)

Pipeline(steps=[('proc', <aiking.ml.structured.Proc object at 0x29356ff40>),
                ('randomforestclassifier',
                 RandomForestClassifier(n_jobs=-1, oob_score=True))])

get_scorer_dict(scorer_names=['accuracy', 'precision', 'recall', 'roc_auc'])

{'accuracy': make_scorer(accuracy_score),
 'precision': make_scorer(precision_score, average=binary),
 'recall': make_scorer(recall_score, average=binary),
 'roc_auc': make_scorer(roc_auc_score, needs_threshold=True)}

This gives an indication of estimate of msle around .26 to .30[Really 0.304 from validation estimate]

m = pipeline['randomforestclassifier']
m.oob_score_

0.8372615039281706

Cross validation estimate

pipeline = get_model_pipeline(max_n_cat,cat_dict=None)
# scores = cross_val_score(pipeline, X, y, scoring='accuracy'); scores
scores_df = pd.DataFrame(cross_validate(pipeline, X, y, 
                                        scoring=['accuracy', 'precision', 'recall', 'roc_auc'], 
                                        return_train_score=True)); scores_df

	fit_time	score_time	test_accuracy	train_accuracy	test_precision	train_precision	test_recall	train_recall	test_roc_auc	train_roc_auc
0	1.506517	0.046242	0.737430	1.0	0.739130	1.0	0.492754	1.0	0.837022	1.0
1	0.084883	0.039459	0.797753	1.0	0.735294	1.0	0.735294	1.0	0.805548	1.0
2	0.085794	0.038840	0.842697	1.0	0.785714	1.0	0.808824	1.0	0.917647	1.0
3	0.087844	0.038950	0.808989	1.0	0.814815	1.0	0.647059	1.0	0.888971	1.0
4	0.087772	0.038551	0.837079	1.0	0.803030	1.0	0.768116	1.0	0.881000	1.0

# m = pipeline['randomforestclassifier']
cv_scores = cross_val_score(pipeline, X, y, cv=5)
cv_scores

array([0.83798883, 0.79213483, 0.87640449, 0.82022472, 0.86516854])

scores_df.plot()

# pipeline = get_model_pipeline(cat_dict); pipeline
y_pred = cross_val_predict(pipeline, X, y)
# cm = confusion_matrix(y, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm)
# disp.plot()
ConfusionMatrixDisplay.from_predictions(y, y_pred)

scores = scores_df['test_accuracy']
print(f"Expected Scores {scores.mean() - 3*scores.std():.2%} to {scores.mean() + 3*scores.std():.2%} with mean as {scores.mean():.2%}")

Expected Scores 67.85% to 93.10% with mean as 80.48%

Single tree model

max_n_cat = 5


def get_model_pipeline2(model, max_n_cat=0, 
                       cat_dict=None, 
                       scale_dict={'class': StandardScaler},
                       cat_num_dict={'class':NumericalEncoder,'categories':None},
                       cat_dummy_dict={'class':OneHotEncoder,'handle_unknown':'ignore'},
                       imputer_dict={'class':SimpleImputer, 'strategy':'median'},
                        
                      ):

    layer_spec_preprocess = (gen_feature_layer,
                {
                    'feature_specs':{
                        'Ticket':(ColExpanderTransform, {'names':['prefix_ticket', 'num_ticket'], 'func':expand_ticket, 'func_kw_args':{}}), 
                    }
                })
    layer_spec_default = (get_default_feature_def, 
                      {
                          'skip_flds':None, 
                          'ignored_flds':None, 
                          'max_n_cat':max_n_cat, 
                          'na_exclude_cols':[],
                          'scale_var_num':True,
                          'scale_var_cat':False,
                          'scale_dict':scale_dict,
                          'cat_num_dict':cat_num_dict,
                          'cat_dummy_dict':cat_dummy_dict,
                          'imputer_dict':imputer_dict,
                          'include_time_cols':True,
                          'keep_dt_cols':False,
                          'cat_dict':cat_dict
                      }
                     )

    layer_specs = [layer_spec_preprocess, layer_spec_default]
    proc = Proc(layer_specs=layer_specs); #proc.fit_transform(X)
    model = model
    pipeline = make_pipeline(proc, model); pipeline
    return pipeline

def get_xy(df, col='Survived'): return df.drop([col], axis=1), df[col]
    
X, y = get_xy(df_train)
model = RandomForestClassifier(n_estimators=1, bootstrap=False, max_depth=3, oob_score=False, n_jobs=-1)
pipeline2 = get_model_pipeline2(model, max_n_cat,cat_dict=None)
pipeline2.fit(X, y)

Pipeline(steps=[('proc', <aiking.ml.structured.Proc object at 0x2a25fb850>),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=False, max_depth=3,
                                        n_estimators=1, n_jobs=-1))])

pipeline2 = get_model_pipeline2(model, max_n_cat,cat_dict=None)
X, y = get_xy(df_train)
scores_df = pd.DataFrame(cross_validate(pipeline2, X, y, 
                                        scoring=['accuracy', 'precision', 'recall', 'roc_auc'], 
                                        return_train_score=True)); scores_df

	fit_time	score_time	test_accuracy	train_accuracy	test_precision	train_precision	test_recall	train_recall	test_roc_auc	train_roc_auc
0	0.039830	0.025932	0.759777	0.800562	0.964286	0.945578	0.391304	0.509158	0.829117	0.832578
1	0.034812	0.024542	0.679775	0.730715	0.720000	0.741176	0.264706	0.459854	0.624733	0.749077
2	0.036440	0.025795	0.730337	0.812062	0.604167	0.751799	0.852941	0.762774	0.838770	0.851728
3	0.034458	0.024143	0.769663	0.823282	0.754717	0.780303	0.588235	0.751825	0.849799	0.862295
4	0.036281	0.025915	0.797753	0.799439	0.789474	0.773109	0.652174	0.673993	0.806010	0.833879

pipeline2['proc'].fit_transform(X)

	Ticket_prefix_ticket_nan	Ticket_num_ticket_nan	Age_nan	Cabin_nan	Ticket_prefix_ticket	Ticket_num_ticket	Name	Cabin	Sex_0	Sex_1	Embarked_0	Embarked_1	Embarked_2	Embarked_3	PassengerId	Pclass	Age	SibSp	Parch	Fare
0	False	False	False	True	5	225	109	0	0.0	1.0	0.0	0.0	1.0	0.0	-1.730108	0.827377	-0.565736	0.432793	-0.473674	-0.502445
1	False	False	False	False	19	193	191	82	1.0	0.0	1.0	0.0	0.0	0.0	-1.726220	-1.566107	0.663861	0.432793	-0.473674	0.786845
2	False	False	False	True	38	651	354	0	1.0	0.0	0.0	0.0	1.0	0.0	-1.722332	0.827377	-0.258337	-0.474545	-0.473674	-0.488854
3	True	False	False	False	0	348	273	56	1.0	0.0	0.0	0.0	1.0	0.0	-1.718444	-1.566107	0.433312	0.432793	-0.473674	0.420730
4	True	False	False	True	0	618	16	0	0.0	1.0	0.0	0.0	1.0	0.0	-1.714556	0.827377	0.433312	-0.474545	-0.473674	-0.486337
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
886	True	False	False	True	0	352	549	0	0.0	1.0	0.0	0.0	1.0	0.0	1.714556	-0.369365	-0.181487	-0.474545	-0.473674	-0.386671
887	True	False	False	False	0	313	304	31	1.0	0.0	0.0	0.0	1.0	0.0	1.718444	-1.566107	-0.796286	-0.474545	-0.473674	-0.044381
888	False	False	True	True	40	106	414	0	1.0	0.0	0.0	0.0	1.0	0.0	1.722332	0.827377	-0.104637	0.432793	2.008933	-0.176263
889	True	False	False	False	0	307	82	61	0.0	1.0	1.0	0.0	0.0	0.0	1.726220	-1.566107	-0.258337	-0.474545	-0.473674	-0.044381
890	True	False	False	True	0	612	221	0	0.0	1.0	0.0	1.0	0.0	0.0	1.730108	0.827377	0.202762	-0.474545	-0.473674	-0.492378

891 rows × 20 columns

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
1	2	1	Cumings, Mrs. John Bradley (Florence Briggs Thayer)	female	38.0	1	0	PC 17599	71.2833	C85	C
2	3	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S
4	5	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S
...	...	...	...	...	...	...	...	...	...	...	...
886	887	2	Montvila, Rev. Juozas	male	27.0	0	0	211536	13.0000	NaN	S
887	888	1	Graham, Miss. Margaret Edith	female	19.0	0	0	112053	30.0000	B42	S
888	889	3	Johnston, Miss. Catherine Helen "Carrie"	female	NaN	1	2	W./C. 6607	23.4500	NaN	S
889	890	1	Behr, Mr. Karl Howell	male	26.0	0	0	111369	30.0000	C148	C
890	891	3	Dooley, Mr. Patrick	male	32.0	0	0	370376	7.7500	NaN	Q

891 rows × 11 columns

scores_df.mean()

fit_time           0.036364
score_time         0.025265
test_accuracy      0.747461
train_accuracy     0.793212
test_precision     0.766529
train_precision    0.798393
test_recall        0.549872
train_recall       0.631521
test_roc_auc       0.789686
train_roc_auc      0.825911
dtype: float64

pipeline2['proc'].transform(X)

	Ticket_prefix_ticket_nan	Ticket_num_ticket_nan	Age_nan	Cabin_nan	Ticket_prefix_ticket	Ticket_num_ticket	Name	Cabin	Sex_0	Sex_1	Embarked_0	Embarked_1	Embarked_2	Embarked_3	PassengerId	Pclass	Age	SibSp	Parch	Fare
0	False	False	False	True	5	225	109	0	0.0	1.0	0.0	0.0	1.0	0.0	-1.730108	0.827377	-0.565736	0.432793	-0.473674	-0.502445
1	False	False	False	False	19	193	191	82	1.0	0.0	1.0	0.0	0.0	0.0	-1.726220	-1.566107	0.663861	0.432793	-0.473674	0.786845
2	False	False	False	True	38	651	354	0	1.0	0.0	0.0	0.0	1.0	0.0	-1.722332	0.827377	-0.258337	-0.474545	-0.473674	-0.488854
3	True	False	False	False	0	348	273	56	1.0	0.0	0.0	0.0	1.0	0.0	-1.718444	-1.566107	0.433312	0.432793	-0.473674	0.420730
4	True	False	False	True	0	618	16	0	0.0	1.0	0.0	0.0	1.0	0.0	-1.714556	0.827377	0.433312	-0.474545	-0.473674	-0.486337
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
886	True	False	False	True	0	352	549	0	0.0	1.0	0.0	0.0	1.0	0.0	1.714556	-0.369365	-0.181487	-0.474545	-0.473674	-0.386671
887	True	False	False	False	0	313	304	31	1.0	0.0	0.0	0.0	1.0	0.0	1.718444	-1.566107	-0.796286	-0.474545	-0.473674	-0.044381
888	False	False	True	True	40	106	414	0	1.0	0.0	0.0	0.0	1.0	0.0	1.722332	0.827377	-0.104637	0.432793	2.008933	-0.176263
889	True	False	False	False	0	307	82	61	0.0	1.0	1.0	0.0	0.0	0.0	1.726220	-1.566107	-0.258337	-0.474545	-0.473674	-0.044381
890	True	False	False	True	0	612	221	0	0.0	1.0	0.0	1.0	0.0	0.0	1.730108	0.827377	0.202762	-0.474545	-0.473674	-0.492378

891 rows × 20 columns

feature_names = pipeline2['proc'].transform(X).columns.tolist(); feature_names

['Ticket_prefix_ticket_nan',
 'Ticket_num_ticket_nan',
 'Age_nan',
 'Cabin_nan',
 'Ticket_prefix_ticket',
 'Ticket_num_ticket',
 'Name',
 'Cabin',
 'Sex_0',
 'Sex_1',
 'Embarked_0',
 'Embarked_1',
 'Embarked_2',
 'Embarked_3',
 'PassengerId',
 'Pclass',
 'Age',
 'SibSp',
 'Parch',
 'Fare']

from sklearn import tree

# tree.plot_tree(model.estimators_[0], filled=True, feature_names=feature_names, fontsize=8);

# tree.plot_tree?

# pipeline2 = get_model_pipeline2(model, max_n_cat,cat_dict=None)
pipeline2['proc'].transform(X).head().T

	0	1	2	3	4
Ticket_prefix_ticket_nan	False	False	False	True	True
Ticket_num_ticket_nan	False	False	False	False	False
Age_nan	False	False	False	False	False
Cabin_nan	True	False	True	False	True
Ticket_prefix_ticket	5	19	38	0	0
Ticket_num_ticket	225	193	651	348	618
Name	109	191	354	273	16
Cabin	0	82	0	56	0
Sex_0	0.0	1.0	1.0	1.0	0.0
Sex_1	1.0	0.0	0.0	0.0	1.0
Embarked_0	0.0	1.0	0.0	0.0	0.0
Embarked_1	0.0	0.0	0.0	0.0	0.0
Embarked_2	1.0	0.0	1.0	1.0	1.0
Embarked_3	0.0	0.0	0.0	0.0	0.0
PassengerId	-1.730108	-1.72622	-1.722332	-1.718444	-1.714556
Pclass	0.827377	-1.566107	0.827377	-1.566107	0.827377
Age	-0.565736	0.663861	-0.258337	0.433312	0.433312
SibSp	0.432793	0.432793	-0.474545	0.432793	-0.474545
Parch	-0.473674	-0.473674	-0.473674	-0.473674	-0.473674
Fare	-0.502445	0.786845	-0.488854	0.42073	-0.486337

from sklearn.tree import export_graphviz
import IPython, graphviz
def draw_tree(t, df, size=10, ratio=0.6, precision=3):
    """Draws a representation of a random forest in IPython."""
    s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True,
                      special_characters=True, rotate=True, precision=precision)
    IPython.display.display(graphviz.Source(re.sub('Tree {',
       f'Tree {{ size={size}; ratio={ratio}', s)))

draw_tree(model.estimators_[0], pipeline2['proc'].transform(X))

Deep Tree

model2 = RandomForestClassifier(n_estimators=1, bootstrap=False, oob_score=False, n_jobs=-1)
pipeline3 = get_model_pipeline2(model2, max_n_cat,cat_dict=None)
X, y = get_xy(df_train)
scores_df = pd.DataFrame(cross_validate(pipeline2, X, y, 
                                        scoring=['accuracy', 'precision', 'recall', 'roc_auc'], 
                                        return_train_score=True)); scores_df.mean()

fit_time           0.056928
score_time         0.030233
test_accuracy      0.788984
train_accuracy     0.800229
test_precision     0.816734
train_precision    0.820361
test_recall        0.596462
train_recall       0.629440
test_roc_auc       0.776113
train_roc_auc      0.840716
dtype: float64

X, y = get_xy(df_train)

pipeline3 = get_model_pipeline2(model2, max_n_cat,cat_dict=None)
pipeline3.fit(X, y)

Pipeline(steps=[('proc', <aiking.ml.structured.Proc object at 0x2a10a9550>),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=False, n_estimators=1,
                                        n_jobs=-1))])

pipeline3['proc'].transform(X)

	Ticket_prefix_ticket_nan	Ticket_num_ticket_nan	Age_nan	Cabin_nan	Ticket_prefix_ticket	Ticket_num_ticket	Name	Cabin	Sex_0	Sex_1	Embarked_0	Embarked_1	Embarked_2	Embarked_3	PassengerId	Pclass	Age	SibSp	Parch	Fare
0	False	False	False	True	5	225	109	0	0.0	1.0	0.0	0.0	1.0	0.0	-1.730108	0.827377	-0.565736	0.432793	-0.473674	-0.502445
1	False	False	False	False	19	193	191	82	1.0	0.0	1.0	0.0	0.0	0.0	-1.726220	-1.566107	0.663861	0.432793	-0.473674	0.786845
2	False	False	False	True	38	651	354	0	1.0	0.0	0.0	0.0	1.0	0.0	-1.722332	0.827377	-0.258337	-0.474545	-0.473674	-0.488854
3	True	False	False	False	0	348	273	56	1.0	0.0	0.0	0.0	1.0	0.0	-1.718444	-1.566107	0.433312	0.432793	-0.473674	0.420730
4	True	False	False	True	0	618	16	0	0.0	1.0	0.0	0.0	1.0	0.0	-1.714556	0.827377	0.433312	-0.474545	-0.473674	-0.486337
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
886	True	False	False	True	0	352	549	0	0.0	1.0	0.0	0.0	1.0	0.0	1.714556	-0.369365	-0.181487	-0.474545	-0.473674	-0.386671
887	True	False	False	False	0	313	304	31	1.0	0.0	0.0	0.0	1.0	0.0	1.718444	-1.566107	-0.796286	-0.474545	-0.473674	-0.044381
888	False	False	True	True	40	106	414	0	1.0	0.0	0.0	0.0	1.0	0.0	1.722332	0.827377	-0.104637	0.432793	2.008933	-0.176263
889	True	False	False	False	0	307	82	61	0.0	1.0	1.0	0.0	0.0	0.0	1.726220	-1.566107	-0.258337	-0.474545	-0.473674	-0.044381
890	True	False	False	True	0	612	221	0	0.0	1.0	0.0	1.0	0.0	0.0	1.730108	0.827377	0.202762	-0.474545	-0.473674	-0.492378

891 rows × 20 columns

draw_tree(model2.estimators_[0], pipeline3['proc'].transform(X))

Multiple Estimators

model3 = RandomForestClassifier(oob_score=True, n_jobs=-1, max_depth=3)
pipeline3 = get_model_pipeline2(model3, max_n_cat,cat_dict=None)
X, y = get_xy(df_train)
scores_df = pd.DataFrame(cross_validate(pipeline2, X, y, 
                                        scoring=['accuracy', 'precision', 'recall', 'roc_auc'], 
                                        return_train_score=True)); scores_df.mean()

fit_time           0.037160
score_time         0.024400
test_accuracy      0.753079
train_accuracy     0.768236
test_precision     0.723124
train_precision    0.737067
test_recall        0.614450
train_recall       0.625572
test_roc_auc       0.793409
train_roc_auc      0.798709
dtype: float64

X, y = get_xy(df_train)

pipeline3 = get_model_pipeline2(model3, max_n_cat,cat_dict=None)
pipeline3.fit(X, y)

Pipeline(steps=[('proc', <aiking.ml.structured.Proc object at 0x2a113f6a0>),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=3, n_jobs=-1,
                                        oob_score=True))])

draw_tree(model3.estimators_[0], pipeline3['proc'].transform(X))

preds = np.stack([t.predict(pipeline3['proc'].transform(X)) for t in model3.estimators_])
preds[:,0],np.mean(preds[:,0]), y[0]

(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 0.0,
 0)

More trees uncorrelated

from sklearn.ensemble import ExtraTreesClassifier

model4 = ExtraTreesClassifier(oob_score=True, n_jobs=-1, max_depth=3)
pipeline3 = get_model_pipeline2(model4, max_n_cat,cat_dict=None)
X, y = get_xy(df_train)
scores_df = pd.DataFrame(cross_validate(pipeline2, X, y, 
                                        scoring=['accuracy', 'precision', 'recall', 'roc_auc'], 
                                        return_train_score=True)); scores_df.mean()

fit_time           0.415596
score_time         0.024783
test_accuracy      0.750995
train_accuracy     0.796294
test_precision     0.632699
train_precision    0.777518
test_recall        0.483589
train_recall       0.670247
test_roc_auc       0.685051
train_roc_auc      0.839345
dtype: float64

Predictions

Retrain pipeline on complete dataset

pipeline = get_model_pipeline(max_n_cat,cat_dict=None)
pipeline.fit(X, y)
y_pred = pipeline.predict(X)
cm = confusion_matrix(y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
display(disp.plot(), accuracy_score(y, y_pred))

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay>

1.0

Calculation for test set and submission

df_sample_submission = pd.read_csv(path/"gender_submission.csv"); df_sample_submission.head()

	PassengerId	Survived
0	892	0
1	893	1
2	894	0
3	895	0
4	896	1

os.getcwd()

'/Users/rahul1.saraf/rahuketu/programming/portfolio/curations/competitions/titanic'

predictions = pd.DataFrame(pipeline.predict(df_test), columns=[y.squeeze().name]); predictions
df_submission = pd.concat([df_test['PassengerId'], predictions], axis=1); df_submission
df_submission.to_csv('submission.csv', index=False)

if not is_kaggle:
    import kaggle
    kaggle.api.competition_submit_cli("submission.csv", "Submission from local machine", competition="titanic")
    # from aiking.integrations.kaggle import push2kaggle
    # push2kaggle("00_index.ipynb")

Warning: Looks like you're using an outdated API Version, please consider updating (server 1.6.12 / client 1.5.16)

100%|██████████████████████████████████████| 2.77k/2.77k [00:02<00:00, 1.32kB/s]