Feature Engineering

Shows the usage of aiking library on a kaggle dataset

Import public packages

import fastcore
import pandas as pd
import pathlib
from fastcore.all import *
from fastcore.imports import *
import os
import sys
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold, train_test_split, cross_validate
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, confusion_matrix
from IPython.display import display
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

Import private packages

is_kaggle = 'kaggle_secrets' in sys.modules
if is_kaggle:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    os.environ['KAGGLE_USERNAME'] = user_secrets.get_secret("kaggle_username")
    if not os.environ['KAGGLE_USERNAME']: raise Exception("Please insert your Kaggle username and key into Kaggle secrets")
    os.environ['KAGGLE_KEY'] = user_secrets.get_secret("kaggle_key")
    github_pat = user_secrets.get_secret("GITHUB_PAT")
    !pip install -Uqq git+https://{github_pat}@github.com/Rahuketu86/aiking
    from aiking.data.external import *
    path = untar_data("kaggle_competitions::titanic"); 
[Path('/Users/rahul1.saraf/rahuketu/programming/AIKING_HOME/data/titanic/test.csv'), Path('/Users/rahul1.saraf/rahuketu/programming/AIKING_HOME/data/titanic/train.csv'), Path('/Users/rahul1.saraf/rahuketu/programming/AIKING_HOME/data/titanic/gender_submission.csv')]
from aiking.ml.structured import *

Read the Dataset

data_dir = pathlib.Path(os.getenv('DATA_DIR', "/kaggle/input")); 
path = data_dir/"titanic"
(#3) [Path('/Users/rahul1.saraf/rahuketu/programming/AIKING_HOME/data/titanic/test.csv'),Path('/Users/rahul1.saraf/rahuketu/programming/AIKING_HOME/data/titanic/train.csv'),Path('/Users/rahul1.saraf/rahuketu/programming/AIKING_HOME/data/titanic/gender_submission.csv')]
df_train = pd.read_csv(path/"train.csv"); df_train.head()
df_test = pd.read_csv(path/"test.csv"); df_test.head()
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S


display(df_train.describe(include='number').T, df_test.describe(include='number').T)
count mean std min 25% 50% 75% max
PassengerId 891.0 446.000000 257.353842 1.00 223.5000 446.0000 668.5 891.0000
Survived 891.0 0.383838 0.486592 0.00 0.0000 0.0000 1.0 1.0000
Pclass 891.0 2.308642 0.836071 1.00 2.0000 3.0000 3.0 3.0000
Age 714.0 29.699118 14.526497 0.42 20.1250 28.0000 38.0 80.0000
SibSp 891.0 0.523008 1.102743 0.00 0.0000 0.0000 1.0 8.0000
Parch 891.0 0.381594 0.806057 0.00 0.0000 0.0000 0.0 6.0000
Fare 891.0 32.204208 49.693429 0.00 7.9104 14.4542 31.0 512.3292
count mean std min 25% 50% 75% max
PassengerId 418.0 1100.500000 120.810458 892.00 996.2500 1100.5000 1204.75 1309.0000
Pclass 418.0 2.265550 0.841838 1.00 1.0000 3.0000 3.00 3.0000
Age 332.0 30.272590 14.181209 0.17 21.0000 27.0000 39.00 76.0000
SibSp 418.0 0.447368 0.896760 0.00 0.0000 0.0000 1.00 8.0000
Parch 418.0 0.392344 0.981429 0.00 0.0000 0.0000 0.00 9.0000
Fare 417.0 35.627188 55.907576 0.00 7.8958 14.4542 31.50 512.3292
display(df_train.describe(include='object').T, df_test.describe(include='object').T)
count unique top freq
Name 891 891 Braund, Mr. Owen Harris 1
Sex 891 2 male 577
Ticket 891 681 347082 7
Cabin 204 147 B96 B98 4
Embarked 889 3 S 644
count unique top freq
Name 418 418 Kelly, Mr. James 1
Sex 418 2 male 266
Ticket 418 363 PC 17608 5
Cabin 91 76 B57 B59 B63 B66 3
Embarked 418 3 S 270
display(df_train['Cabin'].str[0], df_train['Cabin'].str[1:])
0      NaN
1        C
2      NaN
3        C
4      NaN
886    NaN
887      B
888    NaN
889      C
890    NaN
Name: Cabin, Length: 891, dtype: object
0      NaN
1       85
2      NaN
3      123
4      NaN
886    NaN
887     42
888    NaN
889    148
890    NaN
Name: Cabin, Length: 891, dtype: object
[i.rsplit(" ", -1) for i in df_train['Ticket'].tolist()[:5]]
[['A/5', '21171'],
 ['PC', '17599'],
 ['STON/O2.', '3101282'],
def get_ticket_features(row):
    if len(row) == 2: return [row[0], int(row[1])]
        if row[0].isdigit(): return [pd.NA, int(row[0])]
        else: return [row[0], pd.NA]

def expand_ticket(X):
    s = X.squeeze().str.rsplit(" ").apply(get_ticket_features)
    df = pd.DataFrame(s.tolist(), columns=['prefix_ticket', 'num_ticket'])
    return df

# def split_col(X, splitter=" "): return X.squeeze().str.split(splitter, expand=True).apply(pd.to_numeric, errors='ignore', downcast='integer')
# split_col(df_train[['Ticket']], splitter=' ')

# expand_ticket(df_train[['Ticket']])

ticket_transformer = ColExpanderTransform(names=['prefix_ticket', 'num_ticket'], func=expand_ticket, func_kw_args={})
display(ticket_transformer.fit_transform(df_train[['Ticket']]), ticket_transformer.get_feature_names())
prefix_ticket num_ticket
0 A/5 21171
1 PC 17599
2 STON/O2. 3101282
3 <NA> 113803
4 <NA> 373450
... ... ...
886 <NA> 211536
887 <NA> 112053
888 W./C. 6607
889 <NA> 111369
890 <NA> 370376

891 rows × 2 columns

['prefix_ticket', 'num_ticket']
'Cumings, Mrs. John Bradley (Florence Briggs Thayer)'
feature_specs = {
    'Ticket':(ColExpanderTransform, {'names':['prefix_ticket', 'num_ticket'], 'func':expand_ticket, 'func_kw_args':{}}), 
gen_feature_layer(df_train, feature_specs)
[('Ticket', <aiking.ml.structured.ColExpanderTransform>, {}),
 ('PassengerId', None),
 ('Survived', None),
 ('Pclass', None),
 ('Name', None),
 ('Sex', None),
 ('Age', None),
 ('SibSp', None),
 ('Parch', None),
 ('Fare', None),
 ('Cabin', None),
 ('Embarked', None)]


Define Pipeline

def get_model_pipeline(max_n_cat=0, 
                       scale_dict={'class': StandardScaler},
                       imputer_dict={'class':SimpleImputer, 'strategy':'median'},

    layer_spec_preprocess = (gen_feature_layer,
                        'Ticket':(ColExpanderTransform, {'names':['prefix_ticket', 'num_ticket'], 'func':expand_ticket, 'func_kw_args':{}}), 
    layer_spec_default = (get_default_feature_def, 

    layer_specs = [layer_spec_preprocess, layer_spec_default]
    proc = Proc(layer_specs=layer_specs); #proc.fit_transform(X)
    model = RandomForestClassifier(oob_score=True, n_jobs=-1)
    pipeline = make_pipeline(proc, model); pipeline
    return pipeline
pipeline = get_model_pipeline(cat_dict=None); pipeline
Pipeline(steps=[('proc', <aiking.ml.structured.Proc object at 0x2927bae80>),
                 RandomForestClassifier(n_jobs=-1, oob_score=True))])
Train on Partial Data

max_n_cat = 5

def get_xy(df, col='Survived'): return df.drop([col], axis=1), df[col]
X, y = get_xy(df_train)

pipeline = get_model_pipeline(max_n_cat,cat_dict=None)
pipeline.fit(X, y)
Pipeline(steps=[('proc', <aiking.ml.structured.Proc object at 0x29356ff40>),
                 RandomForestClassifier(n_jobs=-1, oob_score=True))])
get_scorer_dict(scorer_names=['accuracy', 'precision', 'recall', 'roc_auc'])
{'accuracy': make_scorer(accuracy_score),
 'precision': make_scorer(precision_score, average=binary),
 'recall': make_scorer(recall_score, average=binary),
 'roc_auc': make_scorer(roc_auc_score, needs_threshold=True)}

This gives an indication of estimate of msle around .26 to .30[Really 0.304 from validation estimate]

m = pipeline['randomforestclassifier']

Cross validation estimate

pipeline = get_model_pipeline(max_n_cat,cat_dict=None)
# scores = cross_val_score(pipeline, X, y, scoring='accuracy'); scores
scores_df = pd.DataFrame(cross_validate(pipeline, X, y, 
                                        scoring=['accuracy', 'precision', 'recall', 'roc_auc'], 
                                        return_train_score=True)); scores_df
fit_time score_time test_accuracy train_accuracy test_precision train_precision test_recall train_recall test_roc_auc train_roc_auc
0 1.506517 0.046242 0.737430 1.0 0.739130 1.0 0.492754 1.0 0.837022 1.0
1 0.084883 0.039459 0.797753 1.0 0.735294 1.0 0.735294 1.0 0.805548 1.0
2 0.085794 0.038840 0.842697 1.0 0.785714 1.0 0.808824 1.0 0.917647 1.0
3 0.087844 0.038950 0.808989 1.0 0.814815 1.0 0.647059 1.0 0.888971 1.0
4 0.087772 0.038551 0.837079 1.0 0.803030 1.0 0.768116 1.0 0.881000 1.0
# m = pipeline['randomforestclassifier']
cv_scores = cross_val_score(pipeline, X, y, cv=5)
array([0.83798883, 0.79213483, 0.87640449, 0.82022472, 0.86516854])

# pipeline = get_model_pipeline(cat_dict); pipeline
y_pred = cross_val_predict(pipeline, X, y)
# cm = confusion_matrix(y, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm)
# disp.plot()
ConfusionMatrixDisplay.from_predictions(y, y_pred)

scores = scores_df['test_accuracy']
print(f"Expected Scores {scores.mean() - 3*scores.std():.2%} to {scores.mean() + 3*scores.std():.2%} with mean as {scores.mean():.2%}")
Expected Scores 67.85% to 93.10% with mean as 80.48%

Single tree model

max_n_cat = 5

def get_model_pipeline2(model, max_n_cat=0, 
                       scale_dict={'class': StandardScaler},
                       imputer_dict={'class':SimpleImputer, 'strategy':'median'},

    layer_spec_preprocess = (gen_feature_layer,
                        'Ticket':(ColExpanderTransform, {'names':['prefix_ticket', 'num_ticket'], 'func':expand_ticket, 'func_kw_args':{}}), 
    layer_spec_default = (get_default_feature_def, 

    layer_specs = [layer_spec_preprocess, layer_spec_default]
    proc = Proc(layer_specs=layer_specs); #proc.fit_transform(X)
    model = model
    pipeline = make_pipeline(proc, model); pipeline
    return pipeline

def get_xy(df, col='Survived'): return df.drop([col], axis=1), df[col]
X, y = get_xy(df_train)
model = RandomForestClassifier(n_estimators=1, bootstrap=False, max_depth=3, oob_score=False, n_jobs=-1)
pipeline2 = get_model_pipeline2(model, max_n_cat,cat_dict=None)
pipeline2.fit(X, y)
Pipeline(steps=[('proc', <aiking.ml.structured.Proc object at 0x2a25fb850>),
                 RandomForestClassifier(bootstrap=False, max_depth=3,
                                        n_estimators=1, n_jobs=-1))])
pipeline2 = get_model_pipeline2(model, max_n_cat,cat_dict=None)
X, y = get_xy(df_train)
scores_df = pd.DataFrame(cross_validate(pipeline2, X, y, 
                                        scoring=['accuracy', 'precision', 'recall', 'roc_auc'], 
                                        return_train_score=True)); scores_df
fit_time score_time test_accuracy train_accuracy test_precision train_precision test_recall train_recall test_roc_auc train_roc_auc
0 0.039830 0.025932 0.759777 0.800562 0.964286 0.945578 0.391304 0.509158 0.829117 0.832578
1 0.034812 0.024542 0.679775 0.730715 0.720000 0.741176 0.264706 0.459854 0.624733 0.749077
2 0.036440 0.025795 0.730337 0.812062 0.604167 0.751799 0.852941 0.762774 0.838770 0.851728
3 0.034458 0.024143 0.769663 0.823282 0.754717 0.780303 0.588235 0.751825 0.849799 0.862295
4 0.036281 0.025915 0.797753 0.799439 0.789474 0.773109 0.652174 0.673993 0.806010 0.833879
Ticket_prefix_ticket_nan Ticket_num_ticket_nan Age_nan Cabin_nan Ticket_prefix_ticket Ticket_num_ticket Name Cabin Sex_0 Sex_1 Embarked_0 Embarked_1 Embarked_2 Embarked_3 PassengerId Pclass Age SibSp Parch Fare
0 False False False True 5 225 109 0 0.0 1.0 0.0 0.0 1.0 0.0 -1.730108 0.827377 -0.565736 0.432793 -0.473674 -0.502445
1 False False False False 19 193 191 82 1.0 0.0 1.0 0.0 0.0 0.0 -1.726220 -1.566107 0.663861 0.432793 -0.473674 0.786845
2 False False False True 38 651 354 0 1.0 0.0 0.0 0.0 1.0 0.0 -1.722332 0.827377 -0.258337 -0.474545 -0.473674 -0.488854
3 True False False False 0 348 273 56 1.0 0.0 0.0 0.0 1.0 0.0 -1.718444 -1.566107 0.433312 0.432793 -0.473674 0.420730
4 True False False True 0 618 16 0 0.0 1.0 0.0 0.0 1.0 0.0 -1.714556 0.827377 0.433312 -0.474545 -0.473674 -0.486337
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 True False False True 0 352 549 0 0.0 1.0 0.0 0.0 1.0 0.0 1.714556 -0.369365 -0.181487 -0.474545 -0.473674 -0.386671
887 True False False False 0 313 304 31 1.0 0.0 0.0 0.0 1.0 0.0 1.718444 -1.566107 -0.796286 -0.474545 -0.473674 -0.044381
888 False False True True 40 106 414 0 1.0 0.0 0.0 0.0 1.0 0.0 1.722332 0.827377 -0.104637 0.432793 2.008933 -0.176263
889 True False False False 0 307 82 61 0.0 1.0 1.0 0.0 0.0 0.0 1.726220 -1.566107 -0.258337 -0.474545 -0.473674 -0.044381
890 True False False True 0 612 221 0 0.0 1.0 0.0 1.0 0.0 0.0 1.730108 0.827377 0.202762 -0.474545 -0.473674 -0.492378

891 rows × 20 columns

PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ...
886 887 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 890 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 11 columns

fit_time           0.036364
score_time         0.025265
test_accuracy      0.747461
train_accuracy     0.793212
test_precision     0.766529
train_precision    0.798393
test_recall        0.549872
train_recall       0.631521
test_roc_auc       0.789686
train_roc_auc      0.825911
dtype: float64
Ticket_prefix_ticket_nan Ticket_num_ticket_nan Age_nan Cabin_nan Ticket_prefix_ticket Ticket_num_ticket Name Cabin Sex_0 Sex_1 Embarked_0 Embarked_1 Embarked_2 Embarked_3 PassengerId Pclass Age SibSp Parch Fare
0 False False False True 5 225 109 0 0.0 1.0 0.0 0.0 1.0 0.0 -1.730108 0.827377 -0.565736 0.432793 -0.473674 -0.502445
1 False False False False 19 193 191 82 1.0 0.0 1.0 0.0 0.0 0.0 -1.726220 -1.566107 0.663861 0.432793 -0.473674 0.786845
2 False False False True 38 651 354 0 1.0 0.0 0.0 0.0 1.0 0.0 -1.722332 0.827377 -0.258337 -0.474545 -0.473674 -0.488854
3 True False False False 0 348 273 56 1.0 0.0 0.0 0.0 1.0 0.0 -1.718444 -1.566107 0.433312 0.432793 -0.473674 0.420730
4 True False False True 0 618 16 0 0.0 1.0 0.0 0.0 1.0 0.0 -1.714556 0.827377 0.433312 -0.474545 -0.473674 -0.486337
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 True False False True 0 352 549 0 0.0 1.0 0.0 0.0 1.0 0.0 1.714556 -0.369365 -0.181487 -0.474545 -0.473674 -0.386671
887 True False False False 0 313 304 31 1.0 0.0 0.0 0.0 1.0 0.0 1.718444 -1.566107 -0.796286 -0.474545 -0.473674 -0.044381
888 False False True True 40 106 414 0 1.0 0.0 0.0 0.0 1.0 0.0 1.722332 0.827377 -0.104637 0.432793 2.008933 -0.176263
889 True False False False 0 307 82 61 0.0 1.0 1.0 0.0 0.0 0.0 1.726220 -1.566107 -0.258337 -0.474545 -0.473674 -0.044381
890 True False False True 0 612 221 0 0.0 1.0 0.0 1.0 0.0 0.0 1.730108 0.827377 0.202762 -0.474545 -0.473674 -0.492378

891 rows × 20 columns

feature_names = pipeline2['proc'].transform(X).columns.tolist(); feature_names
from sklearn import tree

# tree.plot_tree(model.estimators_[0], filled=True, feature_names=feature_names, fontsize=8);

# tree.plot_tree?
# pipeline2 = get_model_pipeline2(model, max_n_cat,cat_dict=None)
0 1 2 3 4
Ticket_prefix_ticket_nan False False False True True
Ticket_num_ticket_nan False False False False False
Age_nan False False False False False
Cabin_nan True False True False True
Ticket_prefix_ticket 5 19 38 0 0
Ticket_num_ticket 225 193 651 348 618
Name 109 191 354 273 16
Cabin 0 82 0 56 0
Sex_0 0.0 1.0 1.0 1.0 0.0
Sex_1 1.0 0.0 0.0 0.0 1.0
Embarked_0 0.0 1.0 0.0 0.0 0.0
Embarked_1 0.0 0.0 0.0 0.0 0.0
Embarked_2 1.0 0.0 1.0 1.0 1.0
Embarked_3 0.0 0.0 0.0 0.0 0.0
PassengerId -1.730108 -1.72622 -1.722332 -1.718444 -1.714556
Pclass 0.827377 -1.566107 0.827377 -1.566107 0.827377
Age -0.565736 0.663861 -0.258337 0.433312 0.433312
SibSp 0.432793 0.432793 -0.474545 0.432793 -0.474545
Parch -0.473674 -0.473674 -0.473674 -0.473674 -0.473674
Fare -0.502445 0.786845 -0.488854 0.42073 -0.486337
from sklearn.tree import export_graphviz
import IPython, graphviz
def draw_tree(t, df, size=10, ratio=0.6, precision=3):
    """Draws a representation of a random forest in IPython."""
    s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True,
                      special_characters=True, rotate=True, precision=precision)
    IPython.display.display(graphviz.Source(re.sub('Tree {',
       f'Tree {{ size={size}; ratio={ratio}', s)))

draw_tree(model.estimators_[0], pipeline2['proc'].transform(X))

Deep Tree

model2 = RandomForestClassifier(n_estimators=1, bootstrap=False, oob_score=False, n_jobs=-1)
pipeline3 = get_model_pipeline2(model2, max_n_cat,cat_dict=None)
X, y = get_xy(df_train)
scores_df = pd.DataFrame(cross_validate(pipeline2, X, y, 
                                        scoring=['accuracy', 'precision', 'recall', 'roc_auc'], 
                                        return_train_score=True)); scores_df.mean()
fit_time           0.056928
score_time         0.030233
test_accuracy      0.788984
train_accuracy     0.800229
test_precision     0.816734
train_precision    0.820361
test_recall        0.596462
train_recall       0.629440
test_roc_auc       0.776113
train_roc_auc      0.840716
dtype: float64
X, y = get_xy(df_train)

pipeline3 = get_model_pipeline2(model2, max_n_cat,cat_dict=None)
pipeline3.fit(X, y)
Pipeline(steps=[('proc', <aiking.ml.structured.Proc object at 0x2a10a9550>),
                 RandomForestClassifier(bootstrap=False, n_estimators=1,
Ticket_prefix_ticket_nan Ticket_num_ticket_nan Age_nan Cabin_nan Ticket_prefix_ticket Ticket_num_ticket Name Cabin Sex_0 Sex_1 Embarked_0 Embarked_1 Embarked_2 Embarked_3 PassengerId Pclass Age SibSp Parch Fare
0 False False False True 5 225 109 0 0.0 1.0 0.0 0.0 1.0 0.0 -1.730108 0.827377 -0.565736 0.432793 -0.473674 -0.502445
1 False False False False 19 193 191 82 1.0 0.0 1.0 0.0 0.0 0.0 -1.726220 -1.566107 0.663861 0.432793 -0.473674 0.786845
2 False False False True 38 651 354 0 1.0 0.0 0.0 0.0 1.0 0.0 -1.722332 0.827377 -0.258337 -0.474545 -0.473674 -0.488854
3 True False False False 0 348 273 56 1.0 0.0 0.0 0.0 1.0 0.0 -1.718444 -1.566107 0.433312 0.432793 -0.473674 0.420730
4 True False False True 0 618 16 0 0.0 1.0 0.0 0.0 1.0 0.0 -1.714556 0.827377 0.433312 -0.474545 -0.473674 -0.486337
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 True False False True 0 352 549 0 0.0 1.0 0.0 0.0 1.0 0.0 1.714556 -0.369365 -0.181487 -0.474545 -0.473674 -0.386671
887 True False False False 0 313 304 31 1.0 0.0 0.0 0.0 1.0 0.0 1.718444 -1.566107 -0.796286 -0.474545 -0.473674 -0.044381
888 False False True True 40 106 414 0 1.0 0.0 0.0 0.0 1.0 0.0 1.722332 0.827377 -0.104637 0.432793 2.008933 -0.176263
889 True False False False 0 307 82 61 0.0 1.0 1.0 0.0 0.0 0.0 1.726220 -1.566107 -0.258337 -0.474545 -0.473674 -0.044381
890 True False False True 0 612 221 0 0.0 1.0 0.0 1.0 0.0 0.0 1.730108 0.827377 0.202762 -0.474545 -0.473674 -0.492378

891 rows × 20 columns

draw_tree(model2.estimators_[0], pipeline3['proc'].transform(X))

Multiple Estimators

model3 = RandomForestClassifier(oob_score=True, n_jobs=-1, max_depth=3)
pipeline3 = get_model_pipeline2(model3, max_n_cat,cat_dict=None)
X, y = get_xy(df_train)
scores_df = pd.DataFrame(cross_validate(pipeline2, X, y, 
                                        scoring=['accuracy', 'precision', 'recall', 'roc_auc'], 
                                        return_train_score=True)); scores_df.mean()
fit_time           0.037160
score_time         0.024400
test_accuracy      0.753079
train_accuracy     0.768236
test_precision     0.723124
train_precision    0.737067
test_recall        0.614450
train_recall       0.625572
test_roc_auc       0.793409
train_roc_auc      0.798709
dtype: float64
X, y = get_xy(df_train)

pipeline3 = get_model_pipeline2(model3, max_n_cat,cat_dict=None)
pipeline3.fit(X, y)
Pipeline(steps=[('proc', <aiking.ml.structured.Proc object at 0x2a113f6a0>),
                 RandomForestClassifier(max_depth=3, n_jobs=-1,
draw_tree(model3.estimators_[0], pipeline3['proc'].transform(X))

preds = np.stack([t.predict(pipeline3['proc'].transform(X)) for t in model3.estimators_])
preds[:,0],np.mean(preds[:,0]), y[0]
(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),

More trees uncorrelated

from sklearn.ensemble import ExtraTreesClassifier

model4 = ExtraTreesClassifier(oob_score=True, n_jobs=-1, max_depth=3)
pipeline3 = get_model_pipeline2(model4, max_n_cat,cat_dict=None)
X, y = get_xy(df_train)
scores_df = pd.DataFrame(cross_validate(pipeline2, X, y, 
                                        scoring=['accuracy', 'precision', 'recall', 'roc_auc'], 
                                        return_train_score=True)); scores_df.mean()
fit_time           0.415596
score_time         0.024783
test_accuracy      0.750995
train_accuracy     0.796294
test_precision     0.632699
train_precision    0.777518
test_recall        0.483589
train_recall       0.670247
test_roc_auc       0.685051
train_roc_auc      0.839345
dtype: float64


Retrain pipeline on complete dataset

pipeline = get_model_pipeline(max_n_cat,cat_dict=None)
pipeline.fit(X, y)
y_pred = pipeline.predict(X)
cm = confusion_matrix(y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
display(disp.plot(), accuracy_score(y, y_pred))

Calculation for test set and submission

df_sample_submission = pd.read_csv(path/"gender_submission.csv"); df_sample_submission.head()
PassengerId Survived
0 892 0
1 893 1
2 894 0
3 895 0
4 896 1
predictions = pd.DataFrame(pipeline.predict(df_test), columns=[y.squeeze().name]); predictions
df_submission = pd.concat([df_test['PassengerId'], predictions], axis=1); df_submission
df_submission.to_csv('submission.csv', index=False)
if not is_kaggle:
    import kaggle
    kaggle.api.competition_submit_cli("submission.csv", "Submission from local machine", competition="titanic")
    # from aiking.integrations.kaggle import push2kaggle
    # push2kaggle("00_index.ipynb")
Warning: Looks like you're using an outdated API Version, please consider updating (server 1.6.12 / client 1.5.16)
