import fastcore
import pandas as pd
import pathlib
from fastcore.all import *
from fastcore.imports import *
import os
import sys
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold, train_test_split, cross_validate
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, confusion_matrix
from IPython.display import display
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
Import public packages
Import private packages
= 'kaggle_secrets' in sys.modules is_kaggle
if is_kaggle:
from kaggle_secrets import UserSecretsClient
= UserSecretsClient()
user_secrets 'KAGGLE_USERNAME'] = user_secrets.get_secret("kaggle_username")
os.environ[if not os.environ['KAGGLE_USERNAME']: raise Exception("Please insert your Kaggle username and key into Kaggle secrets")
'KAGGLE_KEY'] = user_secrets.get_secret("kaggle_key")
os.environ[= user_secrets.get_secret("GITHUB_PAT")
github_pat !pip install -Uqq git+https://{github_pat}@github.com/Rahuketu86/aiking
else:
from aiking.data.external import *
= untar_data("kaggle_competitions::titanic");
path print(path.ls())
[Path('/Users/rahul1.saraf/rahuketu/programming/AIKING_HOME/data/titanic/test.csv'), Path('/Users/rahul1.saraf/rahuketu/programming/AIKING_HOME/data/titanic/train.csv'), Path('/Users/rahul1.saraf/rahuketu/programming/AIKING_HOME/data/titanic/gender_submission.csv')]
from aiking.ml.structured import *
Read the Dataset
= pathlib.Path(os.getenv('DATA_DIR', "/kaggle/input"));
data_dir = data_dir/"titanic"
path path.ls()
(#3) [Path('/Users/rahul1.saraf/rahuketu/programming/AIKING_HOME/data/titanic/test.csv'),Path('/Users/rahul1.saraf/rahuketu/programming/AIKING_HOME/data/titanic/train.csv'),Path('/Users/rahul1.saraf/rahuketu/programming/AIKING_HOME/data/titanic/gender_submission.csv')]
= pd.read_csv(path/"train.csv"); df_train.head()
df_train = pd.read_csv(path/"test.csv"); df_test.head() df_test
PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892 | 3 | Kelly, Mr. James | male | 34.5 | 0 | 0 | 330911 | 7.8292 | NaN | Q |
1 | 893 | 3 | Wilkes, Mrs. James (Ellen Needs) | female | 47.0 | 1 | 0 | 363272 | 7.0000 | NaN | S |
2 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q |
3 | 895 | 3 | Wirz, Mr. Albert | male | 27.0 | 0 | 0 | 315154 | 8.6625 | NaN | S |
4 | 896 | 3 | Hirvonen, Mrs. Alexander (Helga E Lindqvist) | female | 22.0 | 1 | 1 | 3101298 | 12.2875 | NaN | S |
='all').T df_train.describe(include
count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|---|---|---|
PassengerId | 891.0 | NaN | NaN | NaN | 446.0 | 257.353842 | 1.0 | 223.5 | 446.0 | 668.5 | 891.0 |
Survived | 891.0 | NaN | NaN | NaN | 0.383838 | 0.486592 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
Pclass | 891.0 | NaN | NaN | NaN | 2.308642 | 0.836071 | 1.0 | 2.0 | 3.0 | 3.0 | 3.0 |
Name | 891 | 891 | Braund, Mr. Owen Harris | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Sex | 891 | 2 | male | 577 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Age | 714.0 | NaN | NaN | NaN | 29.699118 | 14.526497 | 0.42 | 20.125 | 28.0 | 38.0 | 80.0 |
SibSp | 891.0 | NaN | NaN | NaN | 0.523008 | 1.102743 | 0.0 | 0.0 | 0.0 | 1.0 | 8.0 |
Parch | 891.0 | NaN | NaN | NaN | 0.381594 | 0.806057 | 0.0 | 0.0 | 0.0 | 0.0 | 6.0 |
Ticket | 891 | 681 | 347082 | 7 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Fare | 891.0 | NaN | NaN | NaN | 32.204208 | 49.693429 | 0.0 | 7.9104 | 14.4542 | 31.0 | 512.3292 |
Cabin | 204 | 147 | B96 B98 | 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Embarked | 889 | 3 | S | 644 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Modelling
Define Pipeline
def get_model_pipeline(max_n_cat=0,
=None,
cat_dict={'class': StandardScaler},
scale_dict={'class':NumericalEncoder,'categories':None},
cat_num_dict={'class':OneHotEncoder,'handle_unknown':'ignore'},
cat_dummy_dict={'class':SimpleImputer, 'strategy':'median'},
imputer_dict
):= (get_default_feature_def,
layer_spec_default
{'skip_flds':None,
'ignored_flds':None,
'max_n_cat':max_n_cat,
'na_exclude_cols':[],
'scale_var_num':True,
'scale_var_cat':False,
'scale_dict':scale_dict,
'cat_num_dict':cat_num_dict,
'cat_dummy_dict':cat_dummy_dict,
'imputer_dict':imputer_dict,
'include_time_cols':True,
'keep_dt_cols':False,
'cat_dict':cat_dict
}
)
= [layer_spec_default]
layer_specs = Proc(layer_specs=layer_specs); #proc.fit_transform(X)
proc = RandomForestClassifier(n_jobs=-1)
model = make_pipeline(proc, model); pipeline
pipeline return pipeline
= get_model_pipeline(cat_dict=None); pipeline pipeline
Pipeline(steps=[('proc', <aiking.ml.structured.Proc object at 0x155257070>), ('randomforestclassifier', RandomForestClassifier(n_jobs=-1))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('proc', <aiking.ml.structured.Proc object at 0x155257070>), ('randomforestclassifier', RandomForestClassifier(n_jobs=-1))])
<aiking.ml.structured.Proc object at 0x155257070>
RandomForestClassifier(n_jobs=-1)
Train on Partial Data
= 5
max_n_cat
def get_xy(df, col='Survived'): return df.drop([col], axis=1), df[col]
= get_xy(df_train)
X, y
= get_model_pipeline(max_n_cat,cat_dict=None)
pipeline pipeline.fit(X, y)
Pipeline(steps=[('proc', <aiking.ml.structured.Proc object at 0x155257c40>), ('randomforestclassifier', RandomForestClassifier(n_jobs=-1))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('proc', <aiking.ml.structured.Proc object at 0x155257c40>), ('randomforestclassifier', RandomForestClassifier(n_jobs=-1))])
<aiking.ml.structured.Proc object at 0x155257c40>
RandomForestClassifier(n_jobs=-1)
=['accuracy', 'precision', 'recall', 'roc_auc']) get_scorer_dict(scorer_names
{'accuracy': make_scorer(accuracy_score),
'precision': make_scorer(precision_score, average=binary),
'recall': make_scorer(recall_score, average=binary),
'roc_auc': make_scorer(roc_auc_score, needs_threshold=True)}
This gives an indication of estimate of msle around .26 to .30[Really 0.304 from validation estimate]
Cross validation estimate
= get_model_pipeline(max_n_cat,cat_dict=None)
pipeline # scores = cross_val_score(pipeline, X, y, scoring='accuracy'); scores
= pd.DataFrame(cross_validate(pipeline, X, y, scoring=['accuracy', 'precision', 'recall', 'roc_auc'])); scores_df scores_df
fit_time | score_time | test_accuracy | test_precision | test_recall | test_roc_auc | |
---|---|---|---|---|---|---|
0 | 1.532780 | 0.041193 | 0.776536 | 0.737705 | 0.652174 | 0.830303 |
1 | 0.068360 | 0.036002 | 0.747191 | 0.641975 | 0.764706 | 0.808690 |
2 | 0.063332 | 0.034494 | 0.792135 | 0.674157 | 0.882353 | 0.902473 |
3 | 0.063486 | 0.035383 | 0.814607 | 0.786885 | 0.705882 | 0.867112 |
4 | 0.060760 | 0.037809 | 0.808989 | 0.716049 | 0.840580 | 0.892833 |
scores_df.plot()
# pipeline = get_model_pipeline(cat_dict); pipeline
= cross_val_predict(pipeline, X, y)
y_pred # cm = confusion_matrix(y, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm)
# disp.plot()
ConfusionMatrixDisplay.from_predictions(y, y_pred)
= scores_df['test_accuracy']
scores print(f"Expected Scores {scores.mean() - 3*scores.std():.2%} to {scores.mean() + 3*scores.std():.2%} with mean as {scores.mean():.2%}")
Expected Scores 70.63% to 86.95% with mean as 78.79%
Predictions
Retrain pipeline on complete dataset
= get_model_pipeline(max_n_cat,cat_dict=None)
pipeline
pipeline.fit(X, y)= pipeline.predict(X)
y_pred = confusion_matrix(y, y_pred)
cm = ConfusionMatrixDisplay(confusion_matrix=cm)
disp display(disp.plot(), accuracy_score(y, y_pred))
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay>
1.0
Calculation for test set and submission
= pd.read_csv(path/"gender_submission.csv"); df_sample_submission.head() df_sample_submission
PassengerId | Survived | |
---|---|---|
0 | 892 | 0 |
1 | 893 | 1 |
2 | 894 | 0 |
3 | 895 | 0 |
4 | 896 | 1 |
os.getcwd()
'/Users/rahul1.saraf/rahuketu/programming/portfolio/curations/competitions/titanic'
= pd.DataFrame(pipeline.predict(df_test), columns=[y.squeeze().name]); predictions
predictions = pd.concat([df_test['PassengerId'], predictions], axis=1); df_submission
df_submission 'submission.csv', index=False) df_submission.to_csv(
if not is_kaggle:
import kaggle
"submission.csv", "Submission from local machine", competition="titanic")
kaggle.api.competition_submit_cli(# from aiking.integrations.kaggle import push2kaggle
# push2kaggle("00_index.ipynb")
Warning: Looks like you're using an outdated API Version, please consider updating (server 1.6.12 / client 1.5.16)
100%|██████████████████████████████████████| 2.77k/2.77k [00:01<00:00, 1.77kB/s]