Spaceship Titanic

Shows the usage of aiking library on a kaggle dataset

Import Public Packages

import fastcore
import pandas as pd
import pathlib
from fastcore.all import *
from fastcore.imports import *
import os
import sys
import sklearn
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold, train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, plot_confusion_matrix, confusion_matrix
from IPython.display import display
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

Import Private Packages

is_kaggle = 'kaggle_secrets' in sys.modules
if is_kaggle:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    os.environ['KAGGLE_USERNAME'] = user_secrets.get_secret("kaggle_username")
    if not os.environ['KAGGLE_USERNAME']: raise Exception("Please insert your Kaggle username and key into Kaggle secrets")
    os.environ['KAGGLE_KEY'] = user_secrets.get_secret("kaggle_key")
    github_pat = user_secrets.get_secret("GITHUB_PAT")
    !pip install -Uqq git+https://{github_pat}@github.com/Rahuketu86/aiking
else:
    from aiking.data.external import *
    path = untar_data("kaggle_competitions::spaceship-titanic"); path.ls()
from aiking.ml.structured import *
from aiking.integrations.kaggle import push2kaggle

Read the Dataset

data_dir = pathlib.Path(os.getenv('DATA_DIR', "/kaggle/input")); 
path = data_dir/"spaceship-titanic"
path.ls()
(#3) [Path('/kaggle/input/spaceship-titanic/sample_submission.csv'),Path('/kaggle/input/spaceship-titanic/test.csv'),Path('/kaggle/input/spaceship-titanic/train.csv')]
# !rm -rf  {path}
df_train = pd.read_csv(path/"train.csv"); df_train.head().T
0 1 2 3 4
PassengerId 0001_01 0002_01 0003_01 0003_02 0004_01
HomePlanet Europa Earth Europa Europa Earth
CryoSleep False False False False False
Cabin B/0/P F/0/S A/0/S A/0/S F/1/S
Destination TRAPPIST-1e TRAPPIST-1e TRAPPIST-1e TRAPPIST-1e TRAPPIST-1e
Age 39.0 24.0 58.0 33.0 16.0
VIP False False True False False
RoomService 0.0 109.0 43.0 0.0 303.0
FoodCourt 0.0 9.0 3576.0 1283.0 70.0
ShoppingMall 0.0 25.0 0.0 371.0 151.0
Spa 0.0 549.0 6715.0 3329.0 565.0
VRDeck 0.0 44.0 49.0 193.0 2.0
Name Maham Ofracculy Juanna Vines Altark Susent Solam Susent Willy Santantines
Transported False True False False True
df_test = pd.read_csv(path/"test.csv"); df_test.head().T
0 1 2 3 4
PassengerId 0013_01 0018_01 0019_01 0021_01 0023_01
HomePlanet Earth Earth Europa Europa Earth
CryoSleep True False True False False
Cabin G/3/S F/4/S C/0/S C/1/S F/5/S
Destination TRAPPIST-1e TRAPPIST-1e 55 Cancri e TRAPPIST-1e TRAPPIST-1e
Age 27.0 19.0 31.0 38.0 20.0
VIP False False False False False
RoomService 0.0 0.0 0.0 0.0 10.0
FoodCourt 0.0 9.0 0.0 6652.0 0.0
ShoppingMall 0.0 0.0 0.0 0.0 635.0
Spa 0.0 2823.0 0.0 181.0 0.0
VRDeck 0.0 0.0 0.0 585.0 0.0
Name Nelly Carsoning Lerome Peckers Sabih Unhearfus Meratz Caltilter Brence Harperez

Feature Engineering

def split_col(X, splitter=" "): return X.squeeze().str.split(splitter, expand=True).apply(pd.to_numeric, errors='ignore', downcast='integer')
split_col(df_train[['PassengerId']], splitter='_')
0 1
0 1 1
1 2 1
2 3 1
3 3 2
4 4 1
... ... ...
8688 9276 1
8689 9278 1
8690 9279 1
8691 9280 1
8692 9280 2

8693 rows × 2 columns

passenger_transformer = ColExpanderTransform(names=['Passenger_gggg', 'Passenger_nn'], func=split_col, func_kw_args={"splitter":"_"})
display(passenger_transformer.fit_transform(df_train[['PassengerId']]), passenger_transformer.get_feature_names())
0 1
0 1 1
1 2 1
2 3 1
3 3 2
4 4 1
... ... ...
8688 9276 1
8689 9278 1
8690 9279 1
8691 9280 1
8692 9280 2

8693 rows × 2 columns

['Passenger_gggg', 'Passenger_nn']
def calc_service_cost(X, cols=[]):
    return X[cols].sum(axis=1).to_frame()

cols_sc = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

calc_service_cost(df_train, cols_sc).head()
0
0 0.0
1 736.0
2 10383.0
3 5176.0
4 1091.0
layer_spec_preprocess = (gen_feature_layer,
                {
                    'feature_specs':{
                        'PassengerId':(ColExpanderTransform, {'names':['gggg', 'nn'], 'func':split_col, 'func_kw_args':{"splitter":"_"}}), 
                        'Cabin':(ColExpanderTransform, {'names':['deck', 'num', 'side'], 'func':split_col, 'func_kw_args':{"splitter":"/"}}),
                        'Name':(ColExpanderTransform, {'names':['first', 'last'], 'func':split_col, 'func_kw_args':{"splitter":" "}}),
                    }
                })
layer_spec_calc = (gen_feature_layer,
                {
                    'feature_specs':{
                        str(cols_sc):(calc_service_cost,  {'cols':cols_sc}, {"alias":'ServiceCost'})
                    }
                }
               )
layer_specs = [layer_spec_preprocess, layer_spec_calc]
proc = Proc(layer_specs=layer_specs)
proc.fit_transform(df_train)
RoomService_FoodCourt_ShoppingMall_Spa_VRDeck PassengerId_gggg PassengerId_nn Cabin_deck Cabin_num Cabin_side Name_first Name_last HomePlanet CryoSleep Destination Age VIP RoomService FoodCourt ShoppingMall Spa VRDeck Transported
0 0.0 1 1 B 0.0 P Maham Ofracculy Europa False TRAPPIST-1e 39.0 False 0.0 0.0 0.0 0.0 0.0 False
1 736.0 2 1 F 0.0 S Juanna Vines Earth False TRAPPIST-1e 24.0 False 109.0 9.0 25.0 549.0 44.0 True
2 10383.0 3 1 A 0.0 S Altark Susent Europa False TRAPPIST-1e 58.0 True 43.0 3576.0 0.0 6715.0 49.0 False
3 5176.0 3 2 A 0.0 S Solam Susent Europa False TRAPPIST-1e 33.0 False 0.0 1283.0 371.0 3329.0 193.0 False
4 1091.0 4 1 F 1.0 S Willy Santantines Earth False TRAPPIST-1e 16.0 False 303.0 70.0 151.0 565.0 2.0 True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8688 8536.0 9276 1 A 98.0 P Gravior Noxnuther Europa False 55 Cancri e 41.0 True 0.0 6819.0 0.0 1643.0 74.0 False
8689 0.0 9278 1 G 1499.0 S Kurta Mondalley Earth True PSO J318.5-22 18.0 False 0.0 0.0 0.0 0.0 0.0 False
8690 1873.0 9279 1 G 1500.0 S Fayey Connon Earth False TRAPPIST-1e 26.0 False 0.0 0.0 1872.0 1.0 0.0 True
8691 4637.0 9280 1 E 608.0 S Celeon Hontichre Europa False 55 Cancri e 32.0 False 0.0 1049.0 0.0 353.0 3235.0 False
8692 4826.0 9280 2 E 608.0 S Propsh Hontichre Europa False TRAPPIST-1e 44.0 False 126.0 4688.0 0.0 0.0 12.0 True

8693 rows × 19 columns

Modeling and Evaluation

def get_pipeline(max_n_cat=0, 
                 cat_dict=None, 
                 scale_var_cat=False,
                 scale_dict={'class': StandardScaler},
                 cat_num_dict={'class':NumericalEncoder,'categories':None},
                 cat_dummy_dict={'class':OneHotEncoder,'handle_unknown':'ignore'},
                 imputer_dict={'class':SimpleImputer, 'strategy':'median'}):
    
    
    layer_spec_preprocess = (gen_feature_layer,
                {
                    'feature_specs':{
                        'PassengerId':(ColExpanderTransform, {'names':['gggg', 'nn'], 'func':split_col, 'func_kw_args':{"splitter":"_"}}), 
                        'Cabin':(ColExpanderTransform, {'names':['deck', 'num', 'side'], 'func':split_col, 'func_kw_args':{"splitter":"/"}}),
                        'Name':(ColExpanderTransform, {'names':['first', 'last'], 'func':split_col, 'func_kw_args':{"splitter":" "}}),
                    }
                })
    
    layer_spec_default = (get_default_feature_def, 
                      {
                          'skip_flds':None, 
                          'ignored_flds':None, 
                          'max_n_cat':max_n_cat, 
                          'na_exclude_cols':[],
                          'scale_var_num':True,
                          'scale_var_cat':scale_var_cat,
                          'scale_dict':scale_dict,
                          'cat_num_dict':cat_num_dict,
                          'cat_dummy_dict':cat_dummy_dict,
                          'imputer_dict':imputer_dict,
                          'include_time_cols':True,
                          'keep_dt_cols':False,
                          'cat_dict':cat_dict
                      }
                     )
    
    layer_spec_calc = (gen_feature_layer,
                    {
                        'feature_specs':{
                            str(cols_sc):(calc_service_cost,  {'cols':cols_sc}, {"alias":'ServiceCost'})
                        }
                    }
                   )

    layer_specs = [layer_spec_preprocess , layer_spec_default, layer_spec_calc]
    # layer_specs = [layer_spec_preprocess]
    proc = Proc(layer_specs=layer_specs); #proc.fit_transform(X)
    model = RandomForestClassifier()
    pipeline = make_pipeline(proc, model); pipeline
    return pipeline
X = df_train.drop('Transported', axis=1)
y = df_train[['Transported']]
display(X.head(), y.head())
PassengerId HomePlanet CryoSleep Cabin Destination Age VIP RoomService FoodCourt ShoppingMall Spa VRDeck Name
0 0001_01 Europa False B/0/P TRAPPIST-1e 39.0 False 0.0 0.0 0.0 0.0 0.0 Maham Ofracculy
1 0002_01 Earth False F/0/S TRAPPIST-1e 24.0 False 109.0 9.0 25.0 549.0 44.0 Juanna Vines
2 0003_01 Europa False A/0/S TRAPPIST-1e 58.0 True 43.0 3576.0 0.0 6715.0 49.0 Altark Susent
3 0003_02 Europa False A/0/S TRAPPIST-1e 33.0 False 0.0 1283.0 371.0 3329.0 193.0 Solam Susent
4 0004_01 Earth False F/1/S TRAPPIST-1e 16.0 False 303.0 70.0 151.0 565.0 2.0 Willy Santantines
Transported
0 False
1 True
2 False
3 False
4 True
pipeline = get_pipeline(max_n_cat=20, scale_var_cat=False)
scores = cross_val_score(pipeline, X, y, scoring='accuracy'); scores
array([0.73260495, 0.73950546, 0.80448534, 0.82220944, 0.76524741])
sklearn.__version__
'0.24.2'
y_pred = cross_val_predict(pipeline, X, y)
cm = confusion_matrix(y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay>

print(f"Expected Scores {scores.mean() - 3*scores.std():.2%} to {scores.mean() + 3*scores.std():.2%} with mean as {scores.mean():.2%}")
Expected Scores 66.69% to 87.87% with mean as 77.28%

Predictions

Retrain Pipeline on complete training data

pipeline = get_pipeline(max_n_cat=20, scale_var_cat=False)
pipeline.fit(X, y)
y_pred = pipeline.predict(X)
cm = confusion_matrix(y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
display(disp.plot(), accuracy_score(y, y_pred))
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay>
1.0

Calculations for Test Set and Submission

predictions = pd.DataFrame(pipeline.predict(df_test), columns=[y.squeeze().name]); predictions
Transported
0 False
1 False
2 True
3 True
4 False
... ...
4272 True
4273 False
4274 True
4275 True
4276 False

4277 rows × 1 columns

submission = pd.concat([df_test['PassengerId'], predictions], axis=1); submission
PassengerId Transported
0 0013_01 False
1 0018_01 False
2 0019_01 True
3 0021_01 True
4 0023_01 False
... ... ...
4272 9266_02 True
4273 9269_01 False
4274 9271_01 True
4275 9273_01 True
4276 9277_01 False

4277 rows × 2 columns

if is_kaggle: submission.to_csv('submission.csv', index=False)
else: push2kaggle('index.ipynb')
Kernel version 13 successfully pushed.  Please check progress at https://www.kaggle.com/code/rahuketu86/spaceship-titanic