import fastcore
import pandas as pd
import pathlib
from fastcore.all import *
from fastcore.imports import *
import os
import sys
import sklearn
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold, train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, plot_confusion_matrix, confusion_matrix
from IPython.display import display
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoderImport Public Packages
Import Private Packages
is_kaggle = 'kaggle_secrets' in sys.modulesif is_kaggle:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
os.environ['KAGGLE_USERNAME'] = user_secrets.get_secret("kaggle_username")
if not os.environ['KAGGLE_USERNAME']: raise Exception("Please insert your Kaggle username and key into Kaggle secrets")
os.environ['KAGGLE_KEY'] = user_secrets.get_secret("kaggle_key")
github_pat = user_secrets.get_secret("GITHUB_PAT")
!pip install -Uqq git+https://{github_pat}@github.com/Rahuketu86/aiking
else:
from aiking.data.external import *
path = untar_data("kaggle_competitions::spaceship-titanic"); path.ls()from aiking.ml.structured import *
from aiking.integrations.kaggle import push2kaggleRead the Dataset
data_dir = pathlib.Path(os.getenv('DATA_DIR', "/kaggle/input"));
path = data_dir/"spaceship-titanic"
path.ls()(#3) [Path('/kaggle/input/spaceship-titanic/sample_submission.csv'),Path('/kaggle/input/spaceship-titanic/test.csv'),Path('/kaggle/input/spaceship-titanic/train.csv')]
# !rm -rf {path}df_train = pd.read_csv(path/"train.csv"); df_train.head().T| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| PassengerId | 0001_01 | 0002_01 | 0003_01 | 0003_02 | 0004_01 |
| HomePlanet | Europa | Earth | Europa | Europa | Earth |
| CryoSleep | False | False | False | False | False |
| Cabin | B/0/P | F/0/S | A/0/S | A/0/S | F/1/S |
| Destination | TRAPPIST-1e | TRAPPIST-1e | TRAPPIST-1e | TRAPPIST-1e | TRAPPIST-1e |
| Age | 39.0 | 24.0 | 58.0 | 33.0 | 16.0 |
| VIP | False | False | True | False | False |
| RoomService | 0.0 | 109.0 | 43.0 | 0.0 | 303.0 |
| FoodCourt | 0.0 | 9.0 | 3576.0 | 1283.0 | 70.0 |
| ShoppingMall | 0.0 | 25.0 | 0.0 | 371.0 | 151.0 |
| Spa | 0.0 | 549.0 | 6715.0 | 3329.0 | 565.0 |
| VRDeck | 0.0 | 44.0 | 49.0 | 193.0 | 2.0 |
| Name | Maham Ofracculy | Juanna Vines | Altark Susent | Solam Susent | Willy Santantines |
| Transported | False | True | False | False | True |
df_test = pd.read_csv(path/"test.csv"); df_test.head().T| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| PassengerId | 0013_01 | 0018_01 | 0019_01 | 0021_01 | 0023_01 |
| HomePlanet | Earth | Earth | Europa | Europa | Earth |
| CryoSleep | True | False | True | False | False |
| Cabin | G/3/S | F/4/S | C/0/S | C/1/S | F/5/S |
| Destination | TRAPPIST-1e | TRAPPIST-1e | 55 Cancri e | TRAPPIST-1e | TRAPPIST-1e |
| Age | 27.0 | 19.0 | 31.0 | 38.0 | 20.0 |
| VIP | False | False | False | False | False |
| RoomService | 0.0 | 0.0 | 0.0 | 0.0 | 10.0 |
| FoodCourt | 0.0 | 9.0 | 0.0 | 6652.0 | 0.0 |
| ShoppingMall | 0.0 | 0.0 | 0.0 | 0.0 | 635.0 |
| Spa | 0.0 | 2823.0 | 0.0 | 181.0 | 0.0 |
| VRDeck | 0.0 | 0.0 | 0.0 | 585.0 | 0.0 |
| Name | Nelly Carsoning | Lerome Peckers | Sabih Unhearfus | Meratz Caltilter | Brence Harperez |
Feature Engineering
def split_col(X, splitter=" "): return X.squeeze().str.split(splitter, expand=True).apply(pd.to_numeric, errors='ignore', downcast='integer')
split_col(df_train[['PassengerId']], splitter='_')| 0 | 1 | |
|---|---|---|
| 0 | 1 | 1 |
| 1 | 2 | 1 |
| 2 | 3 | 1 |
| 3 | 3 | 2 |
| 4 | 4 | 1 |
| ... | ... | ... |
| 8688 | 9276 | 1 |
| 8689 | 9278 | 1 |
| 8690 | 9279 | 1 |
| 8691 | 9280 | 1 |
| 8692 | 9280 | 2 |
8693 rows × 2 columns
passenger_transformer = ColExpanderTransform(names=['Passenger_gggg', 'Passenger_nn'], func=split_col, func_kw_args={"splitter":"_"})
display(passenger_transformer.fit_transform(df_train[['PassengerId']]), passenger_transformer.get_feature_names())| 0 | 1 | |
|---|---|---|
| 0 | 1 | 1 |
| 1 | 2 | 1 |
| 2 | 3 | 1 |
| 3 | 3 | 2 |
| 4 | 4 | 1 |
| ... | ... | ... |
| 8688 | 9276 | 1 |
| 8689 | 9278 | 1 |
| 8690 | 9279 | 1 |
| 8691 | 9280 | 1 |
| 8692 | 9280 | 2 |
8693 rows × 2 columns
['Passenger_gggg', 'Passenger_nn']
def calc_service_cost(X, cols=[]):
return X[cols].sum(axis=1).to_frame()
cols_sc = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
calc_service_cost(df_train, cols_sc).head()| 0 | |
|---|---|
| 0 | 0.0 |
| 1 | 736.0 |
| 2 | 10383.0 |
| 3 | 5176.0 |
| 4 | 1091.0 |
layer_spec_preprocess = (gen_feature_layer,
{
'feature_specs':{
'PassengerId':(ColExpanderTransform, {'names':['gggg', 'nn'], 'func':split_col, 'func_kw_args':{"splitter":"_"}}),
'Cabin':(ColExpanderTransform, {'names':['deck', 'num', 'side'], 'func':split_col, 'func_kw_args':{"splitter":"/"}}),
'Name':(ColExpanderTransform, {'names':['first', 'last'], 'func':split_col, 'func_kw_args':{"splitter":" "}}),
}
})
layer_spec_calc = (gen_feature_layer,
{
'feature_specs':{
str(cols_sc):(calc_service_cost, {'cols':cols_sc}, {"alias":'ServiceCost'})
}
}
)
layer_specs = [layer_spec_preprocess, layer_spec_calc]
proc = Proc(layer_specs=layer_specs)
proc.fit_transform(df_train)| RoomService_FoodCourt_ShoppingMall_Spa_VRDeck | PassengerId_gggg | PassengerId_nn | Cabin_deck | Cabin_num | Cabin_side | Name_first | Name_last | HomePlanet | CryoSleep | Destination | Age | VIP | RoomService | FoodCourt | ShoppingMall | Spa | VRDeck | Transported | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 1 | 1 | B | 0.0 | P | Maham | Ofracculy | Europa | False | TRAPPIST-1e | 39.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | False |
| 1 | 736.0 | 2 | 1 | F | 0.0 | S | Juanna | Vines | Earth | False | TRAPPIST-1e | 24.0 | False | 109.0 | 9.0 | 25.0 | 549.0 | 44.0 | True |
| 2 | 10383.0 | 3 | 1 | A | 0.0 | S | Altark | Susent | Europa | False | TRAPPIST-1e | 58.0 | True | 43.0 | 3576.0 | 0.0 | 6715.0 | 49.0 | False |
| 3 | 5176.0 | 3 | 2 | A | 0.0 | S | Solam | Susent | Europa | False | TRAPPIST-1e | 33.0 | False | 0.0 | 1283.0 | 371.0 | 3329.0 | 193.0 | False |
| 4 | 1091.0 | 4 | 1 | F | 1.0 | S | Willy | Santantines | Earth | False | TRAPPIST-1e | 16.0 | False | 303.0 | 70.0 | 151.0 | 565.0 | 2.0 | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8688 | 8536.0 | 9276 | 1 | A | 98.0 | P | Gravior | Noxnuther | Europa | False | 55 Cancri e | 41.0 | True | 0.0 | 6819.0 | 0.0 | 1643.0 | 74.0 | False |
| 8689 | 0.0 | 9278 | 1 | G | 1499.0 | S | Kurta | Mondalley | Earth | True | PSO J318.5-22 | 18.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | False |
| 8690 | 1873.0 | 9279 | 1 | G | 1500.0 | S | Fayey | Connon | Earth | False | TRAPPIST-1e | 26.0 | False | 0.0 | 0.0 | 1872.0 | 1.0 | 0.0 | True |
| 8691 | 4637.0 | 9280 | 1 | E | 608.0 | S | Celeon | Hontichre | Europa | False | 55 Cancri e | 32.0 | False | 0.0 | 1049.0 | 0.0 | 353.0 | 3235.0 | False |
| 8692 | 4826.0 | 9280 | 2 | E | 608.0 | S | Propsh | Hontichre | Europa | False | TRAPPIST-1e | 44.0 | False | 126.0 | 4688.0 | 0.0 | 0.0 | 12.0 | True |
8693 rows × 19 columns
Modeling and Evaluation
def get_pipeline(max_n_cat=0,
cat_dict=None,
scale_var_cat=False,
scale_dict={'class': StandardScaler},
cat_num_dict={'class':NumericalEncoder,'categories':None},
cat_dummy_dict={'class':OneHotEncoder,'handle_unknown':'ignore'},
imputer_dict={'class':SimpleImputer, 'strategy':'median'}):
layer_spec_preprocess = (gen_feature_layer,
{
'feature_specs':{
'PassengerId':(ColExpanderTransform, {'names':['gggg', 'nn'], 'func':split_col, 'func_kw_args':{"splitter":"_"}}),
'Cabin':(ColExpanderTransform, {'names':['deck', 'num', 'side'], 'func':split_col, 'func_kw_args':{"splitter":"/"}}),
'Name':(ColExpanderTransform, {'names':['first', 'last'], 'func':split_col, 'func_kw_args':{"splitter":" "}}),
}
})
layer_spec_default = (get_default_feature_def,
{
'skip_flds':None,
'ignored_flds':None,
'max_n_cat':max_n_cat,
'na_exclude_cols':[],
'scale_var_num':True,
'scale_var_cat':scale_var_cat,
'scale_dict':scale_dict,
'cat_num_dict':cat_num_dict,
'cat_dummy_dict':cat_dummy_dict,
'imputer_dict':imputer_dict,
'include_time_cols':True,
'keep_dt_cols':False,
'cat_dict':cat_dict
}
)
layer_spec_calc = (gen_feature_layer,
{
'feature_specs':{
str(cols_sc):(calc_service_cost, {'cols':cols_sc}, {"alias":'ServiceCost'})
}
}
)
layer_specs = [layer_spec_preprocess , layer_spec_default, layer_spec_calc]
# layer_specs = [layer_spec_preprocess]
proc = Proc(layer_specs=layer_specs); #proc.fit_transform(X)
model = RandomForestClassifier()
pipeline = make_pipeline(proc, model); pipeline
return pipelineX = df_train.drop('Transported', axis=1)
y = df_train[['Transported']]
display(X.head(), y.head())| PassengerId | HomePlanet | CryoSleep | Cabin | Destination | Age | VIP | RoomService | FoodCourt | ShoppingMall | Spa | VRDeck | Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0001_01 | Europa | False | B/0/P | TRAPPIST-1e | 39.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Maham Ofracculy |
| 1 | 0002_01 | Earth | False | F/0/S | TRAPPIST-1e | 24.0 | False | 109.0 | 9.0 | 25.0 | 549.0 | 44.0 | Juanna Vines |
| 2 | 0003_01 | Europa | False | A/0/S | TRAPPIST-1e | 58.0 | True | 43.0 | 3576.0 | 0.0 | 6715.0 | 49.0 | Altark Susent |
| 3 | 0003_02 | Europa | False | A/0/S | TRAPPIST-1e | 33.0 | False | 0.0 | 1283.0 | 371.0 | 3329.0 | 193.0 | Solam Susent |
| 4 | 0004_01 | Earth | False | F/1/S | TRAPPIST-1e | 16.0 | False | 303.0 | 70.0 | 151.0 | 565.0 | 2.0 | Willy Santantines |
| Transported | |
|---|---|
| 0 | False |
| 1 | True |
| 2 | False |
| 3 | False |
| 4 | True |
pipeline = get_pipeline(max_n_cat=20, scale_var_cat=False)
scores = cross_val_score(pipeline, X, y, scoring='accuracy'); scoresarray([0.73260495, 0.73950546, 0.80448534, 0.82220944, 0.76524741])
sklearn.__version__'0.24.2'
y_pred = cross_val_predict(pipeline, X, y)
cm = confusion_matrix(y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
print(f"Expected Scores {scores.mean() - 3*scores.std():.2%} to {scores.mean() + 3*scores.std():.2%} with mean as {scores.mean():.2%}")Expected Scores 66.69% to 87.87% with mean as 77.28%
Predictions
Retrain Pipeline on complete training data
pipeline = get_pipeline(max_n_cat=20, scale_var_cat=False)
pipeline.fit(X, y)
y_pred = pipeline.predict(X)
cm = confusion_matrix(y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
display(disp.plot(), accuracy_score(y, y_pred))<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay>
1.0

Calculations for Test Set and Submission
predictions = pd.DataFrame(pipeline.predict(df_test), columns=[y.squeeze().name]); predictions| Transported | |
|---|---|
| 0 | False |
| 1 | False |
| 2 | True |
| 3 | True |
| 4 | False |
| ... | ... |
| 4272 | True |
| 4273 | False |
| 4274 | True |
| 4275 | True |
| 4276 | False |
4277 rows × 1 columns
submission = pd.concat([df_test['PassengerId'], predictions], axis=1); submission| PassengerId | Transported | |
|---|---|---|
| 0 | 0013_01 | False |
| 1 | 0018_01 | False |
| 2 | 0019_01 | True |
| 3 | 0021_01 | True |
| 4 | 0023_01 | False |
| ... | ... | ... |
| 4272 | 9266_02 | True |
| 4273 | 9269_01 | False |
| 4274 | 9271_01 | True |
| 4275 | 9273_01 | True |
| 4276 | 9277_01 | False |
4277 rows × 2 columns
if is_kaggle: submission.to_csv('submission.csv', index=False)
else: push2kaggle('index.ipynb')Kernel version 13 successfully pushed. Please check progress at https://www.kaggle.com/code/rahuketu86/spaceship-titanic