Blubook Bulldozer – Rahul Saraf

Import public packages

import fastcore
import pandas as pd
import pathlib
from fastcore.all import *
from fastcore.imports import *
import os
import sys
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold, train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, plot_confusion_matrix, confusion_matrix
from IPython.display import display
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
Cell In[1], line 12
     10 import matplotlib.pyplot as plt
     11 from sklearn.model_selection import cross_val_score, cross_val_predict, KFold, train_test_split
---> 12 from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, plot_confusion_matrix, confusion_matrix
     13 from IPython.display import display
     14 from sklearn.pipeline import make_pipeline

ImportError: cannot import name 'plot_confusion_matrix' from 'sklearn.metrics' (/opt/homebrew/Caskroom/miniforge/base/envs/aiking/lib/python3.9/site-packages/sklearn/metrics/__init__.py)

Import private packages

is_kaggle = 'kaggle_secrets' in sys.modules

if is_kaggle:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    os.environ['KAGGLE_USERNAME'] = user_secrets.get_secret("kaggle_username")
    if not os.environ['KAGGLE_USERNAME']: raise Exception("Please insert your Kaggle username and key into Kaggle secrets")
    os.environ['KAGGLE_KEY'] = user_secrets.get_secret("kaggle_key")
    github_pat = user_secrets.get_secret("GITHUB_PAT")
    !pip install -Uqq git+https://{github_pat}@github.com/Rahuketu86/aiking
else:
    from aiking.data.external import *
    path = untar_data("kaggle_competitions::bluebook-bulldozer-remix"); 
    print(path.ls())

[Path('/AIKING_HOME/data/bluebook-bulldozer-remix/bluebook-bulldozer-remix.zip'), Path('/AIKING_HOME/data/bluebook-bulldozer-remix/Data Dictionary.xlsx'), Path('/AIKING_HOME/data/bluebook-bulldozer-remix/Train'), Path('/AIKING_HOME/data/bluebook-bulldozer-remix/TrainingData'), Path('/AIKING_HOME/data/bluebook-bulldozer-remix/Valid')]

from aiking.ml.structured import *

Read the Dataset

data_dir = pathlib.Path(os.getenv('DATA_DIR', "/kaggle/input")); 
path = data_dir/"bluebook-bulldozer-remix"
path.ls()

(#5) [Path('/kaggle/input/bluebook-bulldozer-remix/bluebook-bulldozer-remix.zip'),Path('/kaggle/input/bluebook-bulldozer-remix/Data Dictionary.xlsx'),Path('/kaggle/input/bluebook-bulldozer-remix/Train'),Path('/kaggle/input/bluebook-bulldozer-remix/TrainingData'),Path('/kaggle/input/bluebook-bulldozer-remix/Valid')]

df_train = pd.read_csv(path/"Train/Train.csv", low_memory=False,parse_dates=['saledate'], infer_datetime_format=True); df_train.head()
df_test = pd.read_csv(path/"Valid/Valid.csv", parse_dates=['saledate'], infer_datetime_format=True); df_test.head()

	SalesID	MachineID	ModelID	datasource	auctioneerID	YearMade	MachineHoursCurrentMeter	UsageBand	saledate	fiModelDesc	...	Undercarriage_Pad_Width	Stick_Length	Thumb	Pattern_Changer	Grouser_Type	Backhoe_Mounting	Blade_Type	Travel_Controls	Differential_Type	Steering_Controls
0	1222837	902859	1376	121	3	1000	0.0	NaN	2012-01-05	375L	...	None or Unspecified	None or Unspecified	None or Unspecified	None or Unspecified	Double	NaN	NaN	NaN	NaN	NaN
1	1222839	1048320	36526	121	3	2006	4412.0	Medium	2012-01-05	TX300LC2	...	None or Unspecified	12' 4"	None or Unspecified	Yes	Double	NaN	NaN	NaN	NaN	NaN
2	1222841	999308	4587	121	3	2000	10127.0	Medium	2012-01-05	270LC	...	None or Unspecified	12' 4"	None or Unspecified	None or Unspecified	Double	NaN	NaN	NaN	NaN	NaN
3	1222843	1062425	1954	121	3	1000	4682.0	Low	2012-01-05	892DLC	...	None or Unspecified	None or Unspecified	None or Unspecified	None or Unspecified	Double	NaN	NaN	NaN	NaN	NaN
4	1222845	1032841	4701	121	3	2002	8150.0	Medium	2012-01-04	544H	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Standard	Conventional

5 rows × 52 columns

Modelling

Define categories ordering

max_n_cat = 0
cat_dict = get_cat_dict(df_train, max_n_cat=max_n_cat)
display_cat(cat_dict)

	0	1
0	UsageBand	[High, Low, Medium]
1	fiModelDesc	[100C, 104, 1066, 1066E, 1080, 1080B, 1088, 1088CK, 1088LT, 1088TTL, 10B, 10C, 10DG, 110, 1105, 110S, 110TLB, 110Z, 110Z-2, 112, 112E, 112F, 115, 1150, 1150B, 1150BLGP, 1150C, 1150D, 1150E, 1150ELT, 1150G, 1150GLGP, 1150GLT, 1150H, 1150HLGP, 1150HLT, 1150HWT, 1150K, 1150KLGPSERIES3, 1150KWT, 1150KXLT, 1150KXLTIII, 115SRDZ, 115Z, 115ZIII, 115ZIV, 115ZIV-2, 115ZV, 116, 1166, 118, 1187C, 1188, 1188LC, 1188P, 118B, 118C, 11B, 11C, 12, 120, 120B, 120C, 120CLC, 120D, 120E, 120G, 120H, 120HNA, 120LC, 120M , 125, 125A, 125B, 125C, 125CKBNA, 1280, 1280B, 1288, 12E, 12F, 12G, 12H, 12HNA, 12JX, 130, 1300, 1300D, 130G, 130LC, 130LX, 1340XL, 135, 135C, 135CRTS, 135H, 135HNA, 135MSR SPIN ACE, 135SR, 135SRLC, ...]
2	fiBaseModel	[10, 100, 104, 1066, 1080, 1088, 10DG, 11, 110, 1105, 112, 115, 1150, 116, 1166, 118, 1187, 1188, 12, 120, 125, 1280, 1288, 130, 1300, 1340, 135, 137, 14, 140, 1400, 143, 1450, 15, 150, 1500, 153, 155, 1550, 16, 160, 1600, 163, 165, 1650, 166, 17, 170, 1700, 1737, 1740, 175, 1750, 1760XL, 17ZTS, 18, 1800, 1818, 1825, 1830, 1835, 1838, 1840, 1845, 185, 1850, 190, 1900, 198, 20, 200, 2000, 2022, 2026, 2040, 2042, 2044, 205, 2050, 2054, 2060, 2060XL, 2064, 2066, 2070, 2074, 2076, 208, 2086, 2095, 2099, 21, 210, 2105, 2109, 211, 212, 213, 213LC, 214, ...]
3	fiSecondaryDesc	[ MSR SPIN ACE, #NAME?, -2, -3, -5, -5L, -6, -7, 0.7, 1, 2, 3, 5, 7, A, AA, AB, AG, AW, AX, B, B , BEC, BL, BLGP, BLGPPS, BZ, BZD, C, C , CE, CH, CK, CKB, CL, CLR, CM, CR, CS, CX, D, DC, DL, DT, DX, DXT, DZ, E, EG, EL, ESS, EST, EW, EX, F, FR, G, GT, H, H , H90, HAG, HD, HDS, HDSL, HF, HL, HLGP, HLS, HX, HZ, IV, J, JX, K, L, LC, LC7A, LC7LR, LCD, LCH, LCLR, LCM, LD, LE, LGP, LR, LS, LT, LX, M, M , MC, MR, MRX, MSR, MSR SPIN ACE, MT, MU, MXT, ...]
4	fiModelSeries	[ III, #NAME?, -1, -1.50E+01, -11, -12, -15, -16, -17, -18, -1B, -1C, -1L, -2, -20, -21, -21A, -2A, -2C, -2E, -2LC, -2N, -3, -3C, -3EO, -3H, -3L, -3LC, -3LK, -3M, -3MC, -3PT, -4, -5, -5A, -5E, -5F, -5H, -5L, -5LC, -6, -6A, -6B, -6C, -6E, -6K, -6LC, -6LK, -7, -7B, -7E, -7K, -8, -8E, 1, 12, 14FT, 15, 16, 17, 18, 1970, 2, 20, 21KomStat, 21KomStatII, 22, 2B, 2T, 3, 3A, 3C, 4, 5, 5N, 6, 6.00E+00, 6F, 6L, 6LE, 6LK, 7, 7.00E+00, 7A, 7L, 8, A, AWS, D, D7, E, EX, Elite, FASTRRACK, G, GALEO, H, II, III, IV, ...]
5	fiModelDescriptor	[ 14FT, LGP, SUPER, XLT, XT, ZX, (BLADE RUNNER), 1, 2, 2.00E+00, 2N, 3, 3.00E+00, 3C, 3L, 3NO, 4WD, 4x4x4, 5, 6, 6K, 7, 7.00E+00, 7A, 8, A, AE0, AVANCE, B, BE, C, CK, CR, CRSB, CUSTOM, DA, DELUXE, DHP, DINGO, DLL, DT, DW, E, ESL, G, GALEO, H, H5, HD, HF, HSD, HT, High Lift, HighLift, II, III, IT, IV, K, K3, K5, KA, KBNA, L, LC, LC8, LCH, LCR, LCRTS, LE, LGP, LGPVP, LITRONIC, LK, LL, LM, LN, LR, LRC, LRR, LS, LT, LU, LX, LongReach, M, MC, ME, MH, N, NLC, NSUC, P, PLUS, PRO, RR, RTS, S, SA, SB, ...]
6	ProductSize	[Compact, Large, Large / Medium, Medium, Mini, Small]
7	fiProductClassDesc	[Backhoe Loader - 0.0 to 14.0 Ft Standard Digging Depth, Backhoe Loader - 14.0 to 15.0 Ft Standard Digging Depth, Backhoe Loader - 15.0 to 16.0 Ft Standard Digging Depth, Backhoe Loader - 16.0 + Ft Standard Digging Depth, Backhoe Loader - Unidentified, Hydraulic Excavator, Track - 0.0 to 2.0 Metric Tons, Hydraulic Excavator, Track - 11.0 to 12.0 Metric Tons, Hydraulic Excavator, Track - 12.0 to 14.0 Metric Tons, Hydraulic Excavator, Track - 14.0 to 16.0 Metric Tons, Hydraulic Excavator, Track - 150.0 to 300.0 Metric Tons, Hydraulic Excavator, Track - 16.0 to 19.0 Metric Tons, Hydraulic Excavator, Track - 19.0 to 21.0 Metric Tons, Hydraulic Excavator, Track - 2.0 to 3.0 Metric Tons, Hydraulic Excavator, Track - 21.0 to 24.0 Metric Tons, Hydraulic Excavator, Track - 24.0 to 28.0 Metric Tons, Hydraulic Excavator, Track - 28.0 to 33.0 Metric Tons, Hydraulic Excavator, Track - 3.0 to 4.0 Metric Tons, Hydraulic Excavator, Track - 300.0 + Metric Tons, Hydraulic Excavator, Track - 33.0 to 40.0 Metric Tons, Hydraulic Excavator, Track - 4.0 to 5.0 Metric Tons, Hydraulic Excavator, Track - 4.0 to 6.0 Metric Tons, Hydraulic Excavator, Track - 40.0 to 50.0 Metric Tons, Hydraulic Excavator, Track - 5.0 to 6.0 Metric Tons, Hydraulic Excavator, Track - 50.0 to 66.0 Metric Tons, Hydraulic Excavator, Track - 6.0 to 8.0 Metric Tons, Hydraulic Excavator, Track - 66.0 to 90.0 Metric Tons, Hydraulic Excavator, Track - 8.0 to 11.0 Metric Tons, Hydraulic Excavator, Track - 90.0 to 150.0 Metric Tons, Hydraulic Excavator, Track - Unidentified, Hydraulic Excavator, Track - Unidentified (Compact Construction), Motorgrader - 130.0 to 145.0 Horsepower, Motorgrader - 145.0 to 170.0 Horsepower, Motorgrader - 170.0 to 200.0 Horsepower, Motorgrader - 200.0 + Horsepower, Motorgrader - 45.0 to 130.0 Horsepower, Motorgrader - Unidentified, Skid Steer Loader - 0.0 to 701.0 Lb Operating Capacity, Skid Steer Loader - 1251.0 to 1351.0 Lb Operating Capacity, Skid Steer Loader - 1351.0 to 1601.0 Lb Operating Capacity, Skid Steer Loader - 1601.0 to 1751.0 Lb Operating Capacity, Skid Steer Loader - 1751.0 to 2201.0 Lb Operating Capacity, Skid Steer Loader - 2201.0 to 2701.0 Lb Operating Capacity, Skid Steer Loader - 2701.0+ Lb Operating Capacity, Skid Steer Loader - 701.0 to 976.0 Lb Operating Capacity, Skid Steer Loader - 976.0 to 1251.0 Lb Operating Capacity, Skid Steer Loader - Unidentified, Track Type Tractor, Dozer - 105.0 to 130.0 Horsepower, Track Type Tractor, Dozer - 130.0 to 160.0 Horsepower, Track Type Tractor, Dozer - 160.0 to 190.0 Horsepower, Track Type Tractor, Dozer - 190.0 to 260.0 Horsepower, Track Type Tractor, Dozer - 20.0 to 75.0 Horsepower, Track Type Tractor, Dozer - 260.0 + Horsepower, Track Type Tractor, Dozer - 75.0 to 85.0 Horsepower, Track Type Tractor, Dozer - 85.0 to 105.0 Horsepower, Track Type Tractor, Dozer - Unidentified, Wheel Loader - 0.0 to 40.0 Horsepower, Wheel Loader - 100.0 to 110.0 Horsepower, Wheel Loader - 1000.0 + Horsepower, Wheel Loader - 110.0 to 120.0 Horsepower, Wheel Loader - 120.0 to 135.0 Horsepower, Wheel Loader - 135.0 to 150.0 Horsepower, Wheel Loader - 150.0 to 175.0 Horsepower, Wheel Loader - 175.0 to 200.0 Horsepower, Wheel Loader - 200.0 to 225.0 Horsepower, Wheel Loader - 225.0 to 250.0 Horsepower, Wheel Loader - 250.0 to 275.0 Horsepower, Wheel Loader - 275.0 to 350.0 Horsepower, Wheel Loader - 350.0 to 500.0 Horsepower, Wheel Loader - 40.0 to 60.0 Horsepower, Wheel Loader - 500.0 to 1000.0 Horsepower, Wheel Loader - 60.0 to 80.0 Horsepower, Wheel Loader - 80.0 to 90.0 Horsepower, Wheel Loader - 90.0 to 100.0 Horsepower, Wheel Loader - Unidentified]
8	state	[Alabama, Alaska, Arizona, Arkansas, California, Colorado, Connecticut, Delaware, Florida, Georgia, Hawaii, Idaho, Illinois, Indiana, Iowa, Kansas, Kentucky, Louisiana, Maine, Maryland, Massachusetts, Michigan, Minnesota, Mississippi, Missouri, Montana, Nebraska, Nevada, New Hampshire, New Jersey, New Mexico, New York, North Carolina, North Dakota, Ohio, Oklahoma, Oregon, Pennsylvania, Puerto Rico, Rhode Island, South Carolina, South Dakota, Tennessee, Texas, Unspecified, Utah, Vermont, Virginia, Washington, Washington DC, West Virginia, Wisconsin, Wyoming]
9	ProductGroup	[BL, MG, SSL, TEX, TTT, WL]
10	ProductGroupDesc	[Backhoe Loaders, Motor Graders, Skid Steer Loaders, Track Excavators, Track Type Tractors, Wheel Loader]
11	Drive_System	[All Wheel Drive, Four Wheel Drive, No, Two Wheel Drive]
12	Enclosure	[EROPS, EROPS AC, EROPS w AC, NO ROPS, None or Unspecified, OROPS]
13	Forks	[None or Unspecified, Yes]
14	Pad_Type	[Grouser, None or Unspecified, Reversible, Street]
15	Ride_Control	[No, None or Unspecified, Yes]
16	Stick	[Extended, Standard]
17	Transmission	[AutoShift, Autoshift, Direct Drive, Hydrostatic, None or Unspecified, Powershift, Powershuttle, Standard]
18	Turbocharged	[None or Unspecified, Yes]
19	Blade_Extension	[None or Unspecified, Yes]
20	Blade_Width	[12', 13', 14', 16', <12', None or Unspecified]
21	Enclosure_Type	[High Profile, Low Profile, None or Unspecified]
22	Engine_Horsepower	[No, Variable]
23	Hydraulics	[2 Valve, 3 Valve, 4 Valve, Auxiliary, Base + 1 Function, Base + 2 Function, Base + 3 Function, Base + 4 Function, Base + 5 Function, Base + 6 Function, None or Unspecified, Standard]
24	Pushblock	[None or Unspecified, Yes]
25	Ripper	[Multi Shank, None or Unspecified, Single Shank, Yes]
26	Scarifier	[None or Unspecified, Yes]
27	Tip_Control	[None or Unspecified, Sideshift & Tip, Tip]
28	Tire_Size	[10 inch, 10", 13", 14", 15.5, 15.5", 17.5, 17.5", 20.5, 20.5", 23.1", 23.5, 23.5", 26.5, 29.5, 7.0", None or Unspecified]
29	Coupler	[Hydraulic, Manual, None or Unspecified]
30	Coupler_System	[None or Unspecified, Yes]
31	Grouser_Tracks	[None or Unspecified, Yes]
32	Hydraulics_Flow	[High Flow, None or Unspecified, Standard]
33	Track_Type	[Rubber, Steel]
34	Undercarriage_Pad_Width	[14 inch, 15 inch, 16 inch, 18 inch, 20 inch, 22 inch, 24 inch, 25 inch, 26 inch, 27 inch, 28 inch, 30 inch, 31 inch, 31.5 inch, 32 inch, 33 inch, 34 inch, 36 inch, None or Unspecified]
35	Stick_Length	[10' 10", 10' 2", 10' 6", 11' 0", 11' 10", 12' 10", 12' 4", 12' 8", 13' 10", 13' 7", 13' 9", 14' 1", 15' 4", 15' 9", 19' 8", 24' 3", 6' 3", 7' 10", 8' 10", 8' 2", 8' 4", 8' 6", 9' 10", 9' 2", 9' 5", 9' 6", 9' 7", 9' 8", None or Unspecified]
36	Thumb	[Hydraulic, Manual, None or Unspecified]
37	Pattern_Changer	[No, None or Unspecified, Yes]
38	Grouser_Type	[Double, Single, Triple]
39	Backhoe_Mounting	[None or Unspecified, Yes]
40	Blade_Type	[Angle, Coal, Landfill, No, None or Unspecified, PAT, Semi U, Straight, U, VPAT]
41	Travel_Controls	[1 Speed, 2 Pedal, Differential Steer, Finger Tip, Lever, None or Unspecified, Pedal]
42	Differential_Type	[Limited Slip, Locking, No Spin, Standard]
43	Steering_Controls	[Command Control, Conventional, Four Wheel Standard, No, Wheel]

updates = {
    'UsageBand':['High', 'Medium', 'Low'],
    'Blade_Width':[ "16'", "14'", "13'","12'", "<12'", "None or Unspecified"],
    'Grouser_Type':['Triple', 'Double', 'Single'],
    'ProductSize':['Large','Large / Medium', 'Medium', 'Compact', 'Small', 'Mini']
}

cat_dict.update(updates)
filter_dict = {k:v for k,v in cat_dict.items() if k in updates}
display_cat(filter_dict)

	0	1
0	UsageBand	[High, Medium, Low]
1	ProductSize	[Large, Large / Medium, Medium, Compact, Small, Mini]
2	Blade_Width	[16', 14', 13', 12', <12', None or Unspecified]
3	Grouser_Type	[Triple, Double, Single]

Define Validation Set

Code

def range_data(df_train, df_test, names=['Train', 'Test'], datecol='saledate'):
    return pd.DataFrame([
        {'Name':names[0], 'Start':df_train[datecol].min(), 'End': df_train[datecol].max(), 'Interval':df_train[datecol].max() -df_train[datecol].min(), 'Size': len(df_train) },
        {'Name':names[1], 'Start':df_test[datecol].min(), 'End': df_test[datecol].max(), 'Interval':df_test[datecol].max() -df_test[datecol].min(), 'Size': len(df_test) }]).set_index('Name')

range_data(df_train, df_test)

	Start	End	Interval	Size
Name
Train	1989-01-17	2011-12-30	8382 days	401125
Test	2012-01-01	2012-04-28	118 days	11573

Test data starts at the end of train date with about 4 months of data (covering February with 28/29 days). We need to split our internal validation set in a similar way. We will take 4 months / 120 days of data as validation set

validation_date_start = (df_train[['saledate']].max() - pd.Timedelta(value=120, unit='D')).values[0] # Last 4 months data for internal validation
df_model, df_valid = df_train[df_train['saledate'] < validation_date_start], df_train[df_train['saledate']>= validation_date_start]

range_data(df_model, df_valid, names=['Model', 'Valid'])

	Start	End	Interval	Size
Name
Model	1989-01-17	2011-08-31	8261 days	390213
Valid	2011-09-01	2011-12-30	120 days	10912

Define Pipeline

def get_model_pipeline(max_n_cat=0, 
                       cat_dict=None, 
                       scale_dict={'class': StandardScaler},
                       cat_num_dict={'class':NumericalEncoder,'categories':None},
                       cat_dummy_dict={'class':OneHotEncoder,'handle_unknown':'ignore'},
                       imputer_dict={'class':SimpleImputer, 'strategy':'median'},
                      ):
    layer_spec_default = (get_default_feature_def, 
                      {
                          'skip_flds':None, 
                          'ignored_flds':None, 
                          'max_n_cat':max_n_cat, 
                          'na_exclude_cols':[],
                          'scale_var_num':True,
                          'scale_var_cat':False,
                          'scale_dict':scale_dict,
                          'cat_num_dict':cat_num_dict,
                          'cat_dummy_dict':cat_dummy_dict,
                          'imputer_dict':imputer_dict,
                          'include_time_cols':True,
                          'keep_dt_cols':False,
                          'cat_dict':cat_dict
                      }
                     )

    layer_specs = [layer_spec_default]
    proc = Proc(layer_specs=layer_specs); #proc.fit_transform(X)
    model = RandomForestRegressor(n_jobs=-1)
    pipeline = make_pipeline(proc, model); pipeline
    return pipeline

pipeline = get_model_pipeline(cat_dict); pipeline

Pipeline(steps=[('proc', <aiking.ml.structured.Proc object>),
                ('randomforestregressor', RandomForestRegressor(n_jobs=-1))])

Train on Partial Data

max_n_cat = 0

def get_xy(df, col='SalePrice'): return df.drop([col], axis=1), np.log(df[col])
    
df = df_model.sample(frac=0.04)
X, y = get_xy(df)
X_model, y_model = get_xy(df_model)
X_valid, y_valid = get_xy(df_valid)

pipeline = get_model_pipeline(max_n_cat,cat_dict)
pipeline.fit(X, y)

Pipeline(steps=[('proc', <aiking.ml.structured.Proc object>),
                ('randomforestregressor', RandomForestRegressor(n_jobs=-1))])

get_score(pipeline, X, y, X_valid, y_valid, scorers=get_scorer_dict())

	Training	Validation
Metric
r2	0.978088	0.821789
neg_root_mean_squared_error	-0.102343	-0.305869
explained_variance	0.978089	0.825985
neg_median_absolute_error	-0.055909	-0.181245
neg_mean_absolute_percentage_error	-0.007455	-0.022704

get_score(pipeline, X_model, y_model, X_valid, y_valid, scorers=get_scorer_dict())

	Training	Validation
Metric
r2	0.853482	0.821789
neg_root_mean_squared_error	-0.265110	-0.305869
explained_variance	0.853483	0.825985
neg_median_absolute_error	-0.144865	-0.181245
neg_mean_absolute_percentage_error	-0.019361	-0.022704

This gives an indication of estimate of msle around .26 to .30[Really 0.304 from validation estimate]

Cross validation estimate

df_cv = timeseries_cv(df_model, 'saledate', 'SalePrice', 
                  pipeline_callback_dict={'func': get_model_pipeline, 'func_kwargs':{'max_n_cat':0, 'cat_dict':cat_dict}},
                  y_mod_func=np.log,
                  scorers = get_scorer_dict(),
                  n_train=15000, n_test=12000, n_splits=10)
df_cv

100.00% [10/10 01:06<00:00]

	Training	Validation	set	train_start	train_end	valid_start	valid_end
Metric
r2	0.986050	0.855443	1	2008-02-26	2008-06-28	2008-06-28	2008-11-07
neg_root_mean_squared_error	-0.082520	-0.262933	1	2008-02-26	2008-06-28	2008-06-28	2008-11-07
explained_variance	0.986051	0.863314	1	2008-02-26	2008-06-28	2008-06-28	2008-11-07
neg_median_absolute_error	-0.037583	-0.148024	1	2008-02-26	2008-06-28	2008-06-28	2008-11-07
neg_mean_absolute_percentage_error	-0.005600	-0.019542	1	2008-02-26	2008-06-28	2008-06-28	2008-11-07
r2	0.984919	0.839832	2	2008-02-26	2008-11-07	2008-11-07	2009-02-16
neg_root_mean_squared_error	-0.085098	-0.280481	2	2008-02-26	2008-11-07	2008-11-07	2009-02-16
explained_variance	0.984919	0.865230	2	2008-02-26	2008-11-07	2008-11-07	2009-02-16
neg_median_absolute_error	-0.040576	-0.167759	2	2008-02-26	2008-11-07	2008-11-07	2009-02-16
neg_mean_absolute_percentage_error	-0.005859	-0.021296	2	2008-02-26	2008-11-07	2008-11-07	2009-02-16
r2	0.985811	0.855938	3	2008-02-26	2009-02-16	2009-02-16	2009-05-05
neg_root_mean_squared_error	-0.083452	-0.268049	3	2008-02-26	2009-02-16	2009-02-16	2009-05-05
explained_variance	0.985812	0.870222	3	2008-02-26	2009-02-16	2009-02-16	2009-05-05
neg_median_absolute_error	-0.039221	-0.161301	3	2008-02-26	2009-02-16	2009-02-16	2009-05-05
neg_mean_absolute_percentage_error	-0.005723	-0.020529	3	2008-02-26	2009-02-16	2009-02-16	2009-05-05
r2	0.986149	0.859721	4	2008-02-29	2009-05-05	2009-05-05	2009-08-19
neg_root_mean_squared_error	-0.082841	-0.263684	4	2008-02-29	2009-05-05	2009-05-05	2009-08-19
explained_variance	0.986149	0.860370	4	2008-02-29	2009-05-05	2009-05-05	2009-08-19
neg_median_absolute_error	-0.039276	-0.152460	4	2008-02-29	2009-05-05	2009-05-05	2009-08-19
neg_mean_absolute_percentage_error	-0.005729	-0.019758	4	2008-02-29	2009-05-05	2009-05-05	2009-08-19
r2	0.985700	0.864896	5	2008-03-05	2009-08-19	2009-08-19	2009-12-04
neg_root_mean_squared_error	-0.084180	-0.254076	5	2008-03-05	2009-08-19	2009-08-19	2009-12-04
explained_variance	0.985700	0.864995	5	2008-03-05	2009-08-19	2009-08-19	2009-12-04
neg_median_absolute_error	-0.040860	-0.144788	5	2008-03-05	2009-08-19	2009-08-19	2009-12-04
neg_mean_absolute_percentage_error	-0.005935	-0.018973	5	2008-03-05	2009-08-19	2009-08-19	2009-12-04
r2	0.984630	0.847664	6	2008-05-01	2009-12-04	2009-12-04	2010-03-30
neg_root_mean_squared_error	-0.086090	-0.272201	6	2008-05-01	2009-12-04	2009-12-04	2010-03-30
explained_variance	0.984633	0.855422	6	2008-05-01	2009-12-04	2009-12-04	2010-03-30
neg_median_absolute_error	-0.040860	-0.155152	6	2008-05-01	2009-12-04	2009-12-04	2010-03-30
neg_mean_absolute_percentage_error	-0.005967	-0.019876	6	2008-05-01	2009-12-04	2009-12-04	2010-03-30
r2	0.985190	0.843297	7	2008-05-15	2010-03-30	2010-03-30	2010-08-24
neg_root_mean_squared_error	-0.084891	-0.278687	7	2008-05-15	2010-03-30	2010-03-30	2010-08-24
explained_variance	0.985192	0.844513	7	2008-05-15	2010-03-30	2010-03-30	2010-08-24
neg_median_absolute_error	-0.039577	-0.151982	7	2008-05-15	2010-03-30	2010-03-30	2010-08-24
neg_mean_absolute_percentage_error	-0.005821	-0.020361	7	2008-05-15	2010-03-30	2010-03-30	2010-08-24
r2	0.984002	0.871116	8	2008-11-12	2010-08-24	2010-08-24	2011-01-26
neg_root_mean_squared_error	-0.088833	-0.253561	8	2008-11-12	2010-08-24	2010-08-24	2011-01-26
explained_variance	0.984003	0.871649	8	2008-11-12	2010-08-24	2010-08-24	2011-01-26
neg_median_absolute_error	-0.043196	-0.143973	8	2008-11-12	2010-08-24	2010-08-24	2011-01-26
neg_mean_absolute_percentage_error	-0.006185	-0.018764	8	2008-11-12	2010-08-24	2010-08-24	2011-01-26
r2	0.985709	0.859266	9	2009-02-16	2011-01-26	2011-01-26	2011-04-28
neg_root_mean_squared_error	-0.084228	-0.265014	9	2009-02-16	2011-01-26	2011-01-26	2011-04-28
explained_variance	0.985709	0.867493	9	2009-02-16	2011-01-26	2011-01-26	2011-04-28
neg_median_absolute_error	-0.041602	-0.146238	9	2009-02-16	2011-01-26	2011-01-26	2011-04-28
neg_mean_absolute_percentage_error	-0.005938	-0.019083	9	2009-02-16	2011-01-26	2011-01-26	2011-04-28
r2	0.986866	0.860514	10	2009-05-06	2011-04-28	2011-04-28	2011-08-31
neg_root_mean_squared_error	-0.081017	-0.263815	10	2009-05-06	2011-04-28	2011-04-28	2011-08-31
explained_variance	0.986871	0.860803	10	2009-05-06	2011-04-28	2011-04-28	2011-08-31
neg_median_absolute_error	-0.037374	-0.151020	10	2009-05-06	2011-04-28	2011-04-28	2011-08-31
neg_mean_absolute_percentage_error	-0.005516	-0.019471	10	2009-05-06	2011-04-28	2011-04-28	2011-08-31

df_cv.loc['neg_root_mean_squared_error'][['train_end', 'Training', 'Validation']].set_index('train_end').plot()

df_cv.loc['neg_root_mean_squared_error'][[ 'Validation']].mean()

Validation   -0.26625
dtype: float64

Tuning

Number of points to select for training to improve validation

This requires little bit of playing with how many points to choose for training

n = 8000
df_model_train = df_model.sort_values(by='saledate').iloc[-n:]
df_model_train.sample(frac=1)
X_model_train , y_model_train = get_xy(df_model_train)
pipeline = get_model_pipeline(max_n_cat, cat_dict)
pipeline.fit(X_model_train, y_model_train)
score_df = get_score(pipeline, X_model_train, y_model_train, X_valid, y_valid, scorers=get_scorer_dict())
score_df['n'] = n
score_df

	Training	Validation	n
Metric
r2	0.986471	0.848635	8000
neg_root_mean_squared_error	-0.081511	-0.281891	8000
explained_variance	0.986471	0.849255	8000
neg_median_absolute_error	-0.038784	-0.157271	8000
neg_mean_absolute_percentage_error	-0.005624	-0.020596	8000

np.linspace(20000, 5000,16)

array([20000., 19000., 18000., 17000., 16000., 15000., 14000., 13000.,
       12000., 11000., 10000.,  9000.,  8000.,  7000.,  6000.,  5000.])

def get_score_for_n(n=8000):
    df_model_train = df_model.sort_values(by='saledate').iloc[-n:]
    df_model_train.sample(frac=1)
    X_model_train , y_model_train = get_xy(df_model_train)
    pipeline = get_model_pipeline(max_n_cat, cat_dict)
    pipeline.fit(X_model_train, y_model_train)
    score_df = get_score(pipeline, X_model_train, y_model_train, X_valid, y_valid, scorers=get_scorer_dict())
    score_df['n'] = n
    return score_df

scores  = pd.concat([get_score_for_n(n=int(n)) for n in np.linspace(20000, 5000,16)]); scores

	Training	Validation	n
Metric
r2	0.987612	0.869907	20000
neg_root_mean_squared_error	-0.078250	-0.261333	20000
explained_variance	0.987613	0.870216	20000
neg_median_absolute_error	-0.037010	-0.147194	20000
neg_mean_absolute_percentage_error	-0.005343	-0.019104	20000
...	...	...	...
r2	0.985790	0.833147	5000
neg_root_mean_squared_error	-0.083945	-0.295961	5000
explained_variance	0.985790	0.834022	5000
neg_median_absolute_error	-0.038053	-0.166622	5000
neg_mean_absolute_percentage_error	-0.005691	-0.021768	5000

80 rows × 3 columns

scores.loc['neg_root_mean_squared_error']

	Training	Validation	n
Metric
neg_root_mean_squared_error	-0.078250	-0.261333	20000
neg_root_mean_squared_error	-0.078630	-0.260586	19000
neg_root_mean_squared_error	-0.079291	-0.261858	18000
neg_root_mean_squared_error	-0.080207	-0.262133	17000
neg_root_mean_squared_error	-0.080376	-0.265427	16000
neg_root_mean_squared_error	-0.081357	-0.265461	15000
neg_root_mean_squared_error	-0.081260	-0.267446	14000
neg_root_mean_squared_error	-0.080965	-0.270179	13000
neg_root_mean_squared_error	-0.080303	-0.270147	12000
neg_root_mean_squared_error	-0.081419	-0.273167	11000
neg_root_mean_squared_error	-0.081806	-0.276547	10000
neg_root_mean_squared_error	-0.081851	-0.279289	9000
neg_root_mean_squared_error	-0.082139	-0.281973	8000
neg_root_mean_squared_error	-0.081183	-0.284795	7000
neg_root_mean_squared_error	-0.081215	-0.285585	6000
neg_root_mean_squared_error	-0.083945	-0.295961	5000

Conclusion is we can try 2 models

Add 19000 points from df_model with df_valid
Add df_model with df_valid and take last 19000 points

Predictions

df_model_train = df_model.sort_values(by='saledate').iloc[-19000:]
df_sel_train = pd.concat([df_model_train, df_valid])
df_sel_train.sample(frac=1)
X_sel_train, y_sel_train = get_xy(df_sel_train)

pipeline = get_model_pipeline(max_n_cat, cat_dict)
pipeline.fit(X_sel_train, y_sel_train)

Pipeline(steps=[('proc', <aiking.ml.structured.Proc object>),
                ('randomforestregressor', RandomForestRegressor(n_jobs=-1))])

plt.hist(np.exp(y_sel_train))

(array([10186.,  8235.,  4265.,  2565.,  1941.,  1222.,   922.,   350.,
          166.,    60.]),
 array([  4750.,  18475.,  32200.,  45925.,  59650.,  73375.,  87100.,
        100825., 114550., 128275., 142000.]),
 <BarContainer object of 10 artists>)

plt.hist(np.exp(pipeline.predict(df_test)))

(array([3914., 2885., 1674., 1086.,  809.,  575.,  409.,  141.,   48.,
          32.]),
 array([  5632.98698052,  17963.68207264,  30294.37716476,  42625.07225689,
         54955.76734901,  67286.46244114,  79617.15753326,  91947.85262538,
        104278.54771751, 116609.24280963, 128939.93790175]),
 <BarContainer object of 10 artists>)

df_submission = pd.DataFrame()
df_submission['Id'] = df_test['SalesID']
df_submission['Predicted'] = np.exp(pipeline.predict(df_test))
df_submission.to_csv('submission.csv', index=False)

if not is_kaggle:
    import kaggle
    from aiking.integrations.kaggle import push2kaggle
    # kaggle.api.competition_submit_cli("submission.csv", "Submission from local machine", competition="bluebook-bulldozer-remix")
    push2kaggle("00_index.ipynb")