import sklearn
Important
- What variables most affect our predictions
Imports
sklearn.__version__
'1.5.0'
from aiking.data.external import *
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
import seaborn as sns
import matplotlib.pyplot as plt
# import eli5
Read Data
= get_ds('fifa-2018-match-statistics'); path.ls()[0] path
Path('/Users/rahul1.saraf/rahuketu/programming/AIKING_HOME/data/fifa-2018-match-statistics/FIFA 2018 Statistics.csv')
= pd.read_csv(path/"FIFA 2018 Statistics.csv"); df
df = (df['Man of the Match'] == "Yes"); y
y = df.select_dtypes(np.int64); X
X = train_test_split(X, y, random_state=1)
df_train, df_val, y_train, y_val
df_train.shape, df_val.shape, y_train.shape, y_val.shape
((96, 18), (32, 18), (96,), (32,))
= RandomForestClassifier(n_estimators=100,random_state=0).fit(df_train, y_train); model model
RandomForestClassifier(random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=0)
= permutation_importance(model, df_val, y_val, n_repeats=30, random_state=42, n_jobs=2); result.keys() result
dict_keys(['importances_mean', 'importances_std', 'importances'])
result.importances_mean
array([ 0.159375 , 0.00208333, -0.05 , 0.02083333, 0.03020833,
0.009375 , 0. , 0.003125 , 0.02083333, -0.00520833,
0.02916667, -0.05208333, 0.06145833, 0.01979167, 0.02395833,
0. , 0. , -0.00833333])
result.importances_std
array([0.07003998, 0.03864008, 0.04389856, 0.06002025, 0.02734251,
0.02440703, 0.03952847, 0.02835554, 0.03260932, 0.02562161,
0.02901748, 0.03054766, 0.03992452, 0.02962805, 0.02873565,
0. , 0. , 0.01381927])
0], len(result.importances[0]) result.importances[
(array([0.1875 , 0.21875, 0.0625 , 0.1875 , 0.15625, 0.21875, 0.125 ,
0.09375, 0.21875, 0.03125, 0.15625, 0.125 , 0.21875, 0.09375,
0.15625, 0.09375, 0.09375, 0.25 , 0.03125, 0.1875 , 0.28125,
0.125 , 0.1875 , 0.0625 , 0.28125, 0.15625, 0.125 , 0.15625,
0.21875, 0.28125]),
30)
def show_weights(result):
= result.importances_mean.argsort(); perm_sorted_idx
perm_sorted_idx
= pd.DataFrame({'features':df_val.columns[perm_sorted_idx].tolist(),
feat_df 'importance_mean': result.importances_mean[perm_sorted_idx],
'importance_std': result.importances_std[perm_sorted_idx] }).sort_values(by='importance_mean', ascending=False )
= feat_df['importance_mean'].abs().max()
a
return feat_df.style.background_gradient(axis=0, gmap=feat_df['importance_mean'], cmap='RdBu', vmax=a, vmin=-a)
show_weights(result)
features | importance_mean | importance_std | |
---|---|---|---|
17 | Goal Scored | 0.159375 | 0.070040 |
16 | Distance Covered (Kms) | 0.061458 | 0.039925 |
15 | Off-Target | 0.030208 | 0.027343 |
14 | Pass Accuracy % | 0.029167 | 0.029017 |
13 | Yellow Card | 0.023958 | 0.028736 |
12 | On-Target | 0.020833 | 0.060020 |
11 | Free Kicks | 0.020833 | 0.032609 |
10 | Fouls Committed | 0.019792 | 0.029628 |
9 | Blocked | 0.009375 | 0.024407 |
8 | Offsides | 0.003125 | 0.028356 |
7 | Ball Possession % | 0.002083 | 0.038640 |
6 | Red | 0.000000 | 0.000000 |
5 | Corners | 0.000000 | 0.039528 |
4 | Yellow & Red | 0.000000 | 0.000000 |
3 | Saves | -0.005208 | 0.025622 |
2 | Goals in PSO | -0.008333 | 0.013819 |
1 | Attempts | -0.050000 | 0.043899 |
0 | Passes | -0.052083 | 0.030548 |
def show_plot(result, ax):
= result.importances_mean.argsort(); perm_sorted_idx
perm_sorted_idx
ax.boxplot(
result.importances[perm_sorted_idx].T,=False,
vert=X.columns[perm_sorted_idx],
labels
)=0, color="k", linestyle="--")
ax.axvline(xreturn ax
= plt.subplot(111)
ax show_plot(result, ax)
Important
Feature which are negative imply that, validation score of model is better on shuffled feature than in the original set. This means feature importance of these features are zero. Reason why we see negative is only due to some random chance.
Unsurprisingly goal scored is most important predictor for man of the match
What if we run this on train data?
= permutation_importance(model, df_train, y_train, n_repeats=30, random_state=42, n_jobs=2); result_train.keys() result_train
dict_keys(['importances_mean', 'importances_std', 'importances'])
show_weights(result_train)
features | importance_mean | importance_std | |
---|---|---|---|
17 | Goal Scored | 0.151736 | 0.025328 |
16 | Attempts | 0.017361 | 0.008646 |
15 | Free Kicks | 0.008681 | 0.006634 |
14 | Blocked | 0.008681 | 0.003882 |
13 | Ball Possession % | 0.007292 | 0.004774 |
12 | Distance Covered (Kms) | 0.006944 | 0.004910 |
11 | On-Target | 0.005208 | 0.006449 |
10 | Pass Accuracy % | 0.004861 | 0.005197 |
9 | Saves | 0.001736 | 0.003882 |
8 | Off-Target | 0.001389 | 0.003541 |
7 | Fouls Committed | 0.001042 | 0.003125 |
6 | Passes | 0.000347 | 0.001870 |
5 | Corners | 0.000347 | 0.001870 |
1 | Yellow & Red | 0.000000 | 0.000000 |
4 | Yellow Card | 0.000000 | 0.000000 |
3 | Red | 0.000000 | 0.000000 |
2 | Offsides | 0.000000 | 0.000000 |
0 | Goals in PSO | 0.000000 | 0.000000 |
= plt.subplot(111)
ax =ax) show_plot(result_train, ax