# packages used
import pandas as pd
from sklearn.model_selection import train_test_split

# import data
adult_census = pd.read_csv('../data/adult-census.csv')

# separate feature & target data
target = adult_census['class']
features = adult_census.drop(columns='class')

# drop the duplicated column `"education-num"`
features = features.drop(columns='education-num')

# split into train & test sets
X_train, X_test, y_train, y_test = train_test_split(
    features, target, random_state=123
)


# packages used
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# create selector object based on data type
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

# get columns of interest
numerical_columns = numerical_columns_selector(features)
categorical_columns = categorical_columns_selector(features)

# preprocessors to handle numeric and categorical features
numerical_preprocessor = StandardScaler()
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")

# transformer to associate each of these preprocessors with their 
# respective columns
preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)
])


# packages used
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

# Pipeline object to chain together modeling processes
model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))
model

# fit our model
_ = model.fit(X_train, y_train)

# score on test set
model.score(X_test, y_test)

0.8503808041929408


%%time
from sklearn.model_selection import cross_validate

cv_result = cross_validate(model, X_train, y_train, cv=5)
cv_result

CPU times: user 1.71 s, sys: 66.7 ms, total: 1.77 s
Wall time: 1.78 s

{'fit_time': array([0.32632685, 0.33183312, 0.33147502, 0.34435606, 0.32457304]),
 'score_time': array([0.01660299, 0.01529479, 0.016078  , 0.01629019, 0.015697  ]),
 'test_score': array([0.85191757, 0.84548185, 0.85790336, 0.85094185, 0.85558286])}


scores = cv_result["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f}")

The mean cross-validation accuracy is: 0.852 +/- 0.004


# toy data
from sklearn.datasets import load_breast_cancer
X_cancer, y_cancer = load_breast_cancer(return_X_y=True)

# fit model
clf = LogisticRegression(solver='liblinear').fit(X_cancer, y_cancer)

# score 
clf.score(X_cancer, y_cancer)

0.9595782073813708


# toy data
from sklearn.datasets import fetch_california_housing
X_cali, y_cali = fetch_california_housing(return_X_y=True)

# fit model
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_cali, y_cali)

# score
reg.score(X_cali, y_cali)

0.606232685199805


from sklearn import metrics


y_pred = reg.predict(X_cali)

# Mean squared error
metrics.mean_squared_error(y_cali, y_pred)

0.5243209861846072


# Mean absolute percentage error
metrics.mean_absolute_percentage_error(y_cali, y_pred)

0.31715404597233343


y_pred = clf.predict(X_cancer)

# Area under the curve
metrics.roc_auc_score(y_cancer, y_pred)

0.9543760900586651


# F1 score
metrics.f1_score(y_cancer, y_pred)

0.968011126564673


# multiple metrics at once!
print(metrics.classification_report(y_cancer, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.93      0.95       212
           1       0.96      0.97      0.97       357

    accuracy                           0.96       569
   macro avg       0.96      0.95      0.96       569
weighted avg       0.96      0.96      0.96       569


# say we wanted to use AUC as our loss function while using 5-fold validation
cross_validate(model, X_train, y_train, cv=5, scoring='roc_auc')

{'fit_time': array([0.33399296, 0.33877516, 0.33839607, 0.32831979, 0.32219696]),
 'score_time': array([0.01821494, 0.0176549 , 0.01770401, 0.01844382, 0.01791096]),
 'test_score': array([0.90485391, 0.90327043, 0.91316917, 0.90553718, 0.90816423])}


# applying mean squared error with k-fold cross validation
cross_validate(
    reg, X_cali, y_cali, cv=5, scoring='neg_root_mean_squared_error'
)

{'fit_time': array([0.01037192, 0.00827122, 0.00409293, 0.00524306, 0.00397301]),
 'score_time': array([0.00218201, 0.00053596, 0.00089598, 0.00042582, 0.00049281]),
 'test_score': array([-0.69631786, -0.78898504, -0.80387217, -0.73702076, -0.70333835])}


# example of supplying more than one metric
metrics = ['accuracy', 'roc_auc']

cross_validate(model, X_train, y_train, cv=5, scoring=metrics)

{'fit_time': array([0.3343451 , 0.35279107, 0.342453  , 0.36462188, 0.32294607]),
 'score_time': array([0.03534794, 0.03328896, 0.03542399, 0.03617978, 0.03345275]),
 'test_accuracy': array([0.85191757, 0.84548185, 0.85790336, 0.85094185, 0.85558286]),
 'test_roc_auc': array([0.90485391, 0.90327043, 0.91316917, 0.90553718, 0.90816423])}


%%time
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

# set hyperparameter in KNN model 
model = KNeighborsClassifier(n_neighbors=10)

# create preprocessor & modeling pipeline
pipeline = make_pipeline(preprocessor, model)

# 5-fold cross validation using AUC error metric
results = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='roc_auc')

f'KNN model with 10 neighbors: AUC = {np.mean(results):.3f}'

CPU times: user 1min 45s, sys: 1.21 s, total: 1min 46s
Wall time: 21.6 s

'KNN model with 10 neighbors: AUC = 0.883'


%%time
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# basic model object
knn = KNeighborsClassifier()

# Create grid of hyperparameter values
hyper_grid = {'knn__n_neighbors': [5, 10, 15, 20]}

# create preprocessor & modeling pipeline
pipeline = Pipeline([('prep', preprocessor), ('knn', knn)])

# Tune a knn model using grid search
grid_search = GridSearchCV(pipeline, hyper_grid, cv=5, scoring='roc_auc', n_jobs=-1)
results = grid_search.fit(X_train, y_train)

# Best model's cross validated AUC
abs(results.best_score_)

CPU times: user 256 ms, sys: 42.5 ms, total: 298 ms
Wall time: 1min 14s

0.8937157593356954


results.best_params_

{'knn__n_neighbors': 20}


from sklearn.ensemble import RandomForestClassifier

# basic model object
rf = RandomForestClassifier(random_state=123)

# create preprocessor & modeling pipeline
pipeline = Pipeline([('prep', preprocessor), ('rf', rf)])


# specify hyperparameter distributions to randomly sample from
param_distributions = {
    'rf__n_estimators': loguniform_int(50, 1000),
    'rf__max_features': loguniform(.1, .5),
    'rf__max_depth': loguniform_int(4, 20),
    'rf__min_samples_leaf': loguniform_int(1, 100),
    'rf__max_samples': loguniform(.5, 1),
}


%%time
from sklearn.model_selection import RandomizedSearchCV

# perform 10 random iterations
random_search = RandomizedSearchCV(
    pipeline, 
    param_distributions=param_distributions, 
    n_iter=10,
    cv=5, 
    scoring='roc_auc',
    verbose=1,
    n_jobs=-1,
)

results = random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
CPU times: user 50.9 s, sys: 790 ms, total: 51.7 s
Wall time: 2min 55s


results.best_score_

0.9159613847988037


results.best_params_

{'rf__max_depth': 14,
 'rf__max_features': 0.4233891550145859,
 'rf__max_samples': 0.8068442678419226,
 'rf__min_samples_leaf': 12,
 'rf__n_estimators': 939}

Model Evaluation & Selection¶

Objective¶

Quick refresher¶

Data prep¶

Feature engineering¶

Modeling¶

Resampling & cross-validation¶

K-fold cross-validation¶

Evaluation metrics¶

Estimator scoring method¶

Individual scoring functions¶

Scoring parameters¶

Hyperparameter tuning¶

Bias¶

Variance¶

Wrapping up¶