# Helper packages
import math
import numpy as np
import pandas as pd
from plotnine import (
    ggplot, aes, geom_density,
    geom_line, geom_point, ggtitle
)

# Modeling process
from sklearn.model_selection import (
    train_test_split, KFold, RepeatedKFold, cross_val_score
)
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error


# Ames housing data
ames = pd.read_csv("../data/ames.csv")


ames.head()


# create train/test split
train, test = train_test_split(ames, train_size=0.7, random_state=123)


# dimensions of training data
train.shape

(2051, 81)


# dimensions of testing data
test.shape

(879, 81)


(ggplot(train, aes(x='Sale_Price'))
 + geom_density(color='blue')
 + geom_density(data = test, color = "red")
 + ggtitle("Distribution of Sale_Price"))

<Figure Size: (640 x 480)>


# separate features from labels
X_train = train[["Gr_Liv_Area", "Year_Built"]]
y_train = train["Sale_Price"]


# 1. Prerequisite
from sklearn.linear_model import LinearRegression


# 2. Instantiate the model object
reg = LinearRegression()


# 3. Fit the model
reg.fit(X_train, y_train)

LinearRegression()

LinearRegression()


# 4. Make predictions
reg.predict(X_train)

array([211888.77551558, 119021.83893513, 177818.03700616, ...,
       294633.08255954, 213774.91574325, 166398.33102108])


# 1. Prerequisite
from sklearn.neighbors import KNeighborsRegressor


# 2. Instantiate the model object
knn = KNeighborsRegressor()


# 3. Fit the model
knn.fit(X_train, y_train)

KNeighborsRegressor()

KNeighborsRegressor()


# 4. Make predictions
knn.predict(X_train)

array([218400. , 131280. , 142600. , ..., 318647.2, 180820. , 149480. ])


# 1. Prerequisite
from sklearn.ensemble import RandomForestRegressor


# 2. Instantiate the model object
rf = RandomForestRegressor()


# 3. Fit the model
rf.fit(X_train, y_train)

RandomForestRegressor()

RandomForestRegressor()


# 4. Make predictions
rf.predict(X_train)

array([171438.        , 118076.        , 142605.42388167, ...,
       286305.        , 194271.98133333, 135037.48      ])


# compute MSE for linear model
pred = reg.predict(X_train)
mse = mean_squared_error(y_train, pred)
mse

2313058425.399425


rmse = math.sqrt(mse)
rmse

48094.2660345225


# define loss function
loss = 'neg_root_mean_squared_error'

# create 10 fold CV object
kfold = KFold(n_splits=10, random_state=123, shuffle=True)

# fit KNN model with 10-fold CV
results = cross_val_score(
    knn, X_train, y_train, cv=kfold, scoring=loss
)
results

array([-44371.6172573 , -40053.49695081, -52321.53768451, -46339.69606634,
       -45736.42785737, -53089.81175919, -42830.25107754, -45540.49867468,
       -48512.75628167, -58652.48566654])


# summary stats for all 10 folds
pd.DataFrame(results * -1).describe()


# Basic model object
knn = KNeighborsRegressor()

# Hyperparameter values to assess
hyper_grid = {'n_neighbors': range(2, 26)}

# Create grid search object
grid_search = GridSearchCV(knn, hyper_grid, cv=kfold, scoring=loss)

# Tune a knn model using grid search
results = grid_search.fit(X_train, y_train)


# Best model's cross validated RMSE
abs(results.best_score_)

46651.2105708044


# Best model's k value
results.best_estimator_.get_params().get('n_neighbors')

13


# Plot all RMSE results
all_rmse = pd.DataFrame({
    'k': range(2, 26),
    'RMSE': np.abs(results.cv_results_['mean_test_score'])
})

(ggplot(all_rmse, aes(x='k', y='RMSE'))
 + geom_line()
 + geom_point()
 + ggtitle("Cross validated grid search results"))

<Figure Size: (640 x 480)>


# 1. Split into training vs testing data
train, test = train_test_split(ames, train_size=0.7, random_state=123)

# 2. Separate features from labels and only use numeric features
X_train = (
    train.select_dtypes(include='number').drop("Sale_Price", axis=1)
)
y_train = train["Sale_Price"]

# 3. Create KNN model object
knn = KNeighborsRegressor()

# 4. Define loss function
loss = 'neg_root_mean_squared_error'

# 5. Specify K-fold resampling procedure
kfold = KFold(n_splits=10, random_state=123, shuffle=True)

# 6. Create grid of hyperparameter values
hyper_grid = {'n_neighbors': range(2, 26)}

# 7. Tune a knn model using grid search
grid_search = GridSearchCV(knn, hyper_grid, cv=kfold, scoring=loss)
results = grid_search.fit(X_train, y_train)


# 8. Evaluate performance: Best model's cross validated RMSE
abs(results.best_score_)

41915.408581298376


# 8. Evaluate performance: Best model's k value
results.best_estimator_.get_params().get('n_neighbors')

5


# 8. Evaluate performance: Plot all RMSE results
all_rmse = pd.DataFrame({
    'k': range(2, 26),
    'RMSE': np.abs(results.cv_results_['mean_test_score'])
})

(ggplot(all_rmse, aes(x='k', y='RMSE'))
 + geom_line()
 + geom_point()
 + ggtitle("Cross validated grid search results"))

<Figure Size: (640 x 480)>

	MS_SubClass	MS_Zoning	Lot_Frontage	Lot_Area	Street	Alley	Lot_Shape	Land_Contour	Utilities	Lot_Config	...	Fence	Misc_Feature	Misc_Val	Mo_Sold	Year_Sold	Sale_Type	Sale_Condition	Sale_Price	Longitude	Latitude
0	One_Story_1946_and_Newer_All_Styles	Residential_Low_Density	141	31770	Pave	No_Alley_Access	Slightly_Irregular	Lvl	AllPub	Corner	...	No_Fence	NaN	0	5	2010	WD	Normal	215000	-93.619754	42.054035
1	One_Story_1946_and_Newer_All_Styles	Residential_High_Density	80	11622	Pave	No_Alley_Access	Regular	Lvl	AllPub	Inside	...	Minimum_Privacy	NaN	0	6	2010	WD	Normal	105000	-93.619756	42.053014
2	One_Story_1946_and_Newer_All_Styles	Residential_Low_Density	81	14267	Pave	No_Alley_Access	Slightly_Irregular	Lvl	AllPub	Corner	...	No_Fence	Gar2	12500	6	2010	WD	Normal	172000	-93.619387	42.052659
3	One_Story_1946_and_Newer_All_Styles	Residential_Low_Density	93	11160	Pave	No_Alley_Access	Regular	Lvl	AllPub	Corner	...	No_Fence	NaN	0	4	2010	WD	Normal	244000	-93.617320	42.051245
4	Two_Story_1946_and_Newer	Residential_Low_Density	74	13830	Pave	No_Alley_Access	Slightly_Irregular	Lvl	AllPub	Inside	...	Minimum_Privacy	NaN	0	3	2010	WD	Normal	189900	-93.638933	42.060899

	0
count	10.000000
mean	47744.857928
std	5524.060076
min	40053.496951
25%	44663.837612
50%	46038.061962
75%	51369.342334
max	58652.485667

Modeling with scikit-learn¶

Introduction to Machine Learning¶

Introduction¶

Types of modeling¶

Supervised learning¶

Types of supervised learning¶

Unsupervised learning¶

Today's focus¶

Supervised learning modeling process¶

Modeling Process¶

Modeling Process¶

Prerequisites - packages¶

Prerequisites - Ames housing data¶

Your Turn¶

Data Splitting¶

Generalizability¶

What's the right split?¶

Mechanics of data splitting¶

Visualizing response distribution¶

Separating features & target¶

Creating Models¶

Creating Models with scikit-learn¶

Ordinary least squares¶

K-nearest neighhbor¶

Your Turn¶

Evaluating Models¶

Evaluating model performance¶

Many loss functions for regression problems¶

Computing MSE¶

Your Turn¶

Resampling Methods¶

Resampling methods¶

K-fold cross validation¶

K-fold CV implementation¶

K-fold results¶

Your Turn¶

Hyperparameter Tuning¶

Bias-variance trade-off¶

Bias¶

Variance¶

Hyperparameter tuning¶

Hyperparameter tuning¶

Grid search¶

Putting the Processes Together¶

Putting the Processes Together¶

Can we do better?¶

Learning More¶

Questions¶

Modeling with `scikit-learn`¶