import pandas as pd

# to display nice model diagram
from sklearn import set_config
set_config(display='diagram')

# import data
adult_census = pd.read_csv('../data/adult-census.csv')

# separate feature & target data
target = adult_census['class']
features = adult_census.drop(columns='class')


features.dtypes

age                int64
workclass         object
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
dtype: object


from sklearn.compose import make_column_selector as selector

# create selector object based on data type
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

# get columns of interest
numerical_columns = numerical_columns_selector(features)
categorical_columns = categorical_columns_selector(features)

# results in a list containing relevant column names
numerical_columns

['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']


numerical_features = features[numerical_columns]
numerical_features.describe()


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(numerical_features)

StandardScaler()

StandardScaler()


scaler.mean_

array([  38.64358544,   10.07808853, 1079.06762622,   87.50231358,
         40.42238238])


scaler.scale_

array([1.37103696e+01, 2.57094644e+00, 7.45194277e+03, 4.03000427e+02,
       1.23913172e+01])


numerical_features_scaled = scaler.transform(numerical_features)
numerical_features_scaled

array([[-0.99512893, -1.19725891, -0.14480353, -0.2171271 , -0.03408696],
       [-0.04694151, -0.41933527, -0.14480353, -0.2171271 ,  0.77292975],
       [-0.77631645,  0.74755018, -0.14480353, -0.2171271 , -0.03408696],
       ...,
       [ 1.41180837, -0.41933527, -0.14480353, -0.2171271 , -0.03408696],
       [-1.21394141, -0.41933527, -0.14480353, -0.2171271 , -1.64812038],
       [ 0.97418341, -0.41933527,  1.87131501, -0.2171271 , -0.03408696]])


# fitting and transforming in one step
scaler.fit_transform(numerical_features)

array([[-0.99512893, -1.19725891, -0.14480353, -0.2171271 , -0.03408696],
       [-0.04694151, -0.41933527, -0.14480353, -0.2171271 ,  0.77292975],
       [-0.77631645,  0.74755018, -0.14480353, -0.2171271 , -0.03408696],
       ...,
       [ 1.41180837, -0.41933527, -0.14480353, -0.2171271 , -0.03408696],
       [-1.21394141, -0.41933527, -0.14480353, -0.2171271 , -1.64812038],
       [ 0.97418341, -0.41933527,  1.87131501, -0.2171271 , -0.03408696]])


numerical_features = pd.DataFrame(
    numerical_features_scaled,
    columns=numerical_columns
)

numerical_features.describe()


from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(), LogisticRegression())
model

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])

StandardScaler()

LogisticRegression()


from sklearn.model_selection import train_test_split

# split our data into train & test
X_train, X_test, y_train, y_test = train_test_split(
    numerical_features, target, random_state=123
)

# fit our pipeline model
model.fit(X_train, y_train)

# score our model on the test data
model.score(X_test, y_test)

0.8135287855212513


from sklearn.preprocessing import OrdinalEncoder

# let's illustrate with the 'education' feature
education_column = features[["education"]]

encoder = OrdinalEncoder()
education_encoded = encoder.fit_transform(education_column)
education_encoded

array([[ 1.],
       [11.],
       [ 7.],
       ...,
       [11.],
       [11.],
       [11.]])


encoder.categories_

[array([' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th',
        ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate',
        ' HS-grad', ' Masters', ' Preschool', ' Prof-school',
        ' Some-college'], dtype=object)]


ed_levels = [
    ' Preschool', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' 10th', ' 11th', 
    ' 12th', ' HS-grad', ' Prof-school', ' Some-college', ' Assoc-acdm', 
    ' Assoc-voc', ' Bachelors', ' Masters', ' Doctorate'
]

encoder = OrdinalEncoder(categories=[ed_levels])
education_encoded = encoder.fit_transform(education_column)
education_encoded

array([[ 6.],
       [ 8.],
       [11.],
       ...,
       [ 8.],
       [ 8.],
       [ 8.]])


encoder.categories_

[array([' Preschool', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' 10th',
        ' 11th', ' 12th', ' HS-grad', ' Prof-school', ' Some-college',
        ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Masters',
        ' Doctorate'], dtype=object)]


from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)
education_encoded = encoder.fit_transform(education_column)
education_encoded

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])


feature_names = encoder.get_feature_names_out(
    input_features=["education"]
)
pd.DataFrame(education_encoded, columns=feature_names).head(5)


# one-hot encode all features
cat_features_encoded = encoder.fit_transform(
    features[categorical_columns]
)

# view as a data frame
columns_encoded = encoder.get_feature_names_out(
    categorical_columns
)
pd.DataFrame(cat_features_encoded, columns=columns_encoded).head(3)


# drop the duplicated column `"education-num"` as stated in the data exploration notebook
features = features.drop(columns='education-num')

# create selector object based on data type
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

# get columns of interest
numerical_columns = numerical_columns_selector(features)
categorical_columns = categorical_columns_selector(features)

# split into train & test sets
X_train, X_test, y_train, y_test = train_test_split(
    features, target, random_state=123
)


categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()


from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)
])


model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))
model

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('one-hot-encoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['workclass', 'education',
                                                   'marital-status',
                                                   'occupation', 'relationship',
                                                   'race', 'sex',
                                                   'native-country']),
                                                 ('standard_scaler',
                                                  StandardScaler(),
                                                  ['age', 'capital-gain',
                                                   'capital-loss',
                                                   'hours-per-week'])])),
                ('logisticregression', LogisticRegression(max_iter=500))])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('one-hot-encoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['workclass', 'education',
                                                   'marital-status',
                                                   'occupation', 'relationship',
                                                   'race', 'sex',
                                                   'native-country']),
                                                 ('standard_scaler',
                                                  StandardScaler(),
                                                  ['age', 'capital-gain',
                                                   'capital-loss',
                                                   'hours-per-week'])])),
                ('logisticregression', LogisticRegression(max_iter=500))])

ColumnTransformer(transformers=[('one-hot-encoder',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 ['workclass', 'education', 'marital-status',
                                  'occupation', 'relationship', 'race', 'sex',
                                  'native-country']),
                                ('standard_scaler', StandardScaler(),
                                 ['age', 'capital-gain', 'capital-loss',
                                  'hours-per-week'])])

['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

OneHotEncoder(handle_unknown='ignore')

['age', 'capital-gain', 'capital-loss', 'hours-per-week']

StandardScaler()

LogisticRegression(max_iter=500)


# fit our model
_ = model.fit(X_train, y_train)

# score on test set
model.score(X_test, y_test)

0.8503808041929408

	age	education-num	capital-gain	capital-loss	hours-per-week
count	48842.000000	48842.000000	48842.000000	48842.000000	48842.000000
mean	38.643585	10.078089	1079.067626	87.502314	40.422382
std	13.710510	2.570973	7452.019058	403.004552	12.391444
min	17.000000	1.000000	0.000000	0.000000	1.000000
25%	28.000000	9.000000	0.000000	0.000000	40.000000
50%	37.000000	10.000000	0.000000	0.000000	40.000000
75%	48.000000	12.000000	0.000000	0.000000	45.000000
max	90.000000	16.000000	99999.000000	4356.000000	99.000000

	age	education-num	capital-gain	capital-loss	hours-per-week
count	4.884200e+04	4.884200e+04	4.884200e+04	4.884200e+04	4.884200e+04
mean	2.281092e-16	-9.208746e-17	1.047440e-17	-1.018345e-17	4.466169e-17
std	1.000010e+00	1.000010e+00	1.000010e+00	1.000010e+00	1.000010e+00
min	-1.578629e+00	-3.531030e+00	-1.448035e-01	-2.171271e-01	-3.181452e+00
25%	-7.763164e-01	-4.193353e-01	-1.448035e-01	-2.171271e-01	-3.408696e-02
50%	-1.198790e-01	-3.037346e-02	-1.448035e-01	-2.171271e-01	-3.408696e-02
75%	6.824334e-01	7.475502e-01	-1.448035e-01	-2.171271e-01	3.694214e-01
max	3.745808e+00	2.303397e+00	1.327438e+01	1.059179e+01	4.727312e+00

Feature Engineering¶

Objective¶

Basic prerequisites¶

Selection based on data types¶

Preprocessing numerical data¶

Model pipelines¶

Preprocessing categorical data¶

Encoding ordinal categories¶

Ecoding nominal categories¶

Choosing an encoding strategy¶

Using numerical and categorical variables together¶

Wrapping up¶

	education_ 11th	education_ Assoc-acdm	education_ HS-grad	education_ Some-college
0	1.0	0.0	0.0	0.0
1	0.0	0.0	1.0	0.0
2	0.0	1.0	0.0	0.0
3	0.0	0.0	0.0	1.0
4	0.0	0.0	0.0	1.0

	workclass_ Local-gov	workclass_ Private	...	native-country_ United-States
0	0.0	1.0	...	1.0
1	0.0	1.0	...	1.0
2	1.0	0.0	...	1.0

	education_ 11th	education_ Assoc-acdm	education_ HS-grad	education_ Some-college
0	1.0	0.0	0.0	0.0
1	0.0	0.0	1.0	0.0
2	0.0	1.0	0.0	0.0
3	0.0	0.0	0.0	1.0
4	0.0	0.0	0.0	1.0

	workclass_ Local-gov	workclass_ Private	...	native-country_ United-States
0	0.0	1.0	...	1.0
1	0.0	1.0	...	1.0
2	1.0	0.0	...	1.0

	education_ 11th	education_ Assoc-acdm	education_ HS-grad	education_ Some-college
0	1.0	0.0	0.0	0.0
1	0.0	0.0	1.0	0.0
2	0.0	1.0	0.0	0.0
3	0.0	0.0	0.0	1.0
4	0.0	0.0	0.0	1.0

	workclass_ Local-gov	workclass_ Private	...	native-country_ United-States
0	0.0	1.0	...	1.0
1	0.0	1.0	...	1.0
2	1.0	0.0	...	1.0