import mlflow

experiment = mlflow.set_experiment("Predicting income")


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector

adult_census = pd.read_csv('../data/adult-census.csv')

# separate feature & target data
target = adult_census['class']
features = adult_census.drop(columns=['class','education-num'])

# split into train & test sets
X_train, X_test, y_train, y_test = train_test_split(
    features, target, random_state=123)

# create selector object based on data type
numer_col_selector = selector(dtype_exclude=object)
cat_col_selector = selector(dtype_include=object)

# preprocessors to handle numeric and categorical features
scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer([
    ('one-hot-encoder', encoder, cat_col_selector(features)),
    ('standard_scaler', scaler, numer_col_selector(features))
])


from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline


mlflow.start_run(run_name='first_mlflow_run')

mlflow.log_param('max_iter', 500)
model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))

_ = model.fit(X_train, y_train)

accuracy = model.score(X_test, y_test)
mlflow.log_metric('accuracy', accuracy)

mlflow.end_run()


with mlflow.start_run(run_name='run_as_context_mgr') as run:

    mlflow.log_param('max_iter', 500)
    log_reg = LogisticRegression(max_iter=500)
    model = make_pipeline(preprocessor, log_reg)

    _ = model.fit(X_train, y_train)

    accuracy = model.score(X_test, y_test)
    mlflow.log_metric('accuracy', accuracy)


with mlflow.start_run(run_name='baseline_model') as run:

    mlflow.set_tag('Estimator', 'LogisticRegression')
    mlflow.log_param('max_iter', 500)
    log_reg = LogisticRegression(max_iter=500)
    model = make_pipeline(preprocessor, log_reg)

    _ = model.fit(X_train, y_train)
    mlflow.sklearn.log_model(model, 'baseline_model')

    accuracy = model.score(X_test, y_test)
    mlflow.log_metric('accuracy', accuracy)

/opt/homebrew/anaconda3/envs/uc-python-advanced/lib/python3.11/site-packages/_distutils_hack/__init__.py:33: UserWarning: Setuptools is replacing distutils.
  warnings.warn("Setuptools is replacing distutils.")


#!mlflow ui


# enable autologging
mlflow.sklearn.autolog()

with mlflow.start_run(run_name='autolog_run') as run:
    log_reg = LogisticRegression(max_iter=500)
    model = make_pipeline(preprocessor, log_reg)
    _ = model.fit(X_train, y_train)

    mlflow.log_metric('test_accuracy', model.score(X_test, y_test))

2024/01/07 16:09:34 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: "/opt/homebrew/anaconda3/envs/uc-python-advanced/lib/python3.11/site-packages/mlflow/data/digest_utils.py:26: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead."
2024/01/07 16:09:34 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: "/opt/homebrew/anaconda3/envs/uc-python-advanced/lib/python3.11/site-packages/mlflow/data/pandas_dataset.py:134: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details."
2024/01/07 16:09:36 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: "/opt/homebrew/anaconda3/envs/uc-python-advanced/lib/python3.11/site-packages/mlflow/models/signature.py:212: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details."


#!mlflow ui


from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

# basic model object
knn = KNeighborsClassifier()

# Create grid of hyperparameter values
hyper_grid = {'knn__n_neighbors': [5, 10, 15, 20]}

# create preprocessor & modeling pipeline
pipeline = Pipeline([('preprocessor', preprocessor), ('knn', knn)])

# enable autologging before tuning the KNN model
mlflow.sklearn.autolog()

with mlflow.start_run(run_name='knn_grid_search') as run:
    results = GridSearchCV(
        pipeline, hyper_grid, cv=5, scoring='roc_auc', n_jobs=-1
    ).fit(X_train, y_train)

2024/01/07 16:09:37 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: "/opt/homebrew/anaconda3/envs/uc-python-advanced/lib/python3.11/site-packages/mlflow/data/digest_utils.py:26: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead."
2024/01/07 16:09:37 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: "/opt/homebrew/anaconda3/envs/uc-python-advanced/lib/python3.11/site-packages/mlflow/data/pandas_dataset.py:134: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details."
2024/01/07 16:11:58 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: "/opt/homebrew/anaconda3/envs/uc-python-advanced/lib/python3.11/site-packages/mlflow/models/signature.py:212: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details."
2024/01/07 16:12:00 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


#!mlflow ui


df = mlflow.search_runs(experiment_ids=experiment.experiment_id)
df.head(3)


model_filter = df['tags.mlflow.runName'] == 'knn_grid_search'
run_id = df.loc[model_filter, 'run_id'].iloc[-1]


model_path = (
    f'mlruns/{experiment.experiment_id}/{run_id}'
    '/artifacts/best_estimator'
)
model = mlflow.sklearn.load_model(model_path)
model

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('one-hot-encoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['workclass', 'education',
                                                   'marital-status',
                                                   'occupation', 'relationship',
                                                   'race', 'sex',
                                                   'native-country']),
                                                 ('standard_scaler',
                                                  StandardScaler(),
                                                  ['age', 'capital-gain',
                                                   'capital-loss',
                                                   'hours-per-week'])])),
                ('knn', KNeighborsClassifier(n_neighbors=20))])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('one-hot-encoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['workclass', 'education',
                                                   'marital-status',
                                                   'occupation', 'relationship',
                                                   'race', 'sex',
                                                   'native-country']),
                                                 ('standard_scaler',
                                                  StandardScaler(),
                                                  ['age', 'capital-gain',
                                                   'capital-loss',
                                                   'hours-per-week'])])),
                ('knn', KNeighborsClassifier(n_neighbors=20))])

ColumnTransformer(transformers=[('one-hot-encoder',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 ['workclass', 'education', 'marital-status',
                                  'occupation', 'relationship', 'race', 'sex',
                                  'native-country']),
                                ('standard_scaler', StandardScaler(),
                                 ['age', 'capital-gain', 'capital-loss',
                                  'hours-per-week'])])

['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

OneHotEncoder(handle_unknown='ignore')

['age', 'capital-gain', 'capital-loss', 'hours-per-week']

StandardScaler()

KNeighborsClassifier(n_neighbors=20)

	run_id	experiment_id	status	artifact_uri	start_time	end_time	metrics.std_score_time	metrics.std_test_score	metrics.std_fit_time	metrics.rank_test_score	...	tags.mlflow.runName	tags.estimator_name	tags.mlflow.source.type	tags.estimator_class	tags.mlflow.autologging	tags.mlflow.source.name	tags.mlflow.user	tags.mlflow.parentRunId	tags.mlflow.log-model.history	tags.Estimator
0	02fc5f7cf19d433ab194e17c9796d984	756538018307011023	FINISHED	file:///Users/jamescunningham/Projects/advance...	2024-01-07 21:09:37.743000+00:00	2024-01-07 21:12:00.166000+00:00	0.033195	0.004886	0.007759	4.0	...	resilient-bird-785	Pipeline	LOCAL	sklearn.pipeline.Pipeline	sklearn	/opt/homebrew/anaconda3/envs/uc-python-advance...	jamescunningham	d9324a90f4ab4762b69110d473fd3aa1	None	None
1	432ab39a489b48e28c9fbd7262e11799	756538018307011023	FINISHED	file:///Users/jamescunningham/Projects/advance...	2024-01-07 21:09:37.743000+00:00	2024-01-07 21:12:00.166000+00:00	0.041490	0.002964	0.002496	1.0	...	skittish-cod-799	Pipeline	LOCAL	sklearn.pipeline.Pipeline	sklearn	/opt/homebrew/anaconda3/envs/uc-python-advance...	jamescunningham	d9324a90f4ab4762b69110d473fd3aa1	None	None
2	43ec7480d8114fc1b2f998f4a69d008d	756538018307011023	FINISHED	file:///Users/jamescunningham/Projects/advance...	2024-01-07 21:09:37.743000+00:00	2024-01-07 21:12:00.166000+00:00	0.101619	0.003366	0.001141	2.0	...	valuable-koi-692	Pipeline	LOCAL	sklearn.pipeline.Pipeline	sklearn	/opt/homebrew/anaconda3/envs/uc-python-advance...	jamescunningham	d9324a90f4ab4762b69110d473fd3aa1	None	None

ML Lifecycle Management¶

Objective¶

Problem¶

Intro to MLflow¶

Model Tracking¶

MLflow UI¶

Auto logging¶

Hyperparameter tuning¶

Accessing run information¶

Registering models¶

Wrapping up¶