Machine Learning with Scikit Learn: regularized regressions

machine learning
Author

João Ramalho

Published

January 13, 2023

Introduction

Setup

library(reticulate)
library(here)
here() starts at /home/joao/JR-IA
py_discover_config()
python:         /home/joao/JR-IA/renv/python/condaenvs/renv-python/bin/python
libpython:      /home/joao/JR-IA/renv/python/condaenvs/renv-python/lib/libpython3.7m.so
pythonhome:     /home/joao/JR-IA/renv/python/condaenvs/renv-python:/home/joao/JR-IA/renv/python/condaenvs/renv-python
version:        3.7.13 (default, Mar 29 2022, 02:18:16)  [GCC 7.5.0]
numpy:          /home/joao/JR-IA/renv/python/condaenvs/renv-python/lib/python3.7/site-packages/numpy
numpy_version:  1.21.6

NOTE: Python version was forced by RETICULATE_PYTHON
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, KFold

Data load and transform

#diabetes_df = pd.read_csv("posts/20230113/data/diabetes_clean.csv")

diabetes_df = pd.read_csv("data/diabetes_clean.csv")
X = diabetes_df.drop("glucose", axis=1).values
y = diabetes_df["glucose"].values

Ridge regression

Split data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Select alpha

alphas = [0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]
ridge_scores = []
for alpha in alphas:
  ridge = Ridge(alpha=alpha)
  ridge.fit(X_train, y_train)
  score = ridge.score(X_test, y_test)
  ridge_scores.append(score)
Ridge(alpha=0.1)
Ridge()
Ridge(alpha=10.0)
Ridge(alpha=100.0)
Ridge(alpha=1000.0)
Ridge(alpha=10000.0)
print(ridge_scores)
[0.2828466623222219, 0.2832063357480473, 0.2853000732200003, 0.2642398481266811, 0.19292424694100896, 0.17682728550498084]

We select an alpha of 10.

Fit

ridge = Ridge(alpha = 10)
ridge.fit(X_train, y_train)
Ridge(alpha=10)

Predict

y_pred = ridge.predict(X_test)

Metrics

rg_r2 = ridge.score(X_test, y_test).round(2)
rg_mse = mean_squared_error(y_test, y_pred, squared=False).round(1)
print("R2:", rg_r2, "MSE:", rg_mse)
R2: 0.29 MSE: 26.3

Fit plot

print(y_test.shape, y_pred.shape)
(231,) (231,)
dict_lmplot = {'y_test' : y_test, 'y_pred' : y_pred}
data_lmplot = pd.DataFrame(data=dict_lmplot)
plt.clf()
sns.set_palette("RdBu")
g = sns.lmplot(x = 'y_test', y = 'y_pred', data = data_lmplot)
g.set(xlabel="Glucose (real)", ylabel="Glucose (predicted)")
g.fig.suptitle("Fit plot of glucose (ridge)")
plt.tight_layout()
plt.show()

plt.savefig("posts/20230113/glucose_fit_ridge.png")

Lasso regression

Select alpha

alphas = [0.1, 0.5, 1, 1.5, 2]
lasso_scores = []
for alpha in alphas:
  lasso = Lasso(alpha=alpha)
  lasso.fit(X_train, y_train)
  score = lasso.score(X_test, y_test)
  lasso_scores.append(score)
Lasso(alpha=0.1)
Lasso(alpha=0.5)
Lasso(alpha=1)
Lasso(alpha=1.5)
Lasso(alpha=2)
print(lasso_scores)
[0.2857532569482625, 0.2949263183045806, 0.29795609755827845, 0.2918166945814026, 0.2825959618544781]

We select a coefficient of 1.

Fit

lasso = Lasso(alpha = 1)
lasso.fit(X, y)
Lasso(alpha=1)

Predict

y_pred = lasso.predict(X_test)

Metrics

ls_r2 = lasso.score(X_test, y_test).round(1)
ls_mse = mean_squared_error(y_test, y_pred, squared=False).round(1)
print("R2:", ls_r2, "MSE:", ls_mse)
R2: 0.3 MSE: 25.8

Fit plot

print(y_test.shape, y_pred.shape)
(231,) (231,)
dict_lmplot = {'y_test' : y_test, 'y_pred' : y_pred}
data_lmplot = pd.DataFrame(data=dict_lmplot)
plt.clf()
sns.set_palette("PRGn")
g = sns.lmplot(x = 'y_test', y = 'y_pred', data = data_lmplot)
g.set(xlabel="Glucose (real)", ylabel="Glucose (predicted)")
g.fig.suptitle("Fit plot of glucose (lasso)")
plt.tight_layout()
plt.show()

plt.savefig("posts/20230113/glucose_fit_lasso.png")

Coeficients

# Compute and print the coefficients
lasso_coef = lasso.fit(X,y).coef_
print(lasso_coef)
[-0.22906289  0.1058408  -0.28282436  0.09302862  0.38436673  0.
  0.49564721 19.95533798]
lasso_names = diabetes_df.drop("glucose", axis=1).columns

VI plot

plt.clf()
plt.bar(lasso_names, lasso_coef)
<BarContainer object of 8 artists>
plt.title("Glucose predictors (lasso)\n(model coefficients)")
plt.xticks(rotation=45)
([0, 1, 2, 3, 4, 5, 6, 7], <a list of 8 Text major ticklabel objects>)
plt.tight_layout()
plt.show()

plt.savefig("posts/20230113/glucose_vi_lasso.png")

Compare plots

Model R2 MSE
lm 0.28 26.3
lasso 0.29 26.3
ridge 0.30 25.8