# Exercise material for Linear Regression Methods
import sklearn.datasets as ds
import numpy as np
# we refer to the book
# Elements of Statistical Learning: Data Mining, Inference, and Prediction [ESL]
# by Jerome H. Friedman, Robert Tibshirani, and Trevor Hastie
# https://web.stanford.edu/~hastie/Papers/ESLII.pdf
# load dataset
boston = ds.load_boston()
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html#sklearn.datasets.load_boston
X = boston.data
y = boston.target
# you can also use
# diabetes = ds.load_diabetes()
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html#sklearn.datasets.load_diabetes
# split train and test
percentage = 0.7
length = int(len(boston.target)*percentage)
X_train = boston.data[:length,:]
y_train = boston.target[:length]
X_test = boston.data[length:,:]
y_test = boston.target[length:]
# remember to add a column of all 1s to the matrix X to include the intercept in the estimation
# Exercise 1
# implement the OLS estimator as a function which takes X,y as input and returns beta_hat
# Exercise 2
# implement subset selection, following [ESL] Section 3.3.1
# to generate all subsets from {0, .., n-1} of cardinality k use
import itertools
n = 10
k = 2
list(itertools.combinations(range(n), k))
#
# train the regressor using the train set
# plot the MSE for both train and test over different values of variables in the subset, as in Figure 3.5 of [ESL]
# Exercise 3
# implement forward step-wise selection, following [ESL] Section 3.3.2
# train the regressor using the train set
# plot the MSE for both train and test over different values for the number of variables in the subset, as in Figure 3.5 of [ESL]
# compare the plot with those of Exercise 2, as in Figure 3.6 of [ESL]. What do you observe?
# Exercise 4
# implement ridge regression, following [ESL] Section 3.4.1
# train the regressor using the train set
# plot the MSE for both train and test over different values of the regularization parameter
# plot the values of each beta_i for ridge regression over different values of the regularization parameter, as in Figure 3.8 of [ESL]
# Exercise 5
# train lasso/lars using the train set, described in 3.4.1 of [ESL]
# use the function sklearn.linear_model.lars_path
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.lars_path.html#sklearn.linear_model.lars_path
# plot the MSE for both train and test over different values of the regularization parameter
# plot the values of each beta_i for lasso over different values of the regularization parameter, as in Figure 3.10 of [ESL]
# compare the plot with those of Exercise 4. What do you observe?
# Exercise 6
# Add to the matrix X, for each observation, all the monomial of the form X_i * X_y
# Re-run the code of all the Exercises 1-5
# What do you observe?