Import libraries I need

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler as SS
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

Create your DoKFold

def DoKFold(model, your_feature_obj, your_target_obj, k, standardize = False, random_state = 146):
    kf = KFold(n_splits=k, shuffle=True, random_state=random_state)

    if standardize:
        ss = SS()

    # add an object for your training scores
    train_score = []
    # add an object for your testing scores
    test_score = []
    # add an object for your training MSE
    train_mse = []
    # add an object for your testing MSE
    test_mse = []

    # add your for loop where you create idxTrain & idxTest using kf.split with your features
    for idxTrain, idxTest in kf.split(X):
        Xtrain = X[idxTrain, :]
        Xtest = X[idxTest, :]
        ytrain = y[idxTrain]
        ytest = y[idxTest]

        if standardize:
            Xtrain = ss.fit_transform(Xtrain)
            Xtest = ss.fit_transform(Xtest)
            
        # fit your model on this line using your training data
        model.fit(Xtrain, ytrain)
        
        train_score.append(model.score(Xtrain, ytrain))        
        # use your feature and target testing data to calculate your model score and append it to the test score object
        test_score.append(model.score(Xtest, ytest))
        
        y_train_predict = model.predict(Xtrain)
        y_test_predict = model.predict(Xtest)

        #train_mse.append(mean_squared_error(ytrain, y_train_predict))
        #test_mse.append(mean_squared_error(ytest, y_test_predict))
        
        train_mse.append(np.mean((ytrain - y_train_predict) ** 2))
        test_mse.append(np.mean((ytest - y_test_predict) ** 2))
        
    return train_score,test_score,train_mse,test_mse

import the data, set X as features and y as target

data = fetch_california_housing()

housing = pd.DataFrame(data.data)
housing.columns = data.feature_names
housing['target'] = data.target

X = np.array(housing.iloc[:, :8])
y = np.array(housing['target'])

Question 15

To calculate the correlations between every feature and target, we could just calculate correlations between all variables

housing.corr()

which gives us the following output:

From that, we know that MedInc is the variable that is most strongly correlated with the target.

Question 16

For this question we could just create a copy of the original dataframe, standardize all the features columns and calculate the correlations again. This gave me exact same results as the above question.

ss = SS()
housing_trans = housing
ss.fit_transform(housing_trans.iloc[:, :8])
housing_trans.corr()

Question 17

For this question, we could calculate the correlation coefficient matrix for MedInc and the target. The value at second position of row 1/ first position at row 2 represents the correlation between MedInc and target. We could take either of these two values and square it to get the coefficient of determination. We could use np.round to round the number to desired number of digits after decimal point.

np.round(np.corrcoef(housing['MedInc'],y)[0][1]**2, 2)

Question 18

Call linear regression method and use the DoKFold function to fit the model. Set k=20 and standardize = True

lin_reg = LinearRegression()
train_score, test_score, mse_train, mse_test = DoKFold(lin_reg, X, y, 20, standardize = True)
print(np.mean(train_score))
print(np.mean(test_score))
print(np.mean(mse_train))
print(np.mean(mse_test))

Question 19

In order to find the optimal alpha value when mean testing score is maximized, create a list to record the average testing score for each option of alpha in the range, and use argmax to find the index for the highest average testing score and its corresponding training/testing score/mse.

a_range = np.linspace(20, 30, 101)

k = 20

avg_tr_score=[]
avg_te_score=[]
avg_tr_mse = []
avg_te_mse = []

for a in a_range:
    rid_reg = Ridge(alpha=a)
    train_scores,test_scores, train_mse, test_mse = DoKFold(rid_reg,X,y,k)
    avg_tr_score.append(np.mean(train_scores))
    avg_te_score.append(np.mean(test_scores))
    avg_tr_mse.append(np.mean(train_mse))
    avg_te_mse.append(np.mean(test_mse))

idx = np.argmax(avg_te_score)
print(avg_tr_score[idx], avg_te_score[idx], avg_tr_mse[idx], avg_te_mse[idx])
print('Optimal alpha value: ' + format(a_range[idx], '.3f'))

Question 20

Exactly the same with question 19 except for different alpha range and lasso regression model this time.

a_range = np.linspace(0.001, 0.003, 101)

k = 20

avg_tr_score=[]
avg_te_score=[]
avg_tr_mse = []
avg_te_mse = []

for a in a_range:
    las_reg = Lasso(alpha=a)
    train_scores,test_scores, train_mse, test_mse = DoKFold(las_reg,X,y,k)
    avg_tr_score.append(np.mean(train_scores))
    avg_te_score.append(np.mean(test_scores))
    avg_tr_mse.append(np.mean(train_mse))
    avg_te_mse.append(np.mean(test_mse))

idx = np.argmax(avg_te_score)
print(avg_tr_score[idx], avg_te_score[idx], avg_tr_mse[idx], avg_te_mse[idx])
print('Optimal alpha value: ' + format(a_range[idx], '.3f'))

Question 21

Fristly, standardize all the features using ss.fit_transform

ss = SS()
X_trans = ss.fit_transform(X)

Then do the fitting for linear, ridge and lasso regression respectively

lin_reg = LinearRegression()
lin = lin_reg.fit(X_trans, y)

rid_reg = Ridge(alpha=20.8)
rid = rid_reg.fit(X_trans, y)

las_reg = Lasso(alpha=0.00186)
las = las_reg.fit(X_trans, y)

From the correlation table in question 15, we know that AvgOccup is the least correlated variable with the target. Its index is 5 in the features, so we take a look at the coefficient for each model at position 5.

print(lin.coef_[5], rid.coef_[5], las.coef_[5])

Question 22

Now we take a look at the coefficient for MedInc, which has an index 0.

print(lin.coef_[0], rid.coef_[0], las.coef_[0])

Question 23

The code is pretty much the same with question 19. The only difference is that now we choose the index that corresponds to the smallest average mse of testing set.

a_range = np.linspace(20, 30, 101)

k = 20

avg_tr_score=[]
avg_te_score=[]
avg_tr_mse = []
avg_te_mse = []

for a in a_range:
    rid_reg = Ridge(alpha=a)
    train_scores,test_scores, train_mse, test_mse = DoKFold(rid_reg,X,y,k)
    avg_tr_score.append(np.mean(train_scores))
    avg_te_score.append(np.mean(test_scores))
    avg_tr_mse.append(np.mean(train_mse))
    avg_te_mse.append(np.mean(test_mse))

idx = np.argmin(avg_te_mse)
print(avg_tr_score[idx], avg_te_score[idx], avg_tr_mse[idx], avg_te_mse[idx])
print('Optimal alpha value: ' + format(a_range[idx], '.3f'))

Question 24

Same with quesiton 23 except for different alpha range and lasso regression this time.

a_range = np.linspace(0.001, 0.003, 101)

k = 20

avg_tr_score=[]
avg_te_score=[]
avg_tr_mse = []
avg_te_mse = []

for a in a_range:
    las_reg = Lasso(alpha=a)
    train_scores,test_scores, train_mse, test_mse = DoKFold(las_reg,X,y,k)
    avg_tr_score.append(np.mean(train_scores))
    avg_te_score.append(np.mean(test_scores))
    avg_tr_mse.append(np.mean(train_mse))
    avg_te_mse.append(np.mean(test_mse))

idx = np.argmin(avg_te_mse)
print(avg_tr_score[idx], avg_te_score[idx], avg_tr_mse[idx], avg_te_mse[idx])
print('Optimal alpha value: ' + format(a_range[idx], '.3f'))

For the test I did not get the last question correct. I still did not quite figure out the reason. I had pretty much the exact same code when doing the exam, but it kept giving me 0.001 as the result.

DATA146