Create 100 \( X \) and \( \epsilon \) variables
set.seed(1)
X = rnorm(100)
eps = rnorm(100)
We are selecting \( \beta_0 = 3 \), \( \beta_1 = 2 \), \( \beta_2 = -3 \) and \( \beta_3 = 0.3 \).
beta0 = 3
beta1 = 2
beta2 = -3
beta3 = 0.3
Y = beta0 + beta1 * X + beta2 * X^2 + beta3 * X^3 + eps
Use \( regsubsets \) to select best model having polynomial of \( X \) of degree 10
library(leaps)
data.full = data.frame(y = Y, x = X)
mod.full = regsubsets(y ~ poly(x, 10, raw = T), data = data.full, nvmax = 10)
mod.summary = summary(mod.full)
# Find the model size for best cp, BIC and adjr2
which.min(mod.summary$cp)
## [1] 3
which.min(mod.summary$bic)
## [1] 3
which.max(mod.summary$adjr2)
## [1] 3
# Plot cp, BIC and adjr2
plot(mod.summary$cp, xlab = "Subset Size", ylab = "Cp", pch = 20, type = "l")
points(3, mod.summary$cp[3], pch = 4, col = "red", lwd = 7)
plot(mod.summary$bic, xlab = "Subset Size", ylab = "BIC", pch = 20, type = "l")
points(3, mod.summary$bic[3], pch = 4, col = "red", lwd = 7)
plot(mod.summary$adjr2, xlab = "Subset Size", ylab = "Adjusted R2", pch = 20,
type = "l")
points(3, mod.summary$adjr2[3], pch = 4, col = "red", lwd = 7)
We find that with Cp, BIC and Adjusted R2 criteria, \( 3 \), \( 3 \), and \( 3 \) variable models are respectively picked.
coefficients(mod.full, id = 3)
## (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2
## 3.07627 2.35624 -3.16515
## poly(x, 10, raw = T)7
## 0.01047
All statistics pick \( X^7 \) over \( X^3 \). The remaining coefficients are quite close to \( \beta \) s.
We fit forward and backward stepwise models to the data.
mod.fwd = regsubsets(y ~ poly(x, 10, raw = T), data = data.full, nvmax = 10,
method = "forward")
mod.bwd = regsubsets(y ~ poly(x, 10, raw = T), data = data.full, nvmax = 10,
method = "backward")
fwd.summary = summary(mod.fwd)
bwd.summary = summary(mod.bwd)
which.min(fwd.summary$cp)
## [1] 3
which.min(bwd.summary$cp)
## [1] 3
which.min(fwd.summary$bic)
## [1] 3
which.min(bwd.summary$bic)
## [1] 3
which.max(fwd.summary$adjr2)
## [1] 3
which.max(bwd.summary$adjr2)
## [1] 4
# Plot the statistics
par(mfrow = c(3, 2))
plot(fwd.summary$cp, xlab = "Subset Size", ylab = "Forward Cp", pch = 20, type = "l")
points(3, fwd.summary$cp[3], pch = 4, col = "red", lwd = 7)
plot(bwd.summary$cp, xlab = "Subset Size", ylab = "Backward Cp", pch = 20, type = "l")
points(3, bwd.summary$cp[3], pch = 4, col = "red", lwd = 7)
plot(fwd.summary$bic, xlab = "Subset Size", ylab = "Forward BIC", pch = 20,
type = "l")
points(3, fwd.summary$bic[3], pch = 4, col = "red", lwd = 7)
plot(bwd.summary$bic, xlab = "Subset Size", ylab = "Backward BIC", pch = 20,
type = "l")
points(3, bwd.summary$bic[3], pch = 4, col = "red", lwd = 7)
plot(fwd.summary$adjr2, xlab = "Subset Size", ylab = "Forward Adjusted R2",
pch = 20, type = "l")
points(3, fwd.summary$adjr2[3], pch = 4, col = "red", lwd = 7)
plot(bwd.summary$adjr2, xlab = "Subset Size", ylab = "Backward Adjusted R2",
pch = 20, type = "l")
points(4, bwd.summary$adjr2[4], pch = 4, col = "red", lwd = 7)
We see that all statistics pick \( 3 \) variable models except backward stepwise with adjusted R2. Here are the coefficients
coefficients(mod.fwd, id = 3)
## (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2
## 3.07627 2.35624 -3.16515
## poly(x, 10, raw = T)7
## 0.01047
coefficients(mod.bwd, id = 3)
## (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2
## 3.07888 2.41982 -3.17724
## poly(x, 10, raw = T)9
## 0.00187
coefficients(mod.fwd, id = 4)
## (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2
## 3.112359 2.369859 -3.275727
## poly(x, 10, raw = T)4 poly(x, 10, raw = T)7
## 0.027674 0.009997
Here forward stepwise picks \( X^7 \) over \( X^3 \). Backward stepwise with \( 3 \) variables picks \( X^9 \) while backward stepwise with \( 4 \) variables picks \( X^4 \) and \( X^7 \). All other coefficients are close to \( \beta \) s.
Training Lasso on the data
library(glmnet)
## Loading required package: Matrix
## Loading required package: lattice
## Loaded glmnet 1.9-5
xmat = model.matrix(y ~ poly(x, 10, raw = T), data = data.full)[, -1]
mod.lasso = cv.glmnet(xmat, Y, alpha = 1)
best.lambda = mod.lasso$lambda.min
best.lambda
## [1] 0.03991
plot(mod.lasso)
# Next fit the model on entire data using best lambda
best.model = glmnet(xmat, Y, alpha = 1)
predict(best.model, s = best.lambda, type = "coefficients")
## 11 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 3.0398151
## poly(x, 10, raw = T)1 2.2303371
## poly(x, 10, raw = T)2 -3.1033193
## poly(x, 10, raw = T)3 .
## poly(x, 10, raw = T)4 .
## poly(x, 10, raw = T)5 0.0498411
## poly(x, 10, raw = T)6 .
## poly(x, 10, raw = T)7 0.0008068
## poly(x, 10, raw = T)8 .
## poly(x, 10, raw = T)9 .
## poly(x, 10, raw = T)10 .
Lasso also picks \( X^5 \) over \( X^3 \). It also picks \( X^7 \) with negligible coefficient.
Create new Y with different \( \beta_7 = 7 \).
beta7 = 7
Y = beta0 + beta7 * X^7 + eps
# Predict using regsubsets
data.full = data.frame(y = Y, x = X)
mod.full = regsubsets(y ~ poly(x, 10, raw = T), data = data.full, nvmax = 10)
mod.summary = summary(mod.full)
# Find the model size for best cp, BIC and adjr2
which.min(mod.summary$cp)
## [1] 2
which.min(mod.summary$bic)
## [1] 1
which.max(mod.summary$adjr2)
## [1] 4
coefficients(mod.full, id = 1)
## (Intercept) poly(x, 10, raw = T)7
## 2.959 7.001
coefficients(mod.full, id = 2)
## (Intercept) poly(x, 10, raw = T)2 poly(x, 10, raw = T)7
## 3.0705 -0.1417 7.0016
coefficients(mod.full, id = 4)
## (Intercept) poly(x, 10, raw = T)1 poly(x, 10, raw = T)2
## 3.0763 0.2914 -0.1618
## poly(x, 10, raw = T)3 poly(x, 10, raw = T)7
## -0.2527 7.0091
We see that BIC picks the most accurate 1-variable model with matching coefficients. Other criteria pick additional variables.
xmat = model.matrix(y ~ poly(x, 10, raw = T), data = data.full)[, -1]
mod.lasso = cv.glmnet(xmat, Y, alpha = 1)
best.lambda = mod.lasso$lambda.min
best.lambda
## [1] 12.37
best.model = glmnet(xmat, Y, alpha = 1)
predict(best.model, s = best.lambda, type = "coefficients")
## 11 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 3.820
## poly(x, 10, raw = T)1 .
## poly(x, 10, raw = T)2 .
## poly(x, 10, raw = T)3 .
## poly(x, 10, raw = T)4 .
## poly(x, 10, raw = T)5 .
## poly(x, 10, raw = T)6 .
## poly(x, 10, raw = T)7 6.797
## poly(x, 10, raw = T)8 .
## poly(x, 10, raw = T)9 .
## poly(x, 10, raw = T)10 .
Lasso also picks the best 1-variable model but intercet is quite off (\( 3.8 \) vs \( 3 \)).