library(ISLR)
sum(is.na(Hitters$Salary))
## [1] 59
Hitters = Hitters[-which(is.na(Hitters$Salary)), ]
sum(is.na(Hitters$Salary))
## [1] 0
Hitters$Salary = log(Hitters$Salary)
train = 1:200
Hitters.train = Hitters[train, ]
Hitters.test = Hitters[-train, ]
library(gbm)
## Loading required package: survival
## Loading required package: splines
## Loading required package: lattice
## Loading required package: parallel
## Loaded gbm 2.1
set.seed(103)
pows = seq(-10, -0.2, by = 0.1)
lambdas = 10^pows
length.lambdas = length(lambdas)
train.errors = rep(NA, length.lambdas)
test.errors = rep(NA, length.lambdas)
for (i in 1:length.lambdas) {
boost.hitters = gbm(Salary ~ ., data = Hitters.train, distribution = "gaussian",
n.trees = 1000, shrinkage = lambdas[i])
train.pred = predict(boost.hitters, Hitters.train, n.trees = 1000)
test.pred = predict(boost.hitters, Hitters.test, n.trees = 1000)
train.errors[i] = mean((Hitters.train$Salary - train.pred)^2)
test.errors[i] = mean((Hitters.test$Salary - test.pred)^2)
}
plot(lambdas, train.errors, type = "b", xlab = "Shrinkage", ylab = "Train MSE",
col = "blue", pch = 20)
plot(lambdas, test.errors, type = "b", xlab = "Shrinkage", ylab = "Test MSE",
col = "red", pch = 20)
min(test.errors)
## [1] 0.2561
lambdas[which.min(test.errors)]
## [1] 0.05012
Minimum test error is obtained at \( \lambda = 0.05 \).
lm.fit = lm(Salary ~ ., data = Hitters.train)
lm.pred = predict(lm.fit, Hitters.test)
mean((Hitters.test$Salary - lm.pred)^2)
## [1] 0.4918
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 1.9-5
set.seed(134)
x = model.matrix(Salary ~ ., data = Hitters.train)
y = Hitters.train$Salary
x.test = model.matrix(Salary ~ ., data = Hitters.test)
lasso.fit = glmnet(x, y, alpha = 1)
lasso.pred = predict(lasso.fit, s = 0.01, newx = x.test)
mean((Hitters.test$Salary - lasso.pred)^2)
## [1] 0.4701
Both linear model and regularization like Lasso have higher test MSE than boosting.
boost.best = gbm(Salary ~ ., data = Hitters.train, distribution = "gaussian",
n.trees = 1000, shrinkage = lambdas[which.min(test.errors)])
summary(boost.best)
## var rel.inf
## CAtBat CAtBat 22.7563
## CWalks CWalks 10.4280
## CHits CHits 8.6198
## PutOuts PutOuts 6.6159
## Years Years 6.4612
## Walks Walks 6.2331
## CRBI CRBI 6.0927
## CHmRun CHmRun 5.1076
## RBI RBI 4.5322
## CRuns CRuns 4.4728
## Assists Assists 3.8367
## HmRun HmRun 3.1554
## Hits Hits 3.1229
## AtBat AtBat 2.4339
## Errors Errors 2.4324
## Runs Runs 2.1425
## Division Division 0.7042
## NewLeague NewLeague 0.6675
## League League 0.1849
\( \tt{CAtBat} \), \( \tt{CRBI} \) and \( \tt{CWalks} \) are three most important variables in that order.
library(randomForest)
## randomForest 4.6-7
## Type rfNews() to see new features/changes/bug fixes.
set.seed(21)
rf.hitters = randomForest(Salary ~ ., data = Hitters.train, ntree = 500, mtry = 19)
rf.pred = predict(rf.hitters, Hitters.test)
mean((Hitters.test$Salary - rf.pred)^2)
## [1] 0.2298
Test MSE for bagging is about \( 0.23 \), which is slightly lower than the best test MSE for boosting.