Chapter 5: Exercise 5

library(ISLR)
summary(Default)
##  default    student       balance         income     
##  No :9667   No :7056   Min.   :   0   Min.   :  772  
##  Yes: 333   Yes:2944   1st Qu.: 482   1st Qu.:21340  
##                        Median : 824   Median :34553  
##                        Mean   : 835   Mean   :33517  
##                        3rd Qu.:1166   3rd Qu.:43808  
##                        Max.   :2654   Max.   :73554
attach(Default)

a

set.seed(1)
glm.fit = glm(default ~ income + balance, data = Default, family = binomial)

b

FiveB = function() {
    # i.
    train = sample(dim(Default)[1], dim(Default)[1]/2)
    # ii.
    glm.fit = glm(default ~ income + balance, data = Default, family = binomial, 
        subset = train)
    # iii.
    glm.pred = rep("No", dim(Default)[1]/2)
    glm.probs = predict(glm.fit, Default[-train, ], type = "response")
    glm.pred[glm.probs > 0.5] = "Yes"
    # iv.
    return(mean(glm.pred != Default[-train, ]$default))
}
FiveB()
## [1] 0.0286

2.86% test error rate from validation set approach.

c

FiveB()
## [1] 0.0236
FiveB()
## [1] 0.028
FiveB()
## [1] 0.0268

It seems to average around 2.6% test error rate.

d

train = sample(dim(Default)[1], dim(Default)[1]/2)
glm.fit = glm(default ~ income + balance + student, data = Default, family = binomial, 
    subset = train)
glm.pred = rep("No", dim(Default)[1]/2)
glm.probs = predict(glm.fit, Default[-train, ], type = "response")
glm.pred[glm.probs > 0.5] = "Yes"
mean(glm.pred != Default[-train, ]$default)
## [1] 0.0264

2.64% test error rate, with student dummy variable. Using the validation set approach, it doesn't appear adding the student dummy variable leads to a reduction in the test error rate.