## On lecture 13 April 2015
# How does pH in Nowegian lakes depend on sulfate, nitrate, calcium, aluminium and organic content (x1, ... x5), area of lake (x6) and location (x7 = 0, Telemark, or x7 = 1, Trøndelag)? Data from Statens forurensningstilsyn (1986). Here 26 random lakes from Telemark and Trøndelag out of 1005 lakes have been drawn 

acidrain <- read.table("http://www.math.ntnu.no/~mettela/TMA4267/Data/acidrain.txt",header=TRUE)

# Fitted full model previously:
fitfull <- lm(y~.,data=acidrain) # lm: linear model
summary(fitfull)

# Model selection with Cp and R^2adj

library(leaps)

## Best subset

best <- regsubsets(y~.,data=acidrain)
sumbest <- summary(best)
sumbest
# (stars in each line indicate M_k - see James et al.)
# Note that x4 is in M1 but drops out of M2, ..., M5.
sumbest$outmat # only the table
sumbest$which # table in another format
# Or use regsubsets' plot method:
plot(best,scale="Cp")

names(sumbest) # choices when proceeding
sumbest$cp # the C_p's of the 7 models

# RSS (SSE) or R^2 no good for selecting model due to monotonical improvement
plot(sumbest$rss,type="l")
plot(sumbest$rsq,type="l")

# Try adjusted R^2 and Mallows' C_p:
plot(sumbest$adjr2)
lines(sumbest$adjr2) # add lines to see better
plot(sumbest$cp)
lines(sumbest$cp)
which.max(sumbest$adjr2) # 5
which.min(sumbest$cp) # 3
# look at sumbest again to see models 3 and 5

## To compute C_p and adjusted R^2 manually, e.g. for model 2 - just to see that regsubsets and we use the same definitions:

fit <- lm(y~x1+x3,data=acidrain)
y <- acidrain$y
sse <- sum((y-fit$fitted.values)^2)
sigmahat<-summary(fitfull)$sigma
n <- length(y)
p <- 3 # intercept and two predictors

sse/sigmahat^2-n+2*p # C_p
sumbest$cp # the two agree

1-sse/(n-p)/(sum((y-mean(y))^2)/(n-1)) # adjusted R^2
sumbest$adjr2 # agreement again

## Forward and backward

forw <- regsubsets(y~.,data=acidrain,method="forward")
backw <- regsubsets(y~.,data=acidrain,method="backward")
# Some differences:
par(mfrow=c(2,3))
plot(forw,scale="Cp",main="Forward")
plot(backw,scale="Cp",main="Backward")
plot(best,scale="Cp",main="Best subset") # all find same best model
plot(forw,scale="adjr2")
plot(backw,scale="adjr2")
plot(best,scale="adjr2") # backward and best subset find same best model

# But what is this!? The models obtained by forward and backward are not nested!
# Reorder predictors - just too see what happens:
acidrain <- acidrain[,c(1,4:8,2:3)]
forw <- regsubsets(y~.,data=acidrain,method="forward")
backw <- regsubsets(y~.,data=acidrain,method="backward")
par(mfrow=c(2,3))
plot(forw,scale="Cp",main="Forward")
plot(backw,scale="Cp",main="Backward")
plot(best,scale="Cp",main="Best subset") # all find same best model
plot(forw,scale="adjr2")
plot(backw,scale="adjr2")
plot(best,scale="adjr2") # backward and best subset find same best model
# Looks OK now. The outhor of 'leaps', confirms that this looks like bug in an e-mail.