## On lecture 13 April 2015 # How does pH in Nowegian lakes depend on sulfate, nitrate, calcium, aluminium and organic content (x1, ... x5), area of lake (x6) and location (x7 = 0, Telemark, or x7 = 1, Trøndelag)? Data from Statens forurensningstilsyn (1986). Here 26 random lakes from Telemark and Trøndelag out of 1005 lakes have been drawn acidrain <- read.table("http://www.math.ntnu.no/~mettela/TMA4267/Data/acidrain.txt",header=TRUE) # Fitted full model previously: fitfull <- lm(y~.,data=acidrain) # lm: linear model summary(fitfull) # Model selection with Cp and R^2adj library(leaps) ## Best subset best <- regsubsets(y~.,data=acidrain) sumbest <- summary(best) sumbest # (stars in each line indicate M_k - see James et al.) # Note that x4 is in M1 but drops out of M2, ..., M5. sumbest$outmat # only the table sumbest$which # table in another format # Or use regsubsets' plot method: plot(best,scale="Cp") names(sumbest) # choices when proceeding sumbest$cp # the C_p's of the 7 models # RSS (SSE) or R^2 no good for selecting model due to monotonical improvement plot(sumbest$rss,type="l") plot(sumbest$rsq,type="l") # Try adjusted R^2 and Mallows' C_p: plot(sumbest$adjr2) lines(sumbest$adjr2) # add lines to see better plot(sumbest$cp) lines(sumbest$cp) which.max(sumbest$adjr2) # 5 which.min(sumbest$cp) # 3 # look at sumbest again to see models 3 and 5 ## To compute C_p and adjusted R^2 manually, e.g. for model 2 - just to see that regsubsets and we use the same definitions: fit <- lm(y~x1+x3,data=acidrain) y <- acidrain$y sse <- sum((y-fit$fitted.values)^2) sigmahat<-summary(fitfull)$sigma n <- length(y) p <- 3 # intercept and two predictors sse/sigmahat^2-n+2*p # C_p sumbest$cp # the two agree 1-sse/(n-p)/(sum((y-mean(y))^2)/(n-1)) # adjusted R^2 sumbest$adjr2 # agreement again ## Forward and backward forw <- regsubsets(y~.,data=acidrain,method="forward") backw <- regsubsets(y~.,data=acidrain,method="backward") # Some differences: par(mfrow=c(2,3)) plot(forw,scale="Cp",main="Forward") plot(backw,scale="Cp",main="Backward") plot(best,scale="Cp",main="Best subset") # all find same best model plot(forw,scale="adjr2") plot(backw,scale="adjr2") plot(best,scale="adjr2") # backward and best subset find same best model # But what is this!? The models obtained by forward and backward are not nested! # Reorder predictors - just too see what happens: acidrain <- acidrain[,c(1,4:8,2:3)] forw <- regsubsets(y~.,data=acidrain,method="forward") backw <- regsubsets(y~.,data=acidrain,method="backward") par(mfrow=c(2,3)) plot(forw,scale="Cp",main="Forward") plot(backw,scale="Cp",main="Backward") plot(best,scale="Cp",main="Best subset") # all find same best model plot(forw,scale="adjr2") plot(backw,scale="adjr2") plot(best,scale="adjr2") # backward and best subset find same best model # Looks OK now. The outhor of 'leaps', confirms that this looks like bug in an e-mail.