## ----echo=F, results='hide', fig.show='hide'----------------------------- library(knitr) opts_chunk$set(tidy=TRUE, tidy.opts = list(only.comment=TRUE, width.cutoff=80), warning=FALSE, message=FALSE) ## ----cv_secpay, echo=T--------------------------------------------------- ## read in data library(boot) dat <- read.csv("http://www.quantoid.net/files/reg3/weakliem.txt", header=T) ## delete CzechRepublic and Slovakia dat <- dat[-c(25,49), ] ## Note that mod1 is very flexible and probably ## overfitting, it has a much higher r-squared ## than mod2 mod1 <- glm(secpay ~ poly(gini, 3)*democrat, data=dat) mod2 <- glm(secpay ~ gini*democrat, data=dat) ## finds cross-validation error for both models ## in real life, we would want to repeat this lots of times ## (like 200), but here we're just going to do 25 ## for the sake of time and convenience ## initialize deltas deltas <- NULL ## do 25 times for(i in 1:25){ deltas <- rbind(deltas, c( ## use cv.glm to calculate cross-validation error ## for 5 folds. You might use 10 if you had a bigger ## dataset cv.glm(dat, mod1, K=5)$delta, cv.glm(dat, mod2, K=5)$delta) )} colMeans(deltas) ## ----read, echo=T-------------------------------------------------------- ## set random number generator seed set.seed(1) ## set number of observations and create x n <- 400 x <- 0:(n-1)/(n-1) ## generate f(x) f <- 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10 ## generate y as f(x)+e y <- f + rnorm(n, 0, sd = 2) ## make data into data frame tmp <- data.frame(y=y, x=x) ## estimate the loess model lo.mod <- loess(y ~ x, data=tmp, span=.75) ## ----mindir, echo=T------------------------------------------------------ library(DAMisc) ## the value in the "minimum" element ## is the span with the lowest CV error. best.span <- optimize(cv.lo2, c(.05,.95), form=y ~ x, data=tmp, numiter=5, K=10) best.span ## ----mindir2, echo=T----------------------------------------------------- library(fANCOVA) best.span2 <- loess.as(tmp$x, tmp$y, criterion="gcv") best.span2$pars$span ## ----curve, echo=F, fig.show='hide'-------------------------------------- ## cross-validate the smoother span by choosing ## consecutively greater span parameters source("http://www.quantoid.net/files/reg3/cv.lo.r") out.cv <- matrix(ncol=2, nrow=10) out.cv[1,] <- cv.lo(tmp, "y", update(lo.mod, span=.1), numiter=25, K=10) out.cv[2,] <- cv.lo(tmp, "y", update(lo.mod, span=.2), numiter=25, K=10) out.cv[3,] <- cv.lo(tmp, "y", update(lo.mod, span=.3), numiter=25, K=10) out.cv[4,] <- cv.lo(tmp, "y", update(lo.mod, span=.4), numiter=25, K=10) out.cv[5,] <- cv.lo(tmp, "y", update(lo.mod, span=.5), numiter=25, K=10) out.cv[6,] <- cv.lo(tmp, "y", update(lo.mod, span=.6), numiter=25, K=10) out.cv[7,] <- cv.lo(tmp, "y", update(lo.mod, span=.7), numiter=25, K=10) out.cv[8,] <- cv.lo(tmp, "y", update(lo.mod, span=.8), numiter=25, K=10) out.cv[9,] <- cv.lo(tmp, "y", update(lo.mod, span=.9), numiter=25, K=10) out.cv[10,] <- cv.lo(tmp, "y", update(lo.mod, span=1), numiter=25, K=10) ## Use a sqeuence of numbers, s=(0,1) ## t0 make predictions s <- seq(from=0, to=1, length=250) ## initialize y-hat yhat <- NULL ## Loop over relevant values of s and get the ## predicted values for each number in s for(i in seq(.1,1,by=.1)){ yhat <- cbind(yhat, predict(update(lo.mod, span=i), newdata=data.frame(x=s))) } ## plot the results plot(x,y) for(i in c(1:10)){ lines(s, yhat[,i], col="gray25", lwd=1.5) } ## plot the curve in red using the cv-optimized ## span from above. lines(s, predict(update(lo.mod, span=best.span$minimum), newdata = data.frame(x=s)), col="red", lwd=3) lines(s, predict(update(lo.mod, span=best.span2$pars$span), newdata = data.frame(x=s)), col="blue", lwd=3) legend("topright", c("CV", "GCV"), lty=c(1,1), col=c("red", "blue"), lwd=c(3,3), inset=.01) ## ----cvman, echo=T, fig.show='hide', fig.height=8, fig.width=8----------- cvoptim_yj <- function(pars, form, data, trans.vars, K=5, numiter=10){ require(boot) require(VGAM) form <- as.character(form) for(i in 1:length(trans.vars)){ form <- gsub(trans.vars[i], paste("yeo.johnson(", trans.vars[i], ",", pars[i], ")", sep=""), form) } form <- as.formula(paste0(form[2], form[1], form[3])) m <- glm(as.formula(form), data, family=gaussian) d <- lapply(1:numiter, function(x)cv.glm(data, m, K=K)) mean(sapply(d, function(x)x$delta[1])) } lams <- seq(0,2, by=.1) s <- sapply(lams, function(x)cvoptim_yj(x, form=prestige ~ income + education + women, data=Prestige, trans.vars="income", K=3)) plot(s ~ lams, type="l", xlab=expression(lambda), ylab="CV Error")