## ----echo=F, results='hide', fig.show='hide', message=FALSE, warning=FALSE---- library(knitr) library(DAMisc) opts_chunk$set(tidy=TRUE, tidy.opts = list(only.comment=TRUE, comment = FALSE, width.cutoff=80), warning=FALSE, message=FALSE) ## ----bsmed1, echo=T------------------------------------------------------ ## set random number generator seed set.seed(123) ## set number of observations n <- 25 ## draw x variable from a chi-squared distribution x <- rchisq(n,1,1) ## initialize a vector to save results meds <- rep(NA, 1000) ## loop 1000 times for(i in 1:1000){ ## save the median for each bootstrapped sample meds[i] <- median(x[sample(1:n, n, replace=T)]) } ## print the true median median(x) ## summarize the bs replicates of median summary(meds) ## ----normci, echo=T------------------------------------------------------ ## showing how to calculate normal theory BS-CI se <- sd(meds) ## calculate bias bias <- mean(meds) - median(x) norm.ci <- median(x)- bias + qnorm((1+.95)/2)*se*c(-1,1) ## ----pctci, echo=T------------------------------------------------------- ## calculating percentile CIs med.ord <- meds[order(meds)] pct.ci <- med.ord[c(25,975)] ## or alternatively pct.ci <- quantile(meds, c(.025,.975)) ## ----bcaci, echo=T------------------------------------------------------- ## calculate the BC_a CI by hand ## zhat is the normal quantile for # bs replicates ## less than observed value (estimates bias). zhat <- qnorm(mean(meds < median(x))) ## calculate jackknife medians medi <- sapply(1:n, function(i)median(x[-i])) ## find the mean of the jackknifed medians Tdot <- mean(medi) ## calculate a-hat, f(diff between jackknifed medians and mean of jk medians) ahat <- sum((Tdot-medi)^3)/(6*sum((Tdot-medi)^2)^(3/2)) zalpha <- 1.96 z1alpha <- -1.96 ## calculate the quantiles required to make the ## appropriate interval, these will likely not ## be .025 and .975 a1 <- pnorm(zhat + ((zhat + zalpha)/(1-ahat*(zhat + zalpha)))) a2 <- pnorm(zhat + ((zhat + z1alpha)/(1-ahat*(zhat + z1alpha)))) ## alternatively, using quantile bca.ci <- quantile(meds, c(a1, a2)) ## Find the observation number for the appropriate values, ## 1000 is the number of bootstrap replicates a1 <- floor(a1*1000) a2 <- ceiling(a2*1000) bca.ci <- med.ord[c(a2, a1)] ## ----makemat, echo=T----------------------------------------------------- ## collect all of the BS-CI results together mat <- rbind(norm.ci, pct.ci, bca.ci) rownames(mat) <- c("norm", "pct", "bca") colnames(mat) <- c("lower", "upper") round(mat, 3) ## ----withboot, echo=T---------------------------------------------------- ## Bootstrapping the median with the boot package library(boot) ## set random number generator set.seed(123) ## define function that we will bootstrap med.fun <- function(x, inds){ ## calculate the median for the resampled x values med <- median(x[inds]) ## return the bootstrapped median med } ## use the boot function to botstrap the median boot.med <- boot(x, med.fun, R=1000) boot.ci(boot.med) ## ----bscor, echo=T------------------------------------------------------- ## bootstrap difference in correlations (requires car and boot packages) set.seed(123) ## load Mroz data data(Mroz) ## define function to be bootstrapped cor.fun <- function(dat, inds){ ## save resampled data in tmp tmp <- dat[inds, ] ## calculate correlation between age and lwg for hc == no cor1 <- with(tmp[which(tmp$hc == "no"), ], cor(age, lwg, use="complete")) ## calculate correlation between age and lwg for hc == yes cor2 <- with(tmp[which(tmp$hc == "yes"), ], cor(age, lwg, use="complete")) ## return difference in correlations cor2-cor1 } ## use boot function to estimate and collect the results. ## notice that we're using Mroz$hc as the strata, which means ## that each bootstrap sample will contain the same number of ## hc=yes and hc==no, even though the observations chosen within ## those sub-samples will be different. boot.cor <- boot(Mroz, cor.fun, R=2000, strata=Mroz$hc) ## ----printcor, echo=T---------------------------------------------------- boot.ci(boot.cor) ## ----secpayfig, echo=F, results='hide', fig.show='hide', fig.height=8, fig.width=8---- dat <- read.csv("http://quantoid.net/files/reg3/weakliem.txt", header=T) library(dplyr) m1 <- lm(secpay ~ gini + democrat, data=dat) m2 <- lm(secpay ~ gini*democrat, data=dat) ins <- dat %>% filter(!(country %in% c("CzechRepublic", "Slovakia"))) m3 <- lm(secpay ~ gini*democrat, data=ins) d1 <- coef(m1) d2 <- coef(m2) d3 <- coef(m3) a11 <- d1[1] a12 <- d1[1] + d1[3] b1 <- d1[2] a21 <- d2[1] a22 <- d2[1] + d2[3] b21 <- d2[2] b22 <- d2[2] + d2[4] a31 <- d3[1] a32 <- d3[1] + d3[3] b31 <- d3[2] b32 <- d3[2] + d3[4] library(lattice) cols <- trellis.par.get()$superpose.line$col xyplot( secpay ~ gini, groups=democrat, data=dat, pch=16, panel = function(x,y,...){ panel.xyplot(x,y,...) panel.abline(a11, b1, col=cols[1]) panel.abline(a12, b1, col=cols[2]) }, key =list(space="top", points=list(pch=16, col=cols[1:2]), text=list(c("Non-Democacy", "Democracy")))) xyplot( secpay ~ gini, groups=democrat, data=dat, pch=16, panel = function(x,y,...){ panel.xyplot(x,y,...) panel.abline(a21, b21, col=cols[1]) panel.abline(a22, b22, col=cols[2]) }, key =list(space="top", points=list(pch=16, col=cols[1:2]), text=list(c("Non-Democacy", "Democracy")))) xyplot( secpay ~ gini, groups=democrat, data=ins, pch=16, panel = function(x,y,...){ panel.xyplot(x,y,...) panel.abline(a31, b31, col=cols[1]) panel.abline(a32, b32, col=cols[2]) }, key =list(space="top", points=list(pch=16, col=cols[1:2]), text=list(c("Non-Democacy", "Democracy")))) ## ----weak, echo=T-------------------------------------------------------- ## read in data dat <- read.csv("http://quantoid.net/files/reg3/weakliem.txt", header=T) ## drop two outlying observations dat <- dat[-c(25,49), ] ## save only complete cases of dat for variables in model ## this is important because you don't want the bootstrapping ## to be able to sample missing data, making the mdoel df different ## from sample to sample dat <- dat[complete.cases( dat[,c("secpay", "gini", "democrat")]), ] ## estimate model mod <- lm(secpay ~ gini*democrat,data=dat) ## summarize model summary(mod) ## ----fixx, echo=T-------------------------------------------------------- ## use the Boot function from the car package ## I am only using R=500 for convenience, in real ## life, it should be probably 1500 or 2000 boot.reg1 <- Boot(mod, R=500, method="residual") ## ----fixx_writefun, echo=F, eval=F--------------------------------------- ## ## set random number generator seed ## set.seed(123) ## ## save model residuals (these will get bootstrapped later) ## resids <- mod$residuals ## ## save model fitted values ## yhat <- mod$fitted ## ## define function to be bootstrapped ## ## resample model residuals ## boot.e <- resids[inds] ## ## add resampled residuals to original fitted values ## ## to make bootstrapped y ## boot.y <- yhat + boot.e ## ## estimate model with bootstrapped y as DV ## mod <- lm(boot.y ~ gini*democrat, data=dat) ## ## return model coefficients ## mod$coef ## } ## ## use the boot package to estimate and collect results ## ## Make sure you did as suggested above to make ## ## sure the data don't have missing values on the ## ## variables in your model ## boot.reg1 <- boot(dat, reg.fun, R=2000) ## ----fixxci, echo=T------------------------------------------------------ ## calculate percentile intervals for the four coefficients perc.ci <- sapply(1:4, function(x){boot.ci(boot.reg1, type=c("perc"), index = x)$percent})[4:5,] ## calculate BCa intervals for the four coefficients bca.ci <- sapply(1:4, function(x){boot.ci(boot.reg1, type=c("bca"), index = x)$bca})[4:5,] colnames(bca.ci) <- colnames(perc.ci) <- names(coef(mod)) rownames(bca.ci) <- rownames(perc.ci) <- c("lower", "upper") perc.ci bca.ci ## ----ranxci, echo=T------------------------------------------------------ ## Use Boot function from car to do bootstrapping boot.reg2 <- Boot(mod, R=500, method="case") ## calculate percentile intervals for the four coefficients perc.ci <- sapply(1:4, function(x){boot.ci(boot.reg2, type=c("perc"), index = x)$percent})[4:5,] ## calculate BCa intervals for the four coefficients bca.ci <- sapply(1:4, function(x){boot.ci(boot.reg2, type=c("bca"), index = x)$bca})[4:5,] # change row and column names of output colnames(bca.ci) <- colnames(perc.ci) <- names(coef(mod)) rownames(bca.ci) <- rownames(perc.ci) <- c("lower", "upper") # print output perc.ci bca.ci ## ----ranx_writefun, echo=F, eval=F--------------------------------------- ## ## Alternatively, you could write your ## ## own function and use the boot function ## ## from the boot package as below: ## ## ## define function to be bootstrapped ## reg.fun2 <- function(dat, inds){ ## ## generate a local copy of the data with re-sampled rows ## tmp <- dat[inds, ] ## ## re-estimate model using new resampled data ## mod <- lm(secpay ~ gini*democrat, data=tmp) ## ## return model coefficients ## mod$coef ## } ## ## use boot to estimate and collect results ## boot.reg2 <- boot(dat, reg.fun2, R=2000) ## ----refit, echo=F, results='hide'--------------------------------------- ## reload data dat <- read.csv("http://quantoid.net/files/reg3/weakliem.txt", header=T) ## rescale gini variable to rescale coefficient dat$gini <- dat$gini/100 ## keep only complete cases of variables in model dat <- dat[complete.cases( dat[,c("secpay", "gini", "democrat")]), ] ## estimate model mod <- lm(secpay ~ gini*democrat,data=dat) ## use Boot from car to do bootstrapping boot.reg1 <- Boot(mod, R=500, method="resid") boot.reg2 <- Boot(mod, R=500, method="case") ## ----morefixxci, echo=F, results='hide'---------------------------------- ## save appropriate confidence intervals ## for both models p1 <- sapply(1:4, function(x)boot.ci(boot.reg1, type=c("perc"), index = x)$percent)[4:5,] p2 <- sapply(1:4, function(x)boot.ci(boot.reg1, type=c("bca"), index = x)$bca)[4:5,] p3 <- sapply(1:4, function(x)boot.ci(boot.reg2, type=c("perc"), index = x)$percent)[4:5,] p4 <- sapply(1:4, function(x)boot.ci(boot.reg2, type=c("bca"), index = x)$bca)[4:5,] ## Collect all of the results into a data frame plot.dat <- data.frame(rbind(t(p1), t(p2), t(p3), t(p4))) names(plot.dat) <- c("lower", "upper") ## identify the rows with variable, outlier and type plot.dat$variable <- factor(names(coef(mod)), levels=names(coef(mod))) plot.dat$outliers <- factor(rep(c(1,2), each=8), labels=c("No Outlier", "Out liers In")) plot.dat$type <- factor(rep(c(1,2,1,2), each=4), levels=c(1,2), labels=c("Percentile", "BCa")) ## Make figure library(latticeExtra) png("outrob.png", height=7, width=12, units="in", res=300) trellis.par.set(strip.background=list(col="gray85")) useOuterStrips( xyplot(variable ~ lower | type + outliers, data=plot.dat, xlim = range(c(plot.dat[,c("lower", "upper")]))+c(-.1, .1), ylab = "", xlab = "95% Confidence Interval", panel = function(x,y,subscripts,...){ panel.segments(plot.dat$lower[subscripts],y, plot.dat$upper[subscripts],y, col="black") panel.abline(v=0, lty=2)} )) dev.off() ## ----bsdens, echo=T, fig.show='hide', fig.height=10, fig.width=10-------- ## set plot layout to 2x2 n <- c("Intercept", "GINI", "Democrat", "Gini*Democrat") par(mfrow=c(2,2)) ## plot density of fixed-x and random-x bootstrap distributions for(j in 1:4){ plot(density(boot.reg1$t[,j]), xlab="T*", ylab="Density", main = n[j]) lines(density(boot.reg2$t[,j]), lty=2) } ## ----jab1, echo=T, fig.show='hide', fig.height=10, fig.width=10---------- ## set plot layout to 2x2 par(mfrow=c(2,2)) ## do jackknife after bootstrapping for each coefficient jack.after.boot(boot.reg1, index=1) jack.after.boot(boot.reg1, index=2) jack.after.boot(boot.reg1, index=3) jack.after.boot(boot.reg1, index=4) ## ----jab2, echo=T, fig.show='hide', fig.height=10, fig.width=10---------- ## set plot layout to 2x2 par(mfrow=c(2,2)) ## do jackknife after bootstrapping for each coefficient jack.after.boot(boot.reg2, index=1) jack.after.boot(boot.reg2, index=2) jack.after.boot(boot.reg2, index=3) jack.after.boot(boot.reg2, index=4) ## ----fixx_loess, echo=T-------------------------------------------------- library(foreign) ## read in the data dat <- read.dta("http://quantoid.net/files/reg3/jacob.dta") ## define function to rescale in [0,1] rescale01 <- function(x)(x-min(x, na.rm=T)/diff(range(x, na.rm=T))) ## define function to create sequence over range of variable seq.range <- function(x)seq(min(x), max(x), length=250) ## rescale variables into [0,1] dat$perot <- rescale01(dat$perot) dat$chal <- rescale01(dat$chal_vote) ## generate a sequence of values of perot ## vote used to get predictions of challenger vote ## and save in a data frame s <- seq.range(dat$perot) df <- data.frame(perot = s) ## estimate the loess model lo.mod <- loess(chal ~ perot, data=dat, span=.75) ## save residuals and fitted vals from model resids <- lo.mod$residuals yhat <- lo.mod$fitted ## define function to bootstrap lo.fun <- function(dat, inds){ ## resample residuals boot.e <- resids[inds] ## define bootstrapped y boot.y <- yhat + boot.e ## re-estimate model tmp.lo <- loess(boot.y ~ perot, data=dat, span=.75) ## generate model predictions preds <- predict(tmp.lo, newdata=df) ## return predictions preds } boot.perot1 <- boot(dat, lo.fun, R=1000) ## ----ranx_loess, echo=T-------------------------------------------------- ## define random-x bootstrapping function lo.fun2 <- function(dat, inds){ ## estimate model replace data with ## resampled version of original data tmp.lo <- loess(boot.y ~ perot, data=dat[inds,], span=.75) ## generate predictions from model with ## new fake data preds <- predict(tmp.lo, newdata=df) ## return predictions preds } boot.perot2 <- boot(dat, lo.fun, R=1000) ## ----loci, echo=T, results='hide', fig.show='hide'----------------------- ## make confidence intervals for all ## 250 predictions from the fixed-x resampling out1 <- sapply(1:250, function(i)boot.ci(boot.perot1, index=i, type="perc")$perc) ## make confidence intervals for all ## 250 predictions from the random-x resampling out2 <- sapply(1:250, function(i)boot.ci(boot.perot2, index=i, type="perc")$perc) ## find predictions from original model preds <- predict(lo.mod, newdata=df,se=T) ## calculate analytical confidence intervals ## assuming n=infinity lower <- preds$fit - 1.96*preds$se upper <- preds$fit + 1.96*preds$se ## collect bs confidence intervals ci1 <- t(out1[4:5, ]) ci2 <- t(out2[4:5, ]) ci3 <- cbind(lower, upper) ## plot hypothetical sequence of perot values ## created above against the predictions plot(s, preds$fit, type = "l", ## make the y-axis big enough to accommodate ## all of the confidence intercals ylim=range(c(ci1, ci2, ci3))) ## define x-values for plotting the confidence polygons poly.x <- c(s, rev(s)) ## plot confidence polygon for fixed-x bs polygon(poly.x, y=c(ci1[,1], rev(ci1[,2])), col=rgb(1,0,0,.25), border=NA) ## plot confidence polygon for random-x bs polygon(poly.x, y=c(ci2[,1], rev(ci2[,2])), col=rgb(0,1,0,.25), border=NA) ## plot normal theory interval for original result polygon(poly.x, y=c(ci3[,1], rev(ci3[,2])), col=rgb(0,0,1,.25), border=NA) ## include a legend legend("topleft", c("Random-X", "Fixed-X", "Analytical"), fill = rgb(c(1,0,0), c(0,1,0), c(0,0,1), c(.25,.25,.25)), inset=.01) ## ----cv_secpay, echo=T--------------------------------------------------- ## read in data dat <- read.csv("http://www.quantoid.net/files/reg3/weakliem.txt", header=T) ## delete CzechRepublic and Slovakia dat <- dat[-c(25,49), ] ## Note that mod1 is very flexible and probably ## overfitting, it has a much higher r-squared ## than mod2 mod1 <- glm(secpay ~ poly(gini, 3)*democrat, data=dat) mod2 <- glm(secpay ~ gini*democrat, data=dat) ## finds cross-validation error for both models ## in real life, we would want to repeat this lots of times ## (like 200), but here we're just going to do 25 ## for the sake of time and convenience ## initialize deltas deltas <- NULL ## do 25 times for(i in 1:25){ deltas <- rbind(deltas, c( ## use cv.glm to calculate cross-validation error ## for 5 folds. You might use 10 if you had a bigger ## dataset cv.glm(dat, mod1, K=5)$delta, cv.glm(dat, mod2, K=5)$delta) )} colMeans(deltas) ## ----read, echo=T-------------------------------------------------------- ## set random number generator seed set.seed(1) ## set number of observations and create x n <- 400 x <- 0:(n-1)/(n-1) ## generate f(x) f <- 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10 ## generate y as f(x)+e y <- f + rnorm(n, 0, sd = 2) ## make data into data frame tmp <- data.frame(y=y, x=x) ## estimate the loess model lo.mod <- loess(y ~ x, data=tmp, span=.75) ## ----mindir, echo=T------------------------------------------------------ ## the value in the "minimum" element ## is the span with the lowest CV error. best.span <- optimize(cv.lo2, c(.05,.95), form=y ~ x, data=tmp, numiter=5, K=10) best.span ## ----mindir2, echo=T----------------------------------------------------- library(fANCOVA) best.span2 <- loess.as(tmp$x, tmp$y, criterion="gcv") best.span2$pars$span ## ----curve, echo=F, fig.show='hide'-------------------------------------- ## cross-validate the smoother span by choosing ## consecutively greater span parameters source("http://www.quantoid.net/files/reg3/cv.lo.r") out.cv <- matrix(ncol=2, nrow=10) out.cv[1,] <- cv.lo(tmp, "y", update(lo.mod, span=.1), numiter=25, K=10) out.cv[2,] <- cv.lo(tmp, "y", update(lo.mod, span=.2), numiter=25, K=10) out.cv[3,] <- cv.lo(tmp, "y", update(lo.mod, span=.3), numiter=25, K=10) out.cv[4,] <- cv.lo(tmp, "y", update(lo.mod, span=.4), numiter=25, K=10) out.cv[5,] <- cv.lo(tmp, "y", update(lo.mod, span=.5), numiter=25, K=10) out.cv[6,] <- cv.lo(tmp, "y", update(lo.mod, span=.6), numiter=25, K=10) out.cv[7,] <- cv.lo(tmp, "y", update(lo.mod, span=.7), numiter=25, K=10) out.cv[8,] <- cv.lo(tmp, "y", update(lo.mod, span=.8), numiter=25, K=10) out.cv[9,] <- cv.lo(tmp, "y", update(lo.mod, span=.9), numiter=25, K=10) out.cv[10,] <- cv.lo(tmp, "y", update(lo.mod, span=1), numiter=25, K=10) ## Use a sqeuence of numbers, s=(0,1) ## t0 make predictions s <- seq(from=0, to=1, length=250) ## initialize y-hat yhat <- NULL ## Loop over relevant values of s and get the ## predicted values for each number in s for(i in seq(.1,1,by=.1)){ yhat <- cbind(yhat, predict(update(lo.mod, span=i), newdata=data.frame(x=s))) } ## plot the results plot(x,y) for(i in c(1:10)){ lines(s, yhat[,i], col="gray25", lwd=1.5) } ## plot the curve in red using the cv-optimized ## span from above. lines(s, predict(update(lo.mod, span=best.span$minimum), newdata = data.frame(x=s)), col="red", lwd=3) lines(s, predict(update(lo.mod, span=best.span2$pars$span), newdata = data.frame(x=s)), col="blue", lwd=3) legend("topright", c("CV", "GCV"), lty=c(1,1), col=c("red", "blue"), lwd=c(3,3), inset=.01) ## ----cvman, echo=T, fig.show='hide', fig.height=8, fig.width=8----------- cvoptim_yj <- function(pars, form, data, trans.vars, K=5, numiter=10){ require(boot) require(VGAM) form <- as.character(form) for(i in 1:length(trans.vars)){ form <- gsub(trans.vars[i], paste("yeo.johnson(", trans.vars[i], ",", pars[i], ")", sep=""), form) } form <- as.formula(paste0(form[2], form[1], form[3])) m <- glm(as.formula(form), data, family=gaussian) d <- lapply(1:numiter, function(x)cv.glm(data, m, K=K)) mean(sapply(d, function(x)x$delta[1])) } lams <- seq(0,2, by=.1) s <- sapply(lams, function(x)cvoptim_yj(x, form=prestige ~ income + education + women, data=Prestige, trans.vars="income", K=3)) plot(s ~ lams, type="l", xlab=expression(lambda), ylab="CV Error")