## ----setup, include=FALSE--------------------------------------------------------------------------------------------------------------------- library(tibble) library(ggplot2) library(formatR) library(knitr) library(pander) library(xtable) library(dplyr) library(tidyr) mytheme <- function(){ theme(axis.title=element_text(size=15), axis.text=element_text(size=12), text=element_text(size=12), title=element_text(size=15)) } uwogray <- "#807F83" purples <- c("#4F2683", "#8463AD", "#26064E", "#643F94", "#3B166A") pal2 <- c("#4F2683","#807F83") # pal3 <- c("#4F2683", "#811D7C", "302E87") pal3 <- c(rgb(79,38,131, maxColorValue = 255), rgb(129,29,124, maxColorValue = 255), rgb(48,46,135, maxColorValue = 255)) # pal4 <- c("#4F2683", "#C1582C", "#1E845A", "#C1B52C") pal4 <- c(rgb(79,38,131, maxColorValue = 255), rgb(193,88,44, maxColorValue = 255), rgb(30,132,90, maxColorValue = 255), rgb(193,181,44, maxColorValue = 255)) ## ---- echo=FALSE, cache=TRUE, fig.height=6, fig.width=12, out.width="80%", fig.align="center"------------------------------------------------- set.seed(2493) qtl <- c(.5, .75, .9, .95, .99) samps <- list() samps[[1]] <- matrix(rnorm(500*1500), ncol=1500) samps[[2]] <- matrix(rnorm(1000*1500), ncol=1500) samps[[3]] <- matrix(rnorm(1500*1500), ncol=1500) samps[[4]] <- matrix(rnorm(2500*1500), ncol=1500) samps[[5]] <- matrix(rnorm(5000*1500), ncol=1500) out <- lapply(samps, function(x)t(apply(x, 2, quantile, probs=qtl))) out <- do.call(rbind, out) out <- as_tibble(out) names(out) <- paste0("p", c(50,75,90,95,99)) out <- out %>% mutate(n = factor(rep(1:5, each=1500), labels=c("500", "1000", "1500", "2500", "5000"))) out <- out %>% pivot_longer(p50:p99, names_to="qtl", values_to="vals") sds <- out %>% group_by(n, qtl) %>% summarise(sd = sd(vals)) sds %>% ggplot(aes(x=n, y=sd)) + geom_linerange(aes(ymin=0, ymax=sd), col=uwogray) + geom_point() + facet_wrap(~qtl, nrow=2) + theme_bw() + mytheme() + labs(x= "N", y="SD") ## ----bsmed1, echo=T--------------------------------------------------------------------------------------------------------------------------- ## set random number generator seed set.seed(123) ## set number of observations n <- 25 ## draw x variable from a chi-squared distribution x <- rchisq(n,1,1) x <- rnorm(n,0,1) ## initialize a vector to save results set.seed(123) meds <- rep(NA, 5000) ## loop 1000 times for(i in 1:5000){ ## save the median for each bootstrapped sample meds[i] <- median(x[sample(1:n, n, replace=T)]) } ## print the true median median(x) ## summarize the bs replicates of median summary(meds) ## ----normci, echo=T--------------------------------------------------------------------------------------------------------------------------- ## showing how to calculate normal theory BS-CI se <- sd(meds) ## calculate bias bias <- mean(meds) - median(x) norm.ci <- median(x)- bias + qnorm((1+.95)/2)*se*c(-1,1) ## ----pctci, echo=T---------------------------------------------------------------------------------------------------------------------------- ## calculating percentile CIs med.ord <- meds[order(meds)] pct.ci <- med.ord[c(25,975)] ## or alternatively pct.ci <- quantile(meds, c(.025,.975)) ## ----bcaci, echo=T---------------------------------------------------------------------------------------------------------------------------- ## calculate the BC_a CI by hand ## zhat is the normal quantile for # bs replicates ## less than observed value (estimates bias). zhat <- qnorm(mean(meds < median(x))) ## calculate jackknife medians medi <- sapply(1:n, function(i)median(x[-i])) ## find the mean of the jackknifed medians Tdot <- mean(medi) ## calculate a-hat, f(diff between jackknifed medians and mean of jk medians) ahat <- sum((Tdot-medi)^3)/(6*sum((Tdot-medi)^2)^(3/2)) zalpha <- 1.96 z1alpha <- -1.96 ## calculate the quantiles required to make the ## appropriate interval, these will likely not ## be .025 and .975 a1 <- pnorm(zhat + ((zhat + zalpha)/(1-ahat*(zhat + zalpha)))) a2 <- pnorm(zhat + ((zhat + z1alpha)/(1-ahat*(zhat + z1alpha)))) ## alternatively, using quantile bca.ci <- quantile(meds, c(a1, a2)) ## Find the observation number for the appropriate values, ## 1000 is the number of bootstrap replicates a1 <- floor(a1*1000) a2 <- ceiling(a2*1000) bca.ci <- med.ord[c(a2, a1)] ## ----makemat, echo=T, results='asis'---------------------------------------------------------------------------------------------------------- ## collect all of the BS-CI results together mat <- rbind(norm.ci, pct.ci, bca.ci) rownames(mat) <- c("Normal", "Percentile", "BCa") colnames(mat) <- c("Lower", "Upper") print(xtable(mat), digits=3, type="html", html.table.attributes = "border = 0") ## ----withboot, echo=T------------------------------------------------------------------------------------------------------------------------- ## Bootstrapping the median with the boot package library(boot) ## set random number generator set.seed(123) ## define function that we will bootstrap med.fun <- function(x, inds){ ## calculate the median for the resampled x values med <- median(x[inds]) ## return the bootstrapped median med } ## use the boot function to botstrap the median boot.med <- boot(x, med.fun, R=1000) boot.ci(boot.med) ## ----bscor, echo=T---------------------------------------------------------------------------------------------------------------------------- ## bootstrap difference in correlations (requires car and boot packages) set.seed(123) ## load Mroz data data(Mroz, package="carData") ## define function to be bootstrapped cor.fun <- function(dat, inds){ ## save resampled data in tmp tmp <- dat[inds, ] ## calculate correlation between age and lwg for hc == no cor1 <- with(tmp[which(tmp$hc == "no"), ], cor(age, lwg, use="complete")) ## calculate correlation between age and lwg for hc == yes cor2 <- with(tmp[which(tmp$hc == "yes"), ], cor(age, lwg, use="complete")) ## return difference in correlations cor2-cor1 } ## use boot function to estimate and collect the results. ## notice that we're using Mroz$hc as the strata, which means ## that each bootstrap sample will contain the same number of ## hc=yes and hc==no, even though the observations chosen within ## those sub-samples will be different. boot.cor <- boot(Mroz, cor.fun, R=2000, strata=Mroz$hc) ## ----printcor, echo=T------------------------------------------------------------------------------------------------------------------------- boot.ci(boot.cor) ## ---- secpayfig1, echo=FALSE, fig.height=6, fig.width=6, fig.align="center", out.width="100%"------------------------------------------------- dat <- read.csv("http://quantoid.net/files/reg3/weakliem.txt", header=T) library(dplyr) library(ggplot2) library(broom) dat <- dat %>% mutate(democrat = factor(democrat, levels=c(0,1), labels=c("No", "Yes"))) m1 <- lm(secpay ~ gini + democrat, data=dat) m2 <- lm(secpay ~ gini*democrat, data=dat) ins <- dat %>% filter(!(country %in% c("CzechRepublic", "Slovakia"))) m3 <- lm(secpay ~ gini*democrat, data=ins) augment(m1) %>% ggplot(aes(x=gini)) + geom_point(aes(y=secpay, colour=democrat)) + geom_line(aes(y=.fitted, colour=democrat)) + scale_colour_manual(values=pal2) + theme_bw()+ mytheme() + theme(legend.position = "bottom") + labs(x="GINI", y="Perceived Inequality", colour="Democracy") + ggtitle("Additive, All Obs.") ## ---- secpayfig2, echo=FALSE, fig.height=6, fig.width=6, fig.align="center", out.width="100%"------------------------------------------------- augment(m2) %>% ggplot(aes(x=gini)) + geom_point(aes(y=secpay, colour=democrat)) + geom_line(aes(y=.fitted, colour=democrat)) + scale_colour_manual(values=pal2) + theme_bw()+ mytheme() + theme(legend.position = "bottom") + labs(x="GINI", y="Perceived Inequality", colour="Democracy") + ggtitle("Multiplicative, All Obs") ## ---- secpayfig3, echo=FALSE, fig.height=6, fig.width=6, fig.align="center", out.width="100%"------------------------------------------------- augment(m3) %>% ggplot(aes(x=gini)) + geom_point(aes(y=secpay, colour=democrat)) + geom_line(aes(y=.fitted, colour=democrat)) + scale_colour_manual(values=pal2) + theme_bw()+ mytheme() + theme(legend.position = "bottom") + labs(x="GINI", y="Perceived Inequality", colour="Democracy") + ggtitle("Multiplicative, No Outliers") ## ----weak, echo=T----------------------------------------------------------------------------------------------------------------------------- ## read in data library(car) dat <- read.csv("http://quantoid.net/files/reg3/weakliem.txt", header=T) ## drop two outlying observations dat <- dat[-c(25,49), ] ## save only complete cases of dat for variables in model ## this is important because you don't want the bootstrapping ## to be able to sample missing data, making the mdoel df different ## from sample to sample dat <- dat[complete.cases( dat[,c("secpay", "gini", "democrat")]), ] ## estimate model mod <- lm(secpay ~ gini*democrat,data=dat) ## summarize model S(mod, brief=TRUE) ## ----fixx, echo=T----------------------------------------------------------------------------------------------------------------------------- library(car) ## use the Boot function from the car package ## I am only using R=500 for convenience, in real ## life, it should be probably 1500 or 2000 boot.reg1 <- Boot(mod, R=500, method="residual") ## ----fixx_writefun, echo=F, eval=F------------------------------------------------------------------------------------------------------------ ## define a function reg.fun <- function(dat, inds){ ## set random number generator seed set.seed(123) ## save model residuals (these will get bootstrapped later) resids <- mod$residuals ## save model fitted values yhat <- mod$fitted ## define function to be bootstrapped ## resample model residuals boot.e <- resids[inds] ## add resampled residuals to original fitted values ## to make bootstrapped y boot.y <- yhat + boot.e ## estimate model with bootstrapped y as DV mod <- lm(boot.y ~ gini*democrat, data=dat) ## return model coefficients mod$coef } ## use the boot package to estimate and collect results ## Make sure you did as suggested above to make ## sure the data don't have missing values on the ## variables in your model boot.reg1 <- boot(dat, reg.fun, R=2000) ## ----fixxci, echo=T--------------------------------------------------------------------------------------------------------------------------- ## calculate percentile intervals for the four coefficients est1 <- boot.reg1$t0 perc.ci1 <- sapply(1:4, function(x){boot.ci(boot.reg1, type=c("perc"), index = x)$percent})[4:5,] ## calculate BCa intervals for the four coefficients bca.ci1 <- sapply(1:4, function(x){boot.ci(boot.reg1, type=c("bca"), index = x)$bca})[4:5,] colnames(bca.ci1) <- colnames(perc.ci1) <- names(coef(mod)) rownames(bca.ci1) <- rownames(perc.ci1) <- c("lower", "upper") perc.ci1 bca.ci1 ## ----ranxci, echo=T--------------------------------------------------------------------------------------------------------------------------- ## Use Boot function from car to do bootstrapping boot.reg2 <- Boot(mod, R=500, method="case") est2 <- boot.reg2$t0 ## calculate percentile intervals for the four coefficients perc.ci2 <- sapply(1:4, function(x){boot.ci(boot.reg2, type=c("perc"), index = x)$percent})[4:5,] ## calculate BCa intervals for the four coefficients bca.ci2 <- sapply(1:4, function(x){boot.ci(boot.reg2, type=c("bca"), index = x)$bca})[4:5,] # change row and column names of output colnames(bca.ci2) <- colnames(perc.ci2) <- names(coef(mod)) rownames(bca.ci2) <- rownames(perc.ci2) <- c("lower", "upper") # print output perc.ci2 bca.ci2 ## ----ranx_writefun, echo=F, eval=F------------------------------------------------------------------------------------------------------------ ## Alternatively, you could write your ## own function and use the boot function ## from the boot package as below: ## define function to be bootstrapped reg.fun2 <- function(dat, inds){ ## generate a local copy of the data with re-sampled rows tmp <- dat[inds, ] ## re-estimate model using new resampled data mod <- lm(secpay ~ gini*democrat, data=tmp) ## return model coefficients mod$coef } ## use boot to estimate and collect results boot.reg2 <- boot(dat, reg.fun2, R=2000) ## ----refit, echo=F, results='hide'------------------------------------------------------------------------------------------------------------ ## reload data dat <- read.csv("http://quantoid.net/files/reg3/weakliem.txt", header=T) ## rescale gini variable to rescale coefficient dat$gini <- dat$gini/100 ## keep only complete cases of variables in model dat <- dat[complete.cases( dat[,c("secpay", "gini", "democrat")]), ] ## estimate model mod <- lm(secpay ~ gini*democrat,data=dat) ## use Boot from car to do bootstrapping boot.reg1a <- Boot(mod, R=500, method="resid") est1a <- boot.reg1a$t0 boot.reg2a <- Boot(mod, R=500, method="case") est2a <- boot.reg2a$t0 ## ----morefixxci, echo=F, fig.height=6.5, fig.width=15, out.width="100%"----------------------------------------------------------------------- ## save appropriate confidence intervals ## for both models p1 <- sapply(1:4, function(x)boot.ci(boot.reg1a, type=c("perc"), index = x)$percent)[4:5,] p2 <- sapply(1:4, function(x)boot.ci(boot.reg1a, type=c("bca"), index = x)$bca)[4:5,] p3 <- sapply(1:4, function(x)boot.ci(boot.reg2a, type=c("perc"), index = x)$percent)[4:5,] p4 <- sapply(1:4, function(x)boot.ci(boot.reg2a, type=c("bca"), index = x)$bca)[4:5,] ## Collect all of the results into a data frame plot.dat <- data.frame( rbind(t(p1), t(p2), t(p3), t(p4), t(perc.ci1), t(bca.ci1), t(perc.ci2), t(bca.ci2))) names(plot.dat) <- c("lower", "upper") plot.dat$variable <- factor(names(coef(mod)), levels=names(coef(mod))) plot.dat$outliers <- factor(rep(c(1,2), each=16), labels=c("Outliers In", "No Outliers")) plot.dat$type <- factor(rep(c(1,2,1,2,1,2,1,2), each=4), levels=c(1,2), labels=c("Percentile", "BCa")) plot.dat$bootstrap = factor(rep(c(1,2,1,2), each=8), levels=c(1,2), labels=c("Fixed-X", "Random-X")) plot.dat$est <- c(rep(est1a,2), rep(est2a, 2), rep(est1, 2), rep(est2, 2)) ## Make figure ggplot(plot.dat, aes(x=bootstrap, colour=type)) + geom_point(aes(y=est), position=position_dodge(width=.5)) + geom_linerange(aes(ymin=lower, ymax=upper), position=position_dodge(width=.5)) + facet_grid(outliers ~ variable) + scale_colour_manual(values=pal2) + theme_bw() + mytheme() + theme(legend.position = "top") + labs(x="Bootstrap Type", y="Coefficient (95% CI)", colour="Interval Type") ## ----bsdens, echo=F, fig.height=6.5, fig.width=15, out.width="100%"--------------------------------------------------------------------------- library(tidyr) # collect all of the data n <- c("Intercept", "GINI", "Democrat", "Gini*Democrat") dens.dat <- rbind( boot.reg1a$t, boot.reg2a$t, boot.reg1$t, boot.reg2$t) colnames(dens.dat) <- n dens.dat <- as.data.frame(dens.dat) dens.dat$bootstrap <- factor(rep(c(1,2,1,2), each=500), labels=c("Fixed-X", "Random-X")) dens.dat$outliers <- factor(rep(c(1,2), each=1000), labels=c("Outliers In", "No Outliers")) dens.dat <- dens.dat %>% pivot_longer(cols=Intercept:`Gini*Democrat`, names_to="variable", values_to = "vals") ggplot(dens.dat, aes(x=vals, fill=bootstrap)) + geom_density(alpha=.5, col="transparent") + facet_wrap(outliers ~ variable, scales="free", ncol=4) + scale_fill_manual(values=pal2) + theme_bw() + mytheme()+ theme(legend.position="top") + labs(x="Bootstrap Estimates", y="Density", fill="Bootstrap Type") ## ----jab1, echo=F, fig.height=9, fig.width=9, out.height="60%", fig.align="center"------------------------------------------------------------ ## set plot layout to 2x2 par(mfrow=c(2,2)) ## do jackknife after bootstrapping for each regression ## model for the second parameter estimate (the coefficient on GINI) jack.after.boot(boot.reg1, index=2) jack.after.boot(boot.reg1a, index=2) jack.after.boot(boot.reg2, index=2) jack.after.boot(boot.reg2a, index=2) ### Exercises ## Bootstrap R2 difference mod1 <- lm(log(cases) ~ urban_rural , data=counties) mod2 <- lm(log(cases) ~ white_pop + black_pop + hisp_pop + over60_pop + lt25k + nohs, data=counties) boot.r2diff <- function(data, inds){ tmp <- data[inds, ] m1 <- update(mod1, data=tmp) m2 <- update(mod2, data=tmp) r2diff <- summary(m1)$adj.r.squared - summary(m2)$adj.r.squared r2diff } out1 <- boot(counties, boot.r2diff, R=2000) boot.ci(out, type="perc") e1 <- residuals(mod1) e2 <- residuals(mod2) yhat1 <- fitted(mod1) yhat2 <- fitted(mod2) boot.r2diffb <- function(data, inds){ tmp <- data tmp$y1new <- yhat1 + e1[inds] tmp$y2new <- yhat2 + e2[inds] m1 <- update(mod1, y1new ~ ., data=tmp) m2 <- update(mod2, y2new ~ ., data=tmp) r2diff <- summary(m1)$adj.r.squared - summary(m2)$adj.r.squared r2diff } out <- boot(counties, boot.r2diffb, R=2000) boot.ci(out, type="perc") library(ggplot2) plot.df <- data.frame( r2diff = c(out1$t, out$t), method = factor(rep(1:2, each=2000), labels=c("Random-X", "Fixed-x")) ) ggplot(plot.df, aes(x= r2diff, fill=method)) + geom_density(alpha=.15, col="transparent") ## bootstrap loess md <- counties %>% filter(state=="Maryland" & !is.na(cases) & !is.na(repvote)) %>% mutate(cases_pc = cases/tpop) ggplot(md, aes(y=cases_pc, x=repvote)) + geom_smooth() s <- seq( min(md$repvote), max(md$repvote), length=50) pred.data <- data.frame(repvote=s) boot.loess <- function(dat, inds, ...){ tmp <- dat[inds, ] lo <- loess(cases_pc ~ repvote, data=tmp, ...) yhat <- predict(lo, newdata=pred.data) yhat } out <- boot(md, boot.loess, R=2500, span=.75) cis <- t(apply(out$t, 2, quantile, prob=c(.025,.975), na.rm=TRUE)) pred.data$lower <- cis[,1] pred.data$upper <- cis[,2] pred.data$cases_pc <- out$t0 ggplot(pred.data, aes(x=repvote, y=cases_pc)) + geom_ribbon(aes(ymin=lower, ymax=upper), alpha=.15, col="transparent") + geom_smooth(data=md, method="loess", se=T, alpha=.15, fill="blue") + geom_line() + theme_bw() + labs(x="Republican Vote", y="Predicated Cases/Capita")