## ----setup, include=FALSE------------------------------------------------------------- library(tibble) library(ggplot2) library(formatR) library(knitr) library(pander) library(xtable) library(dplyr) mytheme <- function(...){ theme(axis.title=element_text(size=15), axis.text=element_text(size=12), text=element_text(size=12), title=element_text(size=15), ...) } uwogray <- "#807F83" purples <- c("#4F2683", "#8463AD", "#26064E", "#643F94", "#3B166A") pal2 <- c("#4F2683","#807F83") # pal3 <- c("#4F2683", "#811D7C", "302E87") pal3 <- c(rgb(79,38,131, maxColorValue = 255), rgb(129,29,124, maxColorValue = 255), rgb(48,46,135, maxColorValue = 255)) # pal4 <- c("#4F2683", "#C1582C", "#1E845A", "#C1B52C") pal4 <- c(rgb(79,38,131, maxColorValue = 255), rgb(193,88,44, maxColorValue = 255), rgb(30,132,90, maxColorValue = 255), rgb(193,181,44, maxColorValue = 255)) ## ---- echo=FALSE, cache=TRUE, fig.height=6, fig.width=12, out.width="80%", fig.align="center"---- set.seed(2493) qtl <- c(.5, .75, .9, .95, .99) samps <- list() samps[[1]] <- matrix(rnorm(500*1500), ncol=1500) samps[[2]] <- matrix(rnorm(1000*1500), ncol=1500) samps[[3]] <- matrix(rnorm(1500*1500), ncol=1500) samps[[4]] <- matrix(rnorm(2500*1500), ncol=1500) samps[[5]] <- matrix(rnorm(5000*1500), ncol=1500) out <- lapply(samps, function(x)t(apply(x, 2, quantile, probs=qtl))) out <- do.call(rbind, out) out <- as_tibble(out) names(out) <- paste0("p", c(50,75,90,95,99)) out <- out %>% mutate(n = factor(rep(1:5, each=1500), labels=c("500", "1000", "1500", "2500", "5000"))) out <- out %>% pivot_longer(p50:p99, names_to="qtl", values_to="vals") sds <- out %>% group_by(n, qtl) %>% summarise(sd = sd(vals)) sds %>% ggplot(aes(x=n, y=sd)) + geom_linerange(aes(ymin=0, ymax=sd), col=uwogray) + geom_point() + facet_wrap(~qtl, nrow=2) + theme_bw() + mytheme() + labs(x= "N", y="SD") ## ----bsmed1, echo=T------------------------------------------------------------------- ## set random number generator seed set.seed(123) ## set number of observations n <- 25 ## draw x variable from a chi-squared distribution x <- rchisq(n,1,1) ## Bootstrapping the median with the boot package library(boot) ## set random number generator set.seed(123) ## define function that we will bootstrap med.fun <- function(x, inds){ ## calculate the median for the resampled x values med <- median(x[inds]) ## return the bootstrapped median med } ## use the boot function to botstrap the median boot.med <- boot(x, med.fun, R=1000) ## print the true median median(x) ## summarize the bs replicates of median meds <- boot.med$t t(summary(meds)) ## ----normci, echo=T------------------------------------------------------------------- ## showing how to calculate normal theory BS-CI se <- sd(meds) ## calculate bias bias <- mean(meds) - median(x) norm.ci <- median(x)- bias + qnorm((1+.95)/2)*se*c(-1,1) ## ----pctci, echo=T-------------------------------------------------------------------- ## calculating percentile CIs med.ord <- meds[order(meds)] pct.ci <- med.ord[c(25,975)] ## or alternatively pct.ci <- quantile(meds, c(.025,.975)) ## ----bcaci, echo=T-------------------------------------------------------------------- ## calculate the BC_a CI by hand ## zhat is the normal quantile for # bs replicates ## less than observed value (estimates bias). zhat <- qnorm(mean(meds < median(x))) ## calculate jackknife medians medi <- sapply(1:n, function(i)median(x[-i])) ## find the mean of the jackknifed medians Tdot <- mean(medi) ## calculate a-hat, f(diff between jackknifed medians and mean of jk medians) ahat <- sum((Tdot-medi)^3)/(6*sum((Tdot-medi)^2)^(3/2)) zalpha <- 1.96 z1alpha <- -1.96 ## calculate the quantiles required to make the ## appropriate interval, these will likely not ## be .025 and .975 a1 <- pnorm(zhat + ((zhat + zalpha)/(1-ahat*(zhat + zalpha)))) a2 <- pnorm(zhat + ((zhat + z1alpha)/(1-ahat*(zhat + z1alpha)))) ## alternatively, using quantile bca.ci <- quantile(meds, c(a1, a2)) ## Find the observation number for the appropriate values, ## 1000 is the number of bootstrap replicates a1 <- floor(a1*1000) a2 <- ceiling(a2*1000) bca.ci <- med.ord[c(a2, a1)] ## ----makemat, echo=T, results='asis'-------------------------------------------------- ## collect all of the BS-CI results together mat <- rbind(norm.ci, pct.ci, bca.ci) rownames(mat) <- c("Normal", "Percentile", "BCa") colnames(mat) <- c("Lower", "Upper") # print(xtable(mat), digits=4, type="html", # html.table.attributes = "border = 0") knitr::kable(mat, digits=3) ## ----withboot, echo=T----------------------------------------------------------------- boot.ci(boot.med) ## ----bscor, echo=T-------------------------------------------------------------------- ## bootstrap difference in correlations (requires car and boot packages) set.seed(123) ## load Mroz data data(Mroz, package="carData") ## define function to be bootstrapped cor.fun <- function(dat, inds){ ## save resampled data in tmp tmp <- dat[inds, ] ## calculate correlation between age and lwg for hc == no cor1 <- with(tmp[which(tmp$hc == "no"), ], cor(age, lwg, use="complete")) ## calculate correlation between age and lwg for hc == yes cor2 <- with(tmp[which(tmp$hc == "yes"), ], cor(age, lwg, use="complete")) ## return difference in correlations c(cor1, cor2, cor2-cor1) } ## use boot function to estimate and collect the results. ## notice that we're using Mroz$hc as the strata, which means ## that each bootstrap sample will contain the same number of ## hc=yes and hc==no, even though the observations chosen within ## those sub-samples will be different. boot.cor <- boot(Mroz, cor.fun, R=2000, strata=Mroz$hc) ## ----printcor, echo=T----------------------------------------------------------------- library(DAMisc) out1 <- tidy_boot_ci(boot.cor, type="perc", term_names = c("R no HC", "R HC", "Difference")) out1 out2 <- tidy_boot_ci(boot.cor, type="bca", term_names = c("R no HC", "R HC", "Difference")) out2 ## ---- secpayfig1, echo=FALSE, fig.height=6, fig.width=6, fig.align="center", out.width="100%"---- dat <- read.csv("http://quantoid.net/files/reg3/weakliem.txt", header=T) library(dplyr) library(ggplot2) library(broom) dat <- dat %>% mutate(democrat = factor(democrat, levels=c(0,1), labels=c("No", "Yes"))) m1 <- lm(secpay ~ gini + democrat, data=dat) m2 <- lm(secpay ~ gini*democrat, data=dat) ins <- dat %>% filter(!(country %in% c("CzechRepublic", "Slovakia"))) m3 <- lm(secpay ~ gini*democrat, data=ins) augment(m1) %>% ggplot(aes(x=gini)) + geom_point(aes(y=secpay, colour=democrat)) + geom_line(aes(y=.fitted, colour=democrat)) + scale_colour_manual(values=pal2) + theme_bw()+ mytheme() + theme(legend.position = "bottom") + labs(x="GINI", y="Perceived Inequality", colour="Democracy") + ggtitle("Additive, All Obs.") ## ---- secpayfig2, echo=FALSE, fig.height=6, fig.width=6, fig.align="center", out.width="100%"---- augment(m2) %>% ggplot(aes(x=gini)) + geom_point(aes(y=secpay, colour=democrat)) + geom_line(aes(y=.fitted, colour=democrat)) + scale_colour_manual(values=pal2) + theme_bw()+ mytheme() + theme(legend.position = "bottom") + labs(x="GINI", y="Perceived Inequality", colour="Democracy") + ggtitle("Multiplicative, All Obs") ## ---- secpayfig3, echo=FALSE, fig.height=6, fig.width=6, fig.align="center", out.width="100%"---- augment(m3) %>% ggplot(aes(x=gini)) + geom_point(aes(y=secpay, colour=democrat)) + geom_line(aes(y=.fitted, colour=democrat)) + scale_colour_manual(values=pal2) + theme_bw()+ mytheme() + theme(legend.position = "bottom") + labs(x="GINI", y="Perceived Inequality", colour="Democracy") + ggtitle("Multiplicative, No Outliers") ## ----weak, echo=T--------------------------------------------------------------------- ## read in data library(car) dat <- read.csv("http://quantoid.net/files/reg3/weakliem.txt", header=T) ## drop two outlying observations dat <- dat[-c(25,49), ] ## save only complete cases of dat for variables in model ## this is important because you don't want the bootstrapping ## to be able to sample missing data, making the mdoel df different ## from sample to sample dat <- dat[complete.cases( dat[,c("secpay", "gini", "democrat")]), ] ## estimate model mod <- lm(secpay ~ gini*democrat,data=dat) ## summarize model S(mod, brief=TRUE) ## ----fixx, echo=T--------------------------------------------------------------------- library(car) ## use the Boot function from the car package ## I am only using R=500 for convenience, in real ## life, it should be probably 1500 or 2000 boot.reg1 <- Boot(mod, R=500, method="residual") ## ----fixx_writefun, echo=F, eval=F---------------------------------------------------- ## define a function reg.fun <- function(dat, inds){ ## set random number generator seed set.seed(123) ## save model residuals (these will get bootstrapped later) resids <- mod$residuals ## save model fitted values yhat <- mod$fitted ## define function to be bootstrapped ## resample model residuals boot.e <- resids[inds] ## add resampled residuals to original fitted values ## to make bootstrapped y boot.y <- yhat + boot.e ## estimate model with bootstrapped y as DV mod <- lm(boot.y ~ gini*democrat, data=dat) ## return model coefficients mod$coef } ## use the boot package to estimate and collect results ## Make sure you did as suggested above to make ## sure the data don't have missing values on the ## variables in your model boot.reg1 <- boot(dat, reg.fun, R=2000) ## ----fixxci, echo=T------------------------------------------------------------------- ## calculate percentile intervals for the four coefficients est1 <- tidy_boot_ci(boot.reg1, type="perc", term_names = names(coef(mod))) ## calculate BCa intervals for the four coefficients est2 <- tidy_boot_ci(boot.reg1, type="bca", term_names = names(coef(mod))) est1 est2 ## ----ranxci, echo=T------------------------------------------------------------------- ## Use Boot function from car to do bootstrapping boot.reg2 <- Boot(mod, R=500, method="case") ## ------------------------------------------------------------------------------------- est1a <- tidy_boot_ci(boot.reg2, type="perc", term_names = names(coef(mod))) ## calculate BCa intervals for the four coefficients est2a <- tidy_boot_ci(boot.reg2, type="bca", term_names = names(coef(mod))) est1a est2a ## ----ranx_writefun, echo=F, eval=F---------------------------------------------------- ## Alternatively, you could write your ## own function and use the boot function ## from the boot package as below: ## define function to be bootstrapped reg.fun2 <- function(dat, inds){ ## generate a local copy of the data with re-sampled rows tmp <- dat[inds, ] ## re-estimate model using new resampled data mod <- lm(secpay ~ gini*democrat, data=tmp) ## return model coefficients mod$coef } ## use boot to estimate and collect results boot.reg2 <- boot(dat, reg.fun2, R=2000) ## ----refit, echo=F, results='hide'---------------------------------------------------- ## reload data dat <- read.csv("http://quantoid.net/files/reg3/weakliem.txt", header=T) ## rescale gini variable to rescale coefficient dat$gini <- dat$gini/100 ## keep only complete cases of variables in model dat <- dat[complete.cases( dat[,c("secpay", "gini", "democrat")]), ] ## estimate model mod <- lm(secpay ~ gini*democrat,data=dat) ## use Boot from car to do bootstrapping boot.reg1a <- Boot(mod, R=500, method="resid") boot.reg2a <- Boot(mod, R=500, method="case") ## ----morefixxci, echo=F, fig.height=6.5, fig.width=15, out.width="100%"--------------- ## save appropriate confidence intervals ## for both models p1 <- tidy_boot_ci(boot.reg1a, type="perc", term_names = names(coef(mod))) p2 <- tidy_boot_ci(boot.reg1a, type="bca", term_names = names(coef(mod))) p3 <- tidy_boot_ci(boot.reg2a, type="perc", term_names = names(coef(mod))) p4 <- tidy_boot_ci(boot.reg2a, type="bca", term_names = names(coef(mod))) ## Collect all of the results into a data frame plot.dat <- bind_rows(p1, p2, p3, p4, est1, est2, est1a, est2a) %>% mutate(outliers = factor(rep(c(1,2), each=16), labels=c("Outliers In", "No Outliers")), type = factor(rep(c(1,2,1,2,1,2,1,2), each=4), levels=c(1,2), labels=c("Percentile", "BCa")), bootstrap = factor(rep(c(1,2,1,2), each=8), levels=c(1,2), labels=c("Fixed-X", "Random-X"))) ## Make figure ggplot(plot.dat, aes(x=bootstrap, colour=type)) + geom_point(aes(y=estimate), position=position_dodge(width=.5)) + geom_linerange(aes(ymin=conf.low, ymax=conf.high), position=position_dodge(width=.5)) + facet_grid(outliers ~ term) + scale_colour_manual(values=pal2) + theme_bw() + mytheme() + theme(legend.position = "top") + labs(x="Bootstrap Type", y="Coefficient (95% CI)", colour="Interval Type") ## ----bsdens, echo=F, fig.height=6.5, fig.width=15, out.width="100%"------------------- library(tidyr) # collect all of the data n <- c("Intercept", "GINI", "Democrat", "Gini*Democrat") dens.dat <- rbind( boot.reg1a$t, boot.reg2a$t, boot.reg1$t, boot.reg2$t) colnames(dens.dat) <- n dens.dat <- as.data.frame(dens.dat) dens.dat$bootstrap <- factor(rep(c(1,2,1,2), each=500), labels=c("Fixed-X", "Random-X")) dens.dat$outliers <- factor(rep(c(1,2), each=1000), labels=c("Outliers In", "No Outliers")) dens.dat <- dens.dat %>% pivot_longer(cols=Intercept:`Gini*Democrat`, names_to="variable", values_to = "vals") ggplot(dens.dat, aes(x=vals, fill=bootstrap)) + geom_density(alpha=.5, col="transparent") + facet_wrap(outliers ~ variable, scales="free", ncol=4) + scale_fill_manual(values=pal2) + theme_bw() + mytheme()+ theme(legend.position="top") + labs(x="Bootstrap Estimates", y="Density", fill="Bootstrap Type") ## ----jab1, echo=F, fig.height=9, fig.width=9, out.height="60%", fig.align="center"---- ## set plot layout to 2x2 par(mfrow=c(2,2)) ## do jackknife after bootstrapping for each regression ## model for the second parameter estimate (the coefficient on GINI) jack.after.boot(boot.reg1, index=2) jack.after.boot(boot.reg1a, index=2) jack.after.boot(boot.reg2, index=2) jack.after.boot(boot.reg2a, index=2) ## ---- eval=FALSE---------------------------------------------------------------------- library(DAMisc) library(progress) load(file("https://quantoid.net/files/reg3/wvs.rda")) wvs$class_num <- as.numeric(wvs$class) wvs <- wvs %>% select(age, class_num, educ, income, sex, happy) %>% na.omit() nreps <- 500 form <- as.formula(class_num ~ age + I(age^2) + educ + income + sex + happy) ## ------------------------------------------------------------------------------------- data(Prestige, package="carData") mod <- lm(prestige ~ log(income) + poly(women, 2, raw=TRUE) + poly(education, 3), data=Prestige) b1 <- unname(coef(mod)[3]) b2 <- unname(coef(mod)[4]) minprest <- -b1/(2*b2) minprest ## ------------------------------------------------------------------------------------- boot.min <- function(data, inds){ m <- update(mod, data=data[inds, ]) b1 <- unname(coef(m)[3]) b2 <- unname(coef(m)[4]) -b1/(2*b2) } out1 <- boot(Prestige, boot.min, R=5000) boot.ci(out1, type="perc") ## ------------------------------------------------------------------------------------- B <- MASS::mvrnorm(5000, coef(mod), vcov(mod)) out2 <- -B[,3]/(2*B[,4]) quantile(out2, c(.025, .975))