## ----echo=F, results='hide', fig.show='hide'----------------------------- library(knitr) opts_chunk$set(tidy=TRUE, tidy.opts = list(only.comment=TRUE, width.cutoff=80), warning=FALSE, message=FALSE) ## ----nntest2, echo=T----------------------------------------------------- library(car) data(Prestige) ## estimate two non-nested models using the ## Prestige data mod1 <- lm(prestige ~ income + women, data=na.omit(Prestige), y=T) mod2 <- lm(prestige ~ education + type + women, data=na.omit(Prestige), y=T) ## calculate IC measures AIC(mod1) ## AICc is in the AICcmodavg packasge library(AICcmodavg) AICc(mod1) BIC(mod1) ## ----vc, echo=T---------------------------------------------------------- library(games) vuong(mod1, mod2) clarke(mod1, mod2) ## ----rcv, echo=T, results='hide', fig.show='hide', fig.width=7, fig.height=12---- library(DAMisc) ## parcor package has function for fitting ridge regression ## with cross-validation for the ridge penalty library(parcor) ## load tha banks data library(readstata13) banks99 <- read.dta13( "http://quantoid.net/files/reg3/banks99.dta") ## standardize quantitative variables in banks data banks99s <- scaleDataFrame(banks99[,-c(1,2,4)]) ## Make X and y so that under5_mort is the dv ## and all other variables are the independent variables, ## except for country code, year and country name, ## columns 1, 2 and 4, respectively X <- model.matrix(gdppc_mp ~. , data=banks99s)[,-1] y <- model.response(model.frame(gdppc_mp ~. , data=banks99s)) ## use X and y from Ericksen data above and ## the cross-validating ridge function rcv <- ridge.cv(X,y) mod <- lm(y ~ X) ## calculate the ratio of the coefficients ## to show how different they are between ## constrained and unconstrained. rat <- with(rcv, c(intercept, coefficients))/coef(mod) ## remove X from the front of the variable names names(rat) <- gsub("X", "", names(rat)) ## Make a dot-plot of the coefficients library(lattice) dotplot(sort(rat), col="black") ## focus in on the panel to add a ## vertical line at 0 trellis.focus("panel", 1, 1) panel.abline(v=0, lty=2, col="gray65") panel.abline(v=c(-1,1), lty=3, col="gray75") trellis.unfocus() ## ----lasso1, echo=T------------------------------------------------------ library(glmnet) ## estimate the LASSO for the Banks data g <- glmnet(X, y) ## use cross-validation to choose the penalty ## parameter for the LASSO cvg <- cv.glmnet(X,y) ## compare coefficients with the linear model round(cbind(coef(cvg), coef(mod)), 4) ## ----cors, echo=T-------------------------------------------------------- ## make a data frame that contains predictions tmp <- data.frame( ridge = c(cbind(1, X) %*% with(rcv, c(intercept, coefficients))), lasso = c(cbind(1, X) %*% as.matrix(coef(cvg))), ols = fitted(mod) ) ## correlate and round to 4 decimal places. round(cor(tmp), 4) ## ----alasso, echo=T------------------------------------------------------ # estimate initial ridge regression and save coefficients b.ridge <- coef(ridge.cv(X,y)) # calculate weights gamma <- 1 w <- 1/(abs(b.ridge)^gamma) # estimate the LASSO with the weights cvg <- cv.glmnet(X,y, penalty.factor=w) coef(cvg) ## ----hinge, echo=F, out.width="\\textwidth"------------------------------ ## Example of hinge functions. x <- c(seq(-2, 0, length=51), seq(0, 2, length=51)[-1]) t <- 0 h1 <- function(x, t)ifelse(x> t, x-t, 0) h2 <- function(x, t)ifelse(t >x, t-x, 0) plot(x, h1(x, 0), type="l") lines(x, h2(x, 0), lty=2) legend("top", c("(x-t)+", "(t-x)+", lty=c(1,2)), inset=.01) ## ----earth1, echo=T------------------------------------------------------ ## Make the Friedman data again set.seed(11) n = 200 p = 5 X = data.frame(matrix(runif(n * p), ncol = p)) y = 10 * sin(pi* X[ ,1] * X[,2]) +20 * (X[,3] -.5)^2 + 10 * X[ ,4] + 5 * X[,5] + rnorm(n) df <- as.data.frame(cbind(X,y)) ## Load the earth package and estimate the model library(earth) ## Here, the pmethod corresponds to pruning (backward pass) e1 <- earth(X,y, nfold=10, ncross=10, pmethod="cv", degree=2) ## ----earthsum, echo=T---------------------------------------------------- summary(e1) ## ----e1pdp, echo=T, fig.show='hide', results='hide'---------------------- library(RColorBrewer) cols <- brewer.pal(5, "Set1") ## Plot partial dependence plot for X1 from the MARS model library(pdp) ep1 <- partial(e1, train=X, pred.var="X1") plotPartial(ep1) ## ----e2pdp, echo=T, fig.show='hide', results='hide'---------------------- library(ICEbox) ## Plot the ICE plot for X1 from the MARS model ep2 <- ice(e1, X=X, y=y, predictor="X1") clusterICE(ep2, nClusters=5, plot_legend=TRUE, colorvec=cols) ## ----fitvals, echo=T, fig.show='hide'------------------------------------ ## The MARS package doesn't require mgcv, so if you're ## going to use the gam variance model, you'll have to load ## mgcv manually first. library(mgcv) ## estimate the MARS model with prediction variances e2 <- earth(X,y, nfold=10, ncross=10, pmethod="cv", degree=2, varmod.meth="gam") ## plot the model using the earth package's plotting method plotmo(e2, pt.col=1, level=.95) ## ----polywog1, echo=T---------------------------------------------------- library(polywog) p1 <- polywog(y ~ ., data=df) sort(coef(p1)[-which(coef(p1) == 0)]) ## ----polyice, echo=T, fig.show='hide', results='hide'-------------------- pice <- ice(p1, X, y, predictor="X3") clusterICE(pice, nClusters=5, plot_legend=T, colorvec=cols) ## ----e3, echo=T---------------------------------------------------------- library(readstata13) ## read in the banks data banks <- read.dta13("http://quantoid.net/files/reg3/banks99.dta") ## take out the variables we don't want in the prediction banks.dat <- banks[,-c(1,2,4,5)] ## make the X and y matrices to feed into the MARS routine banks.X <- model.matrix(gdppc_mp ~ ., data=banks.dat)[,-1] banks.y <- log(model.response(model.frame(gdppc_mp~ ., data=banks.dat))) ## Estimate the MARS model on the banks data ## set ncross to something higher than 5 IRL e3 <- earth(banks.X, banks.y, nfold=10, ncross=5, degree=2, pmethod="cv") summary(e3) ## ----i3, echo=T, fig.show='hide', results='hide'------------------------- ## make ICE plot for vehicles/capita i3 <- ice(e3, X=banks.X, y=banks.y, predictor="all_veh_pc") clusterICE(i3, nClusters=5, plot_legend=TRUE, colorvec=cols) ## ----mods, echo=T, results='hide'---------------------------------------- ## Estimate all the models above for the GDP data library(earth) library(polywog) library(foreign) dat <- read.dta("http://quantoid.net/files/reg3/gdp_data_2000.dta") Xm <- model.matrix(log(rgdpna_pc) ~ ., data=dat)[,-1] X <- as.data.frame(Xm) y <- model.response(model.frame(log(rgdpna_pc) ~ ., data=dat)) m5 <- earth(log(rgdpna_pc) ~ ., data=dat, pmethod="cv", ncross=10, nfold=10, degree=3) m6 <- polywog(log(rgdpna_pc) ~ primsch_enroll_pc + polity2 + pop_c100k_pc, data=dat, degree=3) ## ----realpred, echo=T---------------------------------------------------- preds <- cbind( c(predict(m5, newdata=X)), predict(m6, newdata=X) ) colnames(preds) <- c("MARS", "PWOG") cor(preds, y)^2 ## ----pdm5, echo=T, results=F, fig.show='hide'---------------------------- plotPartial(partial(m5, train=X, pred.var="polity2")) ## ----pdm6, echo=T, results=F, fig.show='hide'---------------------------- plotPartial(partial(m6, X=X, pred.var="polity2", type="regression")) ## ----icm5, echo=T, results='hide', fig.show='hide'----------------------- clusterICE(ice(m5, X=X, y=y, predictor="polity2"), plot_legend=T, colorvec=cols, nClusters=5) ## ----icm6, echo=T, results='hide', fig.show='hide'----------------------- clusterICE(ice(m6, X=X, y=y, predictor="polity2"), plot_legend=T, colorvec=cols, nClusters=5) ## ----compmars, echo=T, fig.show='hide', results='hide'------------------- ## load the effects package library(splines) library(effects) ## estimate the polynomial regression model with polity2 lm.mod <- lm(log(rgdpna_pc) ~ poly(polity2, 2, raw=TRUE) + bs(pop_c100k_pc, df=8) + primsch_enroll_pc, data=dat) ## use the effects package to calculate the effect of polity2 from ## the polynomial model. eff <- effect("poly(polity2, 2, raw=TRUE)", lm.mod, xlevels=21) ## create the partial dependence plot from the MARS ## model for polity 2. plotPartial(partial(m5, train=X, pred.var="polity2")) ## plotPartial is a lattice plot, so to add elements ## to an existing plot, you need to focus in on the ## relevant panel. The 1,1 correspond to row and column ## numbers of the panel display. For single-panel displays, ## this will always be 1,1 trellis.focus("panel", 1, 1) ## add the polity predictions panel.lines(eff$x$polity2, eff$fit, col="red", lwd=2) trellis.unfocus() ## ----compbart, echo=T, fig.show='hide', results='hide'------------------- ## make the BART partial dependence plot plotPartial(partial(m6, X=X, pred.var="polity2", type="regression")) trellis.focus("panel", 1, 1) ## add the polity predictions panel.lines(eff$x$polity2, eff$fit, col="red", lwd=2) trellis.unfocus()