## ----echo=F, results='hide', fig.show='hide'----------------------------- library(knitr) opts_chunk$set(tidy=TRUE, tidy.opts = list(only.comment=TRUE, width.cutoff=80), warning=FALSE, message=FALSE) ## ----regtree, echo=T----------------------------------------------------- ## rpart has the the rpart function for estimating CART models library(rpart) library(car) ## load data data(SLID) ## create data listwise deleted on wages, age and education SLID <- SLID[complete.cases(SLID[,c("wages", "age", "education")]), ] ## estimate CART mdoel mod <- rpart(log(wages) ~ age + education, data=SLID) ## Print CART model results mod ## ----regtreeplot, echo=T, fig.show='hide'-------------------------------- ## Plot decision tree plot(mod) ## add text (variable names and split values) to plot text(mod) ## ----regtreesurf, echo=T, fig.show='hide'-------------------------------- ## create values of age from 20-95 and education from 8-20 age.s <- 20:95 educ.s <- 8:20 ## Create all possible combinations of age and education eg <- expand.grid(age=age.s, education=educ.s) ## Get fitted values from the CART model p <- predict(mod, newdata=eg) ## save the predicted values in the "fit" variable ## of the fake data we made to get predictions eg$fit <- p ## load the lattice package to have access to the wireframe function library(lattice) ## Make the 3-D plot... wireframe(fit ~ age + education, data=eg, drape=T) ## ----comptreemods, echo=T------------------------------------------------ ## load the mgcv and KRLS packages library(mgcv) library(KRLS) ## Estimate a raw linear model, multiplicative in age and education lm.mod <- lm(log(wages) ~ education*age, data=SLID) ## Estimate the GAM, interactive in age and education gam.mod <- gam(log(wages) ~ ti(education) + ti(age) + ti(age, education), data=SLID) ## The KRLS model takes a long time to estimate, so I saved ## the predictions in a file "predictions.text" and I'm reading ## them in, but if you wanted to run the model, the syntax is below # krls.mod <- krls(log(wages) ~ education*age, data=SLID, derivative=F) preds <- dget("http://quantoid.net/files/reg3/predictions.txt") ## correlate all of the predictions r <- do.call(cor, preds)^2 colnames(r) <- "R-squared" r ## ----rf, echo=T---------------------------------------------------------- ## Load randomForest package library(randomForest) ## estimate the randomForest model rfmod <- randomForest(log(wages) ~ age+education, data=SLID) ## add the random forests predictions to our set of predicted values preds$x <- cbind(preds$x, rf=rfmod$predicted) ## get correlations between the predictions and the observed y variable r <- do.call(cor, preds) colnames(r) <- "R-squared" ## print the correlation matrix r ## ----rfsurf, echo=T, fig.show='hide'------------------------------------- ## make new fake data for generating predictions eg <- expand.grid(age=age.s, education=educ.s) ## generate predictions from the model p <- predict(rfmod, newdata=eg) ## add predictions to the eg dataframe eg$fit <- p ## make the 3-D plot wireframe(fit ~ age + education, data=eg, drape=T) ## ----gbm, echo=T--------------------------------------------------------- ## Load the gbm package to do gradient boosting machines library(gbm) ## Estimate the GBM Model gbm.mod <- gbm(log(wages) ~ age+education, data=SLID, interaction.depth=10) ## Generate predictions and put them in preds preds$x <- cbind(preds$x, GBM = gbm.mod$fit) ## generate correlations with observed y r <- do.call(cor, preds) colnames(r) <- "R-squared" r ## ----gbmsurf, echo=T, fig.show='hide'------------------------------------ ## Make fake data to plot fitted surface eg <- expand.grid(age=age.s, education=educ.s) ## generate predictions p <- predict(gbm.mod, newdata=eg, n.trees=100) ## add predictions to eg in the "fit" variable eg$fit <- p ## make the 3-D plot wireframe(fit ~ age + education, data=eg, drape=T) ## ----xgb, echo=T--------------------------------------------------------- ## Load the xgboost package to do gradient boosting machines library(xgboost) ## Organize the data in the way that xgboost expects form <- as.formula(log(wages) ~ age+education) X <- model.matrix(form, data=SLID)[,-1] y <- model.response(model.frame(form, SLID)) ## Use xgboost to estimate the model ## You could set nrounds to a high value first (like 100) ## and look at a plot of the training RMSE relative to ## the number of rounds (code commented out right below) # xgb.mod <- xgboost(X,label=y, subsample=.5, nrounds=100) # plot(1:100, as.vector(xgb.mod$evaluation_log[,2])[[1]], # type="l") ## It looks like about 10 rounds is sufficient xgb.mod <- xgboost(X,label=y, subsample=.5, nrounds=10) ## Generate predictions and put them in preds xgb.fit <- predict(xgb.mod, newdata=X) preds$x <- cbind(preds$x, XGB = xgb.fit) ## generate correlations with observed y r <- do.call(cor, preds) colnames(r) <- "R-squared" r ## ----xgbsurf, echo=T, fig.show='hide'------------------------------------ ## Make fake data to plot fitted surface eg <- expand.grid(age=age.s, education=educ.s) ## generate predictions p <- predict(xgb.mod, newdata=as.matrix(eg), nrounds=10) ## add predictions to eg in the "fit" variable eg$fit <- p ## make the 3-D plot wireframe(fit ~ age + education, data=eg, drape=T) ## ----xgbcv, echo=T------------------------------------------------------- rsamp <- sample(1:nrow(X), floor(nrow(X)/2), replace=F) dtrain <- xgb.DMatrix(X[rsamp, ], label = y[rsamp]) cv1 <- xgb.cv(data = dtrain, nrounds = 10, nthread = 2, nfold = 5, metrics = list("rmse"), max_depth = 3, eta = 1, objective = "reg:linear") cv2 <- update(cv1, eta=.5) cv3 <- update(cv1, eta=.25) xgb.mod2 <- xgboost(X, label=y, nrounds=8, eta=.5, max_depth=3, subsample=.5) ## ----xgbsurf2, echo=T, fig.show='hide'----------------------------------- ## Create all possible combinations of age and education age.s <- 20:95 educ.s <- 8:20 eg <- expand.grid(age=age.s, education=educ.s) ## generate predictions p <- predict(xgb.mod2, newdata=as.matrix(eg), nrounds=8) ## add predictions to eg in the "fit" variable eg$fit <- p ## make the 3-D plot wireframe(fit ~ age + education, data=eg, drape=T) ## ----bart, echo=T-------------------------------------------------------- ## for bartMachine to work, you need to install the rJava package. For the ## rJava package to work, you need the java jdk environment for the ## appropriate R architecture to be installed. If you type version into R and ## hit enter, you'll be able to find out what architecture you're using in the "platform" ## entry. If you see x86_64 (or something else with 64), then you'ure using the ## 64 bit architecture, otherwise it's 32. You can download the JRE here: ## http://www.oracle.com/technetwork/java/javase/downloads/jre8-downloads-2133155.html library(rJava) library(bartMachine) ## bartMachine requires a design matrix (stored as a data frame) in X ## and a vector of response values in y. X <- model.matrix(log(wages) ~ age+education, data=SLID)[,-1] y <- model.response(model.frame(log(wages) ~ age+education, data=SLID)) ## Estiamte the BART model bart.mod <- bartMachine(as.data.frame(X),y, verbose=F) ## Generate predictions from the BART model preds$x <- cbind(preds$x, BART = bart.mod$y_hat_train) ## make and print correlations r <- do.call(cor, preds) colnames(r) <- "R-squared" r ## ----bartsurf, echo=T, fig.show='hide'----------------------------------- ## Make fake data to generated predicted surface. eg <- expand.grid(age=age.s, education=educ.s) ## Generate predicted values from fake data pred.bart <- predict(bart.mod, new_data=eg) ## add predictions to eg data frame eg$fit <- pred.bart ## make 3-D plot of fitted surface wireframe(fit ~ age + education, data=eg, drape=T) ## ----gdp, echo=T, results='hide'----------------------------------------- ## Using the banks data, estimate all the models we just talked about, ## generate predictions and and correlate them with the observed response library(readstata13) banks <- read.dta13("http://quantoid.net/files/reg3/banks99.dta") banks.dat <- banks[,-c(1,2,4,5)] X <- model.matrix(gdppc_mp ~ ., data=banks.dat) y <- model.response(model.frame(gdppc_mp~ ., data=banks.dat)) cart1 <- rpart(y ~ X) rf1 <- randomForest(X,y) gbm1 <- gbm(gdppc_mp ~ ., data=banks.dat, interaction.depth=10) rsamp <- sample(1:nrow(X), floor(nrow(X)/2), replace=F) xgbc <- xgb.cv(data=X[rsamp, ], label=y[rsamp], max_depth=5, eta=.5, nrounds=25, nfold=3) xgb1 <- xgboost(data = X, label=y, subsample=.5, max_depth=5, eta=.5, nrounds=10) bart1 <- bartMachine(as.data.frame(X),y, verbose=F) ## ----gdpcor, echo=T------------------------------------------------------ preds <- cbind( CART = predict(cart1, newdata=as.data.frame(X)), RF = rf1$predicted, GBM = gbm1$fit, XGB = predict(xgb1, newdata=X), BART = bart1$y_hat_train) cor(preds, y)^2 ## ----varimp, echo=F, results='hide', fig.show='hide', fig.height=8, fig.width=8---- ## get variable importance measures from the BART model i <- investigate_var_importance(bart1, type="splits", plot=F) ## vars has the names of all varaibles vars <- colnames(X) ## get CART importance measure and scale into [0,1] cart.imp <- cart1$variable.importance/sum(cart1$variable.importance) ## take the "X" off the start of all of the variable names from the CART model names(cart.imp) <- gsub("X", "", names(cart.imp)) ## get importance from the randomForest mdoel and scale into [0,1] rf.imp <- rf1$importance/sum(rf1$importance) ## get relative influence from GBM model ans scale into [0,1] gbm.imp <- relative.influence(gbm1)/sum(relative.influence(gbm1)) ## importance from the XGBoost model xgb.imp <- xgb.importance(model=xgb1) ## take the average variable inclusion proportions from the BART model bart.imp <- i$avg_var_props ## Collect all of the results and store them in imp.data imp.data <- data.frame( var = rep(vars, 5), imp = c( c(cart.imp[match(vars, names(cart.imp))]), c(rf.imp[match(vars, rownames(rf.imp))]), c(gbm.imp[match(vars, names(gbm.imp))]), c(unlist(xgb.imp[match(vars, xgb.imp$Feature), 2])), c(bart.imp[match(vars, names(bart.imp))])), model = factor(rep(c("CART", "RF", "GBM", "XGB", "BART"), each = length(vars)), levels=c("CART", "RF", "GBM", "XGB", "BART")) ) ## data managing to clean up the data before we plot it. imp.data$imp <- ifelse(is.na(imp.data$imp), 0, imp.data$imp) imp.data <- imp.data[-grep("Intercept", imp.data$var), ] imp.data$var <- reorder(imp.data$var, imp.data$imp, mean, na.rm=T) ## make it so the strips are light gray in color trellis.par.set(strip.background=list(col="gray85")) ## make a dot plot of the results ordered by average importance. dotplot(var ~ imp | model, data=imp.data, pch=16, col="black", layout=c(3,2), as.table=T) ## ----ice1, echo=T, results='hide', fig.show='hide'----------------------- library(RColorBrewer) cols <- brewer.pal(5, "Set1") ## Load ICEbox, which calculates individual conditional effects plots. library(ICEbox) ## Make the ICEs from the GBM model ice1 <- ice(xgb1, X=X, y=y, predictor="imports_pc", nrounds=10) ## Cluster ICE plots to have 3 groups. cice1 <- clusterICE(ice1, nClusters=5, plot_legend=TRUE, colorvec=cols) ## ----ice2, echo=T, results='hide', fig.show='hide'----------------------- ## Get ICEs from the BART model ice2 <- ice(bart1, as.data.frame(X), y, predictor="imports_pc") ## Cluster ICE plots to have 4 groups. cice2 <- clusterICE(ice2, nClusters=5, plot_legend=TRUE, colorvec=cols) ## ----pdp1a, echo=T, fig.show='hide'-------------------------------------- ## Load partial dependence plot package library(pdp) ## get partial effect for imports/capita from the GBM model p1 <- partial(gbm1, pred.var="imports_pc", n.trees=100, chull=TRUE) ## plot the partial effect plotPartial(p1) ## ----pdp1b, echo=T, results='hide', fig.show='hide'---------------------- ## Use pd_plot from the bartMachine package to get the ## partial dependence plot from the BART model p2 <- pd_plot(bart1, j="imports_pc") ## ----friedman1, echo=T--------------------------------------------------- ## Below is the code to produce the "Friedman" data, commonly ## used as a way to benchmark and understand models for ## flexible fitting. Note the interaction between x1 and x2 along ## with non-linearity in x3 set.seed(11) n = 200 p = 5 X = data.frame(matrix(runif(n * p), ncol = p)) Xm <- as.matrix(X) y = 10 * sin(pi* X[ ,1] * X[,2]) +20 * (X[,3] -.5)^2 + 10 * X[ ,4] + 5 * X[,5] + rnorm(n) df <- as.data.frame(cbind(X,y)) ## Estimate CART, RandomForest, GBM and BART on the Friedman data cart2 <- rpart(y ~ ., data=df) rf2 <- randomForest(y ~ ., data=df) gbm2 <- xgboost(data=Xm, label=y, nrounds=5, subsample=.5) bart2 <- bartMachine(X, y, verbose=F) ## ----truex12, echo=F, fig.show='hide', fig.height=8, fig.width=8--------- tmpX <- expand.grid(seq(0,1,length=15), seq(0,1,length=15)) tmpy <- matrix(sin(pi* tmpX[ ,1] * tmpX[,2]), ncol=15) fields:::drape.plot(seq(0,1,length=15), seq(0,1,length=15), tmpy, xlab ="x1", ylab="x2", zlab="Predicted Values", col=brewer.pal(11, "RdBu")) ## ----truex3, echo=F, fig.show='hide', fig.height=8, fig.width=8---------- s <- seq(0,1,length=25) plot(s,(s-.5)^2, type="l", ylab="Predicted Values", xlab="X3", main="X3") ## ----fice1, echo=T, fig.show='hide', results='hide'---------------------- library(RColorBrewer) cols <- brewer.pal(5, "Set1") ## Make ICE plots from all models and plot them fice1 <- ice(cart2, X=X, y=y, predictor="X1") clusterICE(fice1, nClusters=5, plot_legend=TRUE, colorvec=cols) ## ----fice2, echo=T, fig.show='hide', results='hide'---------------------- fice2 <- ice(rf2, X=X, y=y, predictor="X1", num.trees=100) clusterICE(fice2, nClusters=5, plot_legend=TRUE, colorvec=cols) ## ----fice3, echo=T, fig.show='hide', results='hide'---------------------- fice3 <- ice(gbm2, X=Xm, y=y, predictor="X1", n.trees=100) clusterICE(fice3, nClusters=5, plot_legend=TRUE, colorvec=cols) ## ----fice4, echo=T, fig.show='hide', results='hide'---------------------- fice4 <- ice(bart2, X=X, y=y, predictor="X1") clusterICE(fice4, nClusters=5, plot_legend=TRUE, colorvec=cols) ## ----fpdp1, echo=T, fig.show='hide', results='hide'---------------------- ## make partial dependence plots for X1 from all models ## and plot them fp1 <- partial(cart2, train=df, pred.var="X1") plotPartial(fp1) ## ----fpdp2, echo=T, fig.show='hide', results='hide'---------------------- fp2 <- partial(rf2, pred.var="X1", num.trees=100) plotPartial(fp2) ## ----fpdp3, echo=T, fig.show='hide', results='hide'---------------------- xgb.plot.shap(model=gbm2, data=Xm, features=c("X1")) ## ----fpdp4, echo=T, fig.show='hide', results='hide'---------------------- pd_plot(bart2, j="X1") ## ----x1x2surfs, eval=F, echo=F------------------------------------------- ## fake <- expand.grid(X1=seq(0,1,length=25), X2=seq(0,1,length=25), X3=.5, X4=.5, X5=.5) ## p.cart <- predict(cart2, newdata=fake) ## p.rf <- predict(rf2, newdata=fake) ## p.xgb <- predict(object=gbm2, newdata=as.matrix(fake), nrounds=5) ## p.bart <- predict(bart2, new_data=fake) ## ## fields:::drape.plot(seq(0,1,length=25), seq(0,1,length=25), ## matrix(p.cart, ncol=25), xlab ="x1", ylab="x2", ## zlab="Predicted Values", col=brewer.pal(11, "RdBu"), theta=-25) ## fields:::drape.plot(seq(0,1,length=25), seq(0,1,length=25), ## matrix(p.rf, ncol=25), xlab ="x1", ylab="x2", ## zlab="Predicted Values", col=brewer.pal(11, "RdBu"), theta=-25) ## fields:::drape.plot(seq(0,1,length=25), seq(0,1,length=25), ## matrix(p.xgb, ncol=25), xlab ="x1", ylab="x2", ## zlab="Predicted Values", col=brewer.pal(11, "RdBu"), theta=-25) ## fields:::drape.plot(seq(0,1,length=25), seq(0,1,length=25), ## matrix(p.bart, ncol=25), xlab ="x1", ylab="x2", ## zlab="Predicted Values", col=brewer.pal(11, "RdBu"), theta=-75) ##