From e5ebfb1e75928a61730f8567accfcbdb2730c047 Mon Sep 17 00:00:00 2001 From: asardaes Date: Fri, 2 Dec 2016 19:59:07 +0100 Subject: [PATCH 1/3] Unload library with .onUnload function Best practice --- pkg/caret/R/aaa.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/caret/R/aaa.R b/pkg/caret/R/aaa.R index a04d1467..85aa7fd2 100644 --- a/pkg/caret/R/aaa.R +++ b/pkg/caret/R/aaa.R @@ -21,6 +21,9 @@ NULL +.onUnload <- function(libpath) { library.dynam.unload("caret", libpath) } + + ################################################################### ## Global Variables ################################################################### From 2adfd5e67fe10c6eace1078e579fa2db7a5178d7 Mon Sep 17 00:00:00 2001 From: asardaes Date: Fri, 2 Dec 2016 21:29:04 +0100 Subject: [PATCH 2/3] Minor adjustments in print.train --- pkg/caret/R/print.train.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/caret/R/print.train.R b/pkg/caret/R/print.train.R index 9d88c50b..fdc580e8 100644 --- a/pkg/caret/R/print.train.R +++ b/pkg/caret/R/print.train.R @@ -132,7 +132,7 @@ stringFunc <- function (x) { cat("Resampling results") if(dim(tuneAcc)[1] > 1) cat(" across tuning parameters") - if(showSD) cat(" (values above are 'mean (sd)')") + if(showSD) cat(" (values below are 'mean (sd)')") cat(":\n\n") if(dim(tuneAcc)[1] > 1) { @@ -264,7 +264,8 @@ stringFunc <- function (x) { cat(truncateText(met)) } - cat(truncateText(optString), "\n") + cat(truncateText(optString)) + if(nzchar(optString)) cat("\n") } else printMat <- NULL if(details) { From 78cc0d7066340e5700f49d390e07d098b56d0506 Mon Sep 17 00:00:00 2001 From: asardaes Date: Fri, 2 Dec 2016 22:28:14 +0100 Subject: [PATCH 3/3] Implemented optimism bootstrap For nominal workflow --- pkg/caret/R/confusionMatrix.R | 5 +- pkg/caret/R/print.train.R | 2 +- pkg/caret/R/train.default.R | 23 +++++--- pkg/caret/R/trainControl.R | 20 ++++++- pkg/caret/R/workflows.R | 127 ++++++++++++++++++++++++++++++++++++++---- pkg/caret/man/train.Rd | 2 +- pkg/caret/man/trainControl.Rd | 20 ++++++- 7 files changed, 170 insertions(+), 29 deletions(-) diff --git a/pkg/caret/R/confusionMatrix.R b/pkg/caret/R/confusionMatrix.R index 37304552..4d6ed993 100644 --- a/pkg/caret/R/confusionMatrix.R +++ b/pkg/caret/R/confusionMatrix.R @@ -572,7 +572,8 @@ resampName <- function(x, numbers = TRUE){ ifelse(x$control$fixedWindow, " a ", " no "), "fixed window)", sep = ""), oob = "Out of Bag Resampling", - boot =, boot632 = paste("Bootstrapped (", numResamp, " reps)", sep = ""), + boot =, optimism_boot =, boot_all =, + boot632 = paste("Bootstrapped (", numResamp, " reps)", sep = ""), cv = paste("Cross-Validated (", x$control$number, " fold)", sep = ""), repeatedcv = paste("Cross-Validated (", x$control$number, " fold, repeated ", x$control$repeats, " times)", sep = ""), @@ -593,6 +594,8 @@ resampName <- function(x, numbers = TRUE){ timeslice = "Rolling Forecasting Origin Resampling", oob = "Out of Bag Resampling", boot = "(Bootstrap)", + optimism_boot = "(Optimism Bootstrap)", + boot_all = "(Bootstrap All)", boot632 = "(Bootstrap 632 Rule)", cv = "(Cross-Validation)", repeatedcv = "(Repeated Cross-Validation)", diff --git a/pkg/caret/R/print.train.R b/pkg/caret/R/print.train.R index fdc580e8..1775a890 100644 --- a/pkg/caret/R/print.train.R +++ b/pkg/caret/R/print.train.R @@ -199,7 +199,7 @@ stringFunc <- function (x) { } else constString <- NULL } else constString <- NULL - tuneAcc <- tuneAcc[,!grepl("Apparent$", names(tuneAcc)),drop = FALSE] + tuneAcc <- tuneAcc[,!grepl("Apparent$|Optimism$", names(tuneAcc)), drop = FALSE] colnames(tuneAcc)[colnames(tuneAcc) == ".B"] <- "Resamples" nms <- names(tuneAcc)[names(tuneAcc) %in% params] sort_args <- vector(mode = "list", length = length(nms)) diff --git a/pkg/caret/R/train.default.R b/pkg/caret/R/train.default.R index ff367184..f97e1ed1 100644 --- a/pkg/caret/R/train.default.R +++ b/pkg/caret/R/train.default.R @@ -89,7 +89,7 @@ #' tuning parameters.} \item{bestTune }{a data frame with the final #' parameters.} #' -#' \item{call }{the (matched) function call with dots expanded} \item{dots}{a +#' \item{call}{the (matched) function call with dots expanded} \item{dots}{a #' list containing any ... values passed to the original call} \item{metric}{a #' string that specifies what summary metric will be used to select the optimal #' model.} \item{control}{the list of control parameters.} \item{preProcess @@ -359,7 +359,8 @@ train.default <- function(x, y, alt_cv =, cv = createFolds(y, trControl$number, returnTrain = TRUE), repeatedcv =, adaptive_cv = createMultiFolds(y, trControl$number, trControl$repeats), loocv = createFolds(y, n, returnTrain = TRUE), - boot =, boot632 =, adaptive_boot = createResample(y, trControl$number), + boot =, boot632 =, optimism_boot =, boot_all =, + adaptive_boot = createResample(y, trControl$number), test = createDataPartition(y, 1, trControl$p), adaptive_lgocv =, lgocv = createDataPartition(y, trControl$number, trControl$p), timeslice = createTimeSlices(seq(along = y), @@ -394,13 +395,16 @@ train.default <- function(x, y, stop('`savePredictions` should be either logical or "all", "final" or "none"') } - ## Create hold--out indicies - if(is.null(trControl$indexOut) & trControl$method != "oob"){ + ## Create holdout indices + if(is.null(trControl$indexOut) && trControl$method != "oob"){ if(tolower(trControl$method) != "timeslice") { y_index <- if(class(y)[1] == "Surv") 1:nrow(y) else seq(along = y) - trControl$indexOut <- lapply(trControl$index, - function(training, allSamples) allSamples[-unique(training)], - allSamples = y_index) + trControl$indexOut <- lapply(trControl$index, function(training) setdiff(y_index, training)) + if(trControl$method %in% c("optimism_boot", "boot_all")) { + trControl$indexExtra <- lapply(trControl$index, function(training) { + list(origIndex = y_index, bootIndex = training) + }) + } names(trControl$indexOut) <- prettySeq(trControl$indexOut) } else { trControl$indexOut <- createTimeSlices(seq(along = y), @@ -559,7 +563,7 @@ train.default <- function(x, y, num_rs <- if(trControl$method != "oob") length(trControl$index) else 1L - if(trControl$method == "boot632") num_rs <- num_rs + 1L + if(trControl$method %in% c("boot632", "optimism_boot", "boot_all")) num_rs <- num_rs + 1L ## Set or check the seeds when needed if(is.null(trControl$seeds) || all(is.na(trControl$seeds))) { seeds <- sample.int(n = 1000000L, size = num_rs * nrow(trainInfo$loop) + 1L) @@ -653,6 +657,9 @@ train.default <- function(x, y, } } } + + ## Remove extra indices + trControl$indexExtra <- NULL ## TODO we used to give resampled results for LOO if(!(trControl$method %in% c("LOOCV", "oob"))) { diff --git a/pkg/caret/R/trainControl.R b/pkg/caret/R/trainControl.R index fc044dc6..14371050 100644 --- a/pkg/caret/R/trainControl.R +++ b/pkg/caret/R/trainControl.R @@ -34,11 +34,22 @@ #' Bengio, 2012). See \url{http://topepo.github.io/caret/random-hyperparameter-search.html} for #' details and an example. #' -#' The \code{"boot632"} method uses the 0.632 estimator presented in Efron -#' (1983), not to be confused with the 0.632+ estimator proposed later by the -#' same author. +#' The supported bootstrap methods are: +#' +#' \itemize{ +#' \item \code{"boot"}: the usual bootstrap. +#' \item \code{"boot632"}: the 0.632 bootstrap estimator (Efron, 1983). +#' \item \code{"optimism_boot"}: the optimism bootstrap estimator. +#' (Efron and Tibshirani, 1994). +#' \item \code{"boot_all"}: all of the above (for efficiency, +#' but "boot" will be used for calculations). +#' } +#' +#' The \code{"boot632"} method should not to be confused with the 0.632+ +#' estimator proposed later by the same author. #' #' @param method The resampling method: \code{"boot"}, \code{"boot632"}, +#' \code{"optimism_boot"}, \code{"boot_all"}, #' \code{"cv"}, \code{"repeatedcv"}, \code{"LOOCV"}, \code{"LGOCV"} (for #' repeated training/test splits), \code{"none"} (only fits one model to the #' entire training set), \code{"oob"} (only for random forest, bagged trees, @@ -126,6 +137,9 @@ #' improvement on cross-validation''. Journal of the American Statistical #' Association, 78(382):316-331 #' +#' Efron, B., & Tibshirani, R. J. (1994). ``An introduction to the bootstrap'', +#' pages 249-252. CRC press. +#' #' Bergstra and Bengio (2012), ``Random Search for Hyper-Parameter #' Optimization'', Journal of Machine Learning Research, 13(Feb):281-305 #' diff --git a/pkg/caret/R/workflows.R b/pkg/caret/R/workflows.R index 47d4540e..095aa2eb 100644 --- a/pkg/caret/R/workflows.R +++ b/pkg/caret/R/workflows.R @@ -65,10 +65,11 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes ## fitting and predicting the full data set. resampleIndex <- ctrl$index - if(ctrl$method %in% c("boot632")) + if(ctrl$method %in% c("boot632", "optimism_boot", "boot_all")) { resampleIndex <- c(list("AllData" = rep(0, nrow(x))), resampleIndex) ctrl$indexOut <- c(list("AllData" = rep(0, nrow(x))), ctrl$indexOut) + if(!is.null(ctrl$indexExtra)) ctrl$indexExtra <- c(list("AllData" = NULL), ctrl$indexExtra) } `%op%` <- getOper(ctrl$allowParallel && getDoParWorkers() > 1) keep_pred <- isTRUE(ctrl$savePredictions) || ctrl$savePredictions %in% c("all", "final") @@ -94,6 +95,8 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes holdoutIndex <- modelIndex } + extraIndex <- if(is.null(ctrl$indexExtra)) NULL else ctrl$indexExtra[[iter]] + if(testing) cat("pre-model\n") if(!is.null(info$submodels[[parm]]) && nrow(info$submodels[[parm]]) > 0) { @@ -115,6 +118,7 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes if(testing) print(mod) + predictedExtra <- NULL if(class(mod)[1] != "try-error") { predicted <- try( @@ -153,6 +157,14 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes for(i in seq(along = predicted)) predicted[[i]] <- tmp rm(tmp) } + } else if(!is.null(extraIndex)) { + predictedExtra <- lapply(extraIndex, function(idx) { + try(predictionFunction(method = method, + modelFit = mod$fit, + newdata = x[idx, , drop = FALSE], + preProc = mod$preProc, + param = submod)) + }) } } else { wrn <- paste(colnames(printed[parm,,drop = FALSE]), @@ -258,6 +270,21 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes wts = wts[holdoutIndex], lv = lev, rows = holdoutIndex) + if(!is.null(predictedExtra)) + predictedExtra <- mapply(predictedExtra, extraIndex, + SIMPLIFY = FALSE, USE.NAMES = FALSE, + FUN = function(pred, rows) { + lapply(pred, function(x) { + y <- y[rows] + wts <- wts[rows] + + x <- outcome_conversion(x, lv = lev) + out <- data.frame(pred = x, obs = y, stringsAsFactors = FALSE) + if(!is.null(wts)) out$weights <- wts + out$rowIndex <- rows + out + }) + }) if(testing) print(head(predicted)) ## same for the class probabilities @@ -266,12 +293,11 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes for(k in seq(along = predicted)) predicted[[k]] <- cbind(predicted[[k]], probValues[[k]]) } - if(keep_pred) + if(keep_pred || (ctrl$method == "boot_all" && names(resampleIndex)[iter] == "AllData")) { tmpPred <- predicted for(modIndex in seq(along = tmpPred)) { - tmpPred[[modIndex]]$rowIndex <- holdoutIndex tmpPred[[modIndex]] <- merge(tmpPred[[modIndex]], allParam[modIndex,,drop = FALSE], all = TRUE) @@ -286,6 +312,26 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes lev = lev, model = method) if(testing) print(head(thisResample)) + + if(!is.null(predictedExtra)) { + thisResampleExtra <- lapply(predictedExtra, function(predicted) { + lapply(predicted, + ctrl$summaryFunction, + lev = lev, + model = method) + }) + thisResampleExtra[[1L]] <- lapply(thisResampleExtra[[1L]], function(res) { + names(res) <- paste0(names(res), "Orig") + res + }) + thisResampleExtra[[2L]] <- lapply(thisResampleExtra[[2L]], function(res) { + names(res) <- paste0(names(res), "Boot") + res + }) + thisResampleExtra <- do.call(cbind, lapply(thisResampleExtra, function(x) do.call(rbind, x))) + thisResampleExtra <- cbind(allParam, thisResampleExtra) + } else thisResampleExtra <- NULL + ## for classification, add the cell counts if(length(lev) > 1 && length(lev) <= 50) { @@ -308,7 +354,7 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes if(ctrl$classProbs) tmp <- cbind(tmp, probValues) tmp$rowIndex <- holdoutIndex - if(keep_pred) + if(keep_pred || (ctrl$method == "boot_all" && names(resampleIndex)[iter] == "AllData")) { tmpPred <- tmp tmpPred$rowIndex <- holdoutIndex @@ -328,6 +374,33 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes thisResample <- as.data.frame(t(thisResample)) thisResample <- cbind(thisResample, info$loop[parm,,drop = FALSE]) + ## for optimism bootstrap + if(!is.null(predictedExtra)) { + thisResampleExtra <- mapply(predictedExtra, extraIndex, + SIMPLIFY = FALSE, USE.NAMES = FALSE, + FUN = function(predicted, holdoutIndex) { + if(is.factor(y)) predicted <- outcome_conversion(predicted, lv = lev) + tmp <- data.frame(pred = predicted, + obs = y[holdoutIndex], + stringsAsFactors = FALSE) + ## Sometimes the code above does not coerce the first + ## columnn to be named "pred" so force it + names(tmp)[1] <- "pred" + if(!is.null(wts)) tmp$weights <- wts[holdoutIndex] + if(ctrl$classProbs && nrow(tmp) == nrow(probValues)) tmp <- cbind(tmp, probValues) + tmp$rowIndex <- holdoutIndex + ctrl$summaryFunction(tmp, lev = lev, model = method) + }) + + names(thisResampleExtra[[1L]]) <- paste0(names(thisResampleExtra[[1L]]), "Orig") + names(thisResampleExtra[[2L]]) <- paste0(names(thisResampleExtra[[2L]]), "Boot") + + thisResampleExtra <- unlist(thisResampleExtra, recursive = FALSE) + + thisResampleExtra <- cbind(as.data.frame(t(thisResampleExtra)), info$loop[parm, , drop = FALSE]) + + } else thisResampleExtra <- NULL + } thisResample$Resample <- names(resampleIndex)[iter] @@ -335,12 +408,13 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes names(resampleIndex), iter, FALSE) if(testing) print(thisResample) - list(resamples = thisResample, pred = tmpPred) + list(resamples = thisResample, pred = tmpPred, resamplesExtra = thisResampleExtra) } resamples <- rbind.fill(result[names(result) == "resamples"]) - pred <- if(keep_pred) rbind.fill(result[names(result) == "pred"]) else NULL - if(ctrl$method %in% c("boot632")) + pred <- rbind.fill(result[names(result) == "pred"]) + resamplesExtra <- rbind.fill(result[names(result) == "resamplesExtra"]) + if(ctrl$method %in% c("boot632", "optimism_boot", "boot_all")) { perfNames <- names(resamples) perfNames <- perfNames[!(perfNames %in% c("Resample", as.character(method$parameters$parameter)))] @@ -355,6 +429,11 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes warning("There were missing values in the apparent performance measures.") } resamples <- subset(resamples, Resample != "AllData") + if(!is.null(pred)) + { + predHat <- subset(pred, Resample == "AllData") + pred <- subset(pred, Resample != "AllData") + } } names(resamples) <- gsub("^\\.", "", names(resamples)) @@ -369,12 +448,36 @@ nominalTrainWorkflow <- function(x, y, wts, info, method, ppOpts, ctrl, lev, tes MeanSD, exclude = gsub("^\\.", "", colnames(info$loop))) - if(ctrl$method %in% c("boot632")) { + if(ctrl$method %in% c("boot632", "boot_all")) { out <- merge(out, apparent) - for(p in seq(along = perfNames)) { - const <- 1-exp(-1) - out[, perfNames[p]] <- (const * out[, perfNames[p]]) + ((1-const) * out[, paste(perfNames[p],"Apparent", sep = "")]) - } + const <- 1 - exp(-1) + sapply(perfNames, function(perfName) { + perfOut <- if(ctrl$method == "boot_all") paste0(perfName, "_632") else perfName + out[, perfOut] <<- (const * out[, perfName]) + ((1-const) * out[, paste(perfName, "Apparent", sep = "")]) + NULL + }) + } + + if(ctrl$method %in% c("optimism_boot", "boot_all")) { + out <- merge(out, apparent) + out <- merge(out, ddply(resamplesExtra[, !grepl("Resample", colnames(resamplesExtra)), drop = FALSE], + colnames(info$loop), + function(df, exclude) { + colMeans(df[, setdiff(colnames(df), exclude), drop = FALSE]) + }, + exclude = colnames(info$loop))) + sapply(perfNames, function(perfName) { + optimism <- out[ , paste0(perfName, "Orig")] - out[ , paste0(perfName, "Boot")] + final_estimate <- out[ , paste0(perfName, "Apparent")] + optimism + ## Remove unnecessary values + out[ , paste0(perfName, "Orig")] <<- NULL + out[ , paste0(perfName, "Boot")] <<- NULL + perfOut <- if(ctrl$method == "boot_all") paste0(perfName, "_OptBoot") else perfName + ## Update estimates + out[ , paste0(perfName, "Optimism")] <<- optimism + out[ , perfOut] <<- final_estimate + NULL + }) } list(performance = out, resamples = resamples, predictions = if(keep_pred) pred else NULL) diff --git a/pkg/caret/man/train.Rd b/pkg/caret/man/train.Rd index 177b050f..4d64cef6 100644 --- a/pkg/caret/man/train.Rd +++ b/pkg/caret/man/train.Rd @@ -97,7 +97,7 @@ A list is returned of class \code{train} containing: \item{method tuning parameters.} \item{bestTune }{a data frame with the final parameters.} -\item{call }{the (matched) function call with dots expanded} \item{dots}{a +\item{call}{the (matched) function call with dots expanded} \item{dots}{a list containing any ... values passed to the original call} \item{metric}{a string that specifies what summary metric will be used to select the optimal model.} \item{control}{the list of control parameters.} \item{preProcess diff --git a/pkg/caret/man/trainControl.Rd b/pkg/caret/man/trainControl.Rd index 27acab66..11488239 100644 --- a/pkg/caret/man/trainControl.Rd +++ b/pkg/caret/man/trainControl.Rd @@ -19,6 +19,7 @@ trainControl(method = "boot", number = ifelse(grepl("cv", method), 10, 25), } \arguments{ \item{method}{The resampling method: \code{"boot"}, \code{"boot632"}, +\code{"optimism_boot"}, \code{"boot_all"}, \code{"cv"}, \code{"repeatedcv"}, \code{"LOOCV"}, \code{"LGOCV"} (for repeated training/test splits), \code{"none"} (only fits one model to the entire training set), \code{"oob"} (only for random forest, bagged trees, @@ -163,9 +164,19 @@ The option \code{search = "grid"} uses the default grid search routine. When Bengio, 2012). See \url{http://topepo.github.io/caret/random-hyperparameter-search.html} for details and an example. -The \code{"boot632"} method uses the 0.632 estimator presented in Efron -(1983), not to be confused with the 0.632+ estimator proposed later by the -same author. +The supported bootstrap methods are: + +\itemize{ + \item \code{"boot"}: the usual bootstrap. + \item \code{"boot632"}: the 0.632 bootstrap estimator (Efron, 1983). + \item \code{"optimism_boot"}: the optimism bootstrap estimator. + (Efron and Tibshirani, 1994). + \item \code{"boot_all"}: all of the above (for efficiency, + but "boot" will be used for calculations). +} + +The \code{"boot632"} method should not to be confused with the 0.632+ +estimator proposed later by the same author. } \examples{ @@ -215,6 +226,9 @@ Efron (1983). ``Estimating the error rate of a prediction rule: improvement on cross-validation''. Journal of the American Statistical Association, 78(382):316-331 +Efron, B., & Tibshirani, R. J. (1994). ``An introduction to the bootstrap'', +pages 249-252. CRC press. + Bergstra and Bengio (2012), ``Random Search for Hyper-Parameter Optimization'', Journal of Machine Learning Research, 13(Feb):281-305