From 9f375a1704e413d0806b73ab8891c7fadc39081c Mon Sep 17 00:00:00 2001 From: asardaes Date: Fri, 15 Jan 2016 21:04:23 +0100 Subject: [PATCH] Better seeds Additional safety checks and calling sample.int just once to avoid possible repetition of seeds. --- pkg/caret/R/train.default.R | 25 ++++++++++++++----------- pkg/caret/man/trainControl.Rd | 4 ++-- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/pkg/caret/R/train.default.R b/pkg/caret/R/train.default.R index 5975d1aa..f8f16d90 100644 --- a/pkg/caret/R/train.default.R +++ b/pkg/caret/R/train.default.R @@ -330,24 +330,27 @@ train.default <- function(x, y, num_rs <- length(trControl$index) - if(trControl$method == "boot632") num_rs <- num_rs + 1 + if(trControl$method == "boot632") num_rs <- num_rs + 1L ## Set or check the seeds when needed - if(is.null(trControl$seeds) | all(is.na(trControl$seeds))) { - seeds <- vector(mode = "list", length = num_rs) - seeds <- lapply(seeds, function(x) sample.int(n = 1000000, size = nrow(trainInfo$loop))) - seeds[[num_rs + 1]] <- sample.int(n = 1000000, size = 1) - trControl$seeds <- seeds + if(is.null(trControl$seeds) || all(is.na(trControl$seeds))) { + seeds <- sample.int(n = 1000000L, size = num_rs * nrow(trainInfo$loop) + 1L) + seeds <- lapply(seq(from = 1L, to = length(seeds), by = nrow(trainInfo$loop)), + function(x) { seeds[x:(x+nrow(trainInfo$loop)-1L)] }) + seeds[[num_rs + 1L]] <- seeds[[num_rs + 1L]][1L] + trControl$seeds <- seeds } else { if(!(length(trControl$seeds) == 1 && is.na(trControl$seeds))) { ## check versus number of tasks numSeeds <- unlist(lapply(trControl$seeds, length)) - badSeed <- (length(trControl$seeds) < num_rs + 1) || - (any(numSeeds[-length(numSeeds)] < nrow(trainInfo$loop))) + badSeed <- (length(trControl$seeds) < num_rs + 1L) || + (any(numSeeds[-length(numSeeds)] < nrow(trainInfo$loop))) || + (numSeeds[length(numSeeds)] < 1L) if(badSeed) stop(paste("Bad seeds: the seed object should be a list of length", - num_rs + 1, "with", + num_rs + 1, "with", num_rs, "integer vectors of size", - nrow(trainInfo$loop), "and the last list element having a", - "single integer")) + nrow(trainInfo$loop), "and the last list element having at least a", + "single integer")) + if(any(is.na(unlist(trControl$seeds)))) stop("At least one seed is missing (NA)") } } diff --git a/pkg/caret/man/trainControl.Rd b/pkg/caret/man/trainControl.Rd index 31ff34ce..0bd58d0b 100644 --- a/pkg/caret/man/trainControl.Rd +++ b/pkg/caret/man/trainControl.Rd @@ -52,7 +52,7 @@ trainControl(method = "boot", \item{indexOut}{a list (the same length as \code{index}) that dictates which data are held-out for each resample (as integers). If \code{NULL}, then the unique set of samples not contained in \code{index} is used.} \item{timingSamps}{the number of training set samples that will be used to measure the time for predicting samples (zero indicates that the prediction time should not be estimated.} \item{predictionBounds}{a logical or numeric vector of length 2 (regression only). If logical, the predictions can be constrained to be within the limit of the training set outcomes. For example, a value of \code{c(TRUE, FALSE)} would only constrain the lower end of predictions. If numeric, specific bounds can be used. For example, if \code{c(10, NA)}, values below 10 would be predicted as 10 (with no constraint in the upper side).} - \item{seeds}{an optional set of integers that will be used to set the seed at each resampling iteration. This is useful when the models are run in parallel. A value of \code{NA} will stop the seed from being set within the worker processes while a value of \code{NULL} will set the seeds using a random set of integers. Alternatively, a list can be used. The list should have \code{B+1} elements where \code{B} is the number of resamples. The first \code{B} elements of the list should be vectors of integers of length \code{M} where \code{M} is the number of models being evaluated. The last element of the list only needs to be a single integer (for the final model). See the Examples section below and the Details section. } + \item{seeds}{an optional set of integers that will be used to set the seed at each resampling iteration. This is useful when the models are run in parallel. A value of \code{NA} will stop the seed from being set within the worker processes while a value of \code{NULL} will set the seeds using a random set of integers. Alternatively, a list can be used. The list should have \code{B+1} elements where \code{B} is the number of resamples, unless \code{method} is \code{"boot632"} in which case \code{B} is the number of resamples plus 1. The first \code{B} elements of the list should be vectors of integers of length \code{M} where \code{M} is the number of models being evaluated. The last element of the list only needs to be a single integer (for the final model). See the Examples section below and the Details section. } \item{adaptive}{a list used when \code{method} is \code{"adaptive_cv"}, \code{"adaptive_boot"} or \code{"adaptive_LGOCV"}. See Details below. } \item{trim}{a logical. If \code{TRUE} the final model in \code{object\$finalModel} may have some components of the object removed so reduce the size of the saved object. The \code{predict} method will still work, but some other features of the model may not work. \code{trim}ing will occur only for models where this feature has been implemented. } \item{allowParallel}{if a parallel backend is loaded and available, should the function use it?} @@ -76,7 +76,7 @@ Using adaptive resampling when \code{method} is either \code{"adaptive_cv"}, \co \item \code{complete}: if a single parameter value is found before the end of resampling, should the full set of resamples be computed for that parameter. ) } -The option \code{search = "grid"} uses the default grid search routine. When \code{search = "random"}, a random search procedure is used (Bergstra and Bengio, 2012). See \url{http://topepo.github.io/caret/random.html} for details and an example. +The option \code{search = "grid"} uses the default grid search routine. When \code{search = "random"}, a random search procedure is used (Bergstra and Bengio, 2012). See \url{http://topepo.github.io/caret/random.html} for details and an example. } \author{Max Kuhn}