diff --git a/pkg/caret/R/confusionMatrix.R b/pkg/caret/R/confusionMatrix.R index 4c4ab151..bd012d6e 100644 --- a/pkg/caret/R/confusionMatrix.R +++ b/pkg/caret/R/confusionMatrix.R @@ -145,7 +145,7 @@ confusionMatrix.table <- function(data, positive = NULL, prevalence = NULL, ...) overall = overall, byClass = tableStats, dots = list(...)), - class = "confusionMatrix") + class = "confusionMatrix") } @@ -170,17 +170,37 @@ as.matrix.confusionMatrix <- function(x, what = "xtabs", ...) out } -as.table.confusionMatrix <- function(x, ...) x$table +as.table.confusionMatrix <- function(x, ...) x$table confusionMatrix.train <- function(data, norm = "overall", dnn = c("Prediction", "Reference"), ...) { - if(data$modelType == "Regression") stop("confusion matrices are only valid for classification models") - if(!norm %in% c("none", "overall", "average")) stop("values for norm should be 'none', 'overall', 'byClass' or 'average'") - if(data$control$method %in% c("oob", "LOOCV", "none")) stop("cannot compute confusion matrices for leave-one-out, out-of-bag resampling or no resampling") - if(!is.null(data$control$index)) - { + if (inherits(data, "train")) { + if(data$modelType == "Regression") + stop("confusion matrices are only valid for classification models") + if(data$control$method %in% c("oob", "LOOCV")) + stop("cannot compute confusion matrices for leave-one-out or out-of-bag resampling") + if(data$control$method == "none") + return(confusionMatrix(predict(data), data$trainingData$.outcome, dnn = dnn, ...)) + + lev <- levels(data) + + ## get only best tune + names(data$bestTune) <- gsub("^\\.", "", names(data$bestTune)) + resampledCM <- merge(data$bestTune, data$resampledCM) + + } else { + if(is.null(data$resampledCM)) + stop("resampled confusion matrices are not availible") + if(data$control$method %in% c("oob", "LOOCV")) + stop("cannot compute confusion matrices for leave-one-out or out-of-bag resampling") + + lev <- data$obsLevels + resampledCM <- data$resampledCM + } + + if(!is.null(data$control$index)) { resampleN <- unlist(lapply(data$control$index, length)) numResamp <- length(resampleN) resampText <- resampName(data) @@ -189,81 +209,57 @@ confusionMatrix.train <- function(data, norm = "overall", dnn = c("Prediction", numResamp <- 0 } - lev <- levels(data) - ## get only best tune - names(data$bestTune) <- gsub("^\\.", "", names(data$bestTune)) - resampledCM <- merge(data$bestTune, data$resampledCM) - counts <- as.matrix(resampledCM[,grep("^cell", colnames(resampledCM))]) - ## normalize by true class? + counts <- as.matrix(resampledCM[ , grep("^\\.?cell", colnames(resampledCM))]) + + ## normalize? + norm <- match.arg(norm, c("none", "overall", "average")) + + if(norm == "none") counts <- matrix(apply(counts, 2, sum), nrow = length(lev)) + else counts <- matrix(apply(counts, 2, mean), nrow = length(lev)) - if(norm == "overall") counts <- t(apply(counts, 1, function(x)x/sum(x))) - if(norm == "average") counts <- counts/numResamp - overall <- matrix(apply(counts, 2, mean), nrow = length(lev)) - rownames(overall) <- colnames(overall) <- lev - if(norm != "none") overall <- overall*100 - names(dimnames(overall)) <- dnn + if(norm == "overall") counts <- counts / sum(counts) * 100 + ## names + rownames(counts) <- colnames(counts) <- lev + names(dimnames(counts)) <- dnn - out <- list(table = as.table(overall), + ## out + out <- list(table = as.table(counts), norm = norm, B = length(data$control$index), text = paste(resampText, "Confusion Matrix")) - class(out) <- "confusionMatrix.train" + class(out) <- paste0("confusionMatrix.", class(data)) out } +confusionMatrix.rfe <- confusionMatrix.train +confusionMatrix.sbf <- confusionMatrix.train print.confusionMatrix.train <- function(x, digits = 1, ...) { cat(x$text, "\n") normText <- switch(x$norm, - none = "\n(entries are un-normalized counts)\n", - average = "\n(entries are cell counts per resample)\n", - overall = "\n(entries are percentages of table totals)\n", - byClass = "\n(entries are percentages within the reference class)\n", + none = "\n(entries are un-normalized aggregated counts)\n", + average = "\n(entries are average cell counts across resamples)\n", + overall = "\n(entries are percentual average cell counts across resamples)\n", "") cat(normText, "\n") - if(x$norm == "none" & x$B == 1) print(getFromNamespace("confusionMatrix.table", "caret")(x$table)) else print(round(x$table, digits)) - cat("\n") - invisible(x) -} - -confusionMatrix.rfe <- function(data, norm = "overall", dnn = c("Prediction", "Reference"), ...) -{ - if(is.null(data$resampledCM)) stop("resampled confusion matrices are not availible") - if(!norm %in% c("none", "overall", "average")) stop("values for norm should be 'none', 'overall', 'byClass' or 'average'") - if(data$control$method %in% c("oob", "LOOCV")) stop("cannot compute confusion matrices for leave-one-out and out-of-bag resampling") - if(!is.null(data$control$index)) - { - resampleN <- unlist(lapply(data$control$index, length)) - numResamp <- length(resampleN) - resampText <- resampName(data) + if(x$norm == "none" & x$B == 1) { + print(getFromNamespace("confusionMatrix.table", "caret")(x$table)) } else { - resampText <- "" - numResamp <- 0 + print(round(x$table, digits)) + + out <- cbind("Accuracy (average)", ":", formatC(sum(diag(x$table) / sum(x$table)))) + + dimnames(out) <- list(rep("", nrow(out)), rep("", ncol(out))) + print(out, quote = FALSE) + cat("\n") } - - - resampledCM <- data$resampledCM - counts <- as.matrix(resampledCM[,grep("^\\.cell", colnames(resampledCM))]) - ## normalize by true class? - - if(norm == "overall") counts <- t(apply(counts, 1, function(x)x/sum(x))) - if(norm == "average") counts <- counts/numResamp - overall <- matrix(apply(counts, 2, mean), nrow = length(data$obsLevels)) - rownames(overall) <- colnames(overall) <- data$obsLevels - overall <- overall*100 - names(dimnames(overall)) <- dnn - - - out <- list(table = overall, - norm = norm, - B = numResamp, - text = paste(resampText, "Confusion Matrix")) - class(out) <- "confusionMatrix.rfe" - out + invisible(x) } +print.confusionMatrix.rfe <- print.confusionMatrix.train +print.confusionMatrix.sbf <- print.confusionMatrix.train resampName <- function(x, numbers = TRUE) { @@ -288,10 +284,10 @@ resampName <- function(x, numbers = TRUE) loocv = "Leave-One-Out Cross-Validation", adaptive_boot = paste("Adaptively Bootstrapped (", numResamp, " reps)", sep = ""), adaptive_cv = paste("Adaptively Cross-Validated (", x$control$number, " fold, repeated ", - x$control$repeats, " times)", sep = ""), + x$control$repeats, " times)", sep = ""), adaptive_lgocv = paste("Adaptive Repeated Train/Test Splits Estimated (", numResamp, " reps, ", - round(x$control$p, 2), "%)", sep = "") - ) + round(x$control$p, 2), "%)", sep = "") + ) } else { out <- switch(tolower(x$control$method), none = "None", @@ -309,45 +305,6 @@ resampName <- function(x, numbers = TRUE) out } -confusionMatrix.sbf <- function(data, norm = "overall", dnn = c("Prediction", "Reference"), ...) -{ - if(is.null(data$resampledCM)) stop("resampled confusion matrices are not availible") - if(!norm %in% c("none", "overall", "average")) stop("values for norm should be 'none', 'overall', 'byClass' or 'average'") - if(data$control$method %in% c("oob", "LOOCV")) stop("cannot compute confusion matrices for leave-one-out and out-of-bag resampling") - if(!is.null(data$control$index)) - { - resampleN <- unlist(lapply(data$control$index, length)) - numResamp <- length(resampleN) - resampText <- resampName(data) - } else { - resampText <- "" - numResamp <- 0 - } - - resampledCM <- data$resampledCM - counts <- as.matrix(resampledCM[,grep("^\\.cell", colnames(resampledCM))]) - ## normalize by true class? - - if(norm == "overall") counts <- t(apply(counts, 1, function(x)x/sum(x))) - if(norm == "average") counts <- counts/numResamp - overall <- matrix(apply(counts, 2, mean), nrow = length(data$obsLevels)) - rownames(overall) <- colnames(overall) <- data$obsLevels - overall <- overall*100 - names(dimnames(overall)) <- dnn - - - out <- list(table = overall, - norm = norm, - B = numResamp, - text = paste(resampText, "Confusion Matrix")) - class(out) <- "confusionMatrix.sbf" - out -} - -print.confusionMatrix.rfe <- print.confusionMatrix.train -print.confusionMatrix.sbf <- print.confusionMatrix.train - - mcc <- function(tab, pos = colnames(tab)[1]) { diff --git a/pkg/caret/man/confusionMatrix.train.Rd b/pkg/caret/man/confusionMatrix.train.Rd index d013360e..b2101d68 100644 --- a/pkg/caret/man/confusionMatrix.train.Rd +++ b/pkg/caret/man/confusionMatrix.train.Rd @@ -17,15 +17,15 @@ Using a \code{\link{train}}, \code{\link{rfe}}, \code{\link{sbf}} object, determ } \arguments{ - \item{data}{an object of class \code{\link{train}}, \code{\link{rfe}}, \code{\link{sbf}} that did not use out-of-bag resampling or leave-one-out cross-validation.} - \item{norm}{a character string indicating how the table entries should be normalized. Valid values are "none", "overall" or "average". } - \item{dnn}{a character vector of dimnames for the table} + \item{data}{An object of class \code{\link{train}}, \code{\link{rfe}}, \code{\link{sbf}} that did not use out-of-bag resampling or leave-one-out cross-validation.} + \item{norm}{A character string indicating how the table entries should be normalized. Valid values are "none", "overall" or "average". } + \item{dnn}{A character vector of dimnames for the table} \item{\dots}{not used here} } \details{ When \code{\link{train}} is used for tuning a model, it tracks the confusion matrix cell entries for the hold-out samples. These can be aggregated and used for diagnostic purposes. For \code{\link{train}}, the matrix is estimated for the final model tuning parameters determined by \code{\link{train}}. For \code{\link{rfe}}, the matrix is associated with the optimal number of variables. -There are several ways to show the table entries. Using \code{norm = "none"} will show the frequencies of samples on each of the cells (across all resamples). \code{norm = "overall"} first divides the cell entries by the total number of data points in the table, then averages these percentages. \code{norm = "average"} takes the raw, aggregate cell counts across resamples and divides by the number of resamples (i.e. to yield an average count for each cell). +There are several ways to show the table entries. Using \code{norm = "none"} will show the aggregated counts of samples on each of the cells (across all resamples). For \code{norm = "average"}, the average number of cell counts across resamples is computed (this can help evaluate how many holdout samples there were on average). The default is \code{norm = "overall"}, which is equivalento to \code{"average"} but in percentages. } \value{ a list of class \code{confusionMatrix.train}, \code{confusionMatrix.rfe} or \code{confusionMatrix.sbf} with elements diff --git a/pkg/caret/man/trainControl.Rd b/pkg/caret/man/trainControl.Rd index 0bd58d0b..f68581bc 100644 --- a/pkg/caret/man/trainControl.Rd +++ b/pkg/caret/man/trainControl.Rd @@ -77,11 +77,15 @@ Using adaptive resampling when \code{method} is either \code{"adaptive_cv"}, \co } The option \code{search = "grid"} uses the default grid search routine. When \code{search = "random"}, a random search procedure is used (Bergstra and Bengio, 2012). See \url{http://topepo.github.io/caret/random.html} for details and an example. + +The \code{"boot632"} method uses the 0.632 estimator presented in Efron (1983), not to be confused with the 0.632+ estimator proposed later by the same author. } \author{Max Kuhn} \references{ +Efron (1983). ``Estimating the error rate of a prediction rule: improvement on cross-validation''. Journal of the American Statistical Association, 78(382):316-331 + Bergstra and Bengio (2012), ``Random Search for Hyper-Parameter Optimization'', Journal of Machine Learning Research, 13(Feb):281-305 Kuhn (2014), ``Futility Analysis in the Cross-Validation of Machine Learning Models'' \url{http://arxiv.org/abs/1405.6974},