diff --git a/.travis.yml b/.travis.yml index d88e179c..6d6de5b2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -44,7 +44,7 @@ before_install: - ./travis-tool.sh r_binary_install nnet - ./travis-tool.sh r_binary_install party - ./travis-tool.sh r_binary_install pls - - ./travis-tool.sh r_binary_install pROC + - ./travis-tool.sh r_install ModelMetrics - ./travis-tool.sh r_binary_install proxy - ./travis-tool.sh r_binary_install randomForest - ./travis-tool.sh r_binary_install RANN diff --git a/pkg/caret/DESCRIPTION b/pkg/caret/DESCRIPTION index 03b460f8..d788452f 100644 --- a/pkg/caret/DESCRIPTION +++ b/pkg/caret/DESCRIPTION @@ -5,7 +5,7 @@ Title: Classification and Regression Training Author: Max Kuhn. Contributions from Jed Wing, Steve Weston, Andre Williams, Chris Keefer, Allan Engelhardt, Tony Cooper, Zachary Mayer, Brenton Kenkel, the R Core Team, Michael Benesty, Reynald Lescarbeau, - Andrew Ziem, Luca Scrucca, Yuan Tang, and Can Candan. + Andrew Ziem, Luca Scrucca, Yuan Tang, Can Candan, and Tyler Hunt. Description: Misc functions for training and plotting classification and regression models. Maintainer: Max Kuhn @@ -20,6 +20,7 @@ Imports: foreach, methods, plyr, + ModelMetrics (>= 1.1.0), nlme, reshape2, stats, @@ -44,7 +45,6 @@ Suggests: nnet, party (>= 0.9-99992), pls, - pROC (>= 1.8), proxy, randomForest, RANN, diff --git a/pkg/caret/NAMESPACE b/pkg/caret/NAMESPACE index be73a939..d44dd11f 100644 --- a/pkg/caret/NAMESPACE +++ b/pkg/caret/NAMESPACE @@ -1,24 +1,27 @@ useDynLib(caret) +importFrom(ModelMetrics, auc) import(foreach, methods, plyr, reshape2, ggplot2, lattice, nlme) importFrom(car, powerTransform, yjPower) importFrom(grDevices, extendrange) -importFrom(stats, .checkMFClasses, .getXlevels, aggregate, anova, - approx, as.formula, binom.test, complete.cases, contrasts, - cor, cov, delete.response, dist, - fitted.values, loess, mahalanobis, - mcnemar.test, median, model.frame, model.matrix, - model.response, model.weights, na.fail, na.pass, optim, - predict, qnorm, quantile, rbinom, reshape, resid, - residuals, rnorm, runif, sd, t.test, terms, +importFrom(stats, .checkMFClasses, .getXlevels, aggregate, anova, + approx, as.formula, binom.test, complete.cases, contrasts, + cor, cov, delete.response, dist, + fitted.values, loess, mahalanobis, + mcnemar.test, median, model.frame, model.matrix, + model.response, model.weights, na.fail, na.pass, optim, + predict, qnorm, quantile, rbinom, reshape, resid, + residuals, rnorm, runif, sd, t.test, terms, toeplitz, var, na.omit, p.adjust, fitted, prcomp, hclust, - lm, model.extract, pt, update, binomial) -importFrom(stats4, coef) -importFrom(utils, capture.output, getFromNamespace, head, - install.packages, installed.packages, object.size, flush.console, menu, stack) + lm, model.extract, pt, update, binomial) +importFrom(stats4, coef) +importFrom(utils, capture.output, getFromNamespace, head, + install.packages, installed.packages, object.size, flush.console, menu, stack, + modifyList, combn + ) export(anovaScores, as.data.frame.resamples, - as.matrix.resamples, + as.matrix.resamples, avNNet, avNNet.default, bag, @@ -73,7 +76,7 @@ export(anovaScores, extractProb, F_meas, F_meas.default, - F_meas.table, + F_meas.table, featurePlot, filterVarImp, findCorrelation, @@ -87,7 +90,7 @@ export(anovaScores, gafs_spCrossover, gafs_raMutation, gafs, - gafs.default, + gafs.default, gafsControl, gamFormula, gamFuncs, @@ -143,7 +146,7 @@ export(anovaScores, nullModel, nullModel.default, oneSE, - panel.calibration, + panel.calibration, panel.lift, panel.lift2, panel.needle, @@ -196,7 +199,7 @@ export(anovaScores, R2, recall, recall.default, - recall.table, + recall.table, resampleHist, resamples, resamples.default, @@ -207,16 +210,16 @@ export(anovaScores, rfeControl, rfeIter, rfFuncs, - rfGA, + rfGA, rfSA, rfSBF, rfStats, RMSE, safs_initial, - safs_perturb, + safs_perturb, safs_prob, safs, - safs.default, + safs.default, safsControl, sbf, sbf.default, @@ -339,7 +342,7 @@ S3method(varImp, nnet) S3method(varImp, glmnet) S3method(varImp, gam) S3method(varImp, gafs) -S3method(varImp, safs) +S3method(varImp, safs) S3method(densityplot, train) S3method(histogram, train) @@ -377,7 +380,7 @@ S3method(plot, prcomp.resamples) S3method(plot, lift) S3method(plot, calibration) S3method(plot, gafs) -S3method(plot, safs) +S3method(plot, safs) S3method(confusionMatrix, train) S3method(confusionMatrix, rfe) @@ -418,7 +421,7 @@ S3method(print, lift) S3method(print, calibration) S3method(print, expoTrans) S3method(print, gafs) -S3method(print, safs) +S3method(print, safs) S3method(predict, plsda) S3method(predict, splsda) @@ -441,7 +444,7 @@ S3method(predict, dummyVars) S3method(predict, BoxCoxTrans) S3method(predict, expoTrans) S3method(predict, gafs) -S3method(predict, safs) +S3method(predict, safs) S3method(summary, bagEarth) S3method(summary, bagFDA) @@ -459,7 +462,7 @@ S3method(predictors, default) S3method(predictors, rfe) S3method(predictors, sbf) S3method(predictors, gafs) -S3method(predictors, safs) +S3method(predictors, safs) S3method(confusionMatrix, table) @@ -492,7 +495,7 @@ S3method(summary, diff.resamples) S3method(update, train) S3method(update, rfe) S3method(update, gafs) -S3method(update, safs) +S3method(update, safs) S3method(fitted, train) S3method(residuals, train) @@ -510,7 +513,7 @@ S3method(oob_pred, sbf) S3method(oob_pred, list) S3method(gafs, default) -S3method(safs, default) +S3method(safs, default) S3method(trim, train) diff --git a/pkg/caret/R/aaa.R b/pkg/caret/R/aaa.R index 8512d8e1..97375fd0 100644 --- a/pkg/caret/R/aaa.R +++ b/pkg/caret/R/aaa.R @@ -20,25 +20,25 @@ ################################################################### if(getRversion() >= "2.15.1"){ - + utils::globalVariables(c('Metric', 'Model')) - - + + ## densityplot(~ values|Metric, data = plotData, groups = ind, ## xlab = "", ...) - + utils::globalVariables(c('ind')) - + ## avPerf <- ddply(subset(results, Metric == metric[1] & X2 == "Estimate"), ## .(Model), ## function(x) c(Median = median(x$value, na.rm = TRUE))) - + utils::globalVariables(c('X2')) - + ## x[[i]]$resample <- subset(x[[i]]$resample, Variables == x[[i]]$bestSubset) - + utils::globalVariables(c('Variables')) - + ## calibCalc: no visible binding for global variable 'obs' ## calibCalc: no visible binding for global variable 'bin' ## @@ -47,9 +47,9 @@ if(getRversion() >= "2.15.1"){ ## binData <- data.frame(prob = x$calibProbVar, ## bin = cut(x$calibProbVar, (0:cuts)/cuts, include.lowest = TRUE), ## class = x$calibClassVar) - + utils::globalVariables(c('obs', 'bin')) - + ## ## checkConditionalX: no visible binding for global variable '.outcome' ## checkConditionalX <- function(x, y) @@ -57,9 +57,9 @@ if(getRversion() >= "2.15.1"){ ## x$.outcome <- y ## unique(unlist(dlply(x, .(.outcome), zeroVar))) ## } - + utils::globalVariables(c('.outcome')) - + ## classLevels.splsda: no visible global function definition for 'ilevels' ## ## classLevels.splsda <- function(x, ...) @@ -68,9 +68,9 @@ if(getRversion() >= "2.15.1"){ ## ## same class name, but this works for either ## ilevels(x$y) ## } - + utils::globalVariables(c('ilevels')) - + ## looRfeWorkflow: no visible binding for global variable 'iter' ## looSbfWorkflow: no visible binding for global variable 'iter' ## looTrainWorkflow: no visible binding for global variable 'parm' @@ -93,9 +93,9 @@ if(getRversion() >= "2.15.1"){ ## .errorhandling = "stop") %dopar% ## { ## - + utils::globalVariables(c('iter', 'parm', 'method', 'Resample', 'dat')) - + ## tuneScheme: no visible binding for global variable '.alpha' ## tuneScheme: no visible binding for global variable '.phi' ## tuneScheme: no visible binding for global variable '.lambda' @@ -103,9 +103,9 @@ if(getRversion() >= "2.15.1"){ ## seqParam[[i]] <- data.frame(.lambda = subset(grid, ## subset = .phi == loop$.phi[i] & ## .lambda < loop$.lambda[i])$.lambda) - + utils::globalVariables(c('.alpha', '.phi', '.lambda')) - + ## createGrid : somDims: no visible binding for global variable '.xdim' ## createGrid : somDims: no visible binding for global variable '.ydim' ## createGrid : lvqGrid: no visible binding for global variable '.k' @@ -114,9 +114,9 @@ if(getRversion() >= "2.15.1"){ ## out <- expand.grid(.xdim = 1:x, .ydim = 2:(x+1), ## .xweight = seq(.5, .9, length = len)) ## - + utils::globalVariables(c('.xdim', '.ydim', '.k', '.size')) - + ## createModel: possible error in rda(trainX, trainY, gamma = ## tuneValue$.gamma, lambda = tuneValue$.lambda, ...): unused ## argument(s) (gamma = tuneValue$.gamma, lambda = tuneValue$.lambda) @@ -144,54 +144,54 @@ if(getRversion() >= "2.15.1"){ ## ## $lambda ## [1] NA - + ## predictionFunction: no visible binding for global variable '.alpha' ## ## delta <- subset(param, .alpha == uniqueA[i])$.delta ## - + utils::globalVariables(c('.alpha')) - + ## predictors.gbm: no visible binding for global variable 'rel.inf' ## predictors.sda: no visible binding for global variable 'varIndex' ## predictors.smda: no visible binding for global variable 'varIndex' ## ## varUsed <- as.character(subset(relImp, rel.inf != 0)$var) - + utils::globalVariables(c('rel.inf', 'varIndex')) - + ## plotClassProbs: no visible binding for global variable 'Observed' ## ## out <- densityplot(form, data = stackProbs, groups = Observed, ...) - + utils::globalVariables(c('Observed')) - + ## plot.train: no visible binding for global variable 'parameter' ## ## paramLabs <- subset(modelInfo, parameter %in% params)$label - + utils::globalVariables(c('parameter')) - + ## plot.rfe: no visible binding for global variable 'Selected' ## ## out <- xyplot(plotForm, data = results, groups = Selected, panel = panel.profile, ...) - + utils::globalVariables(c('Selected')) - + ## icr.formula: no visible binding for global variable 'thresh' ## ## res <- icr.default(x, y, weights = w, thresh = thresh, ...) - + utils::globalVariables(c('thresh', 'probValues', 'min_prob', 'groups', 'trainData', 'j', 'x', '.B')) - + utils::globalVariables(c('model_id', 'player1', 'player2', 'playa', 'win1', 'win2', 'name')) - + utils::globalVariables(c('object', 'Iter', 'lvls', 'Mean', 'Estimate')) - - + + ## parse_sampling: no visible binding for global variable 'sampling_methods' utils::globalVariables(c('sampling_methods')) - + ## ggplot.calibration: no visible binding for global variable 'midpoint' ## ggplot.calibration: no visible binding for global variable 'Percent' ## ggplot.calibration: no visible binding for global variable 'Lower' @@ -206,10 +206,10 @@ altTrainWorkflow <- function(x) x best <- function(x, metric, maximize) { - + bestIter <- if(maximize) which.max(x[,metric]) else which.min(x[,metric]) - + bestIter } @@ -221,14 +221,15 @@ defaultSummary <- function(data, lev = NULL, model = NULL) twoClassSummary <- function (data, lev = NULL, model = NULL) { - if(length(levels(data$obs)) > 2) - stop(paste("Your outcome has", length(levels(data$obs)), + lvls <- levels(data$obs) + if(length(lvls) > 2) + stop(paste("Your outcome has", length(lvls), "levels. The twoClassSummary() function isn't appropriate.")) - requireNamespaceQuietStop('pROC') - if (!all(levels(data[, "pred"]) == levels(data[, "obs"]))) + requireNamespaceQuietStop('ModelMetrics') + if (!all(levels(data[, "pred"]) == lvls)) stop("levels of observed and predicted data do not match") - rocObject <- try(pROC::roc(data$obs, data[, lev[1]], direction = ">"), silent = TRUE) - rocAUC <- if(class(rocObject)[1] == "try-error") NA else rocObject$auc + data$y = as.numeric(data$obs == lvls[2]) + rocAUC <- ModelMetrics:::auc(ifelse(data$obs == lev[2], 0, 1), data[, lvls[1]]) out <- c(rocAUC, sensitivity(data[, "pred"], data[, "obs"], lev[1]), specificity(data[, "pred"], data[, "obs"], lev[2])) @@ -242,41 +243,38 @@ mnLogLoss <- function(data, lev = NULL, model = NULL){ stop("'data' should have columns consistent with 'lev'") if(!all(sort(lev) %in% sort(levels(data$obs)))) stop("'data$obs' should have levels consistent with 'lev'") - eps <- 1e-15 - probs <- as.matrix(data[, lev, drop = FALSE]) - probs[probs > 1 - eps] <- 1 - eps - probs[probs < eps] <- eps - inds <- match(data$obs, colnames(probs)) - probs <- probs[cbind(seq_len(nrow(probs)), inds)] - c(logLoss = -mean(log(probs), na.rm = TRUE)) + + dataComplete <- data[complete.cases(data),] + probs <- as.matrix(dataComplete[, lev, drop = FALSE]) + + inds <- match(dataComplete$obs, colnames(probs)) + ModelMetrics::mlogLoss(dataComplete$obs, probs) } multiClassSummary <- function (data, lev = NULL, model = NULL){ #Check data - if (!all(levels(data[, "pred"]) == levels(data[, "obs"]))) + if (!all(levels(data[, "pred"]) == levels(data[, "obs"]))) stop("levels of observed and predicted data do not match") has_class_probs <- all(lev %in% colnames(data)) if(has_class_probs) { ## Overall multinomial loss lloss <- mnLogLoss(data = data, lev = lev, model = model) - requireNamespaceQuietStop("pROC") + requireNamespaceQuietStop("ModelMetrics") #Calculate custom one-vs-all ROC curves for each class - prob_stats <- lapply(levels(data[, "pred"]), - function(class){ + prob_stats <- lapply(levels(data[, "pred"]), + function(x){ #Grab one-vs-all data for the class - obs <- ifelse(data[, "obs"] == class, 1, 0) - prob <- data[,class] - rocObject <- try(pROC::roc(obs, data[,class], direction = "<"), silent = TRUE) - prob_stats <- if (class(rocObject)[1] == "try-error") NA else rocObject$auc - names(prob_stats) <- c('ROC') - return(prob_stats) + obs <- ifelse(data[, "obs"] == x, 1, 0) + prob <- data[,x] + AUCs <- try(ModelMetrics::auc(obs, data[,x]), silent = TRUE) + return(AUCs) }) roc_stats <- mean(unlist(prob_stats)) } - + #Calculate confusion matrix-based statistics CM <- confusionMatrix(data[, "pred"], data[, "obs"]) - + #Aggregate and average class-wise stats #Todo: add weights # RES: support two classes here as well @@ -287,32 +285,32 @@ multiClassSummary <- function (data, lev = NULL, model = NULL){ class_stats <- colMeans(CM$byClass) names(class_stats) <- paste("Mean", names(class_stats)) } - + # Aggregate overall stats - overall_stats <- if(has_class_probs) - c(CM$overall, lloss, ROC = roc_stats) else CM$overall - if (length(levels(data[, "pred"])) > 2) - names(overall_stats)[names(overall_stats) == "ROC"] <- "Mean_ROC" - - - # Combine overall with class-wise stats and remove some stats we don't want + overall_stats <- if(has_class_probs) + c(CM$overall, logLoss = lloss, ROC = roc_stats) else CM$overall + if (length(levels(data[, "pred"])) > 2) + names(overall_stats)[names(overall_stats) == "ROC"] <- "Mean_AUC" + + + # Combine overall with class-wise stats and remove some stats we don't want stats <- c(overall_stats, class_stats) stats <- stats[! names(stats) %in% c('AccuracyNull', "AccuracyLower", "AccuracyUpper", - "AccuracyPValue", "McnemarPValue", + "AccuracyPValue", "McnemarPValue", 'Mean Prevalence', 'Mean Detection Prevalence')] - + # Clean names names(stats) <- gsub('[[:blank:]]+', '_', names(stats)) - + # Change name ordering to place most useful first # May want to remove some of these eventually - stat_list <- c("Accuracy", "Kappa", "Mean_Sensitivity", "Mean_Specificity", + stat_list <- c("Accuracy", "Kappa", "Mean_Sensitivity", "Mean_Specificity", "Mean_Pos_Pred_Value", "Mean_Neg_Pred_Value", "Mean_Detection_Rate", "Mean_Balanced_Accuracy") - if(has_class_probs) stat_list <- c("logLoss", "Mean_ROC", stat_list) + if(has_class_probs) stat_list <- c("logLoss", "Mean_AUC", stat_list) if (length(levels(data[, "pred"])) == 2) stat_list <- gsub("^Mean_", "", stat_list) - + stats <- stats[c(stat_list)] - + return(stats) } diff --git a/pkg/caret/R/aucRoc.R b/pkg/caret/R/aucRoc.R deleted file mode 100644 index 2820e846..00000000 --- a/pkg/caret/R/aucRoc.R +++ /dev/null @@ -1,15 +0,0 @@ -aucRoc <- function(object) -{ - warning("This function is deprecated a of 1/3/12. The computations now utilize the pROC package. This function will be removed in a few releases.") - - sens <- object[, "sensitivity"] - omspec <- 1 - object[, "specificity"] - newOrder <- order(omspec) - sens <- sens[newOrder] - omspec <- omspec[newOrder] - - rocArea <- sum(.5 *diff(omspec) * (sens[-1] + sens[-length(sens)])) - rocArea <- max(rocArea, 1 - rocArea) - rocArea -} - diff --git a/pkg/caret/R/confusionMatrix.R b/pkg/caret/R/confusionMatrix.R index a5ced4cc..308f6fb4 100644 --- a/pkg/caret/R/confusionMatrix.R +++ b/pkg/caret/R/confusionMatrix.R @@ -352,19 +352,3 @@ resampName <- function(x, numbers = TRUE){ out } - -mcc <- function(tab, pos = colnames(tab)[1]){ - if(nrow(tab) != 2 | ncol(tab) != 2) stop("A 2x2 table is needed") - neg <- colnames(tab)[colnames(tab) != pos] - tp <- tab[pos, pos] - tn <- tab[neg, neg] - fp <- tab[pos,neg] - fn <- tab[neg, pos] - d1 <- tp + fp - d2 <- tp + fn - d3 <- tn + fp - d4 <- tn + fn - if(d1 == 0 | d2 == 0 | d3 == 0 | d4 == 0) return(0) - ((tp * tn) - (fp * fn))/sqrt(d1*d2*d3*d4) -} - diff --git a/pkg/caret/R/filterVarImp.R b/pkg/caret/R/filterVarImp.R index 55dc2aa4..0bb82c08 100644 --- a/pkg/caret/R/filterVarImp.R +++ b/pkg/caret/R/filterVarImp.R @@ -1,134 +1,41 @@ -## todo start using foreach here - -oldfilterVarImp <- function(x, y, nonpara = FALSE, ...) -{ - { - notNumber <- unlist(lapply(x, function(x) !is.numeric(x))) - if(any(notNumber)) - { - for(i in which(notNumber)) x[,i] <- as.numeric(x[,i]) - } - } - - if(is.factor(y)) - { - classLevels <- levels(y) - - outStat <- matrix(NA, nrow = dim(x)[2], ncol = length(classLevels)) - for(i in seq(along = classLevels)) - { - otherLevels <- classLevels[classLevels != classLevels[i]] - - for(k in seq(along = otherLevels)) - { - tmpSubset <- as.character(y) %in% c(classLevels[i], otherLevels[k]) - tmpY <- factor(as.character(y)[tmpSubset]) - tmpX <- x[tmpSubset,] - - rocAuc <- apply( - tmpX, - 2, - function(x, class, pos) - { - isMissing <- is.na(x) | is.na(class) - if(any(isMissing)) - { - x <- x[!isMissing] - class <- class[!isMissing] - } - outResults <- if(length(unique(x)) > 200) roc(x, class = class, positive = pos) - else roc(x, class = class, dataGrid = FALSE, positive = pos) - aucRoc(outResults) - }, - class = tmpY, - pos = classLevels[i]) - outStat[, i] <- pmax(outStat[, i], rocAuc, na.rm = TRUE) - } - if(i ==1 & length(classLevels) == 2) - { - outStat[, 2] <- outStat[, 1] - break() - } - } - colnames(outStat) <- classLevels - rownames(outStat) <- dimnames(x)[[2]] - outStat <- data.frame(outStat) - } else { - paraFoo <- function(data, y) abs(coef(summary(lm(y ~ data, na.action = na.omit)))[2, "t value"]) - nonparaFoo <- function(x, y, ...) - { - meanMod <- sum((y - mean(y, rm.na = TRUE))^2) - nzv <- nearZeroVar(x, saveMetrics = TRUE) - - if(nzv$zeroVar) return(NA) - if(nzv$percentUnique < 20) - { - regMod <- lm(y~x, na.action = na.omit, ...) - } else { - regMod <- try(loess(y~x, na.action = na.omit, ...), silent = TRUE) - - if(class(regMod) == "try-error" | any(is.nan(regMod$residuals))) try(regMod <- lm(y~x, ...)) - if(class(regMod) == "try-error") return(NA) - } - - pR2 <- 1 - (sum(resid(regMod)^2)/meanMod) - if(pR2 < 0) pR2 <- 0 - pR2 - } - - testFunc <- if(nonpara) nonparaFoo else paraFoo - - outStat <- apply(x, 2, testFunc, y = y) - outStat <- data.frame(Overall = outStat) - } - outStat +rocPerCol <- function(dat, cls){ + auc(cls, dat) } - -rocPerCol <- function(dat, cls) { - loadNamespace("pROC") - pROC::roc(cls, dat, direction = "<")$auc +asNumeric <- function(data){ + fc <- sapply(data, is.factor) + modifyList(data, lapply(data[, fc], as.numeric)) } -filterVarImp <- function(x, y, nonpara = FALSE, ...) -{ - { - notNumber <- unlist(lapply(x, function(x) !is.numeric(x))) - if(any(notNumber)) - { - for(i in which(notNumber)) x[,i] <- as.numeric(x[,i]) - } - } - - if(is.factor(y)) - { +filterVarImp <- function(x, y, nonpara = FALSE, ...){ + # converting factors to numeric + notNumber <- sapply(x, function(x) !is.numeric(x)) + x = asNumeric(x) + + if(is.factor(y)){ classLevels <- levels(y) k <- length(classLevels) - - if(k > 2) - { - counter <- 1 - classIndex <- vector(mode = "list", length = k) - tmpStat <- matrix(NA, nrow = ncol(x), ncol = choose(k, 2)) - for(i in 1:k) - { - for(j in i:k) - { - if(i != j) - { - classIndex[[i]] <- c(classIndex[[i]], counter) - classIndex[[j]] <- c(classIndex[[j]], counter) - index <- which(y %in% c(classLevels[i], classLevels[j])) - tmpX <- x[index,,drop = FALSE] - tmpY <- factor(as.character(y[index]), levels = c(classLevels[i], classLevels[j])) - tmpStat[,counter] <- apply(tmpX, 2, rocPerCol, cls = tmpY) - counter <- counter + 1 - } - } - } - outStat <- matrix(NA, ncol(x), k) - for(i in 1:k) outStat[,i] <- apply(tmpStat[,classIndex[[i]]], 1, max) + + if(k > 2){ + + Combs <- combn(classLevels, 2) + CombsN <- combn(1:k, 2) + + lStat <- lapply(1:ncol(Combs), FUN = function(cc){ + yLevs <- as.character(y) %in% Combs[,cc] + tmpX <- x[yLevs,] + tmpY <- as.numeric(y[yLevs] == Combs[,cc][2]) + apply(tmpX, 2, rocPerCol, cls = tmpY) + }) + Stat = do.call("cbind", lStat) + + loutStat <- lapply(1:k, function(j){ + apply(Stat[,CombsN[,j]], 1, max) + }) + + outStat = do.call("cbind", loutStat) + } else { tmp <- apply(x, 2, rocPerCol, cls = y) outStat <- cbind(tmp, tmp) @@ -145,18 +52,18 @@ filterVarImp <- function(x, y, nonpara = FALSE, ...) { meanMod <- sum((y - mean(y, rm.na = TRUE))^2) nzv <- nearZeroVar(x, saveMetrics = TRUE) - + if(nzv$zeroVar) return(NA) if(nzv$percentUnique < 20) { regMod <- lm(y~x, na.action = na.omit, ...) } else { regMod <- try(loess(y~x, na.action = na.omit, ...), silent = TRUE) - + if(class(regMod) == "try-error" | any(is.nan(regMod$residuals))) try(regMod <- lm(y~x, ...)) if(class(regMod) == "try-error") return(NA) } - + pR2 <- 1 - (sum(resid(regMod)^2)/meanMod) if(pR2 < 0) pR2 <- 0 pR2 @@ -164,7 +71,7 @@ filterVarImp <- function(x, y, nonpara = FALSE, ...) testFunc <- if(nonpara) nonparaFoo else paraFoo - outStat <- apply(x, 2, testFunc, y = y) + outStat <- apply(x, 2, testFunc, y = y) outStat <- data.frame(Overall = outStat) } outStat diff --git a/pkg/caret/R/roc.R b/pkg/caret/R/roc.R deleted file mode 100644 index 104a67d1..00000000 --- a/pkg/caret/R/roc.R +++ /dev/null @@ -1,21 +0,0 @@ -roc <- function(data, class, dataGrid = TRUE, gridLength = 100, positive = levels(class)[1]) -{ - warning("This function is deprecated a of 1/3/12. The computations now utilize the pROC package. This function will be removed in a few releases.") - - if(!is.character(positive) | length(positive) != 1) stop("positive argument should be a single character value") - - if(!(positive %in% levels(class))) stop("wrong level specified") - if(length(levels(class)) != 2) stop("wrong number of levels") - if(dataGrid) cutoffDF <- data.frame(value = sort(unique(data))) - else cutoffDF <- data.frame(value = seq( - from = min(data, na.rm = TRUE), - to = max(data, na.rm = TRUE), - length = gridLength)) - numCuts <- dim(cutoffDF)[1] - out <- matrix(NA, ncol = 3, nrow = numCuts + 1) - - out[2:(numCuts + 1), ] <- t(apply(cutoffDF, 1, rocPoint, x = data, y = class, positive = positive)) - out[1, ] <- c(NA, 1, 0) - colnames(out) <- c("cutoff", "sensitivity", "specificity") - out -} diff --git a/pkg/caret/R/rocPoint.R b/pkg/caret/R/rocPoint.R deleted file mode 100644 index c4dca43f..00000000 --- a/pkg/caret/R/rocPoint.R +++ /dev/null @@ -1,19 +0,0 @@ -rocPoint <- function(cutoff, x, y, positive) -{ - warning("This function is deprecated a of 1/3/12. The computations now utilize the pROC package. This function will be removed in a few releases.") - classLevels <- levels(y) - negative <- classLevels[positive != classLevels] - newClass <- factor( - ifelse( - x <= cutoff, - negative, - positive), - levels = classLevels) - out <- c( - cutoff, - sensitivity(newClass, y, positive), - specificity(newClass, y, negative)) - names(out) <- c("cutoff", "sensitivity", "specificity") - out -} - diff --git a/pkg/caret/man/caret-internal.Rd b/pkg/caret/man/caret-internal.Rd index 0d3406dc..3b721c82 100644 --- a/pkg/caret/man/caret-internal.Rd +++ b/pkg/caret/man/caret-internal.Rd @@ -33,7 +33,6 @@ MeanSD(x, exclude = NULL) sortImp(object, top) resampleWrapper(x, ind) caretTheme() -rocPoint(cutoff, x, y, positive) ipredStats(x) rfStats(x) bagEarthStats(x) diff --git a/pkg/caret/tests/testthat/test_glmnet_varImp.R b/pkg/caret/tests/testthat/test_glmnet_varImp.R index cef38045..18331e3e 100644 --- a/pkg/caret/tests/testthat/test_glmnet_varImp.R +++ b/pkg/caret/tests/testthat/test_glmnet_varImp.R @@ -8,17 +8,17 @@ test_that('glmnet varImp returns non-negative values', { skip_if_not_installed('glmnet') set.seed(1) dat <- SLC14_1(200) - + reg <- train(y ~ ., data = dat, method = "glmnet", tuneGrid = data.frame(lambda = .1, alpha = .5), trControl = trainControl(method = "none")) - + # this checks that some coefficients are negative coefs <- predict(reg$finalModel, s=0.1, type="coef") - expect_less_than(0, sum(0 > coefs)) + expect_lt(0, sum(0 > coefs)) # now check that all elements of varImp are nonnegative, # in spite of negative coefficients vis <- varImp(reg, s=0.1, scale=F)$importance - expect_equal(0, sum(0 > vis)) + expect_true(all(vis >= 0)) }) diff --git a/pkg/caret/tests/testthat/test_mnLogLoss.R b/pkg/caret/tests/testthat/test_mnLogLoss.R index def12606..ae19d1f6 100644 --- a/pkg/caret/tests/testthat/test_mnLogLoss.R +++ b/pkg/caret/tests/testthat/test_mnLogLoss.R @@ -1,7 +1,5 @@ context('mnLogLoss') -eps <- 1e-15 - classes <- LETTERS[1:3] test_dat1 <- data.frame(obs = c("A", "A", "A", "B", "B", "C"), @@ -10,24 +8,21 @@ test_dat1 <- data.frame(obs = c("A", "A", "A", "B", "B", "C"), B = c(0, .05, .29, .8, .6, .3), C = c(0, .15, .20, .1, .2, .4)) -expected1 <- log(1-eps) + log(.8) + log(.51) + log(.8) + log(.6) + log(.4) -expected1 <- c(logLoss = -expected1/nrow(test_dat1)) -result1 <- mnLogLoss(test_dat1, lev = classes) +test_that("Multiclass logloss returns expected values", { + result1 <- mnLogLoss(test_dat1, classes) -test_dat2 <- test_dat1 -test_dat2$A[1] <- NA + test_dat2 <- test_dat1 + test_dat2$A[1] <- NA + result2 <- mnLogLoss(test_dat2, classes) -expected2 <- log(.8) + log(.51) + log(.8) + log(.6) + log(.4) -expected2 <- c(logLoss = -expected2/sum(complete.cases(test_dat2))) -result2 <- mnLogLoss(test_dat2, lev = classes) + test_dat3 <- test_dat1 + test_dat3 <- test_dat3[, rev(1:5)] + result3 <- mnLogLoss(test_dat3, classes) -test_dat3 <- test_dat1 -test_dat3 <- test_dat3[, rev(1:5)] -expected3 <- expected1 -result3 <- mnLogLoss(test_dat3, lev = classes[c(2, 3, 1)]) + expect_equal(result1, 0.424458, tolerance = .000001) + expect_equal(result2, 0.5093496, tolerance = .000001) + expect_equal(result3, 0.424458, tolerance = .000001) -expect_equal(result1, expected1) -expect_equal(result2, expected2) -expect_equal(result3, expected3) +}) diff --git a/pkg/caret/tests/testthat/test_models_bagEarth.R b/pkg/caret/tests/testthat/test_models_bagEarth.R index 7f8b1f6c..63489e41 100644 --- a/pkg/caret/tests/testthat/test_models_bagEarth.R +++ b/pkg/caret/tests/testthat/test_models_bagEarth.R @@ -2,6 +2,7 @@ # such as the bagEarth() not returning the right kind of object, that one of # the functions (bagEarth, format, predict) crash during normal usage, or that # bagEarth cannot model a simplistic kind of linear equation. +context("earth") test_that('bagEarth simple regression', { skip_on_cran() data <- data.frame(X = 1:100) diff --git a/pkg/caret/tests/testthat/test_pROC_direction.R b/pkg/caret/tests/testthat/test_pROC_direction.R deleted file mode 100644 index 8b0a7d8f..00000000 --- a/pkg/caret/tests/testthat/test_pROC_direction.R +++ /dev/null @@ -1,24 +0,0 @@ -library(caret) - -context('Testing pROC direction') - -test_that('rocPerCol returns AUC < 0.5 with direction = "<"', { - #skip_on_cran() - skip_if_not_installed('pROC') - - set.seed(42) - dat <- twoClassSim(200, linearVars = 1) - - auto.auc <- as.numeric(pROC::roc(dat$Class, dat$Linear1, direction = "auto")$auc) - fixed.auc <- as.numeric(pROC::roc(dat$Class, dat$Linear1, direction = "<")$auc) - tested.auc <- as.numeric(caret:::rocPerCol(dat$Linear1, dat$Class)) - - # tested.auc should equal tested.auc now - expect_equal(tested.auc, fixed.auc) - - # Also it has been hand-checked to be < 0.5 with this seed (0.4875) - expect_lt(tested.auc, 0.5) - - # And it should be lower than the "auto" auc - expect_lt(tested.auc, auto.auc) -}) diff --git a/pkg/caret/tests/testthat/test_sampling_options.R b/pkg/caret/tests/testthat/test_sampling_options.R index 12abe61c..e199c4a0 100644 --- a/pkg/caret/tests/testthat/test_sampling_options.R +++ b/pkg/caret/tests/testthat/test_sampling_options.R @@ -1,18 +1,19 @@ library(caret) library(testthat) -load(system.file("models", "sampling.RData", package = "caret")) +context("sampling options") +load(system.file("models", "sampling.RData", package = "caret")) test_that('check appropriate sampling calls by name', { skip_on_cran() arg_names <- c("up", "down", "rose", "smote") arg_funcs <- sampling_methods arg_first <- c(TRUE, FALSE) - + ## test that calling by string gives the right result for(i in arg_names) { out <- caret:::parse_sampling(i) - expected <- list(name = i, + expected <- list(name = i, func = sampling_methods[[i]], first = TRUE) expect_equivalent(out, expected) @@ -24,11 +25,11 @@ test_that('check appropriate sampling calls by function', { arg_names <- c("up", "down", "rose", "smote") arg_funcs <- sampling_methods arg_first <- c(TRUE, FALSE) - + ## test that calling by function gives the right result for(i in arg_names) { out <- caret:::parse_sampling(sampling_methods[[i]]) - expected <- list(name = "custom", + expected <- list(name = "custom", func = sampling_methods[[i]], first = TRUE) expect_equivalent(out, expected) @@ -39,26 +40,26 @@ test_that('check bad sampling name', { skip_on_cran() expect_error(caret:::parse_sampling("what?")) }) - + test_that('check bad first arg', { skip_on_cran() expect_error(caret:::parse_sampling(list(name = "yep", func = sampling_methods[["up"]], first = 2))) -}) +}) test_that('check bad func arg', { skip_on_cran() expect_error(caret:::parse_sampling(list(name = "yep", func = I, first = 2))) -}) - +}) + test_that('check incomplete list', { skip_on_cran() expect_error(caret:::parse_sampling(list(name = "yep"))) -}) +}) test_that('check call', { skip_on_cran() expect_error(caret:::parse_sampling(14)) -}) +}) ################################################################### ## @@ -82,4 +83,4 @@ test_that('check getting one method', { test_that('check missing method', { skip_on_cran() expect_error(getSamplingInfo("plum")) -}) \ No newline at end of file +}) diff --git a/pkg/caret/tests/testthat/test_twoClassSummary.R b/pkg/caret/tests/testthat/test_twoClassSummary.R new file mode 100644 index 00000000..579bd656 --- /dev/null +++ b/pkg/caret/tests/testthat/test_twoClassSummary.R @@ -0,0 +1,35 @@ + + +context('twoClassSummary') + + +test_that("twoClassSummary is calculating correctly", { + +library(caret) + +set.seed(1) +tr_dat <- twoClassSim(500) +te_dat <- tr_dat +tr_dat$Class = factor(tr_dat$Class, levels = rev(levels(te_dat$Class))) + +set.seed(35) +mod1 <- train(Class ~ ., data = tr_dat, + method = "fda", + tuneLength = 10, + metric = "ROC", + trControl = trainControl(classProbs = TRUE, + summaryFunction = twoClassSummary)) + +set.seed(35) +mod2 <- train(Class ~ ., data = te_dat, + method = "fda", + tuneLength = 10, + metric = "ROC", + trControl = trainControl(classProbs = TRUE, + summaryFunction = twoClassSummary)) + +expect_equal(mod1$resample$ROC, mod2$resample$ROC) +expect_equal(mod1$resample$Sens, mod2$resample$Spec) +expect_equal(mod1$resample$Spec, mod2$resample$Sens) + +})