Reproducing Article from PLOS One Using Given Data and Instructions: How Reproducable Is It?

## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

When I saw this article I tought “Would it be possible to repreduce these results” and “Can I get higher AUC”? In this post I am going to use the data provide by the autors and will try to reproduce the result using R and caret packege for machine learning. First we have to load the data

library(rfigshare)
library(readxl)
library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(dplyr)
## Warning: Installed Rcpp (0.12.12) different from Rcpp used to build dplyr (0.12.11).
## Please reinstall dplyr to avoid random crashes or undefined behavior.
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
## 
##     combine, src, summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
trellis.par.set(caretTheme())
# download.file("https://ndownloader.figshare.com/files/7331495", "data/")
S1Table <- read_excel("../../data/S1Table.xlsx") %>%
    na.omit() %>%
    select(-id)
#save(list = ls(all.names = TRUE), file = "data/plos_article.RData")
load("../../data/plos_article.RData")
library(doMC)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
registerDoMC(cores = 4)

Let’s summarise the data to see if there are some evident things at the first glance.

html(contents(S1Table), maxlevels = 10, levelType='table')

Data frame:S1Table

6669 observations and 70 variables, maximum # NAs:0  
NameStorage
URGENCEdouble
decesdouble
SEXEPATdouble
AGEPATdouble
TABACdouble
HTAdouble
DIABETEdouble
DIABETE_Insulinodouble
DYSLIPIDEMIEdouble
Coronariendouble
Infarctus_90_joursdouble
ATCDCHIRCARDIAQUEdouble
NBINTERVENTIONSdouble
ARTERIOPATHIEPERIPHdouble
ATCDINSUFCARDIAQUEdouble
ATCDENDOCARDITEdouble
ATCDEMBOLIQUEdouble
ATCD_AVC_hemorragiquedouble
ATCD_AVC_ischemiquedouble
Poor_Mobilitydouble
BPCOdouble
IRC_traiteedouble
ATCDULCEREdouble
NEOPLASIEdouble
ATCDRADIOTHERAPIEdouble
CIRRHOSEdouble
DIALYSEdouble
DEFIMMUNITAIREdouble
ARYTHMIEAURICULAIREdouble
ARYTHMIEVENTRICULAIREdouble
PACEMAKERdouble
poidsdouble
tailledouble
imcdouble
RYTHMESFAcharacter
CLASSENYHAdouble
ANGORCLASSECCSdouble
INSUFCARDIAQUEdouble
ENDOCARDITEAIGUEdouble
Critical_preoperative_statedouble
ANTIAGREGANTSdouble
BETABLOQUANTSdouble
ANTIARYTHMIQUESdouble
STATINEdouble
DIURETIQUESdouble
INHIBCALCIQUESdouble
IECdouble
NITRESdouble
HEMOGLOBINEdouble
PLAQUETTESdouble
TPdouble
CREATININEPREOPdouble
CLAIRANCECOCKdouble
CLAIRANCEMDRDdouble
FEVGdouble
PAPSdouble
CORONAROGRAPHIE_PREOPdouble
NBTRONCSSTENOSESdouble
CORONAROPATHIEdouble
VALVULOPATHIEdouble
ITFONCTIONNELLEdouble
AORTEASCENDANTEdouble
CARDIOPATHIECONGENITALEdouble
Nombre_de_gestedouble
CHIRCORONAIREdouble
CHIRVALVULAIREdouble
VALVEAORTIQUEdouble
VALVEMITRALEdouble
VALVETRISCUPIDEdouble
Chirurgie_sur_aorte_thoraciquedouble

# did have results='asis' above
d <- describe(S1Table)
# html(d, size=40, scroll=TRUE)

Now let’s use caret package for data splitting and I will impute allso the missing data using the same package.

library(caret)
set.seed(123456)
S1Table$deces <- factor(S1Table$deces, labels = c("No", "Yes"))
trainIndex <- createDataPartition(S1Table$deces, p = .8, 
                                  list = FALSE, 
                                  times = 1)
dfTrain <- S1Table[ trainIndex,]
dfTest  <- S1Table[-trainIndex,]

Logistic Regression

fitControl <- trainControl(method = "repeatedcv", 
                           number=5, repeats=5,
                     summaryFunction = twoClassSummary, 
                     classProbs = TRUE)
glmFit1 <- train(deces ~ ., data = dfTrain, 
                 method = "glm",
                 metric = "ROC",
                 family = "binomial", 
                 trControl = fitControl
                 )
glmFit1
glm1Out <- predict(glmFit1, newdata = dfTest)
confusionMatrix(dfTest$deces, glm1Out)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1239    5
##        Yes   80    9
##                                         
##                Accuracy : 0.936         
##                  95% CI : (0.922, 0.949)
##     No Information Rate : 0.989         
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0.16          
##  Mcnemar's Test P-Value : 1e-15         
##                                         
##             Sensitivity : 0.939         
##             Specificity : 0.643         
##          Pos Pred Value : 0.996         
##          Neg Pred Value : 0.101         
##              Prevalence : 0.989         
##          Detection Rate : 0.929         
##    Detection Prevalence : 0.933         
##       Balanced Accuracy : 0.791         
##                                         
##        'Positive' Class : No            
## 
glm1Out_prob <- predict(object = glmFit1, dfTest, type='prob')
aucGlm <- roc(ifelse(dfTest[,"deces"] == "Yes",1,0), glm1Out_prob[[2]])
## Warning in roc.default(ifelse(dfTest[, "deces"] == "Yes", 1, 0),
## glm1Out_prob[[2]]): Deprecated use a matrix as response. Unexpected results
## may be produced, please pass a vector or factor.

Gradient Boosting Machine

fitControl <- trainControl(method = "repeatedcv", 
                     summaryFunction = twoClassSummary, 
                     classProbs = TRUE)
gbmGrid <-  expand.grid(interaction.depth = c(1, 5, 9), 
                        n.trees = (1:30)*50, 
                        shrinkage = 0.1,
                        n.minobsinnode = 20)
gbmFit1 <- train(deces ~ ., data = dfTrain, 
                 method = "gbm",
                 metric = "ROC",
                 verbose = FALSE,                    
                 trControl = fitControl,
                 tuneGrid = gbmGrid
                 )
gbmFit1
trellis.par.set(caretTheme())
plot(gbmFit1) 

whichTwoPct <- tolerance(gbmFit1$results, metric = "ROC", 
                         tol = 2, maximize = TRUE)  
cat("best model within 2 pct of best:\n")
## best model within 2 pct of best:
gbmFit1$results[whichTwoPct,1:10]
##   shrinkage interaction.depth n.minobsinnode n.trees   ROC  Sens   Spec
## 1       0.1                 1             20      50 0.753 0.997 0.0306
##    ROCSD  SensSD SpecSD
## 1 0.0423 0.00252 0.0306
gbm1Out <- predict(gbmFit1, newdata = dfTest)
confusionMatrix(dfTest$deces, gbm1Out)
gbm1Out_prob <- predict(object = gbmFit1, dfTest, type='prob')
aucGbm <- roc(ifelse(dfTest[,"deces"] == "Yes",1,0), gbm1Out_prob[[2]])
gbmImp <- varImp(gbmFit1)
## Loading required package: gbm
## Loading required package: splines
## Loaded gbm 2.1.3
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:Hmisc':
## 
##     is.discrete, summarize
plot(gbmImp, top = 20)

Random Forest

control <- trainControl(method="repeatedcv", number=10, repeats=3, classProbs = TRUE)
tunegrid <- expand.grid(mtry = c(1:10))
rfFit1 <- train(deces ~ ., data = dfTrain, 
                 method = "rf",
                 metric = "Accuracy",
                 verbose = FALSE,                    
                 trControl = control,
                 tuneGrid = tunegrid
                 )
rfFit1
rf1Out <- predict(rfFit1, newdata = dfTest)
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:Hmisc':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
confusionMatrix(dfTest$deces, rf1Out)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1243    1
##        Yes   88    1
##                                         
##                Accuracy : 0.933         
##                  95% CI : (0.918, 0.946)
##     No Information Rate : 0.998         
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0.019         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.9339        
##             Specificity : 0.5000        
##          Pos Pred Value : 0.9992        
##          Neg Pred Value : 0.0112        
##              Prevalence : 0.9985        
##          Detection Rate : 0.9325        
##    Detection Prevalence : 0.9332        
##       Balanced Accuracy : 0.7169        
##                                         
##        'Positive' Class : No            
## 
rf1Out_prob <- predict(object = rfFit1, dfTest, type='prob')
aucRf <- roc(ifelse(dfTest[,"deces"] == "Yes",1,0), rf1Out_prob[[2]])
## Warning in roc.default(ifelse(dfTest[, "deces"] == "Yes", 1, 0),
## rf1Out_prob[[2]]): Deprecated use a matrix as response. Unexpected results
## may be produced, please pass a vector or factor.
rfImp <- varImp(rfFit1)
plot(rfImp, top = 20)

Supporting Vector Machine

control <- trainControl(method="repeatedcv", number=5, repeats= 5, classProbs = TRUE)
tunegrid <- expand.grid(sigma = c(.01, .015, 0.2),
                    C = c(0.75, 0.9, 1, 1.1, 1.25)
)
svmFit1 <- train(deces ~ ., data = dfTrain, 
                 method = "svmRadial",
                 metric = "Accuracy",
                 verbose = FALSE,                    
                 trControl = control,
                 tuneGrid = tunegrid
                 )
svmFit1
svm1Out <- predict(svmFit1, newdata = dfTest)
## Loading required package: kernlab
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
confusionMatrix(dfTest$deces, svm1Out)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1243    1
##        Yes   84    5
##                                         
##                Accuracy : 0.936         
##                  95% CI : (0.922, 0.949)
##     No Information Rate : 0.995         
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0.098         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.9367        
##             Specificity : 0.8333        
##          Pos Pred Value : 0.9992        
##          Neg Pred Value : 0.0562        
##              Prevalence : 0.9955        
##          Detection Rate : 0.9325        
##    Detection Prevalence : 0.9332        
##       Balanced Accuracy : 0.8850        
##                                         
##        'Positive' Class : No            
## 
svm1Out_prob <- predict(object = svmFit1, dfTest, type='prob')
aucSvm <- roc(ifelse(dfTest[,"deces"] == "Yes",1,0), svm1Out_prob[[2]])
## Warning in roc.default(ifelse(dfTest[, "deces"] == "Yes", 1, 0),
## svm1Out_prob[[2]]): Deprecated use a matrix as response. Unexpected results
## may be produced, please pass a vector or factor.

Naive Bayes

control <- trainControl(method="repeatedcv", number=5, repeats= 5, classProbs = TRUE)
# tunegrid <- expand.grid(sigma = c(.01, .015, 0.2),
#                     C = c(0.75, 0.9, 1, 1.1, 1.25)
# )
nbFit1 <- train(deces ~ ., data = dfTrain, 
                 method = "nb",
                 metric = "Accuracy",
                 verbose = FALSE,                    
                 trControl = control
                 )
nbFit1
nb1Out <- predict(nbFit1, newdata = dfTest)
## Loading required package: klaR
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
confusionMatrix(dfTest$deces, nb1Out)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1244    0
##        Yes   89    0
##                                         
##                Accuracy : 0.933         
##                  95% CI : (0.918, 0.946)
##     No Information Rate : 1             
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0             
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.933         
##             Specificity :    NA         
##          Pos Pred Value :    NA         
##          Neg Pred Value :    NA         
##              Prevalence : 1.000         
##          Detection Rate : 0.933         
##    Detection Prevalence : 0.933         
##       Balanced Accuracy :    NA         
##                                         
##        'Positive' Class : No            
## 
nb1Out_prob <- predict(object = nbFit1, dfTest, type='prob')
aucNb <- roc(ifelse(dfTest[,"deces"] == "Yes",1,0), nb1Out_prob[[2]])
## Warning in roc.default(ifelse(dfTest[, "deces"] == "Yes", 1, 0),
## nb1Out_prob[[2]]): Deprecated use a matrix as response. Unexpected results
## may be produced, please pass a vector or factor.

ROC Curves for all models

plot(aucGlm, col="red", xlim = c(1,0), ylim = c(0,1));text(x = 0.2, y = 0.35, labels = paste("GLM with all",format(aucGlm$auc)))
lines(aucGbm, col="blue", xlim = c(1,0), ylim = c(0,1));text(x = 0.2, y = 0.30, labels = paste("GBM with all",format(aucGbm$auc)))
lines(aucRf, col="green", xlim = c(1,0), ylim = c(0,1));text(x = 0.2, y = 0.25, labels = paste("RF with all",format(aucRf$auc)))
lines(aucSvm, col="purple", xlim = c(1,0), ylim = c(0,1));text(x = 0.2, y = 0.20, labels = paste("SVM with all",format(aucSvm$auc)))
lines(aucNb, col="yellow", xlim = c(1,0), ylim = c(0,1));text(x = 0.2, y = 0.15, labels = paste("NB with all",format(aucNb$auc)))

sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X El Capitan 10.11.6
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] splines   parallel  methods   stats     graphics  grDevices utils    
## [8] datasets  base     
## 
## other attached packages:
##  [1] klaR_0.6-12         MASS_7.3-47         kernlab_0.9-25     
##  [4] randomForest_4.6-12 plyr_1.8.4          gbm_2.1.3          
##  [7] doMC_1.3.4          iterators_1.0.8     foreach_1.4.3      
## [10] dplyr_0.7.1         pROC_1.10.0         caret_6.0-76       
## [13] readxl_1.0.0        rfigshare_0.3.7.100 Hmisc_4.0-3        
## [16] ggplot2_2.2.1.9000  Formula_1.2-1       survival_2.41-3    
## [19] lattice_0.20-35    
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.2.1          assertthat_0.2.0    stats4_3.3.2       
##  [4] latticeExtra_0.6-28 cellranger_1.1.0    yaml_2.1.14        
##  [7] backports_1.1.0     quantreg_5.33       glue_1.1.1         
## [10] digest_0.6.12       RColorBrewer_1.1-2  checkmate_1.8.3    
## [13] minqa_1.2.4         colorspace_1.3-2    htmltools_0.3.6    
## [16] httpuv_1.3.5        Matrix_1.2-10       XML_3.98-1.9       
## [19] pkgconfig_2.0.1     SparseM_1.77        bookdown_0.4.1     
## [22] scales_0.4.1.9002   lme4_1.1-13         MatrixModels_0.4-1 
## [25] combinat_0.0-8      htmlTable_1.9       tibble_1.3.3       
## [28] mgcv_1.8-17         car_2.1-4           nnet_7.3-12        
## [31] lazyeval_0.2.0      pbkrtest_0.4-7      RJSONIO_1.3-0      
## [34] magrittr_1.5        evaluate_0.10       nlme_3.1-131       
## [37] foreign_0.8-69      class_7.3-14        blogdown_0.0.53    
## [40] tools_3.3.2         data.table_1.10.4   stringr_1.2.0      
## [43] munsell_0.4.3       cluster_2.0.6       bindrcpp_0.2       
## [46] e1071_1.6-8         rlang_0.1.1         grid_3.3.2         
## [49] nloptr_1.0.4        htmlwidgets_0.9     base64enc_0.1-3    
## [52] rmarkdown_1.6       gtable_0.2.0        ModelMetrics_1.1.0 
## [55] codetools_0.2-15    reshape2_1.4.2      R6_2.2.2           
## [58] gridExtra_2.2.1     knitr_1.16          bindr_0.1          
## [61] rprojroot_1.2       stringi_1.1.5       Rcpp_0.12.12       
## [64] rpart_4.1-11        acepack_1.4.1
comments powered by Disqus