#install.packages("pacman")
library(pacman)
p_load(infotheo)
p_load(tidyverse)
p_load(ggplot2)
p_load(cowplot)
p_load(mlbench)
p_load(Metrics)
#remove.packages("rlang")
#install.packages("rlang", repos = "https://cloud.r-project.org")
set.seed(123)

1 Wisconsin Breast Cancer Dataset

BreastCancer Dataset
A data frame with 699 observations on 11 variables, one being a character variable, 9 being ordered or nominal, and 1 target class.

  1. Sample code number: id number
  2. Clump Thickness: 1 - 10
  3. Uniformity of Cell Size: 1 - 10
  4. Uniformity of Cell Shape: 1 - 10
  5. Marginal Adhesion: 1 - 10
  6. Single Epithelial Cell Size: 1 - 10
  7. Bare Nuclei: 1 - 10
  8. Bland Chromatin: 1 - 10
  9. Normal Nucleoli: 1 - 10
  10. Mitoses: 1 - 10
  11. Class: (benign, malignant)

Breast Cancer Wisconsin (Original) Data Set

“Multisurface method of pattern separation for medical diagnosis applied to breast cytology.”, Wolberg,W.H., Mangasarian,O.L. (1990). In Proceedings of the National Academy of Sciences, 87, 9193-9196.

Zhang,J. (1992). Selecting typical instances in instance-based learning. In Proceedings of the Ninth International Machine Learning Conference (pp. 470-479). Aberdeen, Scotland: Morgan Kaufmann.

1.1 Cleaning and documentation

data(BreastCancer)
glimpse(BreastCancer)
Observations: 699
Variables: 11
$ Id              <chr> "1000025", "1002945", "1015425", "1016277", "1017023", "1017122", "1018099", "1018561", "1033078",...
$ Cl.thickness    <ord> 5, 5, 3, 6, 4, 8, 1, 2, 2, 4, 1, 2, 5, 1, 8, 7, 4, 4, 10, 6, 7, 10, 3, 8, 1, 5, 3, 5, 2, 1, 3, 2, ...
$ Cell.size       <ord> 1, 4, 1, 8, 1, 10, 1, 1, 1, 2, 1, 1, 3, 1, 7, 4, 1, 1, 7, 1, 3, 5, 1, 4, 1, 2, 2, 1, 1, 1, 1, 1, 7...
$ Cell.shape      <ord> 1, 4, 1, 8, 1, 10, 1, 2, 1, 1, 1, 1, 3, 1, 5, 6, 1, 1, 7, 1, 2, 5, 1, 5, 1, 3, 1, 1, 1, 3, 1, 1, 7...
$ Marg.adhesion   <ord> 1, 5, 1, 1, 3, 8, 1, 1, 1, 1, 1, 1, 3, 1, 10, 4, 1, 1, 6, 1, 10, 3, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, ...
$ Epith.c.size    <ord> 2, 7, 2, 3, 2, 7, 2, 2, 2, 2, 1, 2, 2, 2, 7, 6, 2, 2, 4, 2, 5, 6, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 8,...
$ Bare.nuclei     <fct> 1, 10, 2, 4, 1, 10, 10, 1, 1, 1, 1, 1, 3, 3, 9, 1, 1, 1, 10, 1, 10, 7, 1, NA, 1, 7, 1, 1, 1, 1, 1,...
$ Bl.cromatin     <fct> 3, 3, 3, 3, 3, 9, 3, 3, 1, 2, 3, 2, 4, 3, 5, 4, 2, 3, 4, 3, 5, 7, 2, 7, 3, 3, 2, 2, 2, 1, 2, 3, 7,...
$ Normal.nucleoli <fct> 1, 2, 1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 4, 1, 5, 3, 1, 1, 1, 1, 4, 10, 1, 3, 1, 6, 1, 1, 1, 1, 1, 1, 4...
$ Mitoses         <fct> 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 4, 1, 1, 1, 2, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3,...
$ Class           <fct> benign, benign, benign, benign, benign, malignant, benign, benign, benign, benign, benign, benign,...
summary(BreastCancer$Class)
   benign malignant 
      458       241 
BreastCancer$y <- as.factor(as.numeric(BreastCancer$Class=="malignant"))
BreastCancer$Class <- NULL
BreastCancer$Id <- NULL
BreastCancer[,1:5] <- lapply(BreastCancer[,1:5] , as.numeric)
summary(BreastCancer)
  Cl.thickness      Cell.size        Cell.shape     Marg.adhesion     Epith.c.size     Bare.nuclei   Bl.cromatin 
 Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   1      :402   2      :166  
 1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 1.000   1st Qu.: 1.000   1st Qu.: 2.000   10     :132   3      :165  
 Median : 4.000   Median : 1.000   Median : 1.000   Median : 1.000   Median : 2.000   2      : 30   1      :152  
 Mean   : 4.418   Mean   : 3.134   Mean   : 3.207   Mean   : 2.807   Mean   : 3.216   5      : 30   7      : 73  
 3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 5.000   3rd Qu.: 4.000   3rd Qu.: 4.000   3      : 28   4      : 40  
 Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000   (Other): 61   5      : 34  
                                                                                      NA's   : 16   (Other): 69  
 Normal.nucleoli    Mitoses    y      
 1      :443     1      :579   0:458  
 10     : 61     2      : 35   1:241  
 3      : 44     3      : 33          
 2      : 36     10     : 14          
 8      : 24     4      : 12          
 6      : 22     7      :  9          
 (Other): 69     (Other): 17          
p_load(GGally)
ggpairs(BreastCancer, title = "Breast Cancer Dataset")

p_load(corrplot)
p_load(infotheo)
BreastCancer_mi <- mutinformation(BreastCancer, method="emp") %>% natstobits()
#BreastCancer_mi <- BreastCancer_mi/max(BreastCancer_mi) 
mi_max <- max( BreastCancer_mi[lower.tri(BreastCancer_mi, diag = FALSE)])
diag(BreastCancer_mi) <-0
corrplot.mixed(BreastCancer_mi,
               cl.lim = c(0,mi_max),
               title = "Normalised Mutual Information Breast Cancer Dataset",
               mar=c(0,0,1,0),
               lower = "ellipse",
               upper="number",
               is.corr = FALSE,
               order = "hclust"
)

p_load(infotheo)
BreastCancer_mi <- mutinformation(BreastCancer, method="emp") %>% natstobits()
BreastCancer_mi_d <- as.dist(max(BreastCancer_mi)-BreastCancer_mi)
hc <- hclust(BreastCancer_mi_d, method="ward.D2")
plot(hc)

There are 16 unexplained missing values on one of the features. We’re going to impute those values, being careful to not use the outcome as one of the predictors. This will allows us to make comparisons across methods that do not handle missing values well, and also will protect us on predicting onto new test data which might also have unexplained missingness.

MissForest—non-parametric missing value imputation for mixed-type data, Daniel J. Stekhoven Peter Bühlmann, Bioinformatics, Volume 28, Issue 1, 1 January 2012, Pages 112–118,

#There are 16 missing values in Bare.nuclei, they're continous 
p_load("missForest")
BreastCancer_imputed <- BreastCancer
BreastCancer_imputed <- missForest(BreastCancer %>% select(-y), verbose = TRUE)$ximp
  missForest iteration 1 in progress...done!
    estimated error(s): 0 0.09553441 
    difference(s): 0 0.001430615 
    time: 1.23 seconds

  missForest iteration 2 in progress...done!
    estimated error(s): 0 0.08674963 
    difference(s): 0 0.0003576538 
    time: 1.19 seconds

  missForest iteration 3 in progress...done!
    estimated error(s): 0 0.09699854 
    difference(s): 0 0.0007153076 
    time: 1.02 seconds
BreastCancer_imputed$y <- BreastCancer$y

Convert categorical variables to ‘one-hot’ dummy variables

Making dummy variables with dummy_cols(), Jacob Kaplan, 2018-06-21

#install.packages('data.table')
p_load(fastDummies)
BreastCancer_onehot <- fastDummies::dummy_cols(BreastCancer_imputed,
                                               select_columns=c("Bare.nuclei",
                                                                "Bl.cromatin",
                                                                "Normal.nucleoli",
                                                                "Mitoses"))
BreastCancer_onehot[,c('Bare.nuclei','Bl.cromatin','Normal.nucleoli','Mitoses')] <- NULL

2 Hold out a Test Set

The Very first thing we’re going to do is pull 20% of the Breat Cancer dataset out as a test set and we’re never going to touch it for any reason other than final model evaluation.

Immediately split off a test set that we will not touch until the very final evaluation.

N=nrow(BreastCancer)
condition_train <- runif(N)<.8; table(condition_train)
condition_train
FALSE  TRUE 
  127   572 
BreastCancer_train <- BreastCancer_imputed[condition_train,]
BreastCancer_test <- BreastCancer_imputed[!condition_train,]
BreastCancer_onehot_train <- BreastCancer_onehot[condition_train,]
BreastCancer_onehot_test <- BreastCancer_onehot[!condition_train,]

3 Supervised Learning

formula= y ~    Cl.thickness + 
                           Cell.size + 
                           Cell.shape + 
                           Marg.adhesion + 
                           Epith.c.size + 
                           Bare.nuclei + 
                           Bl.cromatin + 
                           Normal.nucleoli + 
                           Mitoses
#One Hot formula dummies
formula_onehot = y ~ 
Cl.thickness +
Cell.size +
Cell.shape +
Marg.adhesion +
Epith.c.size +
Bare.nuclei_1 + Bare.nuclei_10 + Bare.nuclei_2 + Bare.nuclei_4 + Bare.nuclei_3 + Bare.nuclei_9 + Bare.nuclei_7 + 
Bare.nuclei_5 + Bare.nuclei_8 + Bare.nuclei_6 +  Bl.cromatin_3  + 
Bl.cromatin_9 + Bl.cromatin_1+Bl.cromatin_2+Bl.cromatin_4+Bl.cromatin_5+Bl.cromatin_7  +   
Bl.cromatin_8+Bl.cromatin_6+Bl.cromatin_10+
  
Normal.nucleoli_1 +  Normal.nucleoli_2 + Normal.nucleoli_7 + 
Normal.nucleoli_4 + Normal.nucleoli_5 +  Normal.nucleoli_3 + 
Normal.nucleoli_10 + Normal.nucleoli_6 +  Normal.nucleoli_9 + 
Normal.nucleoli_8 +  
  
Mitoses_1+ Mitoses_5 + Mitoses_4 + Mitoses_2+Mitoses_3 + Mitoses_7 + Mitoses_10 + Mitoses_8 + Mitoses_6

Register a single back end for cross-validation

p_load(caret)
set.seed(123)
cctrl1 <- trainControl(method="cv", 
                       number=10,
                       returnResamp="all",
                       classProbs=TRUE,
                       summaryFunction=twoClassSummary,
                       savePredictions=TRUE
                       )

4 Linear Models

p_load(glmnet)
set.seed(123)
glm1 <- glm(formula_onehot ,
               data=BreastCancer_onehot_train ,
               family=binomial(link='probit')
            )
glm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred
library(broom)
tidy(glm1 ) #There are 44 features, counting dummified categorical variables

Out of sample accuracy?

FALSE [1] 0.9368907

5 Variable Selection

5.1 Feature Importance and P Values

Introduction to vimp, Brian D. Williamson, 2018-06-19

5.2 Regularization, e.g. Lasso/Ridge Regression

set.seed(123)
glmnet1 <- glmnet(x=BreastCancer_onehot_train %>% select(-y) %>%  as.matrix(),
               y=as.factor(BreastCancer_onehot_train$y),
               family="binomial"
               )
plot(glmnet1)

glmnet1_cv <- cv.glmnet(x=BreastCancer_onehot_train %>% select(-y) %>% data.matrix(),
                       y=as.factor(BreastCancer_onehot_train$y),
                       family="binomial",
                                nfolds=5)
glmnet1_cv$lambda.1se #smallest model with error within 1se error of the minimum ever observed
[1] 0.01817259
plot(glmnet1_cv)

glmnet_lambda.1se_betas <- coef(glmnet1_cv,s="lambda.1se") %>% as.matrix() %>% as.data.frame()  %>% 
                           rename(beta='1') %>% 
                           rownames_to_column() %>% arrange(desc(beta) ) 
#There are 44 features
#14 have been set to nonzero coefficients
#the cofficients are relatively small
#Design a single model around that optimal lambda
glmnet_lambda.1se <- glmnet(x=BreastCancer_onehot_train %>% select(-y) %>% data.matrix(),
                       y=BreastCancer_onehot_train$y,
                       family="binomial",
                       lambda=glmnet1_cv$lambda.1se
                       )
#cross validate that model to get estimate of accuracy on the test set
glmnet_lambda.1se_cv <- train(x=BreastCancer_onehot_train %>% select(-y) %>% data.matrix(),
                             y=as.factor(paste0('Outcome',BreastCancer_train$y)),
                             method = "glmnet",
                             trControl = cctrl1,
                             metric = "ROC",
                             tuneGrid = expand.grid(alpha = 1,lambda = glmnet1_cv$lambda.1se))
#Area Under the Curve Almost Perfect Now despite using only 14 of the 44 features
print(glmnet_lambda.1se_cv$results$ROC)  #0.99
[1] 0.9920091
p_load(plotROC)
out_of_sample_predictions2 <- data.frame(y_hat=glmnet_lambda.1se_cv$pred$Outcome1,
                                               y=BreastCancer_train$y[glmnet_lambda.1se_cv$pred$rowIndex],
                                               model="Lasso")
basicplot <- ggplot(bind_rows(out_of_sample_predictions,
                              out_of_sample_predictions2),
                    aes(d = as.numeric(y), m = y_hat, color=model)) + geom_roc(n.cuts=0) + 
                    style_roc(theme = theme_grey, xlab = "1 - Specificity") + ggtitle("AUC")
Unequal factor levels: coercing to characterbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vector
basicplot

NA

5.3 Linear Expansions and Interaction Terms

We can put the same feature in a linear multiple times with polynomials to capture nonlinear relationships.

set.seed(123)
library(dplyr)
df <- data.frame(x=seq(0,100)) %>% 
  mutate(y=0+x+x^2+x^3)  %>% 
  mutate(pred_lm      = lm(y~x    )$fitted.values) %>% 
  mutate(pred_lm_quad = lm(y~x+I(x^2))$fitted.values)
library(ggplot2)
ggplot(df, aes(x,y))  + 
         geom_point( aes(x,y)) + 
         geom_line(aes(x=x,y=pred_lm), col='red')  + 
         geom_line(aes(x=x,y=pred_lm_quad), col='blue')  

5.4 Interaction Terms

Nonlinear Models * (ISLR) “Chapter 7 Moving Beyond Linearity” Linear_separability

set.seed(123)
form <-  ~ .^2
y <- BreastCancer_onehot_train$Class_binary
BreastCancer_onehot_train_twoway <-  model.matrix(form, data = BreastCancer_onehot_train[,-c(6)])
BreastCancer_onehot_test_twoway <-  model.matrix(form, data = BreastCancer_onehot_test[,-c(6)])
dim(BreastCancer_onehot_train_twoway)#991 terms
[1] 572 991
condition = colnames(BreastCancer_onehot_train_twoway)=='Class_binary'
glmnet_twoway <- glmnet(x=BreastCancer_onehot_train_twoway ,
               y=as.factor(BreastCancer_onehot_train$y),
               family="binomial"
               )
plot(glmnet_twoway)

glmnet_twoway_cv <- cv.glmnet(x=BreastCancer_onehot_train_twoway,
                       y=as.factor(BreastCancer_onehot_train$y),
                       family="binomial",
                                nfolds=5)
glmnet_twoway_cv$lambda.1se #smallest model with error within 1se error of the minimum ever observed
[1] 0.01994439
plot(glmnet_twoway_cv)

glmnet_twoway_lambda.1se_betas <- coef(glmnet1_cv,s="lambda.1se") %>% as.matrix() %>% as.data.frame()  %>% 
                           rename(beta='1') %>% 
                           rownames_to_column() %>% arrange(desc(beta) ) 
#Design a single model around that optimal lambda
glmnet_twoway_lambda.1se <- glmnet(x=BreastCancer_onehot_train %>% select(-y) %>% data.matrix(),
                       y=BreastCancer_onehot_train$y,
                       family="binomial",
                       lambda=glmnet1_cv$lambda.1se
                       )
#cross validate that model to get estimate of accuracy on the test set
glmnet_twoway_lambda.1se_cv <- train(x=BreastCancer_onehot_train_twoway,
                             y=as.factor(paste0('Outcome',BreastCancer_train$y)),
                             method = "glmnet",
                             trControl = cctrl1,
                             metric = "ROC",
                             tuneGrid = expand.grid(alpha = 1,lambda = glmnet1_cv$lambda.1se))
#Area Under the Curve Almost Perfect Now despite using only 14 of the 44 features
print(glmnet_twoway_lambda.1se_cv$results$ROC) #0.991
[1] 0.9912164
p_load(plotROC)
out_of_sample_predictions3 <- data.frame(y_hat=glmnet_twoway_lambda.1se_cv$pred$Outcome1,
                                               y=BreastCancer_train$y[glmnet_twoway_lambda.1se_cv$pred$rowIndex],
                                               model="Lasso Interactions")
basicplot <- ggplot(bind_rows(out_of_sample_predictions,
                              out_of_sample_predictions2,
                              out_of_sample_predictions3),
                    aes(d = as.numeric(y), m = y_hat, color=model)) + geom_roc(n.cuts=0) + 
                    style_roc(theme = theme_grey, xlab = "1 - Specificity") + ggtitle("AUC")
Unequal factor levels: coercing to characterbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vector
basicplot

Interpreting the model
There are some measures that unambigiously look bad for cancer outcomes.
There are certain interactions that are good news.
Bare.nuclei_8:Normal.nucleoli_2
Bare.nuclei_1:Mitoses_1
Bare.nuclei_7:Normal.nucleoli_8
Normal.nucleoli_1:Mitoses_1
are.nuclei_1:Normal.nucleoli_1

Bare.nuclei_1 by itself looks like good news, but in combination with something else it’s especially helpful.

#There are 991 terms
#By a mirracle, also 14 chosen
#Some of the  cofficients are relatively small
glmnet_twoway_cv_betas <- coef(glmnet_twoway_cv,s="lambda.1se") %>% 
                            as.matrix() %>% as.data.frame()  %>% 
                           rename(beta='1') %>% 
                           rownames_to_column() %>% arrange(desc(beta) ) 
glmnet_twoway_cv_betas %>% filter(beta!=0)

6 Decision Trees

set.seed(123)
p_load(party)
single_decision_tree <- ctree(formula, data = BreastCancer_train)
plot(single_decision_tree)

Out of sample

Slightly worse but arguably an easier to interpret model.

set.seed(123)
single_decision_tree_cv_model <- train(x=BreastCancer_train[,-c(10)],
                             y=as.factor(paste0('Outcome',BreastCancer_train$y)),
                             method = "ctree",
                             trControl = cctrl1,
                             metric = "ROC",
                             tuneGrid = expand.grid(mincriterion = 0.99)
                             )
print(single_decision_tree_cv_model$results$ROC) #0.9668608
[1] 0.9656646
p_load(plotROC)
out_of_sample_predictions4 <- data.frame(y_hat=single_decision_tree_cv_model$pred$Outcome1,
                                               y=BreastCancer_train$y[single_decision_tree_cv_model$pred$rowIndex],
                                               model="Tree")
basicplot <- ggplot(bind_rows(out_of_sample_predictions,
                              out_of_sample_predictions2,
                              out_of_sample_predictions3,
                              out_of_sample_predictions4),
                    aes(d = as.numeric(y), m = y_hat, color=model)) + geom_roc(n.cuts=0) + 
                    style_roc(theme = theme_grey, xlab = "1 - Specificity") + ggtitle("AUC")
Unequal factor levels: coercing to characterbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vector
basicplot

7 Overfitting

7.1 Bootstrapping Observations

7.2 Model Complexity/Parismony

8 Curse of dimensionality

8.1 Feature Bagging/Subspace Mtethods

9 Random Forests

set.seed(123)
#install.packages('randomForest', dependencies=T)
p_load(randomForest)
forest <- randomForest(formula,
                       data = BreastCancer_train,
                       localImp = TRUE,
                       na.action=na.omit)
print(forest)

Call:
 randomForest(formula = formula, data = BreastCancer_train, localImp = TRUE,      na.action = na.omit) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 3

        OOB estimate of  error rate: 2.62%
Confusion matrix:
    0   1 class.error
0 363  13  0.03457447
1   2 194  0.01020408
set.seed(123)
p_load(randomForest)
forest_cv_model <- train(x=BreastCancer_train[,-c(10)],
                             y=as.factor(paste0('Outcome',BreastCancer_train$y)),
                             method = "rf",
                             trControl = cctrl1,
                             metric = "ROC"
                             #tuneGrid = expand.grid(alpha = 1,lambda = glmnet1_cv$lambda.1se)
                             )
print(forest_cv_model$results)
p_load(plotROC)
condition <- forest_cv_model$pred$mtry==5
out_of_sample_predictions5 <- data.frame(y_hat=forest_cv_model$pred$Outcome1[condition] ,
                                               y=BreastCancer_train$y[forest_cv_model$pred$rowIndex[condition]] ,
                                               model="Forest")
basicplot <- ggplot(bind_rows(out_of_sample_predictions,
                              out_of_sample_predictions2,
                              out_of_sample_predictions3,
                              out_of_sample_predictions4,
                              out_of_sample_predictions5),
                    aes(d = as.numeric(y), m = y_hat, color=model)) + geom_roc(n.cuts=0) + 
                    style_roc(theme = theme_grey, xlab = "1 - Specificity") + ggtitle("AUC")
Unequal factor levels: coercing to characterbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vector
basicplot

9.1 Depth

Understanding random forests with randomForestExplainer, Aleksandra Paluszyńska

set.seed(123)
#devtools::install_github("MI2DataLab/randomForestExplainer")
p_load(randomForestExplainer)
#install.packages('rlang')
min_depth_frame <- min_depth_distribution(forest)
save(min_depth_frame, file = "min_depth_frame.rda")
load("min_depth_frame.rda")
head(min_depth_frame, n = 10)
# plot_min_depth_distribution(forest) # gives the same result as below but takes longer
plot_min_depth_distribution(min_depth_frame)

Variable Importance Pay particular attention to “accuracy_decrease” which is the drop in the classifier’s accuracy if that variable is shuffled destroying its information.

importance_frame <- measure_importance(forest)
importance_frame
plot_multi_way_importance(importance_frame, size_measure = "no_of_nodes")

(vars <- important_variables(importance_frame, k = 5, measures = c("mean_min_depth", "no_of_trees")))
[1] "Bare.nuclei"     "Normal.nucleoli" "Cell.shape"      "Cell.size"       "Cl.thickness"   
interactions_frame <- min_depth_interactions(forest, vars)
head(interactions_frame[order(interactions_frame$occurrences, decreasing = TRUE), ])
plot_min_depth_interactions(interactions_frame)

plot_predict_interaction(forest, BreastCancer_train[,-c(10)], "Cell.size", "Cl.thickness")

Can even generate an automated report

explain_forest(forest, interactions = TRUE, data = BreastCancer_train)

10 Compare out of Sample Accuracy

set.seed(123)
df_predictions <- data.frame(y_true=BreastCancer_test$y,
                             y_hat_glm=stats::predict.glm(glm1, newdata=BreastCancer_onehot_test, type = "response" ),
                             y_hat_lasso = predict(glmnet1_cv, newx=BreastCancer_onehot_test %>% 
                                                 select(-y) %>% data.matrix(), s=c("lambda.1se") ,
                                                 type = "response")[,1],
                              y_hat_lasso_twoway <- predict(glmnet_twoway_cv, 
                                                            newx=BreastCancer_onehot_test_twoway %>%
                                                              data.matrix(),
                                      s=c("lambda.1se") , type = "response")[,1],
                             y_hat_single_tree = predict(single_decision_tree, newdata=BreastCancer_test,
                                                         type = "prob") %>% sapply(rbind) %>% t() %>%
                               data.frame() %>% pull(X2),
                             y_hat_forest = predict(forest, newdata=BreastCancer_test, type = "prob")[,'1']#,
                             #y_hat_nn = predict(NN, newdata=BreastCancer_test, type = "prob")
                             )
prediction from a rank-deficient fit may be misleading
p_load(MLmetrics)
AUC(df_predictions$y_hat_glm,df_predictions$y_true) %>% round(3)
[1] 0.935
AUC(df_predictions$y_hat_lasso,df_predictions$y_true) %>% round(3)
[1] 0.991
AUC(df_predictions$y_hat_lasso_twoway,df_predictions$y_true) %>% round(3)
[1] 0.99
AUC(df_predictions$y_hat_single_tree,df_predictions$y_true) %>% round(3)
[1] 0.972
AUC(df_predictions$y_hat_forest,df_predictions$y_true) %>% round(3)
[1] 0.988
table(df_predictions$y_hat_lasso>.5,
      df_predictions$y_true)
       
         0  1
  FALSE 79  4
  TRUE   3 41

11 Neural Networks

p_load("neuralnet")
formula_onehot_2 = y + y_not ~ Cl.thickness + Cell.size + Cell.shape + Marg.adhesion + Epith.c.size + 
    Bare.nuclei_1 + Bare.nuclei_10 + Bare.nuclei_2 + Bare.nuclei_4 + 
    Bare.nuclei_3 + Bare.nuclei_9 + Bare.nuclei_7 + Bare.nuclei_5 + 
    Bare.nuclei_8 + Bare.nuclei_6 + Bl.cromatin_3 + Bl.cromatin_9 + 
    Bl.cromatin_1 + Bl.cromatin_2 + Bl.cromatin_4 + Bl.cromatin_5 + 
    Bl.cromatin_7 + Bl.cromatin_8 + Bl.cromatin_6 + Bl.cromatin_10 + 
    Normal.nucleoli_1 + Normal.nucleoli_2 + Normal.nucleoli_7 + 
    Normal.nucleoli_4 + Normal.nucleoli_5 + Normal.nucleoli_3 + 
    Normal.nucleoli_10 + Normal.nucleoli_6 + Normal.nucleoli_9 + 
    Normal.nucleoli_8 + Mitoses_1 + Mitoses_5 + Mitoses_4 + Mitoses_2 + 
    Mitoses_3 + Mitoses_7 + Mitoses_10 + Mitoses_8 + Mitoses_6
BreastCancer_onehot_train_2 = BreastCancer_onehot_train
BreastCancer_onehot_train_2$y_not = as.numeric(!as.logical(as.numeric(BreastCancer_onehot_train_2$y)-1))
BreastCancer_onehot_test_2 = BreastCancer_onehot_test
BreastCancer_onehot_test_2$y_not = as.numeric(!as.logical(as.numeric(BreastCancer_onehot_test_2$y)-1))
table(BreastCancer_onehot_test_2$y_not, BreastCancer_onehot_test_2$y)
   
     0  1
  0  0 45
  1 82  0
NN = neuralnet(formula_onehot_2,
               data= BreastCancer_onehot_train_2 %>% data.matrix(), 
               hidden = 10 , 
               linear.output = F
               )
# plot neural network
plot(NN)

13 Special Topics

13.1 Time

13.2 Text

14 Examples

15 Extras

15.1 Gradient Boosting

15.3 Nearest Neighbor

15.4 How the Sausage is Made

