#install.packages("pacman")
library(pacman)
p_load(infotheo)
p_load(tidyverse)
p_load(ggplot2)
p_load(cowplot)
p_load(mlbench)
p_load(Metrics)
#remove.packages("rlang")
#install.packages("rlang", repos = "https://cloud.r-project.org")
set.seed(123)
BreastCancer Dataset
A data frame with 699 observations on 11 variables, one being a character variable, 9 being ordered or nominal, and 1 target class.
Breast Cancer Wisconsin (Original) Data Set
“Multisurface method of pattern separation for medical diagnosis applied to breast cytology.”, Wolberg,W.H., Mangasarian,O.L. (1990). In Proceedings of the National Academy of Sciences, 87, 9193-9196.
Zhang,J. (1992). Selecting typical instances in instance-based learning. In Proceedings of the Ninth International Machine Learning Conference (pp. 470-479). Aberdeen, Scotland: Morgan Kaufmann.
data(BreastCancer)
glimpse(BreastCancer)
Observations: 699
Variables: 11
$ Id <chr> "1000025", "1002945", "1015425", "1016277", "1017023", "1017122", "1018099", "1018561", "1033078",...
$ Cl.thickness <ord> 5, 5, 3, 6, 4, 8, 1, 2, 2, 4, 1, 2, 5, 1, 8, 7, 4, 4, 10, 6, 7, 10, 3, 8, 1, 5, 3, 5, 2, 1, 3, 2, ...
$ Cell.size <ord> 1, 4, 1, 8, 1, 10, 1, 1, 1, 2, 1, 1, 3, 1, 7, 4, 1, 1, 7, 1, 3, 5, 1, 4, 1, 2, 2, 1, 1, 1, 1, 1, 7...
$ Cell.shape <ord> 1, 4, 1, 8, 1, 10, 1, 2, 1, 1, 1, 1, 3, 1, 5, 6, 1, 1, 7, 1, 2, 5, 1, 5, 1, 3, 1, 1, 1, 3, 1, 1, 7...
$ Marg.adhesion <ord> 1, 5, 1, 1, 3, 8, 1, 1, 1, 1, 1, 1, 3, 1, 10, 4, 1, 1, 6, 1, 10, 3, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, ...
$ Epith.c.size <ord> 2, 7, 2, 3, 2, 7, 2, 2, 2, 2, 1, 2, 2, 2, 7, 6, 2, 2, 4, 2, 5, 6, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 8,...
$ Bare.nuclei <fct> 1, 10, 2, 4, 1, 10, 10, 1, 1, 1, 1, 1, 3, 3, 9, 1, 1, 1, 10, 1, 10, 7, 1, NA, 1, 7, 1, 1, 1, 1, 1,...
$ Bl.cromatin <fct> 3, 3, 3, 3, 3, 9, 3, 3, 1, 2, 3, 2, 4, 3, 5, 4, 2, 3, 4, 3, 5, 7, 2, 7, 3, 3, 2, 2, 2, 1, 2, 3, 7,...
$ Normal.nucleoli <fct> 1, 2, 1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 4, 1, 5, 3, 1, 1, 1, 1, 4, 10, 1, 3, 1, 6, 1, 1, 1, 1, 1, 1, 4...
$ Mitoses <fct> 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 4, 1, 1, 1, 2, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3,...
$ Class <fct> benign, benign, benign, benign, benign, malignant, benign, benign, benign, benign, benign, benign,...
summary(BreastCancer$Class)
benign malignant
458 241
BreastCancer$y <- as.factor(as.numeric(BreastCancer$Class=="malignant"))
BreastCancer$Class <- NULL
BreastCancer$Id <- NULL
BreastCancer[,1:5] <- lapply(BreastCancer[,1:5] , as.numeric)
summary(BreastCancer)
Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size Bare.nuclei Bl.cromatin
Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000 1 :402 2 :166
1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 2.000 10 :132 3 :165
Median : 4.000 Median : 1.000 Median : 1.000 Median : 1.000 Median : 2.000 2 : 30 1 :152
Mean : 4.418 Mean : 3.134 Mean : 3.207 Mean : 2.807 Mean : 3.216 5 : 30 7 : 73
3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.: 4.000 3rd Qu.: 4.000 3 : 28 4 : 40
Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000 (Other): 61 5 : 34
NA's : 16 (Other): 69
Normal.nucleoli Mitoses y
1 :443 1 :579 0:458
10 : 61 2 : 35 1:241
3 : 44 3 : 33
2 : 36 10 : 14
8 : 24 4 : 12
6 : 22 7 : 9
(Other): 69 (Other): 17
p_load(GGally)
ggpairs(BreastCancer, title = "Breast Cancer Dataset")
p_load(corrplot)
p_load(infotheo)
BreastCancer_mi <- mutinformation(BreastCancer, method="emp") %>% natstobits()
#BreastCancer_mi <- BreastCancer_mi/max(BreastCancer_mi)
mi_max <- max( BreastCancer_mi[lower.tri(BreastCancer_mi, diag = FALSE)])
diag(BreastCancer_mi) <-0
corrplot.mixed(BreastCancer_mi,
cl.lim = c(0,mi_max),
title = "Normalised Mutual Information Breast Cancer Dataset",
mar=c(0,0,1,0),
lower = "ellipse",
upper="number",
is.corr = FALSE,
order = "hclust"
)
p_load(infotheo)
BreastCancer_mi <- mutinformation(BreastCancer, method="emp") %>% natstobits()
BreastCancer_mi_d <- as.dist(max(BreastCancer_mi)-BreastCancer_mi)
hc <- hclust(BreastCancer_mi_d, method="ward.D2")
plot(hc)
There are 16 unexplained missing values on one of the features. We’re going to impute those values, being careful to not use the outcome as one of the predictors. This will allows us to make comparisons across methods that do not handle missing values well, and also will protect us on predicting onto new test data which might also have unexplained missingness.
MissForest—non-parametric missing value imputation for mixed-type data, Daniel J. Stekhoven Peter Bühlmann, Bioinformatics, Volume 28, Issue 1, 1 January 2012, Pages 112–118,
#There are 16 missing values in Bare.nuclei, they're continous
p_load("missForest")
BreastCancer_imputed <- BreastCancer
BreastCancer_imputed <- missForest(BreastCancer %>% select(-y), verbose = TRUE)$ximp
missForest iteration 1 in progress...done!
estimated error(s): 0 0.09553441
difference(s): 0 0.001430615
time: 1.23 seconds
missForest iteration 2 in progress...done!
estimated error(s): 0 0.08674963
difference(s): 0 0.0003576538
time: 1.19 seconds
missForest iteration 3 in progress...done!
estimated error(s): 0 0.09699854
difference(s): 0 0.0007153076
time: 1.02 seconds
BreastCancer_imputed$y <- BreastCancer$y
Convert categorical variables to ‘one-hot’ dummy variables
Making dummy variables with dummy_cols(), Jacob Kaplan, 2018-06-21
#install.packages('data.table')
p_load(fastDummies)
BreastCancer_onehot <- fastDummies::dummy_cols(BreastCancer_imputed,
select_columns=c("Bare.nuclei",
"Bl.cromatin",
"Normal.nucleoli",
"Mitoses"))
BreastCancer_onehot[,c('Bare.nuclei','Bl.cromatin','Normal.nucleoli','Mitoses')] <- NULL
The Very first thing we’re going to do is pull 20% of the Breat Cancer dataset out as a test set and we’re never going to touch it for any reason other than final model evaluation.
Immediately split off a test set that we will not touch until the very final evaluation.
N=nrow(BreastCancer)
condition_train <- runif(N)<.8; table(condition_train)
condition_train
FALSE TRUE
127 572
BreastCancer_train <- BreastCancer_imputed[condition_train,]
BreastCancer_test <- BreastCancer_imputed[!condition_train,]
BreastCancer_onehot_train <- BreastCancer_onehot[condition_train,]
BreastCancer_onehot_test <- BreastCancer_onehot[!condition_train,]
formula= y ~ Cl.thickness +
Cell.size +
Cell.shape +
Marg.adhesion +
Epith.c.size +
Bare.nuclei +
Bl.cromatin +
Normal.nucleoli +
Mitoses
#One Hot formula dummies
formula_onehot = y ~
Cl.thickness +
Cell.size +
Cell.shape +
Marg.adhesion +
Epith.c.size +
Bare.nuclei_1 + Bare.nuclei_10 + Bare.nuclei_2 + Bare.nuclei_4 + Bare.nuclei_3 + Bare.nuclei_9 + Bare.nuclei_7 +
Bare.nuclei_5 + Bare.nuclei_8 + Bare.nuclei_6 + Bl.cromatin_3 +
Bl.cromatin_9 + Bl.cromatin_1+Bl.cromatin_2+Bl.cromatin_4+Bl.cromatin_5+Bl.cromatin_7 +
Bl.cromatin_8+Bl.cromatin_6+Bl.cromatin_10+
Normal.nucleoli_1 + Normal.nucleoli_2 + Normal.nucleoli_7 +
Normal.nucleoli_4 + Normal.nucleoli_5 + Normal.nucleoli_3 +
Normal.nucleoli_10 + Normal.nucleoli_6 + Normal.nucleoli_9 +
Normal.nucleoli_8 +
Mitoses_1+ Mitoses_5 + Mitoses_4 + Mitoses_2+Mitoses_3 + Mitoses_7 + Mitoses_10 + Mitoses_8 + Mitoses_6
Register a single back end for cross-validation
p_load(caret)
set.seed(123)
cctrl1 <- trainControl(method="cv",
number=10,
returnResamp="all",
classProbs=TRUE,
summaryFunction=twoClassSummary,
savePredictions=TRUE
)
p_load(glmnet)
set.seed(123)
glm1 <- glm(formula_onehot ,
data=BreastCancer_onehot_train ,
family=binomial(link='probit')
)
glm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred
library(broom)
tidy(glm1 ) #There are 44 features, counting dummified categorical variables
Out of sample accuracy?
FALSE [1] 0.9368907
“bounceR”, R Package
Introduction to vimp, Brian D. Williamson, 2018-06-19
set.seed(123)
glmnet1 <- glmnet(x=BreastCancer_onehot_train %>% select(-y) %>% as.matrix(),
y=as.factor(BreastCancer_onehot_train$y),
family="binomial"
)
plot(glmnet1)
glmnet1_cv <- cv.glmnet(x=BreastCancer_onehot_train %>% select(-y) %>% data.matrix(),
y=as.factor(BreastCancer_onehot_train$y),
family="binomial",
nfolds=5)
glmnet1_cv$lambda.1se #smallest model with error within 1se error of the minimum ever observed
[1] 0.01817259
plot(glmnet1_cv)
glmnet_lambda.1se_betas <- coef(glmnet1_cv,s="lambda.1se") %>% as.matrix() %>% as.data.frame() %>%
rename(beta='1') %>%
rownames_to_column() %>% arrange(desc(beta) )
#There are 44 features
#14 have been set to nonzero coefficients
#the cofficients are relatively small
#Design a single model around that optimal lambda
glmnet_lambda.1se <- glmnet(x=BreastCancer_onehot_train %>% select(-y) %>% data.matrix(),
y=BreastCancer_onehot_train$y,
family="binomial",
lambda=glmnet1_cv$lambda.1se
)
#cross validate that model to get estimate of accuracy on the test set
glmnet_lambda.1se_cv <- train(x=BreastCancer_onehot_train %>% select(-y) %>% data.matrix(),
y=as.factor(paste0('Outcome',BreastCancer_train$y)),
method = "glmnet",
trControl = cctrl1,
metric = "ROC",
tuneGrid = expand.grid(alpha = 1,lambda = glmnet1_cv$lambda.1se))
#Area Under the Curve Almost Perfect Now despite using only 14 of the 44 features
print(glmnet_lambda.1se_cv$results$ROC) #0.99
[1] 0.9920091
p_load(plotROC)
out_of_sample_predictions2 <- data.frame(y_hat=glmnet_lambda.1se_cv$pred$Outcome1,
y=BreastCancer_train$y[glmnet_lambda.1se_cv$pred$rowIndex],
model="Lasso")
basicplot <- ggplot(bind_rows(out_of_sample_predictions,
out_of_sample_predictions2),
aes(d = as.numeric(y), m = y_hat, color=model)) + geom_roc(n.cuts=0) +
style_roc(theme = theme_grey, xlab = "1 - Specificity") + ggtitle("AUC")
Unequal factor levels: coercing to characterbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vector
basicplot
NA
We can put the same feature in a linear multiple times with polynomials to capture nonlinear relationships.
set.seed(123)
library(dplyr)
df <- data.frame(x=seq(0,100)) %>%
mutate(y=0+x+x^2+x^3) %>%
mutate(pred_lm = lm(y~x )$fitted.values) %>%
mutate(pred_lm_quad = lm(y~x+I(x^2))$fitted.values)
library(ggplot2)
ggplot(df, aes(x,y)) +
geom_point( aes(x,y)) +
geom_line(aes(x=x,y=pred_lm), col='red') +
geom_line(aes(x=x,y=pred_lm_quad), col='blue')
Nonlinear Models * (ISLR) “Chapter 7 Moving Beyond Linearity” Linear_separability
set.seed(123)
form <- ~ .^2
y <- BreastCancer_onehot_train$Class_binary
BreastCancer_onehot_train_twoway <- model.matrix(form, data = BreastCancer_onehot_train[,-c(6)])
BreastCancer_onehot_test_twoway <- model.matrix(form, data = BreastCancer_onehot_test[,-c(6)])
dim(BreastCancer_onehot_train_twoway)#991 terms
[1] 572 991
condition = colnames(BreastCancer_onehot_train_twoway)=='Class_binary'
glmnet_twoway <- glmnet(x=BreastCancer_onehot_train_twoway ,
y=as.factor(BreastCancer_onehot_train$y),
family="binomial"
)
plot(glmnet_twoway)
glmnet_twoway_cv <- cv.glmnet(x=BreastCancer_onehot_train_twoway,
y=as.factor(BreastCancer_onehot_train$y),
family="binomial",
nfolds=5)
glmnet_twoway_cv$lambda.1se #smallest model with error within 1se error of the minimum ever observed
[1] 0.01994439
plot(glmnet_twoway_cv)
glmnet_twoway_lambda.1se_betas <- coef(glmnet1_cv,s="lambda.1se") %>% as.matrix() %>% as.data.frame() %>%
rename(beta='1') %>%
rownames_to_column() %>% arrange(desc(beta) )
#Design a single model around that optimal lambda
glmnet_twoway_lambda.1se <- glmnet(x=BreastCancer_onehot_train %>% select(-y) %>% data.matrix(),
y=BreastCancer_onehot_train$y,
family="binomial",
lambda=glmnet1_cv$lambda.1se
)
#cross validate that model to get estimate of accuracy on the test set
glmnet_twoway_lambda.1se_cv <- train(x=BreastCancer_onehot_train_twoway,
y=as.factor(paste0('Outcome',BreastCancer_train$y)),
method = "glmnet",
trControl = cctrl1,
metric = "ROC",
tuneGrid = expand.grid(alpha = 1,lambda = glmnet1_cv$lambda.1se))
#Area Under the Curve Almost Perfect Now despite using only 14 of the 44 features
print(glmnet_twoway_lambda.1se_cv$results$ROC) #0.991
[1] 0.9912164
p_load(plotROC)
out_of_sample_predictions3 <- data.frame(y_hat=glmnet_twoway_lambda.1se_cv$pred$Outcome1,
y=BreastCancer_train$y[glmnet_twoway_lambda.1se_cv$pred$rowIndex],
model="Lasso Interactions")
basicplot <- ggplot(bind_rows(out_of_sample_predictions,
out_of_sample_predictions2,
out_of_sample_predictions3),
aes(d = as.numeric(y), m = y_hat, color=model)) + geom_roc(n.cuts=0) +
style_roc(theme = theme_grey, xlab = "1 - Specificity") + ggtitle("AUC")
Unequal factor levels: coercing to characterbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vector
basicplot
Interpreting the model
There are some measures that unambigiously look bad for cancer outcomes.
There are certain interactions that are good news.
Bare.nuclei_8:Normal.nucleoli_2
Bare.nuclei_1:Mitoses_1
Bare.nuclei_7:Normal.nucleoli_8
Normal.nucleoli_1:Mitoses_1
are.nuclei_1:Normal.nucleoli_1
Bare.nuclei_1 by itself looks like good news, but in combination with something else it’s especially helpful.
#There are 991 terms
#By a mirracle, also 14 chosen
#Some of the cofficients are relatively small
glmnet_twoway_cv_betas <- coef(glmnet_twoway_cv,s="lambda.1se") %>%
as.matrix() %>% as.data.frame() %>%
rename(beta='1') %>%
rownames_to_column() %>% arrange(desc(beta) )
glmnet_twoway_cv_betas %>% filter(beta!=0)
set.seed(123)
p_load(party)
single_decision_tree <- ctree(formula, data = BreastCancer_train)
plot(single_decision_tree)
Out of sample
Slightly worse but arguably an easier to interpret model.
set.seed(123)
single_decision_tree_cv_model <- train(x=BreastCancer_train[,-c(10)],
y=as.factor(paste0('Outcome',BreastCancer_train$y)),
method = "ctree",
trControl = cctrl1,
metric = "ROC",
tuneGrid = expand.grid(mincriterion = 0.99)
)
print(single_decision_tree_cv_model$results$ROC) #0.9668608
[1] 0.9656646
p_load(plotROC)
out_of_sample_predictions4 <- data.frame(y_hat=single_decision_tree_cv_model$pred$Outcome1,
y=BreastCancer_train$y[single_decision_tree_cv_model$pred$rowIndex],
model="Tree")
basicplot <- ggplot(bind_rows(out_of_sample_predictions,
out_of_sample_predictions2,
out_of_sample_predictions3,
out_of_sample_predictions4),
aes(d = as.numeric(y), m = y_hat, color=model)) + geom_roc(n.cuts=0) +
style_roc(theme = theme_grey, xlab = "1 - Specificity") + ggtitle("AUC")
Unequal factor levels: coercing to characterbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vector
basicplot
set.seed(123)
#install.packages('randomForest', dependencies=T)
p_load(randomForest)
forest <- randomForest(formula,
data = BreastCancer_train,
localImp = TRUE,
na.action=na.omit)
print(forest)
Call:
randomForest(formula = formula, data = BreastCancer_train, localImp = TRUE, na.action = na.omit)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 3
OOB estimate of error rate: 2.62%
Confusion matrix:
0 1 class.error
0 363 13 0.03457447
1 2 194 0.01020408
set.seed(123)
p_load(randomForest)
forest_cv_model <- train(x=BreastCancer_train[,-c(10)],
y=as.factor(paste0('Outcome',BreastCancer_train$y)),
method = "rf",
trControl = cctrl1,
metric = "ROC"
#tuneGrid = expand.grid(alpha = 1,lambda = glmnet1_cv$lambda.1se)
)
print(forest_cv_model$results)
p_load(plotROC)
condition <- forest_cv_model$pred$mtry==5
out_of_sample_predictions5 <- data.frame(y_hat=forest_cv_model$pred$Outcome1[condition] ,
y=BreastCancer_train$y[forest_cv_model$pred$rowIndex[condition]] ,
model="Forest")
basicplot <- ggplot(bind_rows(out_of_sample_predictions,
out_of_sample_predictions2,
out_of_sample_predictions3,
out_of_sample_predictions4,
out_of_sample_predictions5),
aes(d = as.numeric(y), m = y_hat, color=model)) + geom_roc(n.cuts=0) +
style_roc(theme = theme_grey, xlab = "1 - Specificity") + ggtitle("AUC")
Unequal factor levels: coercing to characterbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vector
basicplot
Understanding random forests with randomForestExplainer, Aleksandra Paluszyńska
set.seed(123)
#devtools::install_github("MI2DataLab/randomForestExplainer")
p_load(randomForestExplainer)
#install.packages('rlang')
min_depth_frame <- min_depth_distribution(forest)
save(min_depth_frame, file = "min_depth_frame.rda")
load("min_depth_frame.rda")
head(min_depth_frame, n = 10)
# plot_min_depth_distribution(forest) # gives the same result as below but takes longer
plot_min_depth_distribution(min_depth_frame)
Variable Importance Pay particular attention to “accuracy_decrease” which is the drop in the classifier’s accuracy if that variable is shuffled destroying its information.
importance_frame <- measure_importance(forest)
importance_frame
plot_multi_way_importance(importance_frame, size_measure = "no_of_nodes")
(vars <- important_variables(importance_frame, k = 5, measures = c("mean_min_depth", "no_of_trees")))
[1] "Bare.nuclei" "Normal.nucleoli" "Cell.shape" "Cell.size" "Cl.thickness"
interactions_frame <- min_depth_interactions(forest, vars)
head(interactions_frame[order(interactions_frame$occurrences, decreasing = TRUE), ])
plot_min_depth_interactions(interactions_frame)
plot_predict_interaction(forest, BreastCancer_train[,-c(10)], "Cell.size", "Cl.thickness")
Can even generate an automated report
explain_forest(forest, interactions = TRUE, data = BreastCancer_train)
set.seed(123)
df_predictions <- data.frame(y_true=BreastCancer_test$y,
y_hat_glm=stats::predict.glm(glm1, newdata=BreastCancer_onehot_test, type = "response" ),
y_hat_lasso = predict(glmnet1_cv, newx=BreastCancer_onehot_test %>%
select(-y) %>% data.matrix(), s=c("lambda.1se") ,
type = "response")[,1],
y_hat_lasso_twoway <- predict(glmnet_twoway_cv,
newx=BreastCancer_onehot_test_twoway %>%
data.matrix(),
s=c("lambda.1se") , type = "response")[,1],
y_hat_single_tree = predict(single_decision_tree, newdata=BreastCancer_test,
type = "prob") %>% sapply(rbind) %>% t() %>%
data.frame() %>% pull(X2),
y_hat_forest = predict(forest, newdata=BreastCancer_test, type = "prob")[,'1']#,
#y_hat_nn = predict(NN, newdata=BreastCancer_test, type = "prob")
)
prediction from a rank-deficient fit may be misleading
p_load(MLmetrics)
AUC(df_predictions$y_hat_glm,df_predictions$y_true) %>% round(3)
[1] 0.935
AUC(df_predictions$y_hat_lasso,df_predictions$y_true) %>% round(3)
[1] 0.991
AUC(df_predictions$y_hat_lasso_twoway,df_predictions$y_true) %>% round(3)
[1] 0.99
AUC(df_predictions$y_hat_single_tree,df_predictions$y_true) %>% round(3)
[1] 0.972
AUC(df_predictions$y_hat_forest,df_predictions$y_true) %>% round(3)
[1] 0.988
table(df_predictions$y_hat_lasso>.5,
df_predictions$y_true)
0 1
FALSE 79 4
TRUE 3 41
p_load("neuralnet")
formula_onehot_2 = y + y_not ~ Cl.thickness + Cell.size + Cell.shape + Marg.adhesion + Epith.c.size +
Bare.nuclei_1 + Bare.nuclei_10 + Bare.nuclei_2 + Bare.nuclei_4 +
Bare.nuclei_3 + Bare.nuclei_9 + Bare.nuclei_7 + Bare.nuclei_5 +
Bare.nuclei_8 + Bare.nuclei_6 + Bl.cromatin_3 + Bl.cromatin_9 +
Bl.cromatin_1 + Bl.cromatin_2 + Bl.cromatin_4 + Bl.cromatin_5 +
Bl.cromatin_7 + Bl.cromatin_8 + Bl.cromatin_6 + Bl.cromatin_10 +
Normal.nucleoli_1 + Normal.nucleoli_2 + Normal.nucleoli_7 +
Normal.nucleoli_4 + Normal.nucleoli_5 + Normal.nucleoli_3 +
Normal.nucleoli_10 + Normal.nucleoli_6 + Normal.nucleoli_9 +
Normal.nucleoli_8 + Mitoses_1 + Mitoses_5 + Mitoses_4 + Mitoses_2 +
Mitoses_3 + Mitoses_7 + Mitoses_10 + Mitoses_8 + Mitoses_6
BreastCancer_onehot_train_2 = BreastCancer_onehot_train
BreastCancer_onehot_train_2$y_not = as.numeric(!as.logical(as.numeric(BreastCancer_onehot_train_2$y)-1))
BreastCancer_onehot_test_2 = BreastCancer_onehot_test
BreastCancer_onehot_test_2$y_not = as.numeric(!as.logical(as.numeric(BreastCancer_onehot_test_2$y)-1))
table(BreastCancer_onehot_test_2$y_not, BreastCancer_onehot_test_2$y)
0 1
0 0 45
1 82 0
NN = neuralnet(formula_onehot_2,
data= BreastCancer_onehot_train_2 %>% data.matrix(),
hidden = 10 ,
linear.output = F
)
# plot neural network
plot(NN)
Principal_component_analysis Multiple correspondence analysis