UIRF-1-year.Rmd

---
title: "UIRF_RFE-1-year"
author: "Hajar"
date: '2022-08-02'
output: html_document
---

```{r setup, include=FALSE}
library(classifierplots)
library(caret)
library(ROCR)
library(pROC)

library(pdp)
library(dplyr)
library(mlbench)

library(caret)
library(randomForest)

knitr::opts_chunk$set(echo = TRUE)
setwd("C:/Users/hajar.hasannejadasl/Documents/prospect2022/")
scriptsLocations = paste(getwd(), "/scripts/", sep = "")
source("scripts/importAllHandmadeFunctions.R")
importAllHandmadeFunctions(scriptsLocations)

#Build 1-year prediction model for urine loss with random forest

#load data
allData1year <- readRDS("data/allData1year")
```



```{r} 

training= data.frame(allData1year[1])
training=training[,1:38]
test= data.frame(allData1year[2])
test=test[,1:38]

```
Before we start, we’ll first remove the variables related to hot flashes and sensitive breasts as these are not to be included upon request by collaborators.
```{r}

training = training[,-grep("epic26_22_opvliegers1", names(training))]
training = training[,-grep("epic26_23_gevoeligeborsten1", names(training))]

test = test[,-grep("epic26_22_opvliegers1", names(test))]
test = test[,-grep("epic26_23_gevoeligeborsten1", names(test))]

```
First we recreate the training and test set with only the input variables and the outcome we want, in this case *“epic26_1_urineverlies2”. We’ll also remove the outcome data that we’re not interested in right now from the training and test set.


```{r}
outcomeTraining = allData1year[[1]]$epic26_1_urineverlies2
outcomeTest = allData1year[[2]]$epic26_1_urineverlies2
```
The test set contains one sample with answer 0 (which means NA). We’ll remove that one first.

```{r}
test = test[-which(outcomeTest == 0),]
outcomeTest = outcomeTest[-which(outcomeTest == 0)]

```
Now we’ll make our binary data set


```{r}

#convert to binary
BinaryoutcomeTest<-ifelse(outcomeTest<5, "1","0")
BinaryoutcomeTraining<-ifelse(outcomeTraining<5, "1","0")

#convert to factor
BinaryoutcomeTraining=as.numeric(BinaryoutcomeTraining)
BinaryoutcomeTest=as.factor(BinaryoutcomeTest)

```

Prediction after Upsampling of the smaller dataset
Upsampling on class 2 (patients with problems) on train data


```{r}

#Upsampling

library(smotefamily)
trainingSet2 <- data.frame(cbind(training,BinaryoutcomeTraining))
trainingSet2$BinaryoutcomeTraining <- as.factor(trainingSet2$BinaryoutcomeTraining)
smote <- smotefamily::SMOTE(trainingSet2[,-which(colnames(trainingSet2) == "BinaryoutcomeTraining")], trainingSet2$BinaryoutcomeTraining, K=5, dup_size = 1)
datasmote=smote$data
table(datasmote$class)

SMOTE2BinaryOutcomeTraining = (datasmote[,c(37)])
SMOTE2OutcomeTraining = (datasmote[,c(37)])

SMOTE2TrainingSet = datasmote[,-c(37)]
SMOTE2BinaryOutcomeTraining<-as.factor(SMOTE2BinaryOutcomeTraining)
SMOTE2OutcomeTraining<-as.factor(SMOTE2BinaryOutcomeTraining)

```


The SMOTE function has generated 214 syntheic data. The dataset is now almost fully balanced.





```{r}

#Run RFE after upsampling

# control <- rfeControl(functions = rfFuncs, # random forest
#                       method = "repeatedcv", # repeated cv
#                       repeats = 5, # number of repeats
#                       number = 10) # number of folds
# 
# result_rfeupsampling <- rfe(x = SMOTE2TrainingSet,
#                    y = SMOTE2BinaryOutcomeTraining,
#                    sizes = c(1:36),
#                    rfeControl = control)
#saveRDS(result_rfeupsampling, paste(getwd(), "/rfeResults_RandomForestUIUpsampling-1year.rds", sep = ""))
result_rfeupsampling<-readRDS("C:/Users/hajar.hasannejadasl/Documents/prospect2022//TestRF/rfeResults_RandomForestUIUpsampling-1year.rds")
        
# Print the results
result_rfeupsampling

# Print the selected features
#predictors(result_rfeupsampling)

# Print the results visually
ggplot(data = result_rfeupsampling, metric = "Accuracy") + theme_bw()
ggplot(data = result_rfeupsampling, metric = "Kappa") + theme_bw()
```

```{r}

#ChosenModel
levels(SMOTE2BinaryOutcomeTraining) <- c("firstclass", "secondclass")
chosenmodelvariables<-result_rfeupsampling$optVariables[1:10]
chosenmodelvariables
```
 

```{r}
newdataset<-SMOTE2TrainingSet[c(chosenmodelvariables)]
 combined<-cbind(newdataset,SMOTE2BinaryOutcomeTraining)
# # #train with chosen variables
 set.seed(350)

control <- trainControl(method='repeatedcv',
                        number=10,
                        repeats=3)
#Metric compare model is Accuracy
metric <- "Accuracy"
set.seed(123)
#Number randomely variable selected is mtry
mtry <- sqrt(ncol(combined))
tunegrid <- expand.grid(.mtry=mtry)
# rf_default <- train(SMOTE2BinaryOutcomeTraining~.,
#                       data=combined,
#                       method='rf',
#                       metric='Accuracy',
#                       tuneGrid=tunegrid,
#                       trControl=control)
# 
# rf_default$coef
# saveRDS(rf_default, "C:/Users/hajar.hasannejadasl/Documents/prospect2022/TestRF_RFUI1year.rds")


finalmodel<-readRDS("C:/Users/hajar.hasannejadasl/Documents/prospect2022/TestRF_RFUI1year.rds")

```



```{r}
#performance metrics for test

outcometest<-BinaryoutcomeTest
levels(BinaryoutcomeTest) <- c("firstclass", "secondclass")

predict_RF <- predict(finalmodel, test,na.action = na.pass)
confusionMatrix(table(predict_RF, BinaryoutcomeTest))


#ROC plot
test_probRF = predict(finalmodel, test, type = "prob")
roc_RF_one <- roc(BinaryoutcomeTest, as.vector(test_probRF[,1]), ci=T)

plot(roc_RF_one, col = "darkgreen",lty = 1,cex.lab=1.5, cex.axis=1.5, lwd = 3, print.auc=TRUE)




#######

```

```{r}


#create a dataframe for var impotance
varimp_data <- data.frame(feature = row.names(varImp(finalmodel$finalModel)),
                          importance = varImp(finalmodel$finalModel)[, 1])



#sort based on importance
varimp_data<- varimp_data[order(varimp_data$importance, decreasing = TRUE),]

#Rename feature



varimp_data$feature <- c("Treatments","PSA","Age","Leaking urine","Sexual functioning","Urinary control", "sCT","Urinary function","Urine loss", "CVD")



 # plot dataframe
  ggplot(varimp_data, aes(x = reorder(feature, importance), 
                         y = importance)) +
       coord_flip() +

    geom_bar(stat='identity', fill="#229954") +
    coord_flip() +
      geom_text(aes(label = round(importance,1)), vjust=0, hjust=0, color="black", size=3) + 

    theme_classic() +
    labs(
      x     = "Feature",
      y     = "Importance",
      title = "Feature Importance random forest 1-year"+
        theme_bw()+theme(legend.position = "none")
    )
  
  
  
  
  
  
  
```