<- here::here("fluanalysis","processed_data","Processed_data.Rds")
data_location
<- readRDS(data_location) Processed_data
Model Evaluation
Load processed flu data file
#Install necessary packages
library(rsample) #for data splitting
Warning: package 'rsample' was built under R version 4.2.2
library(workflows) #for combining recipes and models
Warning: package 'workflows' was built under R version 4.2.2
library(tidymodels) # for the parsnip package, along with the rest of #tidy models Helper packages
Warning: package 'tidymodels' was built under R version 4.2.2
── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
✔ broom 1.0.1 ✔ purrr 0.3.4
✔ dials 1.1.0 ✔ recipes 1.0.5
✔ dplyr 1.1.0 ✔ tibble 3.1.8
✔ ggplot2 3.4.0 ✔ tidyr 1.2.0
✔ infer 1.0.4 ✔ tune 1.0.1
✔ modeldata 1.1.0 ✔ workflowsets 1.0.0
✔ parsnip 1.0.4 ✔ yardstick 1.1.0
Warning: package 'dials' was built under R version 4.2.2
Warning: package 'dplyr' was built under R version 4.2.2
Warning: package 'ggplot2' was built under R version 4.2.2
Warning: package 'infer' was built under R version 4.2.2
Warning: package 'modeldata' was built under R version 4.2.2
Warning: package 'parsnip' was built under R version 4.2.2
Warning: package 'recipes' was built under R version 4.2.2
Warning: package 'tune' was built under R version 4.2.2
Warning: package 'workflowsets' was built under R version 4.2.2
Warning: package 'yardstick' was built under R version 4.2.2
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ purrr::discard() masks scales::discard()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
✖ recipes::step() masks stats::step()
• Learn how to get started at https://www.tidymodels.org/start/
library(readr) # for importing data
Attaching package: 'readr'
The following object is masked from 'package:yardstick':
spec
The following object is masked from 'package:scales':
col_factor
library(broom.mixed) # for converting bayesian models to tidy tibbles
Warning: package 'broom.mixed' was built under R version 4.2.2
library(dotwhisker) # for visualizing regression results
Warning: package 'dotwhisker' was built under R version 4.2.2
library(dplyr)
- #Split data into training dataset and test dataset
set.seed(222)
#select 3/4 of the data and save into training dataset
<- initial_split(Processed_data, prop=3/4)
data_split #now create the two data frames based on the above parameters
<- training(data_split)
train_data <- testing(data_split) test_data
glimpse(Processed_data)
Rows: 730
Columns: 32
$ SwollenLymphNodes <fct> Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, No, Yes, Y…
$ ChestCongestion <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y…
$ ChillsSweats <fct> No, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, …
$ NasalCongestion <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y…
$ CoughYN <fct> Yes, Yes, No, Yes, No, Yes, Yes, Yes, Yes, Yes, No, …
$ Sneeze <fct> No, No, Yes, Yes, No, Yes, No, Yes, No, No, No, No, …
$ Fatigue <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ SubjectiveFever <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes…
$ Headache <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes…
$ Weakness <fct> Mild, Severe, Severe, Severe, Moderate, Moderate, Mi…
$ WeaknessYN <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ CoughIntensity <fct> Severe, Severe, Mild, Moderate, None, Moderate, Seve…
$ CoughYN2 <fct> Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes…
$ Myalgia <fct> Mild, Severe, Severe, Severe, Mild, Moderate, Mild, …
$ MyalgiaYN <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ RunnyNose <fct> No, No, Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No…
$ AbPain <fct> No, No, Yes, No, No, No, No, No, No, No, Yes, Yes, N…
$ ChestPain <fct> No, No, Yes, No, No, Yes, Yes, No, No, No, No, Yes, …
$ Diarrhea <fct> No, No, No, No, No, Yes, No, No, No, No, No, No, No,…
$ EyePn <fct> No, No, No, No, Yes, No, No, No, No, No, Yes, No, Ye…
$ Insomnia <fct> No, No, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Yes, Y…
$ ItchyEye <fct> No, No, No, No, No, No, No, No, No, No, No, No, Yes,…
$ Nausea <fct> No, No, Yes, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Y…
$ EarPn <fct> No, Yes, No, Yes, No, No, No, No, No, No, No, Yes, Y…
$ Hearing <fct> No, Yes, No, No, No, No, No, No, No, No, No, No, No,…
$ Pharyngitis <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, …
$ Breathless <fct> No, No, Yes, No, No, Yes, No, No, No, Yes, No, Yes, …
$ ToothPn <fct> No, No, Yes, No, No, No, No, No, Yes, No, No, Yes, N…
$ Vision <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, …
$ Vomit <fct> No, No, No, No, No, No, Yes, No, No, No, Yes, Yes, N…
$ Wheeze <fct> No, No, No, Yes, No, Yes, No, No, No, No, No, Yes, N…
$ BodyTemp <dbl> 98.3, 100.4, 100.8, 98.8, 100.5, 98.4, 102.5, 98.4, …
Creating Model 1
Creating a recipe
<-
flu_rec recipe(Nausea ~ ., data=train_data)
Creating a model
<-
lr_modlogistic_reg()%>%
set_engine("glm")
Combining the recipe and the model
<-
flu_wflow workflow() %>%
add_model(lr_mod)%>%
add_recipe(flu_rec)
flu_wflow
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()
── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps
── Model ───────────────────────────────────────────────────────────────────────
Logistic Regression Model Specification (classification)
Computational engine: glm
<-
flu_fit%>%
flu_wflow fit(data=train_data)
#to view your model details
%>%
flu_fit extract_fit_parsnip() %>%
tidy()
# A tibble: 38 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 1.63 9.40 0.173 0.862
2 SwollenLymphNodesYes -0.241 0.232 -1.04 0.298
3 ChestCongestionYes 0.219 0.257 0.853 0.394
4 ChillsSweatsYes 0.115 0.332 0.346 0.729
5 NasalCongestionYes 0.560 0.311 1.80 0.0713
6 CoughYNYes -0.705 0.611 -1.15 0.249
7 SneezeYes 0.117 0.248 0.473 0.636
8 FatigueYes 0.177 0.438 0.403 0.687
9 SubjectiveFeverYes 0.229 0.264 0.868 0.385
10 HeadacheYes 0.435 0.352 1.24 0.216
# … with 28 more rows
# ℹ Use `print(n = ...)` to see more rows
Model flu_fit evaluation
predict(flu_fit,test_data)
Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading
# A tibble: 183 × 1
.pred_class
<fct>
1 No
2 No
3 No
4 No
5 No
6 Yes
7 Yes
8 No
9 No
10 Yes
# … with 173 more rows
# ℹ Use `print(n = ...)` to see more rows
<-
flu_aug augment(flu_fit, test_data)
Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading
Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading
%>%
flu_aug select (Nausea, RunnyNose,Fatigue,.pred_class,.pred_Yes)
# A tibble: 183 × 5
Nausea RunnyNose Fatigue .pred_class .pred_Yes
<fct> <fct> <fct> <fct> <dbl>
1 No No Yes No 0.0377
2 Yes Yes Yes No 0.283
3 No Yes Yes No 0.320
4 Yes Yes Yes No 0.442
5 No No Yes No 0.170
6 Yes No Yes Yes 0.812
7 Yes Yes Yes Yes 0.746
8 No No Yes No 0.275
9 No No Yes No 0.280
10 Yes No Yes Yes 0.719
# … with 173 more rows
# ℹ Use `print(n = ...)` to see more rows
%>%
flu_aug roc_curve(truth=Nausea, .pred_No) %>%
autoplot()
Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
always returns an ungrouped data frame and adjust accordingly.
ℹ The deprecated feature was likely used in the yardstick package.
Please report the issue at <https://github.com/tidymodels/yardstick/issues>.
%>%
flu_aug roc_auc(truth=Nausea, .pred_No)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 roc_auc binary 0.724
Alternative Model
In this model we will fit only one predictor, our main predictor, the variable RunnyNose.
Creating a recipe
<-
flu_recA recipe(Nausea ~ RunnyNose, data=train_data)
Creating a model
<-
lr_modlogistic_reg()%>%
set_engine("glm")
Combining the recipe and the model
<-
flu_wflowA workflow() %>%
add_model(lr_mod)%>%
add_recipe(flu_recA)
flu_wflowA
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()
── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps
── Model ───────────────────────────────────────────────────────────────────────
Logistic Regression Model Specification (classification)
Computational engine: glm
<-
flu_fitA%>%
flu_wflowA fit(data=train_data)
#to view your model details
%>%
flu_fitA extract_fit_parsnip() %>%
tidy()
# A tibble: 2 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) -0.790 0.172 -4.59 0.00000447
2 RunnyNoseYes 0.188 0.202 0.930 0.352
Model flu_fit evaluation
predict(flu_fitA,test_data)
# A tibble: 183 × 1
.pred_class
<fct>
1 No
2 No
3 No
4 No
5 No
6 No
7 No
8 No
9 No
10 No
# … with 173 more rows
# ℹ Use `print(n = ...)` to see more rows
<-
flu_augA augment(flu_fitA, test_data)
%>%
flu_augA select (Nausea, RunnyNose,Fatigue,.pred_class,.pred_Yes)
# A tibble: 183 × 5
Nausea RunnyNose Fatigue .pred_class .pred_Yes
<fct> <fct> <fct> <fct> <dbl>
1 No No Yes No 0.312
2 Yes Yes Yes No 0.354
3 No Yes Yes No 0.354
4 Yes Yes Yes No 0.354
5 No No Yes No 0.312
6 Yes No Yes No 0.312
7 Yes Yes Yes No 0.354
8 No No Yes No 0.312
9 No No Yes No 0.312
10 Yes No Yes No 0.312
# … with 173 more rows
# ℹ Use `print(n = ...)` to see more rows
%>%
flu_augA roc_curve(truth=Nausea, .pred_No) %>%
autoplot()
%>%
flu_augA roc_auc(truth=Nausea, .pred_No)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 roc_auc binary 0.466
Conclusion
From these model evaluation exercise, the first model where all predictors are used is a better fit. The roc_auc value is well over .5
The Alternative (Second model) where only RunnyNose variable is used as a predictor is NOT a good performing model. The roc_auc is under .5, and the graph seems like there is under fitting.
This section added by VIJAYPANTHAYI
I will be repeating the previous steps, however, instead fitting linear models to the continuous variable, body temperature. In order to do this. the metric needs to be changed to RMSE (root-mean-square-error).
#Create the training and test data sets from the original data set
set.seed(223)
<- initial_split(Processed_data,prop=3/4)
data_split_linear <- training(data_split_linear)
training_data_linear <- testing(data_split_linear) test_data_linear
Workflow Creation and Model Fitting
We will use tidymodels
to generate our linear regression model. We will use recipe()
and worklfow()
to create the workflow.
# Initiate a new recipe
<- recipe(BodyTemp ~ ., data = training_data_linear)
linear_recipe # Create the linear regression model
<- linear_reg() %>%
linear set_engine("lm")
# Create the workflow
<-
flu_wflow_linear workflow() %>%
add_model(linear) %>%
add_recipe(linear_recipe)
flu_wflow_linear
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()
── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps
── Model ───────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)
Computational engine: lm
Model 2 Evaluation
Now that we have created the workflow, we can fit the model to the training and test data sets created previously.
<- flu_wflow_linear %>%
training_fit_linear fit(data = training_data_linear)
<- flu_wflow_linear %>%
test_fit_linear fit(data = test_data_linear)
The next step is to use the trained workflows, training_fit
, to predict with unseen test data.
predict(training_fit_linear, test_data_linear)
Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response"): prediction from a rank-deficient fit may be misleading
# A tibble: 183 × 1
.pred
<dbl>
1 99.4
2 98.8
3 99.8
4 98.8
5 98.4
6 99.9
7 99.2
8 98.9
9 99.1
10 99.4
# … with 173 more rows
# ℹ Use `print(n = ...)` to see more rows
We now want to compare the estimates. To do this, we use augment()
.
<- augment(training_fit_linear, training_data_linear) training_aug_linear
Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response"): prediction from a rank-deficient fit may be misleading
<- augment(test_fit_linear, test_data_linear) test_aug_linear
Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response"): prediction from a rank-deficient fit may be misleading
If we want to assess how well the model makes predictions, we can use RMSE (root-mean-square-error) on the training_data
and the test_data
. RMSE is commonly used to evaluate the quality of predictions by measuring how far predictions fall from the measured true values. The lower the RMSE, the better the model.
%>%
training_aug_linear rmse(BodyTemp, .pred)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 1.11
Now, same for test_data
.
%>%
test_aug_linear rmse(BodyTemp, .pred)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 1.07
The RMSE value for the training data set is 1.106 and the RMSE value for the test data set is 1.0666. This indicates that the model couldn’t find a solution or is not optimized very well for all predictors.
Alternative Model
Now, lets repeat these steps with only 1 predictor.
<- recipe(BodyTemp ~ RunnyNose, data = training_data_linear)
linear_recipe1
<-
flu_wflow_linear1 workflow() %>%
add_model(linear) %>%
add_recipe(linear_recipe1)
<- flu_wflow_linear1 %>%
training_fit_linear1 fit(data = training_data_linear)
<- flu_wflow_linear1 %>%
test_fit_linear1 fit(data = test_data_linear)
<- augment(training_fit_linear1, training_data_linear)
training_aug_linear1 <- augment(test_fit_linear1, test_data_linear) test_aug_linear1
Now, let’s find the RMSE for the training and test data sets.
%>%
training_aug_linear1 rmse(BodyTemp, .pred)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 1.19
Now, same for test_data
.
%>%
test_aug_linear1 rmse(BodyTemp, .pred)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 1.19
The RMSE value for the training data set is 1.185 and the RMSE value for the test data set is 1.194. This still indicates that the model couldn’t find a solution or is not optimized very well for all predictors.
It appears that Model 1 predicts the categorical variable better than Model 2 predicts the continuous one.