prep for quarto

hfrick · hfrick · commit d9d23dde5906 · 2025-04-23T18:29:46.000+01:00
diff --git a/vignettes/.gitignore b/vignettes/.gitignore
@@ -1,2 +1,4 @@
 *.html
 *.R
+
+/.quarto/
diff --git a/vignettes/Common_Patterns.Rmd b/vignettes/Common_Patterns.Rmd
@@ -7,7 +7,8 @@ vignette: >
   %\VignetteEncoding{UTF-8}
 ---
 
-```{r, include = FALSE}
+```{r}
+#| include: false
 knitr::opts_chunk$set(
   collapse = TRUE,
   comment = "#>",
@@ -19,7 +20,8 @@ The rsample package provides a number of resampling methods which are broadly ap
 
 Let's go ahead and load rsample now:
 
-```{r setup}
+```{r}
+#| label: setup
 library(rsample)
 ```
 
diff --git a/vignettes/Working_with_rsets.Rmd b/vignettes/Working_with_rsets.Rmd
@@ -8,7 +8,9 @@ output:
     toc: yes
 ---
 
-```{r ex_setup, include=FALSE}
+```{r}
+#| label: ex_setup
+#| include: false
 knitr::opts_chunk$set(
   message = FALSE,
   digits = 3,
@@ -19,7 +21,9 @@ knitr::opts_chunk$set(
 options(digits = 3, width = 90)
 ```
 
-```{r ggplot2_setup, include = FALSE}
+```{r}
+#| label: ggplot2_setup
+#| include: false
 library(ggplot2)
 theme_set(theme_bw())
 ```
@@ -35,7 +39,9 @@ Let's use the `attrition` data set. From its documentation:
 
 The data can be accessed using 
 
-```{r attrition, message=FALSE}
+```{r}
+#| label: attrition
+#| message: false
 library(rsample)
 data("attrition", package = "modeldata")
 names(attrition)
@@ -55,14 +61,18 @@ glm(Attrition ~ JobSatisfaction + Gender + MonthlyIncome,
 
 For convenience, we'll create a formula object that will be used later:
 
-```{r form, message=FALSE}
+```{r}
+#| label: form
+#| message: false
 mod_form <- as.formula(Attrition ~ JobSatisfaction + Gender + MonthlyIncome)
 ```
 
 To evaluate this model, we will use 10 repeats of 10-fold cross-validation and use the 100 holdout samples to evaluate the overall accuracy of the model. 
 
 First, let's make the splits of the data:
-```{r model_vfold, message=FALSE}
+```{r}
+#| label: model_vfold
+#| message: false
 library(rsample)
 set.seed(4622)
 rs_obj <- vfold_cv(attrition, v = 10, repeats = 10)
@@ -77,7 +87,8 @@ Now let's write a function that will, for each resample:
 
 Here is our function:
 
-```{r lm_func}
+```{r}
+#| label: lm_func
 ## splits will be the `rsplit` object with the 90/10 partition
 holdout_results <- function(splits, ...) {
   # Fit the model to the 90%
@@ -99,7 +110,9 @@ holdout_results <- function(splits, ...) {
 
 For example: 
 
-```{r onefold, warning = FALSE}
+```{r}
+#| label: onefold
+#| warning: false
 example <- holdout_results(rs_obj$splits[[1]],  mod_form)
 dim(example)
 dim(assessment(rs_obj$splits[[1]]))
@@ -111,7 +124,9 @@ For this model, the `.fitted` value is the linear predictor in log-odds units.
 
 To compute this data set for each of the 100 resamples, we'll use the `map()` function from the purrr package:
 
-```{r model_purrr, warning=FALSE}
+```{r}
+#| label: model_purrr
+#| warning: false
 library(purrr)
 rs_obj$results <- map(rs_obj$splits,
                       holdout_results,
@@ -121,7 +136,8 @@ rs_obj
 
 Now we can compute the accuracy values for all of the assessment data sets: 
 
-```{r model_acc}
+```{r}
+#| label: model_acc
 rs_obj$accuracy <- map_dbl(rs_obj$results, function(x) mean(x$correct))
 summary(rs_obj$accuracy)
 ```
@@ -134,15 +150,20 @@ Traditionally, the bootstrap has been primarily used to empirically determine th
 
 For example, are there differences in the median monthly income between genders? 
 
-```{r type_plot, fig.alt = "Two boxplots of monthly income separated by gender, showing a slight difference in median but largely overlapping boxes."}
+```{r}
+#| label: type_plot
+#| fig.alt: > 
+#|   Two boxplots of monthly income separated by gender, showing a slight
+#|   difference in median but largely overlapping boxes.
 ggplot(attrition, aes(x = Gender, y = MonthlyIncome)) + 
   geom_boxplot() + 
   scale_y_log10()
 ```
 
 If we wanted to compare the genders, we could conduct a _t_-test or rank-based test. Instead, let's use the bootstrap to see if there is a difference in the median incomes for the two groups. We need a simple function to compute this statistic on the resample:
 
-```{r mean_diff}
+```{r}
+#| label: mean_diff
 median_diff <- function(splits) {
   x <- analysis(splits)
   median(x$MonthlyIncome[x$Gender == "Female"]) - 
@@ -152,28 +173,35 @@ median_diff <- function(splits) {
 
 Now we would create a large number of bootstrap samples (say 2000+). For illustration, we'll only do 500 in this document. 
 
-```{r boot_mean_diff}
+```{r}
+#| label: boot_mean_diff
 set.seed(353)
 bt_resamples <- bootstraps(attrition, times = 500)
 ```
 
 This function is then computed across each resample:
 
-```{r stats}
+```{r}
+#| label: stats
 bt_resamples$wage_diff <- map_dbl(bt_resamples$splits, median_diff)
 ```
 
 The bootstrap distribution of this statistic has a slightly bimodal and skewed distribution:  
 
-```{r stats_plot, fig.alt = "The bootstrap distribution of the differences in median monthly income: it is slightly bimodal and left-skewed."}
+```{r}
+#| label: stats_plot
+#| fig.alt: > 
+#|   The bootstrap distribution of the differences in median monthly income:
+#|   it is slightly bimodal and left-skewed.
 ggplot(bt_resamples, aes(x = wage_diff)) + 
   geom_line(stat = "density", adjust = 1.25) + 
   xlab("Difference in Median Monthly Income (Female - Male)")
 ```
 
 The variation is considerable in this statistic. One method of computing a confidence interval is to take the percentiles of the bootstrap distribution. A 95% confidence interval for the difference in the means would be:
 
-```{r ci}
+```{r}
+#| label: ci
 quantile(bt_resamples$wage_diff, 
          probs = c(0.025, 0.975))
 ```
@@ -184,7 +212,8 @@ The calculated 95% confidence interval contains zero, so we don't have evidence
 
 Unless there is already a column in the resample object that contains the fitted model, a function can be used to fit the model and save all of the model coefficients. The [broom package](https://cran.r-project.org/package=broom) package has a `tidy()` function that will save the coefficients in a data frame. Instead of returning a data frame with a row for each model term, we will save a data frame with a single row and columns for each model term. As before, `purrr::map()` can be used to estimate and save these values for each split.
 
-```{r coefs}
+```{r}
+#| label: coefs
 glm_coefs <- function(splits, ...) {
   ## use `analysis` or `as.data.frame` to get the analysis data
   mod <- glm(..., data = analysis(splits), family = binomial)
@@ -201,15 +230,17 @@ bt_resamples$betas[[1]]
 
 As previously mentioned, the [broom package](https://cran.r-project.org/package=broom) contains a class called `tidy` that created representations of objects that can be easily used for analysis, plotting, etc. rsample contains `tidy` methods for `rset` and `rsplit` objects. For example: 
 
-```{r tidy_rsplit}
+```{r}
+#| label: tidy_rsplit
 first_resample <- bt_resamples$splits[[1]]
 class(first_resample)
 tidy(first_resample)
 ```
 
 and
 
-```{r tidy_rset}
+```{r}
+#| label: tidy_rset
 class(bt_resamples)
 tidy(bt_resamples)
 ```
diff --git a/vignettes/rsample.Rmd b/vignettes/rsample.Rmd
@@ -8,7 +8,9 @@ output:
     toc: yes
 ---
 
-```{r ex_setup, include=FALSE}
+```{r}
+#| label: ex_setup
+#| include: false
 knitr::opts_chunk$set(
   message = FALSE,
   digits = 3,
@@ -28,7 +30,9 @@ The main class in the package (`rset`) is for a _set_ or _collection_ of resampl
 
 Like [modelr](https://cran.r-project.org/package=modelr), the resamples are stored in data-frame-like `tibble` object. As a simple example, here is a small set of bootstraps of the `mtcars` data:
 
-```{r mtcars_bt, message=FALSE}
+```{r}
+#| label: mtcars_bt
+#| message: false
 library(rsample)
 set.seed(8584)
 bt_resamples <- bootstraps(mtcars, times = 3)
@@ -48,14 +52,16 @@ In this package we use the following terminology for the two partitions that com
 (Aside: While some might use the term "training" and "testing" for these data sets, we avoid them since those labels often conflict with the data that result from an initial partition of the data that is typically done _before_ resampling. The training/test split can be conducted using the `initial_split()` function in this package.)
 
 Let's look at one of the `rsplit` objects
-```{r rsplit}
+```{r}
+#| label: rsplit
 first_resample <- bt_resamples$splits[[1]]
 first_resample
 ```
 This indicates that there were `r dim(bt_resamples$splits[[1]])["analysis"]` data points in the analysis set, `r dim(bt_resamples$splits[[1]])["assessment"]` instances were in the assessment set, and that the original data contained `r dim(bt_resamples$splits[[1]])["n"]` data points. These results can also be determined using the `dim` function on an `rsplit` object. 
 
 To obtain either of these data sets from an `rsplit`, the `as.data.frame()` function can be used. By default, the analysis set is returned but the `data` option can be used to return the assessment data: 
-```{r rsplit_df}
+```{r}
+#| label: rsplit_df
 head(as.data.frame(first_resample))
 as.data.frame(first_resample, data = "assessment")
 ```

-Original file line number
+Diff line change
@@ @@ -1,2 +1,4 @@ @@
 *.html
 *.R
++
 +/.quarto/