easystats · etiennebacher · May 18, 2024 · May 16, 2024 · May 16, 2024 · May 16, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.10.0.3
+Version: 0.10.0.4
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,12 @@
 # datawizard 0.10.1
 
+BREAKING CHANGES
+
+* Arguments named `group` or `group_by` will be deprecated in a future release.
+  Please use `by` instead. This affects following functions in *datawizard*.
+
+  * `data_partition()`
+
 CHANGES
 
 * `recode_into()` is more relaxed regarding checking the type of `NA` values.

diff --git a/R/data_codebook.R b/R/data_codebook.R
@@ -232,9 +232,9 @@
 
     # add proportions, but not for ranges, since these are always 100%
     if (is_range) {
-      proportions <- ""
+      frq_proportions <- ""
     } else {
-      proportions <- sprintf("%.1f%%", round(100 * (frq / sum(frq)), 1))
+      frq_proportions <- sprintf("%.1f%%", round(100 * (frq / sum(frq)), 1))
     }
 
     # make sure we have not too long rows, e.g. for variables that
@@ -245,9 +245,9 @@
     }
     if (length(frq) > max_values) {
       frq <- frq[1:max_values]
-      proportions <- proportions[1:max_values]
+      frq_proportions <- frq_proportions[1:max_values]
       frq[max_values] <- NA
-      proportions[max_values] <- NA
+      frq_proportions[max_values] <- NA
     }
     if (length(values) > max_values) {
       values <- values[1:max_values]
@@ -273,7 +273,7 @@
       values,
       value_labels,
       frq,
-      proportions,
+      proportions = frq_proportions,
       stringsAsFactors = FALSE
     ))
 
@@ -347,12 +347,12 @@
     x$Prop[x$Prop == "NA" | is.na(x$Prop)] <- ""
     # align only for text format
     if (identical(format, "text")) {
-      x$Prop[x$Prop != ""] <- format(x$Prop[x$Prop != ""], justify = "right")
+      x$Prop[x$Prop != ""] <- format(x$Prop[x$Prop != ""], justify = "right") # nolint
     }
-    x[["N"]][x$Prop != ""] <- sprintf(
+    x[["N"]][x$Prop != ""] <- sprintf( # nolint
       "%s (%s)",
-      as.character(x[["N"]][x$Prop != ""]),
-      x$Prop[x$Prop != ""]
+      as.character(x[["N"]][x$Prop != ""]), # nolint
+      x$Prop[x$Prop != ""] # nolint
     )
     x$Prop <- NULL
   }
@@ -388,7 +388,7 @@
   # since we have each value at its own row, the HTML table contains
   # horizontal borders for each cell/row. We want to remove those borders
   # from rows that actually belong to one variable
-  separator_lines <- which(duplicated(x$.row_id) & x$N == "")
+  separator_lines <- which(duplicated(x$.row_id) & x$N == "") # nolint
   # remove separator lines, as we don't need these for HTML tables
   x <- x[-separator_lines, ]
   # check row IDs, and find odd rows
@@ -405,7 +405,7 @@
   out <- gt::tab_style(
     out,
     style = list(gt::cell_borders(sides = "top", style = "hidden")),
-    locations = gt::cells_body(rows = which(x$ID == ""))
+    locations = gt::cells_body(rows = which(x$ID == "")) # nolint
   )
   # highlight odd rows
   if (!is.null(row_color)) {
@@ -466,5 +466,5 @@
    N = "r"
  )
  align <- align[colnames(x)]
  paste0(unname(align), collapse = "")
 }
diff --git a/R/data_partition.R b/R/data_partition.R
@@ -2,19 +2,20 @@
 #'
 #' Creates data partitions (for instance, a training and a test set) based on a
 #' data frame that can also be stratified (i.e., evenly spread a given factor)
-#' using the `group` argument.
+#' using the `by` argument.
 #'
 #' @inheritParams data_rename
 #' @param proportion Scalar (between 0 and 1) or numeric vector, indicating the
 #'   proportion(s) of the training set(s). The sum of `proportion` must not be
 #'   greater than 1. The remaining part will be used for the test set.
-#' @param group A character vector indicating the name(s) of the column(s) used
+#' @param by A character vector indicating the name(s) of the column(s) used
 #'   for stratified partitioning.
 #' @param seed A random number generator seed. Enter an integer (e.g. 123) so
 #'   that the random sampling will be the same each time you run the function.
 #' @param row_id Character string, indicating the name of the column that
 #'   contains the row-id's.
 #' @param verbose Toggle messages and warnings.
+#' @param group Deprecated. Use `by` instead.
 #'
 #' @return A list of data frames. The list includes one training set per given
 #'   proportion and the remaining data as test set. List elements of training
@@ -28,7 +29,7 @@
 #' nrow(out$p_0.9)
 #'
 #' # Stratify by group (equal proportions of each species)
-#' out <- data_partition(iris, proportion = 0.9, group = "Species")
+#' out <- data_partition(iris, proportion = 0.9, by = "Species")
 #' out$test
 #'
 #' # Create multiple partitions
@@ -38,21 +39,27 @@
 #' # Create multiple partitions, stratified by group - 30% equally sampled
 #' # from species in first training set, 50% in second training set and
 #' # remaining 20% equally sampled from each species in test set.
-#' out <- data_partition(iris, proportion = c(0.3, 0.5), group = "Species")
+#' out <- data_partition(iris, proportion = c(0.3, 0.5), by = "Species")
 #' lapply(out, function(i) table(i$Species))
 #'
 #' @inherit data_rename seealso
 #' @export
 data_partition <- function(data,
                            proportion = 0.7,
-                           group = NULL,
+                           by = NULL,
                            seed = NULL,
                            row_id = ".row_id",
                            verbose = TRUE,
+                           group = NULL,
                            ...) {
   # validation checks
   data <- .coerce_to_dataframe(data)
 
+  ## TODO: deprecate later
+  if (!is.null(group)) {
+    by <- group
+  }
+
   if (sum(proportion) > 1) {
     insight::format_error("Sum of `proportion` cannot be higher than 1.")
   }
@@ -91,12 +98,12 @@ data_partition <- function(data,
 
   # Create list of data groups. We generally lapply over list of
   # sampled row-id's by group, thus, we even create a list if not grouped.
-  if (is.null(group)) {
+  if (is.null(by)) {
     indices_list <- list(seq_len(nrow(data)))
   } else {
     # else, split by group(s) and extract row-ids per group
     indices_list <- lapply(
-      split(data, data[group]),
+      split(data, data[by]),
       data_extract,
       select = row_id,
       as_data_frame = FALSE
@@ -130,7 +137,7 @@ data_partition <- function(data,
   })
 
   # we need to move all list elements one level higher.
-  if (is.null(group)) {
+  if (is.null(by)) {
     training_sets <- training_sets[[1]]
   } else {
     # for grouped training sets, we need to row-bind all sampled training

diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R
@@ -229,6 +229,9 @@ print_html.dw_data_xtabulate <- function(x, big_mark = NULL, ...) {
     x$Group <- NULL
   }
 
+
+  ## FIXME: change group_by argument later
+
   # print table
   insight::export_table(
     format(x, big_mark = big_mark, format = "html", ...),
@@ -265,6 +268,9 @@ print_html.dw_data_xtabulates <- function(x, big_mark = NULL, ...) {
 
     out <- do.call(rbind, x)
 
+
+    ## FIXME: change group_by argument later
+
     # print table
     insight::export_table(
       out,

diff --git a/man/data_partition.Rd b/man/data_partition.Rd
diff --git a/tests/testthat/_snaps/data_partition.md b/tests/testthat/_snaps/data_partition.md
@@ -88,7 +88,7 @@
 ---
 
     Code
-      str(data_partition(iris, proportion = 0.7, group = "Species", seed = 123))
+      str(data_partition(iris, proportion = 0.7, by = "Species", seed = 123))
     Output
       List of 2
        $ p_0.7:'data.frame':	105 obs. of  6 variables:
@@ -109,7 +109,7 @@
 ---
 
     Code
-      str(data_partition(iris, proportion = c(0.2, 0.5), group = "Species", seed = 123))
+      str(data_partition(iris, proportion = c(0.2, 0.5), by = "Species", seed = 123))
     Output
       List of 3
        $ p_0.2:'data.frame':	30 obs. of  6 variables:

diff --git a/tests/testthat/test-data_partition.R b/tests/testthat/test-data_partition.R
@@ -53,8 +53,8 @@ test_that("data_partition works as expected", {
   data(iris)
   expect_snapshot(str(data_partition(iris, proportion = 0.7, seed = 123)))
   expect_snapshot(str(data_partition(iris, proportion = c(0.2, 0.5), seed = 123)))
-  expect_snapshot(str(data_partition(iris, proportion = 0.7, group = "Species", seed = 123)))
-  expect_snapshot(str(data_partition(iris, proportion = c(0.2, 0.5), group = "Species", seed = 123)))
+  expect_snapshot(str(data_partition(iris, proportion = 0.7, by = "Species", seed = 123)))
+  expect_snapshot(str(data_partition(iris, proportion = c(0.2, 0.5), by = "Species", seed = 123)))
 })
 
 test_that("data_partition warns if no testing set", {