easystats · etiennebacher · May 18, 2024 · May 16, 2024 · May 16, 2024 · May 16, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.10.0.3
+Version: 0.10.0.4
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,15 @@
 # datawizard 0.10.1
 
+BREAKING CHANGES
+
+* Arguments named `group` or `group_by` will be deprecated in a future release.
+  Please use `by` instead. This affects following functions in *datawizard*.
+
+  * `data_partition()`
+  * `demean()` and `degroup()`
+  * `means_by_group()`
+  * `rescale_weights()`
+
 CHANGES
 
 * `recode_into()` is more relaxed regarding checking the type of `NA` values.

diff --git a/R/data_codebook.R b/R/data_codebook.R
@@ -232,9 +232,9 @@
 
     # add proportions, but not for ranges, since these are always 100%
     if (is_range) {
-      proportions <- ""
+      frq_proportions <- ""
     } else {
-      proportions <- sprintf("%.1f%%", round(100 * (frq / sum(frq)), 1))
+      frq_proportions <- sprintf("%.1f%%", round(100 * (frq / sum(frq)), 1))
     }
 
     # make sure we have not too long rows, e.g. for variables that
@@ -245,9 +245,9 @@
     }
     if (length(frq) > max_values) {
       frq <- frq[1:max_values]
-      proportions <- proportions[1:max_values]
+      frq_proportions <- frq_proportions[1:max_values]
       frq[max_values] <- NA
-      proportions[max_values] <- NA
+      frq_proportions[max_values] <- NA
     }
     if (length(values) > max_values) {
       values <- values[1:max_values]
@@ -273,7 +273,7 @@
       values,
       value_labels,
       frq,
-      proportions,
+      proportions = frq_proportions,
       stringsAsFactors = FALSE
     ))
 
@@ -347,12 +347,12 @@
     x$Prop[x$Prop == "NA" | is.na(x$Prop)] <- ""
     # align only for text format
     if (identical(format, "text")) {
-      x$Prop[x$Prop != ""] <- format(x$Prop[x$Prop != ""], justify = "right")
+      x$Prop[x$Prop != ""] <- format(x$Prop[x$Prop != ""], justify = "right") # nolint
     }
-    x[["N"]][x$Prop != ""] <- sprintf(
+    x[["N"]][x$Prop != ""] <- sprintf( # nolint
       "%s (%s)",
-      as.character(x[["N"]][x$Prop != ""]),
-      x$Prop[x$Prop != ""]
+      as.character(x[["N"]][x$Prop != ""]), # nolint
+      x$Prop[x$Prop != ""] # nolint
     )
     x$Prop <- NULL
   }
@@ -388,7 +388,7 @@
   # since we have each value at its own row, the HTML table contains
   # horizontal borders for each cell/row. We want to remove those borders
   # from rows that actually belong to one variable
-  separator_lines <- which(duplicated(x$.row_id) & x$N == "")
+  separator_lines <- which(duplicated(x$.row_id) & x$N == "") # nolint
   # remove separator lines, as we don't need these for HTML tables
   x <- x[-separator_lines, ]
   # check row IDs, and find odd rows
@@ -405,7 +405,7 @@
   out <- gt::tab_style(
     out,
     style = list(gt::cell_borders(sides = "top", style = "hidden")),
-    locations = gt::cells_body(rows = which(x$ID == ""))
+    locations = gt::cells_body(rows = which(x$ID == "")) # nolint
   )
   # highlight odd rows
   if (!is.null(row_color)) {
@@ -466,5 +466,5 @@
    N = "r"
  )
  align <- align[colnames(x)]
  paste0(unname(align), collapse = "")
 }
diff --git a/R/data_partition.R b/R/data_partition.R
@@ -2,19 +2,20 @@
 #'
 #' Creates data partitions (for instance, a training and a test set) based on a
 #' data frame that can also be stratified (i.e., evenly spread a given factor)
-#' using the `group` argument.
+#' using the `by` argument.
 #'
 #' @inheritParams data_rename
 #' @param proportion Scalar (between 0 and 1) or numeric vector, indicating the
 #'   proportion(s) of the training set(s). The sum of `proportion` must not be
 #'   greater than 1. The remaining part will be used for the test set.
-#' @param group A character vector indicating the name(s) of the column(s) used
+#' @param by A character vector indicating the name(s) of the column(s) used
 #'   for stratified partitioning.
 #' @param seed A random number generator seed. Enter an integer (e.g. 123) so
 #'   that the random sampling will be the same each time you run the function.
 #' @param row_id Character string, indicating the name of the column that
 #'   contains the row-id's.
 #' @param verbose Toggle messages and warnings.
+#' @param group Deprecated. Use `by` instead.
 #'
 #' @return A list of data frames. The list includes one training set per given
 #'   proportion and the remaining data as test set. List elements of training
@@ -28,7 +29,7 @@
 #' nrow(out$p_0.9)
 #'
 #' # Stratify by group (equal proportions of each species)
-#' out <- data_partition(iris, proportion = 0.9, group = "Species")
+#' out <- data_partition(iris, proportion = 0.9, by = "Species")
 #' out$test
 #'
 #' # Create multiple partitions
@@ -38,21 +39,27 @@
 #' # Create multiple partitions, stratified by group - 30% equally sampled
 #' # from species in first training set, 50% in second training set and
 #' # remaining 20% equally sampled from each species in test set.
-#' out <- data_partition(iris, proportion = c(0.3, 0.5), group = "Species")
+#' out <- data_partition(iris, proportion = c(0.3, 0.5), by = "Species")
 #' lapply(out, function(i) table(i$Species))
 #'
 #' @inherit data_rename seealso
 #' @export
 data_partition <- function(data,
                            proportion = 0.7,
-                           group = NULL,
+                           by = NULL,
                            seed = NULL,
                            row_id = ".row_id",
                            verbose = TRUE,
+                           group = NULL,
                            ...) {
   # validation checks
   data <- .coerce_to_dataframe(data)
 
+  ## TODO: deprecate later
+  if (!is.null(group)) {
+    by <- group
+  }
+
   if (sum(proportion) > 1) {
     insight::format_error("Sum of `proportion` cannot be higher than 1.")
   }
@@ -91,12 +98,12 @@ data_partition <- function(data,
 
   # Create list of data groups. We generally lapply over list of
   # sampled row-id's by group, thus, we even create a list if not grouped.
-  if (is.null(group)) {
+  if (is.null(by)) {
     indices_list <- list(seq_len(nrow(data)))
   } else {
     # else, split by group(s) and extract row-ids per group
     indices_list <- lapply(
-      split(data, data[group]),
+      split(data, data[by]),
       data_extract,
       select = row_id,
       as_data_frame = FALSE
@@ -130,7 +137,7 @@ data_partition <- function(data,
   })
 
   # we need to move all list elements one level higher.
-  if (is.null(group)) {
+  if (is.null(by)) {
     training_sets <- training_sets[[1]]
   } else {
     # for grouped training sets, we need to row-bind all sampled training

diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R
@@ -229,6 +229,9 @@ print_html.dw_data_xtabulate <- function(x, big_mark = NULL, ...) {
     x$Group <- NULL
   }
 
+
+  ## FIXME: change group_by argument later
+
   # print table
   insight::export_table(
     format(x, big_mark = big_mark, format = "html", ...),
@@ -265,6 +268,9 @@ print_html.dw_data_xtabulates <- function(x, big_mark = NULL, ...) {
 
     out <- do.call(rbind, x)
 
+
+    ## FIXME: change group_by argument later
+
     # print table
     insight::export_table(
       out,

diff --git a/R/demean.R b/R/demean.R
@@ -11,7 +11,7 @@
 #' @param x A data frame.
 #' @param select Character vector (or formula) with names of variables to select
 #'   that should be group- and de-meaned.
-#' @param group Character vector (or formula) with the name of the variable that
+#' @param by Character vector (or formula) with the name of the variable that
 #'   indicates the group- or cluster-ID.
 #' @param center Method for centering. `demean()` always performs
 #'   mean-centering, while `degroup()` can use `center = "median"` or
@@ -25,6 +25,7 @@
 #'   attributes to indicate the within- and between-effects. This is only
 #'   relevant when printing `model_parameters()` - in such cases, the
 #'   within- and between-effects are printed in separated blocks.
+#' @param group Deprecated. Use `by` instead.
 #' @inheritParams center
 #'
 #' @return
@@ -92,7 +93,7 @@
 #'
 #'   \subsection{Terminology}{
 #'     The group-meaned variable is simply the mean of an independent variable
-#'     within each group (or id-level or cluster) represented by `group`.
+#'     within each group (or id-level or cluster) represented by `by`.
 #'     It represents the cluster-mean of an independent variable. The regression
 #'     coefficient of a group-meaned variable is the *between-subject-effect*.
 #'     The de-meaned variable is then the centered version of the group-meaned
@@ -199,10 +200,10 @@
 #' iris$ID <- sample(1:4, nrow(iris), replace = TRUE) # fake-ID
 #' iris$binary <- as.factor(rbinom(150, 1, .35)) # binary variable
 #'
-#' x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), group = "ID")
+#' x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), by = "ID")
 #' head(x)
 #'
-#' x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), group = "ID")
+#' x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), by = "ID")
 #' head(x)
 #'
 #'
@@ -213,23 +214,29 @@
 #'   y = c(1, 2, 1, 2, 4, 3, 2, 1),
 #'   ID = c(1, 2, 3, 1, 2, 3, 1, 2)
 #' )
-#' demean(dat, select = c("a", "x*y"), group = "ID")
+#' demean(dat, select = c("a", "x*y"), by = "ID")
 #'
 #' # or in formula-notation
-#' demean(dat, select = ~ a + x * y, group = ~ID)
+#' demean(dat, select = ~ a + x * y, by = ~ID)
 #'
 #' @export
 demean <- function(x,
                    select,
-                   group,
+                   by,
                    suffix_demean = "_within",
                    suffix_groupmean = "_between",
                    add_attributes = TRUE,
-                   verbose = TRUE) {
+                   verbose = TRUE,
+                   group = NULL) {
+  ## TODO: deprecate later
+  if (!is.null(group)) {
+    by <- group
+  }
+
   degroup(
     x = x,
     select = select,
-    group = group,
+    by = by,
     center = "mean",
     suffix_demean = suffix_demean,
     suffix_groupmean = suffix_groupmean,
@@ -247,12 +254,18 @@ demean <- function(x,
 #' @export
 degroup <- function(x,
                     select,
-                    group,
+                    by,
                     center = "mean",
                     suffix_demean = "_within",
                     suffix_groupmean = "_between",
                     add_attributes = TRUE,
-                    verbose = TRUE) {
+                    verbose = TRUE,
+                    group = NULL) {
+  ## TODO: deprecate later
+  if (!is.null(group)) {
+    by <- group
+  }
+
   # ugly tibbles again...
   x <- .coerce_to_dataframe(x)
 
@@ -266,8 +279,8 @@ degroup <- function(x,
     ))
   }
 
-  if (inherits(group, "formula")) {
-    group <- all.vars(group)
+  if (inherits(by, "formula")) {
+    by <- all.vars(by)
   }
 
   interactions_no <- select[!grepl("(\\*|\\:)", select)]
@@ -296,7 +309,7 @@ degroup <- function(x,
   select <- intersect(colnames(x), select)
 
   # get data to demean...
-  dat <- x[, c(select, group)]
+  dat <- x[, c(select, by)]
 
 
   # find categorical predictors that are coded as factors
@@ -344,31 +357,18 @@ degroup <- function(x,
   # for variables within each group (the group means). assign
   # mean values to a vector of same length as the data
 
-  if (center == "mode") {
-    x_gm_list <- lapply(select, function(i) {
-      stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) distribution_mode(stats::na.omit(.gm)))
-    })
-  } else if (center == "median") {
-    x_gm_list <- lapply(select, function(i) {
-      stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) stats::median(.gm, na.rm = TRUE))
-    })
-  } else if (center == "min") {
-    x_gm_list <- lapply(select, function(i) {
-      stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) min(.gm, na.rm = TRUE))
-    })
-  } else if (center == "max") {
-    x_gm_list <- lapply(select, function(i) {
-      stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) max(.gm, na.rm = TRUE))
-    })
-  } else {
-    x_gm_list <- lapply(select, function(i) {
-      stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) mean(.gm, na.rm = TRUE))
-    })
-  }
-
+  gm_fun <- switch(center,
+    mode = function(.gm) distribution_mode(stats::na.omit(.gm)),
+    median = function(.gm) stats::median(.gm, na.rm = TRUE),
+    min = function(.gm) min(.gm, na.rm = TRUE),
+    max = function(.gm) max(.gm, na.rm = TRUE),
+    function(.gm) mean(.gm, na.rm = TRUE)
+  )
+  x_gm_list <- lapply(select, function(i) {
+    stats::ave(dat[[i]], dat[[by]], FUN = gm_fun)
+  })
   names(x_gm_list) <- select
 
-
   # create de-meaned variables by subtracting the group mean from each individual value
 
   x_dm_list <- lapply(select, function(i) dat[[i]] - x_gm_list[[i]])