Skip to content

Commit

Permalink
Rename group/group_by arguments into by (#502)
Browse files Browse the repository at this point in the history
* easystats/easystats#404

* update snapshots

* lintr, comments

* fix demean()

* fix means_by_group

* fix

* update news

* fix rescale_weights

* silence tests

* Update NEWS.md

Co-authored-by: Etienne Bacher <[email protected]>

* deprecation warnings

* use insight remotes

* Update NEWS.md

Co-authored-by: Etienne Bacher <[email protected]>

* Also address #265

* update docs

* update docs and tests

* Update extract_column_names.R

* update readme

* trigger CI

* revert commits related to aliases

* version

* news

* other remnants

* fix

* do not use devel pkgdown

* lintr

* same

---------

Co-authored-by: Etienne Bacher <[email protected]>
  • Loading branch information
strengejacke and etiennebacher committed May 18, 2024
1 parent d74a225 commit d9a40dc
Show file tree
Hide file tree
Showing 30 changed files with 284 additions and 199 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.10.0.3
Version: 0.10.0.4
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down Expand Up @@ -69,7 +69,7 @@ Suggests:
tidyr,
withr
Remotes:
easystats/modelbased
easystats/modelbased, easystats/insight
VignetteBuilder:
knitr
Encoding: UTF-8
Expand Down
11 changes: 11 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
# datawizard 0.10.1

BREAKING CHANGES

* Arguments named `group` or `group_by` are deprecated and will be removed
in a future release. Please use `by` instead. This affects the following
functions in *datawizard* (#502).

* `data_partition()`
* `demean()` and `degroup()`
* `means_by_group()`
* `rescale_weights()`

CHANGES

* `recode_into()` is more relaxed regarding checking the type of `NA` values.
Expand Down
24 changes: 12 additions & 12 deletions R/data_codebook.R
Original file line number Diff line number Diff line change
Expand Up @@ -232,9 +232,9 @@ data_codebook <- function(data,

# add proportions, but not for ranges, since these are always 100%
if (is_range) {
proportions <- ""
frq_proportions <- ""
} else {
proportions <- sprintf("%.1f%%", round(100 * (frq / sum(frq)), 1))
frq_proportions <- sprintf("%.1f%%", round(100 * (frq / sum(frq)), 1))
}

# make sure we have not too long rows, e.g. for variables that
Expand All @@ -245,9 +245,9 @@ data_codebook <- function(data,
}
if (length(frq) > max_values) {
frq <- frq[1:max_values]
proportions <- proportions[1:max_values]
frq_proportions <- frq_proportions[1:max_values]
frq[max_values] <- NA
proportions[max_values] <- NA
frq_proportions[max_values] <- NA
}
if (length(values) > max_values) {
values <- values[1:max_values]
Expand All @@ -273,7 +273,7 @@ data_codebook <- function(data,
values,
value_labels,
frq,
proportions,
proportions = frq_proportions,
stringsAsFactors = FALSE
))

Expand Down Expand Up @@ -347,12 +347,12 @@ format.data_codebook <- function(x, format = "text", ...) {
x$Prop[x$Prop == "NA" | is.na(x$Prop)] <- ""
# align only for text format
if (identical(format, "text")) {
x$Prop[x$Prop != ""] <- format(x$Prop[x$Prop != ""], justify = "right")
x$Prop[x$Prop != ""] <- format(x$Prop[x$Prop != ""], justify = "right") # nolint
}
x[["N"]][x$Prop != ""] <- sprintf(
x[["N"]][x$Prop != ""] <- sprintf( # nolint
"%s (%s)",
as.character(x[["N"]][x$Prop != ""]),
x$Prop[x$Prop != ""]
as.character(x[["N"]][x$Prop != ""]), # nolint
x$Prop[x$Prop != ""] # nolint
)
x$Prop <- NULL
}
Expand Down Expand Up @@ -388,7 +388,7 @@ print_html.data_codebook <- function(x,
# since we have each value at its own row, the HTML table contains
# horizontal borders for each cell/row. We want to remove those borders
# from rows that actually belong to one variable
separator_lines <- which(duplicated(x$.row_id) & x$N == "")
separator_lines <- which(duplicated(x$.row_id) & x$N == "") # nolint
# remove separator lines, as we don't need these for HTML tables
x <- x[-separator_lines, ]
# check row IDs, and find odd rows
Expand All @@ -405,7 +405,7 @@ print_html.data_codebook <- function(x,
out <- gt::tab_style(
out,
style = list(gt::cell_borders(sides = "top", style = "hidden")),
locations = gt::cells_body(rows = which(x$ID == ""))
locations = gt::cells_body(rows = which(x$ID == "")) # nolint
)
# highlight odd rows
if (!is.null(row_color)) {
Expand Down Expand Up @@ -466,5 +466,5 @@ print_md.data_codebook <- function(x, ...) {
N = "r"
)
align <- align[colnames(x)]
paste0(unname(align), collapse = "")
paste(unname(align), collapse = "")
}
24 changes: 16 additions & 8 deletions R/data_partition.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,20 @@
#'
#' Creates data partitions (for instance, a training and a test set) based on a
#' data frame that can also be stratified (i.e., evenly spread a given factor)
#' using the `group` argument.
#' using the `by` argument.
#'
#' @inheritParams data_rename
#' @param proportion Scalar (between 0 and 1) or numeric vector, indicating the
#' proportion(s) of the training set(s). The sum of `proportion` must not be
#' greater than 1. The remaining part will be used for the test set.
#' @param group A character vector indicating the name(s) of the column(s) used
#' @param by A character vector indicating the name(s) of the column(s) used
#' for stratified partitioning.
#' @param seed A random number generator seed. Enter an integer (e.g. 123) so
#' that the random sampling will be the same each time you run the function.
#' @param row_id Character string, indicating the name of the column that
#' contains the row-id's.
#' @param verbose Toggle messages and warnings.
#' @param group Deprecated. Use `by` instead.
#'
#' @return A list of data frames. The list includes one training set per given
#' proportion and the remaining data as test set. List elements of training
Expand All @@ -28,7 +29,7 @@
#' nrow(out$p_0.9)
#'
#' # Stratify by group (equal proportions of each species)
#' out <- data_partition(iris, proportion = 0.9, group = "Species")
#' out <- data_partition(iris, proportion = 0.9, by = "Species")
#' out$test
#'
#' # Create multiple partitions
Expand All @@ -38,21 +39,28 @@
#' # Create multiple partitions, stratified by group - 30% equally sampled
#' # from species in first training set, 50% in second training set and
#' # remaining 20% equally sampled from each species in test set.
#' out <- data_partition(iris, proportion = c(0.3, 0.5), group = "Species")
#' out <- data_partition(iris, proportion = c(0.3, 0.5), by = "Species")
#' lapply(out, function(i) table(i$Species))
#'
#' @inherit data_rename seealso
#' @export
data_partition <- function(data,
proportion = 0.7,
group = NULL,
by = NULL,
seed = NULL,
row_id = ".row_id",
verbose = TRUE,
group = NULL,
...) {
# validation checks
data <- .coerce_to_dataframe(data)

## TODO: remove warning in future release
if (!is.null(group)) {
by <- group
insight::format_warning("Argument `group` is deprecated and will be removed in a future release. Please use `by` instead.") # nolint
}

if (sum(proportion) > 1) {
insight::format_error("Sum of `proportion` cannot be higher than 1.")
}
Expand Down Expand Up @@ -91,12 +99,12 @@ data_partition <- function(data,

# Create list of data groups. We generally lapply over list of
# sampled row-id's by group, thus, we even create a list if not grouped.
if (is.null(group)) {
if (is.null(by)) {
indices_list <- list(seq_len(nrow(data)))
} else {
# else, split by group(s) and extract row-ids per group
indices_list <- lapply(
split(data, data[group]),
split(data, data[by]),
data_extract,
select = row_id,
as_data_frame = FALSE
Expand Down Expand Up @@ -130,7 +138,7 @@ data_partition <- function(data,
})

# we need to move all list elements one level higher.
if (is.null(group)) {
if (is.null(by)) {
training_sets <- training_sets[[1]]
} else {
# for grouped training sets, we need to row-bind all sampled training
Expand Down
4 changes: 2 additions & 2 deletions R/data_xtabulate.R
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ print_html.dw_data_xtabulate <- function(x, big_mark = NULL, ...) {
format(x, big_mark = big_mark, format = "html", ...),
missing = "(NA)",
format = "html",
group_by = "groups"
by = "groups"
)
}

Expand Down Expand Up @@ -270,7 +270,7 @@ print_html.dw_data_xtabulates <- function(x, big_mark = NULL, ...) {
out,
missing = "(NA)",
format = "html",
group_by = "groups"
by = "groups"
)
}
}
Expand Down
76 changes: 39 additions & 37 deletions R/demean.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#' @param x A data frame.
#' @param select Character vector (or formula) with names of variables to select
#' that should be group- and de-meaned.
#' @param group Character vector (or formula) with the name of the variable that
#' @param by Character vector (or formula) with the name of the variable that
#' indicates the group- or cluster-ID.
#' @param center Method for centering. `demean()` always performs
#' mean-centering, while `degroup()` can use `center = "median"` or
Expand All @@ -25,6 +25,7 @@
#' attributes to indicate the within- and between-effects. This is only
#' relevant when printing `model_parameters()` - in such cases, the
#' within- and between-effects are printed in separated blocks.
#' @param group Deprecated. Use `by` instead.
#' @inheritParams center
#'
#' @return
Expand Down Expand Up @@ -92,7 +93,7 @@
#'
#' \subsection{Terminology}{
#' The group-meaned variable is simply the mean of an independent variable
#' within each group (or id-level or cluster) represented by `group`.
#' within each group (or id-level or cluster) represented by `by`.
#' It represents the cluster-mean of an independent variable. The regression
#' coefficient of a group-meaned variable is the *between-subject-effect*.
#' The de-meaned variable is then the centered version of the group-meaned
Expand Down Expand Up @@ -199,10 +200,10 @@
#' iris$ID <- sample(1:4, nrow(iris), replace = TRUE) # fake-ID
#' iris$binary <- as.factor(rbinom(150, 1, .35)) # binary variable
#'
#' x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), group = "ID")
#' x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), by = "ID")
#' head(x)
#'
#' x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), group = "ID")
#' x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), by = "ID")
#' head(x)
#'
#'
Expand All @@ -213,23 +214,30 @@
#' y = c(1, 2, 1, 2, 4, 3, 2, 1),
#' ID = c(1, 2, 3, 1, 2, 3, 1, 2)
#' )
#' demean(dat, select = c("a", "x*y"), group = "ID")
#' demean(dat, select = c("a", "x*y"), by = "ID")
#'
#' # or in formula-notation
#' demean(dat, select = ~ a + x * y, group = ~ID)
#' demean(dat, select = ~ a + x * y, by = ~ID)
#'
#' @export
demean <- function(x,
select,
group,
by,
suffix_demean = "_within",
suffix_groupmean = "_between",
add_attributes = TRUE,
verbose = TRUE) {
verbose = TRUE,
group = NULL) {
## TODO: remove warning in future release
if (!is.null(group)) {
by <- group
insight::format_warning("Argument `group` is deprecated and will be removed in a future release. Please use `by` instead.") # nolint
}

degroup(
x = x,
select = select,
group = group,
by = by,
center = "mean",
suffix_demean = suffix_demean,
suffix_groupmean = suffix_groupmean,
Expand All @@ -247,12 +255,19 @@ demean <- function(x,
#' @export
degroup <- function(x,
select,
group,
by,
center = "mean",
suffix_demean = "_within",
suffix_groupmean = "_between",
add_attributes = TRUE,
verbose = TRUE) {
verbose = TRUE,
group = NULL) {
## TODO: remove warning later
if (!is.null(group)) {
by <- group
insight::format_warning("Argument `group` is deprecated and will be removed in a future release. Please use `by` instead.") # nolint
}

# ugly tibbles again...
x <- .coerce_to_dataframe(x)

Expand All @@ -266,8 +281,8 @@ degroup <- function(x,
))
}

if (inherits(group, "formula")) {
group <- all.vars(group)
if (inherits(by, "formula")) {
by <- all.vars(by)
}

interactions_no <- select[!grepl("(\\*|\\:)", select)]
Expand Down Expand Up @@ -296,7 +311,7 @@ degroup <- function(x,
select <- intersect(colnames(x), select)

# get data to demean...
dat <- x[, c(select, group)]
dat <- x[, c(select, by)]


# find categorical predictors that are coded as factors
Expand Down Expand Up @@ -344,31 +359,18 @@ degroup <- function(x,
# for variables within each group (the group means). assign
# mean values to a vector of same length as the data

if (center == "mode") {
x_gm_list <- lapply(select, function(i) {
stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) distribution_mode(stats::na.omit(.gm)))
})
} else if (center == "median") {
x_gm_list <- lapply(select, function(i) {
stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) stats::median(.gm, na.rm = TRUE))
})
} else if (center == "min") {
x_gm_list <- lapply(select, function(i) {
stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) min(.gm, na.rm = TRUE))
})
} else if (center == "max") {
x_gm_list <- lapply(select, function(i) {
stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) max(.gm, na.rm = TRUE))
})
} else {
x_gm_list <- lapply(select, function(i) {
stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) mean(.gm, na.rm = TRUE))
})
}

gm_fun <- switch(center,
mode = function(.gm) distribution_mode(stats::na.omit(.gm)),
median = function(.gm) stats::median(.gm, na.rm = TRUE),
min = function(.gm) min(.gm, na.rm = TRUE),
max = function(.gm) max(.gm, na.rm = TRUE),
function(.gm) mean(.gm, na.rm = TRUE)
)
x_gm_list <- lapply(select, function(i) {
stats::ave(dat[[i]], dat[[by]], FUN = gm_fun)
})
names(x_gm_list) <- select


# create de-meaned variables by subtracting the group mean from each individual value

x_dm_list <- lapply(select, function(i) dat[[i]] - x_gm_list[[i]])
Expand Down
Loading

0 comments on commit d9a40dc

Please sign in to comment.