Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename group/group_by arguments into by #502

Merged
merged 27 commits into from
May 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.10.0.3
Version: 0.10.0.4
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down Expand Up @@ -69,7 +69,7 @@ Suggests:
tidyr,
withr
Remotes:
easystats/modelbased
easystats/modelbased, easystats/insight
VignetteBuilder:
knitr
Encoding: UTF-8
Expand Down
11 changes: 11 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
# datawizard 0.10.1

BREAKING CHANGES

* Arguments named `group` or `group_by` are deprecated and will be removed
in a future release. Please use `by` instead. This affects the following
functions in *datawizard* (#502).

* `data_partition()`
* `demean()` and `degroup()`
* `means_by_group()`
* `rescale_weights()`

CHANGES

* `recode_into()` is more relaxed regarding checking the type of `NA` values.
Expand Down
24 changes: 12 additions & 12 deletions R/data_codebook.R
Original file line number Diff line number Diff line change
Expand Up @@ -232,9 +232,9 @@ data_codebook <- function(data,

# add proportions, but not for ranges, since these are always 100%
if (is_range) {
proportions <- ""
frq_proportions <- ""
} else {
proportions <- sprintf("%.1f%%", round(100 * (frq / sum(frq)), 1))
frq_proportions <- sprintf("%.1f%%", round(100 * (frq / sum(frq)), 1))
}

# make sure we have not too long rows, e.g. for variables that
Expand All @@ -245,9 +245,9 @@ data_codebook <- function(data,
}
if (length(frq) > max_values) {
frq <- frq[1:max_values]
proportions <- proportions[1:max_values]
frq_proportions <- frq_proportions[1:max_values]
frq[max_values] <- NA
proportions[max_values] <- NA
frq_proportions[max_values] <- NA
}
if (length(values) > max_values) {
values <- values[1:max_values]
Expand All @@ -273,7 +273,7 @@ data_codebook <- function(data,
values,
value_labels,
frq,
proportions,
proportions = frq_proportions,
stringsAsFactors = FALSE
))

Expand Down Expand Up @@ -347,12 +347,12 @@ format.data_codebook <- function(x, format = "text", ...) {
x$Prop[x$Prop == "NA" | is.na(x$Prop)] <- ""
# align only for text format
if (identical(format, "text")) {
x$Prop[x$Prop != ""] <- format(x$Prop[x$Prop != ""], justify = "right")
x$Prop[x$Prop != ""] <- format(x$Prop[x$Prop != ""], justify = "right") # nolint
}
x[["N"]][x$Prop != ""] <- sprintf(
x[["N"]][x$Prop != ""] <- sprintf( # nolint
"%s (%s)",
as.character(x[["N"]][x$Prop != ""]),
x$Prop[x$Prop != ""]
as.character(x[["N"]][x$Prop != ""]), # nolint
x$Prop[x$Prop != ""] # nolint
)
x$Prop <- NULL
}
Expand Down Expand Up @@ -388,7 +388,7 @@ print_html.data_codebook <- function(x,
# since we have each value at its own row, the HTML table contains
# horizontal borders for each cell/row. We want to remove those borders
# from rows that actually belong to one variable
separator_lines <- which(duplicated(x$.row_id) & x$N == "")
separator_lines <- which(duplicated(x$.row_id) & x$N == "") # nolint
# remove separator lines, as we don't need these for HTML tables
x <- x[-separator_lines, ]
# check row IDs, and find odd rows
Expand All @@ -405,7 +405,7 @@ print_html.data_codebook <- function(x,
out <- gt::tab_style(
out,
style = list(gt::cell_borders(sides = "top", style = "hidden")),
locations = gt::cells_body(rows = which(x$ID == ""))
locations = gt::cells_body(rows = which(x$ID == "")) # nolint
)
# highlight odd rows
if (!is.null(row_color)) {
Expand Down Expand Up @@ -466,5 +466,5 @@ print_md.data_codebook <- function(x, ...) {
N = "r"
)
align <- align[colnames(x)]
paste0(unname(align), collapse = "")
paste(unname(align), collapse = "")
}
24 changes: 16 additions & 8 deletions R/data_partition.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,20 @@
#'
#' Creates data partitions (for instance, a training and a test set) based on a
#' data frame that can also be stratified (i.e., evenly spread a given factor)
#' using the `group` argument.
#' using the `by` argument.
#'
#' @inheritParams data_rename
#' @param proportion Scalar (between 0 and 1) or numeric vector, indicating the
#' proportion(s) of the training set(s). The sum of `proportion` must not be
#' greater than 1. The remaining part will be used for the test set.
#' @param group A character vector indicating the name(s) of the column(s) used
#' @param by A character vector indicating the name(s) of the column(s) used
#' for stratified partitioning.
#' @param seed A random number generator seed. Enter an integer (e.g. 123) so
#' that the random sampling will be the same each time you run the function.
#' @param row_id Character string, indicating the name of the column that
#' contains the row-id's.
#' @param verbose Toggle messages and warnings.
#' @param group Deprecated. Use `by` instead.
#'
#' @return A list of data frames. The list includes one training set per given
#' proportion and the remaining data as test set. List elements of training
Expand All @@ -28,7 +29,7 @@
#' nrow(out$p_0.9)
#'
#' # Stratify by group (equal proportions of each species)
#' out <- data_partition(iris, proportion = 0.9, group = "Species")
#' out <- data_partition(iris, proportion = 0.9, by = "Species")
#' out$test
#'
#' # Create multiple partitions
Expand All @@ -38,21 +39,28 @@
#' # Create multiple partitions, stratified by group - 30% equally sampled
#' # from species in first training set, 50% in second training set and
#' # remaining 20% equally sampled from each species in test set.
#' out <- data_partition(iris, proportion = c(0.3, 0.5), group = "Species")
#' out <- data_partition(iris, proportion = c(0.3, 0.5), by = "Species")
#' lapply(out, function(i) table(i$Species))
#'
#' @inherit data_rename seealso
#' @export
data_partition <- function(data,
proportion = 0.7,
group = NULL,
by = NULL,
seed = NULL,
row_id = ".row_id",
verbose = TRUE,
group = NULL,
...) {
# validation checks
data <- .coerce_to_dataframe(data)

## TODO: remove warning in future release
if (!is.null(group)) {
by <- group
insight::format_warning("Argument `group` is deprecated and will be removed in a future release. Please use `by` instead.") # nolint
}

if (sum(proportion) > 1) {
insight::format_error("Sum of `proportion` cannot be higher than 1.")
}
Expand Down Expand Up @@ -91,12 +99,12 @@ data_partition <- function(data,

# Create list of data groups. We generally lapply over list of
# sampled row-id's by group, thus, we even create a list if not grouped.
if (is.null(group)) {
if (is.null(by)) {
indices_list <- list(seq_len(nrow(data)))
} else {
# else, split by group(s) and extract row-ids per group
indices_list <- lapply(
split(data, data[group]),
split(data, data[by]),
data_extract,
select = row_id,
as_data_frame = FALSE
Expand Down Expand Up @@ -130,7 +138,7 @@ data_partition <- function(data,
})

# we need to move all list elements one level higher.
if (is.null(group)) {
if (is.null(by)) {
training_sets <- training_sets[[1]]
} else {
# for grouped training sets, we need to row-bind all sampled training
Expand Down
4 changes: 2 additions & 2 deletions R/data_xtabulate.R
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ print_html.dw_data_xtabulate <- function(x, big_mark = NULL, ...) {
format(x, big_mark = big_mark, format = "html", ...),
missing = "(NA)",
format = "html",
group_by = "groups"
by = "groups"
)
}

Expand Down Expand Up @@ -270,7 +270,7 @@ print_html.dw_data_xtabulates <- function(x, big_mark = NULL, ...) {
out,
missing = "(NA)",
format = "html",
group_by = "groups"
by = "groups"
)
}
}
Expand Down
76 changes: 39 additions & 37 deletions R/demean.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#' @param x A data frame.
#' @param select Character vector (or formula) with names of variables to select
#' that should be group- and de-meaned.
#' @param group Character vector (or formula) with the name of the variable that
#' @param by Character vector (or formula) with the name of the variable that
#' indicates the group- or cluster-ID.
#' @param center Method for centering. `demean()` always performs
#' mean-centering, while `degroup()` can use `center = "median"` or
Expand All @@ -25,6 +25,7 @@
#' attributes to indicate the within- and between-effects. This is only
#' relevant when printing `model_parameters()` - in such cases, the
#' within- and between-effects are printed in separated blocks.
#' @param group Deprecated. Use `by` instead.
#' @inheritParams center
#'
#' @return
Expand Down Expand Up @@ -92,7 +93,7 @@
#'
#' \subsection{Terminology}{
#' The group-meaned variable is simply the mean of an independent variable
#' within each group (or id-level or cluster) represented by `group`.
#' within each group (or id-level or cluster) represented by `by`.
#' It represents the cluster-mean of an independent variable. The regression
#' coefficient of a group-meaned variable is the *between-subject-effect*.
#' The de-meaned variable is then the centered version of the group-meaned
Expand Down Expand Up @@ -199,10 +200,10 @@
#' iris$ID <- sample(1:4, nrow(iris), replace = TRUE) # fake-ID
#' iris$binary <- as.factor(rbinom(150, 1, .35)) # binary variable
#'
#' x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), group = "ID")
#' x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), by = "ID")
#' head(x)
#'
#' x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), group = "ID")
#' x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), by = "ID")
#' head(x)
#'
#'
Expand All @@ -213,23 +214,30 @@
#' y = c(1, 2, 1, 2, 4, 3, 2, 1),
#' ID = c(1, 2, 3, 1, 2, 3, 1, 2)
#' )
#' demean(dat, select = c("a", "x*y"), group = "ID")
#' demean(dat, select = c("a", "x*y"), by = "ID")
#'
#' # or in formula-notation
#' demean(dat, select = ~ a + x * y, group = ~ID)
#' demean(dat, select = ~ a + x * y, by = ~ID)
#'
#' @export
demean <- function(x,
select,
group,
by,
suffix_demean = "_within",
suffix_groupmean = "_between",
add_attributes = TRUE,
verbose = TRUE) {
verbose = TRUE,
group = NULL) {
## TODO: remove warning in future release
if (!is.null(group)) {
by <- group
insight::format_warning("Argument `group` is deprecated and will be removed in a future release. Please use `by` instead.") # nolint
}

degroup(
x = x,
select = select,
group = group,
by = by,
center = "mean",
suffix_demean = suffix_demean,
suffix_groupmean = suffix_groupmean,
Expand All @@ -247,12 +255,19 @@ demean <- function(x,
#' @export
degroup <- function(x,
select,
group,
by,
center = "mean",
suffix_demean = "_within",
suffix_groupmean = "_between",
add_attributes = TRUE,
verbose = TRUE) {
verbose = TRUE,
group = NULL) {
## TODO: remove warning later
if (!is.null(group)) {
by <- group
insight::format_warning("Argument `group` is deprecated and will be removed in a future release. Please use `by` instead.") # nolint
}

# ugly tibbles again...
x <- .coerce_to_dataframe(x)

Expand All @@ -266,8 +281,8 @@ degroup <- function(x,
))
}

if (inherits(group, "formula")) {
group <- all.vars(group)
if (inherits(by, "formula")) {
by <- all.vars(by)
}

interactions_no <- select[!grepl("(\\*|\\:)", select)]
Expand Down Expand Up @@ -296,7 +311,7 @@ degroup <- function(x,
select <- intersect(colnames(x), select)

# get data to demean...
dat <- x[, c(select, group)]
dat <- x[, c(select, by)]


# find categorical predictors that are coded as factors
Expand Down Expand Up @@ -344,31 +359,18 @@ degroup <- function(x,
# for variables within each group (the group means). assign
# mean values to a vector of same length as the data

if (center == "mode") {
x_gm_list <- lapply(select, function(i) {
stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) distribution_mode(stats::na.omit(.gm)))
})
} else if (center == "median") {
x_gm_list <- lapply(select, function(i) {
stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) stats::median(.gm, na.rm = TRUE))
})
} else if (center == "min") {
x_gm_list <- lapply(select, function(i) {
stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) min(.gm, na.rm = TRUE))
})
} else if (center == "max") {
x_gm_list <- lapply(select, function(i) {
stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) max(.gm, na.rm = TRUE))
})
} else {
x_gm_list <- lapply(select, function(i) {
stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) mean(.gm, na.rm = TRUE))
})
}

gm_fun <- switch(center,
mode = function(.gm) distribution_mode(stats::na.omit(.gm)),
median = function(.gm) stats::median(.gm, na.rm = TRUE),
min = function(.gm) min(.gm, na.rm = TRUE),
max = function(.gm) max(.gm, na.rm = TRUE),
function(.gm) mean(.gm, na.rm = TRUE)
)
x_gm_list <- lapply(select, function(i) {
stats::ave(dat[[i]], dat[[by]], FUN = gm_fun)
})
names(x_gm_list) <- select


# create de-meaned variables by subtracting the group mean from each individual value

x_dm_list <- lapply(select, function(i) dat[[i]] - x_gm_list[[i]])
Expand Down
Loading
Loading