Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename group/group_by arguments into by #502

Merged
merged 27 commits into from
May 18, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.10.0.3
Version: 0.10.0.4
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down
10 changes: 10 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# datawizard 0.10.1

BREAKING CHANGES

* Arguments named `group` or `group_by` will be deprecated in a future release.
strengejacke marked this conversation as resolved.
Show resolved Hide resolved
strengejacke marked this conversation as resolved.
Show resolved Hide resolved
Please use `by` instead. This affects following functions in *datawizard*.
strengejacke marked this conversation as resolved.
Show resolved Hide resolved

* `data_partition()`
* `demean()` and `degroup()`
* `means_by_group()`
* `rescale_weights()`

CHANGES

* `recode_into()` is more relaxed regarding checking the type of `NA` values.
Expand Down
22 changes: 11 additions & 11 deletions R/data_codebook.R
Original file line number Diff line number Diff line change
Expand Up @@ -232,9 +232,9 @@

# add proportions, but not for ranges, since these are always 100%
if (is_range) {
proportions <- ""
frq_proportions <- ""
} else {
proportions <- sprintf("%.1f%%", round(100 * (frq / sum(frq)), 1))
frq_proportions <- sprintf("%.1f%%", round(100 * (frq / sum(frq)), 1))
}

# make sure we have not too long rows, e.g. for variables that
Expand All @@ -245,9 +245,9 @@
}
if (length(frq) > max_values) {
frq <- frq[1:max_values]
proportions <- proportions[1:max_values]
frq_proportions <- frq_proportions[1:max_values]
frq[max_values] <- NA
proportions[max_values] <- NA
frq_proportions[max_values] <- NA
}
if (length(values) > max_values) {
values <- values[1:max_values]
Expand All @@ -273,7 +273,7 @@
values,
value_labels,
frq,
proportions,
proportions = frq_proportions,
stringsAsFactors = FALSE
))

Expand Down Expand Up @@ -347,12 +347,12 @@
x$Prop[x$Prop == "NA" | is.na(x$Prop)] <- ""
# align only for text format
if (identical(format, "text")) {
x$Prop[x$Prop != ""] <- format(x$Prop[x$Prop != ""], justify = "right")
x$Prop[x$Prop != ""] <- format(x$Prop[x$Prop != ""], justify = "right") # nolint
}
x[["N"]][x$Prop != ""] <- sprintf(
x[["N"]][x$Prop != ""] <- sprintf( # nolint
"%s (%s)",
as.character(x[["N"]][x$Prop != ""]),
x$Prop[x$Prop != ""]
as.character(x[["N"]][x$Prop != ""]), # nolint
x$Prop[x$Prop != ""] # nolint
)
x$Prop <- NULL
}
Expand Down Expand Up @@ -388,7 +388,7 @@
# since we have each value at its own row, the HTML table contains
# horizontal borders for each cell/row. We want to remove those borders
# from rows that actually belong to one variable
separator_lines <- which(duplicated(x$.row_id) & x$N == "")
separator_lines <- which(duplicated(x$.row_id) & x$N == "") # nolint
# remove separator lines, as we don't need these for HTML tables
x <- x[-separator_lines, ]
# check row IDs, and find odd rows
Expand All @@ -405,7 +405,7 @@
out <- gt::tab_style(
out,
style = list(gt::cell_borders(sides = "top", style = "hidden")),
locations = gt::cells_body(rows = which(x$ID == ""))
locations = gt::cells_body(rows = which(x$ID == "")) # nolint
)
# highlight odd rows
if (!is.null(row_color)) {
Expand Down Expand Up @@ -466,5 +466,5 @@
N = "r"
)
align <- align[colnames(x)]
paste0(unname(align), collapse = "")

Check warning on line 469 in R/data_codebook.R

View workflow job for this annotation

GitHub Actions / lint / lint

file=R/data_codebook.R,line=469,col=3,[paste_linter] Use paste(), not paste0(), to collapse a character vector when sep= is not used.

Check warning on line 469 in R/data_codebook.R

View workflow job for this annotation

GitHub Actions / lint-changed-files / lint-changed-files

file=R/data_codebook.R,line=469,col=3,[paste_linter] Use paste(), not paste0(), to collapse a character vector when sep= is not used.
}
23 changes: 15 additions & 8 deletions R/data_partition.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,20 @@
#'
#' Creates data partitions (for instance, a training and a test set) based on a
#' data frame that can also be stratified (i.e., evenly spread a given factor)
#' using the `group` argument.
#' using the `by` argument.
#'
#' @inheritParams data_rename
#' @param proportion Scalar (between 0 and 1) or numeric vector, indicating the
#' proportion(s) of the training set(s). The sum of `proportion` must not be
#' greater than 1. The remaining part will be used for the test set.
#' @param group A character vector indicating the name(s) of the column(s) used
#' @param by A character vector indicating the name(s) of the column(s) used
#' for stratified partitioning.
#' @param seed A random number generator seed. Enter an integer (e.g. 123) so
#' that the random sampling will be the same each time you run the function.
#' @param row_id Character string, indicating the name of the column that
#' contains the row-id's.
#' @param verbose Toggle messages and warnings.
#' @param group Deprecated. Use `by` instead.
#'
#' @return A list of data frames. The list includes one training set per given
#' proportion and the remaining data as test set. List elements of training
Expand All @@ -28,7 +29,7 @@
#' nrow(out$p_0.9)
#'
#' # Stratify by group (equal proportions of each species)
#' out <- data_partition(iris, proportion = 0.9, group = "Species")
#' out <- data_partition(iris, proportion = 0.9, by = "Species")
#' out$test
#'
#' # Create multiple partitions
Expand All @@ -38,21 +39,27 @@
#' # Create multiple partitions, stratified by group - 30% equally sampled
#' # from species in first training set, 50% in second training set and
#' # remaining 20% equally sampled from each species in test set.
#' out <- data_partition(iris, proportion = c(0.3, 0.5), group = "Species")
#' out <- data_partition(iris, proportion = c(0.3, 0.5), by = "Species")
#' lapply(out, function(i) table(i$Species))
#'
#' @inherit data_rename seealso
#' @export
data_partition <- function(data,
proportion = 0.7,
group = NULL,
by = NULL,
seed = NULL,
row_id = ".row_id",
verbose = TRUE,
group = NULL,
...) {
# validation checks
data <- .coerce_to_dataframe(data)

## TODO: deprecate later
if (!is.null(group)) {
by <- group
}
strengejacke marked this conversation as resolved.
Show resolved Hide resolved

if (sum(proportion) > 1) {
insight::format_error("Sum of `proportion` cannot be higher than 1.")
}
Expand Down Expand Up @@ -91,12 +98,12 @@ data_partition <- function(data,

# Create list of data groups. We generally lapply over list of
# sampled row-id's by group, thus, we even create a list if not grouped.
if (is.null(group)) {
if (is.null(by)) {
indices_list <- list(seq_len(nrow(data)))
} else {
# else, split by group(s) and extract row-ids per group
indices_list <- lapply(
split(data, data[group]),
split(data, data[by]),
data_extract,
select = row_id,
as_data_frame = FALSE
Expand Down Expand Up @@ -130,7 +137,7 @@ data_partition <- function(data,
})

# we need to move all list elements one level higher.
if (is.null(group)) {
if (is.null(by)) {
training_sets <- training_sets[[1]]
} else {
# for grouped training sets, we need to row-bind all sampled training
Expand Down
6 changes: 6 additions & 0 deletions R/data_xtabulate.R
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,9 @@ print_html.dw_data_xtabulate <- function(x, big_mark = NULL, ...) {
x$Group <- NULL
}


## FIXME: change group_by argument later

# print table
insight::export_table(
format(x, big_mark = big_mark, format = "html", ...),
Expand Down Expand Up @@ -265,6 +268,9 @@ print_html.dw_data_xtabulates <- function(x, big_mark = NULL, ...) {

out <- do.call(rbind, x)


## FIXME: change group_by argument later

# print table
insight::export_table(
out,
Expand Down
74 changes: 37 additions & 37 deletions R/demean.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#' @param x A data frame.
#' @param select Character vector (or formula) with names of variables to select
#' that should be group- and de-meaned.
#' @param group Character vector (or formula) with the name of the variable that
#' @param by Character vector (or formula) with the name of the variable that
#' indicates the group- or cluster-ID.
#' @param center Method for centering. `demean()` always performs
#' mean-centering, while `degroup()` can use `center = "median"` or
Expand All @@ -25,6 +25,7 @@
#' attributes to indicate the within- and between-effects. This is only
#' relevant when printing `model_parameters()` - in such cases, the
#' within- and between-effects are printed in separated blocks.
#' @param group Deprecated. Use `by` instead.
#' @inheritParams center
#'
#' @return
Expand Down Expand Up @@ -92,7 +93,7 @@
#'
#' \subsection{Terminology}{
#' The group-meaned variable is simply the mean of an independent variable
#' within each group (or id-level or cluster) represented by `group`.
#' within each group (or id-level or cluster) represented by `by`.
#' It represents the cluster-mean of an independent variable. The regression
#' coefficient of a group-meaned variable is the *between-subject-effect*.
#' The de-meaned variable is then the centered version of the group-meaned
Expand Down Expand Up @@ -199,10 +200,10 @@
#' iris$ID <- sample(1:4, nrow(iris), replace = TRUE) # fake-ID
#' iris$binary <- as.factor(rbinom(150, 1, .35)) # binary variable
#'
#' x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), group = "ID")
#' x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), by = "ID")
#' head(x)
#'
#' x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), group = "ID")
#' x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), by = "ID")
#' head(x)
#'
#'
Expand All @@ -213,23 +214,29 @@
#' y = c(1, 2, 1, 2, 4, 3, 2, 1),
#' ID = c(1, 2, 3, 1, 2, 3, 1, 2)
#' )
#' demean(dat, select = c("a", "x*y"), group = "ID")
#' demean(dat, select = c("a", "x*y"), by = "ID")
#'
#' # or in formula-notation
#' demean(dat, select = ~ a + x * y, group = ~ID)
#' demean(dat, select = ~ a + x * y, by = ~ID)
#'
#' @export
demean <- function(x,
select,
group,
by,
suffix_demean = "_within",
suffix_groupmean = "_between",
add_attributes = TRUE,
verbose = TRUE) {
verbose = TRUE,
group = NULL) {
## TODO: deprecate later
if (!is.null(group)) {
by <- group
}

degroup(
x = x,
select = select,
group = group,
by = by,
center = "mean",
suffix_demean = suffix_demean,
suffix_groupmean = suffix_groupmean,
Expand All @@ -247,12 +254,18 @@ demean <- function(x,
#' @export
degroup <- function(x,
select,
group,
by,
center = "mean",
suffix_demean = "_within",
suffix_groupmean = "_between",
add_attributes = TRUE,
verbose = TRUE) {
verbose = TRUE,
group = NULL) {
## TODO: deprecate later
if (!is.null(group)) {
by <- group
}

# ugly tibbles again...
x <- .coerce_to_dataframe(x)

Expand All @@ -266,8 +279,8 @@ degroup <- function(x,
))
}

if (inherits(group, "formula")) {
group <- all.vars(group)
if (inherits(by, "formula")) {
by <- all.vars(by)
}

interactions_no <- select[!grepl("(\\*|\\:)", select)]
Expand Down Expand Up @@ -296,7 +309,7 @@ degroup <- function(x,
select <- intersect(colnames(x), select)

# get data to demean...
dat <- x[, c(select, group)]
dat <- x[, c(select, by)]


# find categorical predictors that are coded as factors
Expand Down Expand Up @@ -344,31 +357,18 @@ degroup <- function(x,
# for variables within each group (the group means). assign
# mean values to a vector of same length as the data

if (center == "mode") {
x_gm_list <- lapply(select, function(i) {
stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) distribution_mode(stats::na.omit(.gm)))
})
} else if (center == "median") {
x_gm_list <- lapply(select, function(i) {
stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) stats::median(.gm, na.rm = TRUE))
})
} else if (center == "min") {
x_gm_list <- lapply(select, function(i) {
stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) min(.gm, na.rm = TRUE))
})
} else if (center == "max") {
x_gm_list <- lapply(select, function(i) {
stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) max(.gm, na.rm = TRUE))
})
} else {
x_gm_list <- lapply(select, function(i) {
stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) mean(.gm, na.rm = TRUE))
})
}

gm_fun <- switch(center,
mode = function(.gm) distribution_mode(stats::na.omit(.gm)),
median = function(.gm) stats::median(.gm, na.rm = TRUE),
min = function(.gm) min(.gm, na.rm = TRUE),
max = function(.gm) max(.gm, na.rm = TRUE),
function(.gm) mean(.gm, na.rm = TRUE)
)
x_gm_list <- lapply(select, function(i) {
stats::ave(dat[[i]], dat[[by]], FUN = gm_fun)
})
names(x_gm_list) <- select


# create de-meaned variables by subtracting the group mean from each individual value

x_dm_list <- lapply(select, function(i) dat[[i]] - x_gm_list[[i]])
Expand Down
Loading
Loading