add row_min, row_max, minor cleanup elsewhere

m-clark · Jul 13, 2020 · 87df6da · 87df6da
1 parent 4ad02fe
commit 87df6da
Show file tree

Hide file tree

Showing 58 changed files with 1,617 additions and 1,577 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: tidyext
 Type: Package
 Title: Tidy Extensions for Data Processing
-Version: 0.3.5
+Version: 0.3.6
 Authors@R: person("Michael", "Clark", role = c("aut", "cre"), email = "[email protected]")
 Maintainer: Michael Clark <[email protected]>
 Description: Common data processing and summary functions to extend your tidy ways.
@@ -16,10 +16,8 @@ Depends:
 Imports:
     dplyr (>= 1.0.0),
     purrr,
-    magrittr,
     rlang,
     scales,
-    tibble,
     tidyr (>= 1.0.0)
 Suggests:
     ggplot2,
@@ -28,5 +26,6 @@ Suggests:
     stringi,
     stringr,
     testthat,
+    tibble,
     covr
 RoxygenNote: 7.1.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,5 @@
 # Generated by roxygen2: do not edit by hand
 
-export("%>%")
 export(cat_by)
 export(combn_2_col)
 export(create_prediction_data)
@@ -15,17 +14,16 @@ export(onehot)
 export(pre_process)
 export(rnd)
 export(row_apply)
+export(row_max)
 export(row_means)
+export(row_min)
 export(row_sums)
 export(spread2)
 export(sum_NA)
 export(sum_NaN)
 export(sum_blank)
-export(vars)
 import(dplyr)
 importFrom(dplyr,quo_name)
-importFrom(dplyr,vars)
-importFrom(magrittr,"%>%")
 importFrom(purrr,map)
 importFrom(purrr,map_df)
 importFrom(purrr,map_dfr)
@@ -39,7 +37,6 @@ importFrom(stats,model.matrix)
 importFrom(stats,na.omit)
 importFrom(stats,quantile)
 importFrom(stats,sd)
-importFrom(tibble,rowid_to_column)
 importFrom(tidyr,gather)
 importFrom(tidyr,nest)
 importFrom(tidyr,unnest)

diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,10 @@
+# tidyext  0.3.6
+
+Added `row_min` and `row_max`. Removed some dependencies.
+
 # tidyext  0.3.5
 
-Cleanup and update for R 4.0 and dplyr 1.0. Deprecate gather_multi and spread2.
+Cleanup and update for R 4.0 and dplyr 1.0. Deprecate `gather_multi` and `spread2`.
 
 # tidyext  0.3.1
 

diff --git a/R/combn_2_col.R b/R/combn_2_col.R
@@ -47,92 +47,57 @@
 #'   only the indicator columns.
 #' @examples
 #' library(tidyext)
-#' d = data.frame(id = 1:4,
-#'                labs = c('A/B', 'B/C/D/E', 'A/E', 'D/E'))
-#' test = combn_2_col(data=d, var='labs', max_m=3)
+#'
+#' d = data.frame(id = 1:4, labs = c('A/B', 'B/C/D/E', 'A/E', 'D/E'))
+#' test = combn_2_col(data = d, var = 'labs', max_m = 3)
 #' test
 #' str(test)
+#'
 #' d$labs =  c('A B', 'B C D E', 'A E', 'D E')
-#' combn_2_col(data=d, var='labs', max_m=1)
+#' combn_2_col(data = d, var = 'labs', max_m = 1)
+#'
 #' d$labs =  c('Tom, Dick & Harriet', "J'Sean", "OBG, Andreas", NA)
-#' combn_2_col(data=d, var='labs', sep=',', max_m=2, collapse='-')
-#'
-#' \dontrun{
-#' # requires at least tidytext
-#' tidy_dtm <- function(data, var, sep='-', max_m=3) {
-#'   init = stringr::str_split(data[[var]], pattern = sep) # creates a list of separated letters
-#'
-#'   # the following gets the combos with a dot separating drugs in a given combo
-#'   # this first lapply could be parallelized if need be and is probably slowest
-#'   # probably want to change to m = min(c(4, m)) so as to only limit to 4
-#'   # see also, combinat::combn which is slightly faster than base R below
-#'   observation_combos = init %>%
-#'     lapply(function(x)
-#'       sapply(seq_along(x), function(m)
-#'         utils::combn(x,  min(max_m, m), FUN=paste, collapse = '_')))
-#'
-#'   # now we have a standard text analysis problem in need of a document term
-#'   matrix
-#'   documents = observation_combos %>% lapply(unlist)
-#'
-#'   # create a 'tidy' form of documents and terms; each term (i.e. combo) only
-#'   occurs once in a document
-#'   doc_df = data.frame(id=rep(data$id, sapply(documents, length)),
-#'                       combos=unlist(documents),
-#'                       count=1)  # each term only occurs once in the document
-#'   doc_df %>%
-#'     tidytext::cast_dfm(document=id, term=combos, value=count)
-#'   }
-#'
-#' # requires at least text2vec
-#' ttv <- function(data, var, sep='-', max_m=3) {
-#'   docs = sapply(stringr::str_split(data[[var]], pattern=sep),
-#'                 function(str_vec)
-#'                   sapply(seq_along(str_vec),
-#'                          function(m)
-#'                            combn(str_vec,
-#'                                  m = min(max_m, m),
-#'                                  FUN = paste,
-#'                                  collapse = '_')
-#'                   ) %>% unlist()
-#'   )
-#'
-#'   toks = itoken(docs, progressbar = FALSE)
-#'   vocab = create_vocabulary(toks)
-#'   create_dtm(toks, vectorizer = vocab_vectorizer(vocab), progressbar = FALSE) %>%
-#'     as.matrix() %>%
-#'     cbind(data,.)
-#' }
-#'
-#' }
+#'
+#' combn_2_col(
+#' data = d,
+#' var = 'labs',
+#' sep = ',',
+#' max_m = 2,
+#' collapse = '-'
+#' )
+#'
 #'
 #'
 #'
 #' @export
-combn_2_col <- function(data,
-                        var,
-                        sep='[^[:alnum:]]+',
-                        max_m=1,
-                        collapse = '_',
-                        toInteger=FALSE,
-                        sparse=FALSE) {
+combn_2_col <- function(
+  data,
+  var,
+  sep = '[^[:alnum:]]+',
+  max_m = 1,
+  collapse = '_',
+  toInteger = FALSE,
+  sparse = FALSE
+) {
 
   if (is.null(data) | is.null(var))
     stop('Need data and variable name to continue.')
 
   if (max_m < 1) stop('Need positive value for max_m.')
 
   data$combo <-
-    map(stringr::str_split(data[[var]], pattern=sep),
-           function(str_vec)
-             map(seq_along(str_vec),
-                    function(m)
-                      combn(str_vec,
-                            m = min(max_m, m),
-                            FUN = paste,
-                            collapse = collapse)
-             ) %>% unlist()
+    map(stringr::str_split(data[[var]], pattern = sep),
+        function(str_vec)
+          map(seq_along(str_vec),
+              function(m)
+                combn(str_vec,
+                      m = min(max_m, m),
+                      FUN = paste,
+                      collapse = collapse)
+          ) %>%
+          unlist()
     )
+
   combo_cols <- unique(unlist(data$combo))
 
   if (sparse) {
@@ -142,7 +107,7 @@ combn_2_col <- function(data,
         do.call(rbind,.) %>%
         Matrix::Matrix(sparse = TRUE,
                        dimnames = list(rownames(data), combo_cols))
-      )
+    )
   }
 
   if (toInteger) {
@@ -156,6 +121,7 @@ combn_2_col <- function(data,
       map(function(x) combo_cols %in% x) %>%
       do.call(rbind,.)
   }
+
   data
 }
 
diff --git a/R/gather_multi.R b/R/gather_multi.R
@@ -132,7 +132,7 @@ gather_multi <- function(data,
                   na.rm = na.rm,
                   convert = convert,
                   factor_key = factor_key) %>%
-    rowid_to_column()
+    mutate(rowid = 1:nrow(.)) %>%  # changed to get rid of tibble requirement while deprecated
 
   for (i in 2:length(varlist)) {
     data_long <- data %>%
@@ -144,7 +144,7 @@ gather_multi <- function(data,
                     na.rm = na.rm,
                     convert = convert,
                     factor_key = factor_key) %>%
-      rowid_to_column()%>%
+      mutate(rowid = 1:nrow(.)) %>%  # changed to get rid of tibble requirement while deprecated
       select(rowid, !!values[[i]]) %>%
       left_join(data_long, ., by='rowid')
   }

diff --git a/R/globals.R b/R/globals.R
@@ -3,4 +3,4 @@ utils::globalVariables(c(".", 'Max', 'Mean', 'Median', 'Min', 'Missing', 'N',
                          'Q1', 'Q3', 'SD', 'Variable', 'X1st.Qu.', 'X3rd.Qu.',
                          'perc', 'result', 'results', 'target', 'value',
                          'x.Freq', 'x.x', 'y.Freq', 'y.x', '%', 'rowid',
-                         'Group', 'Frequency'))
+                         'Group', 'Frequency', 'rn'))
diff --git a/R/head_tail.R b/R/head_tail.R
@@ -11,8 +11,8 @@
 #' @examples
 #'
 #' library(tidyext)
-#' as.matrix(mtcars) %>%
-#' head_tail(6)
+#'
+#' head_tail(mtcars)
 #'
 head_tail = function(data, n_slice = 6) {
   # initial checks

diff --git a/R/num_summary.R b/R/num_summary.R
@@ -35,7 +35,7 @@ num_summary <- function(x, digits = 1, extra = FALSE) {
 
   x <- as.numeric(x)
 
-  d <- tibble(
+  d <- dplyr::tibble(
     N = length(na.omit(x)),
     data.frame(t(c(summary(x)))),
     SD = sd(x, na.rm = TRUE),

diff --git a/R/pre_process.R b/R/pre_process.R
@@ -33,14 +33,17 @@
 #'   them.
 #' @importFrom scales rescale
 #' @return A data frame that has been processed
-#' @export
 #'
 #' @examples
 #' library(tidyext)
+#' library(dplyr)
+#'
 #' pre_process(mtcars)
-#' pre_process(mtcars, log_vars=vars(mpg, wt))
-#' pre_process(mtcars, zero_start=vars(cyl, gear))
-#' pre_process(mtcars, zero_one=vars(mpg))
+#' pre_process(mtcars, log_vars = vars(mpg, wt))
+#' pre_process(mtcars, zero_start = vars(cyl, gear))
+#' pre_process(mtcars, zero_one = vars(mpg))
+#'
+#' @export
 pre_process <- function(
   data,
   std = TRUE,

diff --git a/R/row_sums.R b/R/row_sums.R
@@ -7,7 +7,7 @@
 #' @param ... The columns to sum, take the mean of, etc. \emph{Required}.
 #' @param .fun The function to apply.
 #' @param na_rm Whether to remove \code{NA} values or not. Default is \code{FALSE}.
-#' @param varname The column name of the sums means etc.
+#' @param varname The column name of the sums means etc. as a character string.
 #'
 #' @details Simple wrappers for applying rowwise operations only for selected
 #'   columns within the tidyverse approach to data processing.  The
@@ -28,6 +28,9 @@
 #' d  %>%
 #'   row_means(matches('x|z'))
 #'
+#' d  %>%
+#'   row_max(matches('x|y'))
+#'
 #' row_apply(
 #'   d ,
 #'   everything(),
@@ -37,6 +40,8 @@
 #'
 #' @export
 row_sums <- function(data, ..., na_rm = FALSE, varname = 'sum') {
+  # note: dplyr 1.0 included rowwise operations, but it wasn't obvious what
+  # advantage there would be for these functions except for min and max
   dplyr::mutate(data, !!varname := rowSums(select(data, ...), na.rm = na_rm))
 }
 
@@ -46,6 +51,28 @@ row_means <- function(data, ..., na_rm = FALSE, varname = 'mean') {
   dplyr::mutate(data, !!varname := rowMeans(select(data, ...), na.rm = na_rm))
 }
 
+#' @export
+#' @rdname row_sums
+row_min <- function(data, ..., na_rm = FALSE, varname = 'min') {
+  dplyr::select(data, ...) %>%
+    dplyr::mutate(rn = 1:nrow(.)) %>%
+    dplyr::rowwise(rn) %>%
+    dplyr::mutate(!!varname := min(dplyr::c_across(...), na.rm = na_rm)) %>%
+    dplyr::ungroup() %>%
+    dplyr::select(-rn)
+}
+
+#' @export
+#' @rdname row_sums
+row_max <- function(data, ..., na_rm = FALSE, varname = 'max') {
+  dplyr::select(data, ...) %>%
+    dplyr::mutate(rn = 1:nrow(.)) %>%
+    dplyr::rowwise(rn) %>%
+    dplyr::mutate(!!varname := max(dplyr::c_across(...), na.rm = na_rm)) %>%
+    dplyr::ungroup() %>%
+    dplyr::select(-rn)
+}
+
 #' @export
 #' @rdname row_sums
 row_apply <- function(data, ..., .fun, varname = 'var') {

diff --git a/R/spread2.R b/R/spread2.R
@@ -29,7 +29,6 @@
 #'
 #' @return A data frame with 'wide' format.
 #' @seealso \code{\link[tidyr]{spread}}
-#' @importFrom tibble rowid_to_column
 #' @examples
 #' \dontrun{
 #' library(tidyext); library(tidyr)
@@ -90,7 +89,7 @@ spread2 <- function(data,
   } else {
     data <- data %>%
       bind_cols(data %>%
-                  tibble::rowid_to_column() %>%
+                  mutate(rowid = 1:nrow(.)) %>%  # changed to get rid of tibble requirement while deprecated
                   select(rowid)
                 )
   }

diff --git a/R/utils.R b/R/utils.R
@@ -1,9 +1 @@
-#' @importFrom magrittr %>%
-#' @export
-magrittr::`%>%`
-
-
-
-#' @importFrom dplyr vars
-#' @export
-dplyr::vars
+#' @importFrom dplyr vars `%>%` tibble
diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -40,14 +40,16 @@ reference:
       - create_prediction_data
       - gather_multi
       - onehot
-      # - pre_process
+      - pre_process
       # - spread2
       - row_sums
   - title: "Miscellaneous"
     desc: >
       Miscellaneous functions.
     contents:
       - select_not
+      - head_tail
+      - rnd
       - tidyext
 figures:
   dev: svglite::svglite