Fix converting to factors when more than 9 levels (#367)

* Fix converting to factors when more than 9 levels * version, news * Apply automatic changes * drop haven class attr * rd * check if the behaviour is reproducible * Apply automatic changes * add test * add test * fix * fix strict test * fix lints --------- Co-authored-by: strengejacke <[email protected]> Co-authored-by: Indrajeet Patil <[email protected]> Co-authored-by: IndrajeetPatil <[email protected]>
easystats · Mar 2, 2023 · fe74c96 · fe74c96
1 parent 9f101d1
commit fe74c96
Show file tree

Hide file tree

Showing 6 changed files with 373 additions and 17 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.6.5.7
+Version: 0.6.5.8
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),

diff --git a/NEWS.md b/NEWS.md
@@ -34,6 +34,8 @@ BUG FIXES
 * Select helpers now work in custom functions when argument is called `select`
   (#356).
 * Fix unexpected warning in `convert_na_to()` when `select` is a list (#352).
+* Fixed issue with correct labelling of numeric variables with more than nine
+  unique values and associated value labels.
 
 # datawizard 0.6.5
 

diff --git a/R/data_read.R b/R/data_read.R
@@ -149,7 +149,11 @@ data_read <- function(path,
         if (!is.character(i)) {
           # if all values are labelled, we assume factor. Use labels as levels
           if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) {
-            i <- factor(as.character(i), labels = names(value_labels))
+            if (is.numeric(i)) {
+              i <- factor(i, labels = names(value_labels))
+            } else {
+              i <- factor(as.character(i), labels = names(value_labels))
+            }
             value_labels <- NULL
           } else {
             # else, fall back to numeric
@@ -171,6 +175,13 @@ data_read <- function(path,
       }
       i
     })
+  } else {
+    # drop haven class attributes
+    x[] <- lapply(x, function(i) {
+      # save labels
+      class(i) <- setdiff(class(i), c("haven_labelled", "vctrs_vctr"))
+      i
+    })
   }
 
   class(x) <- "data.frame"

diff --git a/tests/testthat/test-convert_to_na.R b/tests/testthat/test-convert_to_na.R
@@ -16,7 +16,7 @@ test_that("convert_to_na-factor", {
   expect_identical(as.vector(table(x)), c(50L, 50L))
 
   expect_message(
-    x <- convert_to_na(iris$Species, na = 2),
+    x <- convert_to_na(iris$Species, na = 2), # nolint
     "for a factor or character variable"
   )
   expect_identical(sum(is.na(x)), 0L)
@@ -39,7 +39,7 @@ test_that("convert_to_na-numeric", {
 
 test_that("convert_to_na-df", {
   expect_message(
-    x <- convert_to_na(iris, na = 5),
+    x <- convert_to_na(iris, na = 5), # nolint
     "needs to be a character vector"
   )
   expect_identical(sum(is.na(x)), sum(vapply(iris, function(i) sum(i == 5), FUN.VALUE = integer(1L))))
@@ -49,7 +49,7 @@ test_that("convert_to_na-df", {
 
   data(iris)
   expect_message(
-    x <- convert_to_na(iris, na = 3),
+    x <- convert_to_na(iris, na = 3), # nolint
     "needs to be a character vector"
   )
   expect_identical(sum(is.na(x)), sum(vapply(iris, function(i) {
@@ -76,7 +76,7 @@ test_that("convert_to_na other classes", {
 
   x <- convert_to_na(d$a, na = 3)
   expect_equal(x, c(1, 2, NA, 4, 5), tolerance = 1e-3, ignore_attr = TRUE)
-  expect_message(x <- convert_to_na(d$a, na = "c"), "needs to be a numeric vector")
+  expect_message(x <- convert_to_na(d$a, na = "c"), "needs to be a numeric vector") # nolint
   expect_equal(x, 1:5, tolerance = 1e-3, ignore_attr = TRUE)
 
   x <- convert_to_na(d$b, na = "c")
@@ -109,7 +109,7 @@ test_that("convert_to_na other classes", {
   x <- convert_to_na(d$d, na = TRUE)
   expect_equal(x, c(NA, NA, FALSE, FALSE, NA), tolerance = 1e-3, ignore_attr = TRUE)
   expect_message(
-    x <- convert_to_na(d$e, na = as.complex(4)),
+    x <- convert_to_na(d$e, na = as.complex(4)), # nolint
     "variables of class `complex`"
   )
   expect_equal(x, d$e, tolerance = 1e-3, ignore_attr = TRUE)

diff --git a/tests/testthat/test-data_read.R b/tests/testthat/test-data_read.R
@@ -235,3 +235,211 @@ test_that("data_read, convert_factors", {
 })
 
 unlink(temp_file)
+
+
+
+
+# SPSS file, many value labels  -----------------------------------
+
+# Output validated against SPSS output from original dataset
+
+temp_file <- tempfile(fileext = ".sav")
+request <- httr::GET("https://raw.github.com/easystats/circus/master/data/spss_many_labels.sav")
+httr::stop_for_status(request)
+writeBin(httr::content(request, type = "raw"), temp_file)
+
+test_that("data_read, convert many labels correctly", {
+  d <- data_read(
+    temp_file,
+    verbose = FALSE
+  )
+  # all are factors by default
+  expect_identical(
+    vapply(d, class, character(1)),
+    c(selv1 = "factor", c12 = "factor", c12a = "factor", c12c = "factor")
+  )
+  expect_identical(
+    levels(d$selv1),
+    c(
+      "Vignette 1 weiblich (Gülsen E. Reinigungskraft B)", "Vignette 2 weiblich (Gülsen E. Anwältin B)",
+      "Vignette 3 weiblich (Monika E. Reinigungskraft B)", "Vignette 4 weiblich (Monika E. Anwältin B)",
+      "Vignette 5 männlich (Hasan E. Reinigungskraft B)", "Vignette 6 männlich (Hasan E. Anwalt B)",
+      "Vignette 7 männlich (Martin E. Reinigungskraft B)", "Vignette 8 männlich (Martin E. Anwalt B)",
+      "Vignette 9 weiblich (Gülsen E. Reinigungskraft E)", "Vignette 10 weiblich (Gülsen E. Anwältin E)",
+      "Vignette 11 weiblich (Monika E. Reinigungskraft E)", "Vignette 12 weiblich (Monika E. Anwältin E)",
+      "Vignette 13 männlich (Hasan E. Reinigungskraft E)", "Vignette 14 männlich (Hasan E. Anwalt E)",
+      "Vignette 15 männlich (Martin E. Reinigungskraft E)", "Vignette 16 männlich (Martin E. Anwalt E)"
+    )
+  )
+  out <- capture.output(data_tabulate(d$selv1))
+  expect_identical(
+    out,
+    c(
+      "d$selv1 <categorical>", "# total N=2413 valid N=2413", "",
+      "Value                                              |   N | Raw % | Valid % | Cumulative %",
+      "---------------------------------------------------+-----+-------+---------+-------------",
+      "Vignette 1 weiblich (Gülsen E. Reinigungskraft B)  | 150 |  6.22 |    6.22 |         6.22",
+      "Vignette 2 weiblich (Gülsen E. Anwältin B)         | 150 |  6.22 |    6.22 |        12.43",
+      "Vignette 3 weiblich (Monika E. Reinigungskraft B)  | 150 |  6.22 |    6.22 |        18.65",
+      "Vignette 4 weiblich (Monika E. Anwältin B)         | 151 |  6.26 |    6.26 |        24.91",
+      "Vignette 5 männlich (Hasan E. Reinigungskraft B)   | 151 |  6.26 |    6.26 |        31.16",
+      "Vignette 6 männlich (Hasan E. Anwalt B)            | 153 |  6.34 |    6.34 |        37.51",
+      "Vignette 7 männlich (Martin E. Reinigungskraft B)  | 150 |  6.22 |    6.22 |        43.72",
+      "Vignette 8 männlich (Martin E. Anwalt B)           | 150 |  6.22 |    6.22 |        49.94",
+      "Vignette 9 weiblich (Gülsen E. Reinigungskraft E)  | 151 |  6.26 |    6.26 |        56.20",
+      "Vignette 10 weiblich (Gülsen E. Anwältin E)        | 150 |  6.22 |    6.22 |        62.41",
+      "Vignette 11 weiblich (Monika E. Reinigungskraft E) | 150 |  6.22 |    6.22 |        68.63",
+      "Vignette 12 weiblich (Monika E. Anwältin E)        | 151 |  6.26 |    6.26 |        74.89",
+      "Vignette 13 männlich (Hasan E. Reinigungskraft E)  | 155 |  6.42 |    6.42 |        81.31",
+      "Vignette 14 männlich (Hasan E. Anwalt E)           | 150 |  6.22 |    6.22 |        87.53",
+      "Vignette 15 männlich (Martin E. Reinigungskraft E) | 150 |  6.22 |    6.22 |        93.74",
+      "Vignette 16 männlich (Martin E. Anwalt E)          | 151 |  6.26 |    6.26 |       100.00",
+      "<NA>                                               |   0 |  0.00 |    <NA> |         <NA>"
+    )
+  )
+
+  expect_identical(levels(d$c12), c("ja", "nein", "keine Angabe"))
+  out <- capture.output(data_tabulate(d$c12))
+  expect_identical(
+    out,
+    c(
+      "Sind oder waren Sie schon einmal selbst von solchen Beschwerden betroffen? (d$c12) <categorical>",
+      "# total N=2413 valid N=2413",
+      "",
+      "Value        |    N | Raw % | Valid % | Cumulative %",
+      "-------------+------+-------+---------+-------------",
+      "ja           |  786 | 32.57 |   32.57 |        32.57",
+      "nein         | 1616 | 66.97 |   66.97 |        99.54",
+      "keine Angabe |   11 |  0.46 |    0.46 |       100.00",
+      "<NA>         |    0 |  0.00 |    <NA> |         <NA>"
+    )
+  )
+
+  expect_identical(levels(d$c12a), c("Filter", "ja", "nein", "keine Angabe"))
+  out <- capture.output(data_tabulate(d$c12a))
+  expect_identical(
+    out,
+    c(
+      "Haben Sie deswegen Behandlung(en) in Anspruch genommen? (d$c12a) <categorical>",
+      "# total N=2413 valid N=2413",
+      "",
+      "Value        |    N | Raw % | Valid % | Cumulative %",
+      "-------------+------+-------+---------+-------------",
+      "Filter       | 1627 | 67.43 |   67.43 |        67.43",
+      "ja           |  500 | 20.72 |   20.72 |        88.15",
+      "nein         |  285 | 11.81 |   11.81 |        99.96",
+      "keine Angabe |    1 |  0.04 |    0.04 |       100.00",
+      "<NA>         |    0 |  0.00 |    <NA> |         <NA>"
+    )
+  )
+  expect_identical(
+    levels(d$c12c),
+    c(
+      "Filter", "0 = keine", "1", "2", "3", "4", "5", "6", "7", "8",
+      "9", "10 = sehr starke", "weiß nicht / keine Angabe"
+    )
+  )
+  out <- capture.output(data_tabulate(d$c12c))
+  expect_identical(
+    out,
+    c(
+      "Wie sehr haben diese Behandlung(en) Ihre Beeinträchtigung durch die Beschwerden verbessert? (d$c12c) <categorical>", # nolint
+      "# total N=2413 valid N=2413",
+      "",
+      "Value                     |    N | Raw % | Valid % | Cumulative %",
+      "--------------------------+------+-------+---------+-------------",
+      "Filter                    | 1913 | 79.28 |   79.28 |        79.28",
+      "0 = keine                 |   34 |  1.41 |    1.41 |        80.69",
+      "1                         |    2 |  0.08 |    0.08 |        80.77",
+      "2                         |   11 |  0.46 |    0.46 |        81.23",
+      "3                         |   14 |  0.58 |    0.58 |        81.81",
+      "4                         |   19 |  0.79 |    0.79 |        82.59",
+      "5                         |   61 |  2.53 |    2.53 |        85.12",
+      "6                         |   42 |  1.74 |    1.74 |        86.86",
+      "7                         |   63 |  2.61 |    2.61 |        89.47",
+      "8                         |   97 |  4.02 |    4.02 |        93.49",
+      "9                         |   53 |  2.20 |    2.20 |        95.69",
+      "10 = sehr starke          |   99 |  4.10 |    4.10 |        99.79",
+      "weiß nicht / keine Angabe |    5 |  0.21 |    0.21 |       100.00",
+      "<NA>                      |    0 |  0.00 |    <NA> |         <NA>"
+    )
+  )
+})
+
+test_that("data_read, convert many labels correctly", {
+  d <- data_read(
+    temp_file,
+    convert_factors = FALSE,
+    verbose = FALSE
+  )
+  # all are factors by default
+  expect_identical(
+    vapply(d, class, character(1)),
+    c(selv1 = "numeric", c12 = "numeric", c12a = "numeric", c12c = "numeric")
+  )
+  out <- capture.output(table(d$selv1))
+  expect_identical(
+    out,
+    c(
+      "",
+      "  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16 ",
+      "150 150 150 151 151 153 150 150 151 150 150 151 155 150 150 151 "
+    )
+  )
+  expect_identical(
+    attributes(d$selv1)$labels,
+    c(
+      `Vignette 1 weiblich (Gülsen E. Reinigungskraft B)` = 1, `Vignette 2 weiblich (Gülsen E. Anwältin B)` = 2,
+      `Vignette 3 weiblich (Monika E. Reinigungskraft B)` = 3, `Vignette 4 weiblich (Monika E. Anwältin B)` = 4,
+      `Vignette 5 männlich (Hasan E. Reinigungskraft B)` = 5, `Vignette 6 männlich (Hasan E. Anwalt B)` = 6,
+      `Vignette 7 männlich (Martin E. Reinigungskraft B)` = 7, `Vignette 8 männlich (Martin E. Anwalt B)` = 8,
+      `Vignette 9 weiblich (Gülsen E. Reinigungskraft E)` = 9, `Vignette 10 weiblich (Gülsen E. Anwältin E)` = 10,
+      `Vignette 11 weiblich (Monika E. Reinigungskraft E)` = 11, `Vignette 12 weiblich (Monika E. Anwältin E)` = 12,
+      `Vignette 13 männlich (Hasan E. Reinigungskraft E)` = 13, `Vignette 14 männlich (Hasan E. Anwalt E)` = 14,
+      `Vignette 15 männlich (Martin E. Reinigungskraft E)` = 15, `Vignette 16 männlich (Martin E. Anwalt E)` = 16,
+      `99` = 99
+    )
+  )
+
+  out <- capture.output(table(d$c12))
+  expect_identical(
+    out,
+    c(
+      "",
+      "   1    2   99 ",
+      " 786 1616   11 "
+    )
+  )
+  expect_identical(attributes(d$c12)$labels, c(Filter = -2, ja = 1, nein = 2, `keine Angabe` = 99))
+
+  out <- capture.output(table(d$c12a))
+  expect_identical(
+    out,
+    c(
+      "",
+      "  -2    1    2   99 ",
+      "1627  500  285    1 "
+    )
+  )
+  expect_identical(attributes(d$c12a)$labels, c(Filter = -2, ja = 1, nein = 2, `keine Angabe` = 99))
+
+  out <- capture.output(table(d$c12c))
+  expect_identical(
+    out,
+    c(
+      "",
+      "  -2    0    1    2    3    4    5    6    7    8    9   10   99 ",
+      "1913   34    2   11   14   19   61   42   63   97   53   99    5 "
+    )
+  )
+  expect_identical(
+    attributes(d$c12c)$labels,
+    c(
+      Filter = -2, `0 = keine` = 0, `1` = 1, `2` = 2, `3` = 3, `4` = 4,
+      `5` = 5, `6` = 6, `7` = 7, `8` = 8, `9` = 9, `10 = sehr starke` = 10,
+      `weiß nicht / keine Angabe` = 99
+    )
+  )
+})
+
+unlink(temp_file)