Skip to content

Commit

Permalink
Fix converting to factors when more than 9 levels (#367)
Browse files Browse the repository at this point in the history
* Fix converting to factors when more than 9 levels

* version, news

* Apply automatic changes

* drop haven class attr

* rd

* check if the behaviour is reproducible

* Apply automatic changes

* add test

* add test

* fix

* fix strict test

* fix lints

---------

Co-authored-by: strengejacke <[email protected]>
Co-authored-by: Indrajeet Patil <[email protected]>
Co-authored-by: IndrajeetPatil <[email protected]>
  • Loading branch information
4 people committed Mar 2, 2023
1 parent 9f101d1 commit fe74c96
Show file tree
Hide file tree
Showing 6 changed files with 373 additions and 17 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.6.5.7
Version: 0.6.5.8
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ BUG FIXES
* Select helpers now work in custom functions when argument is called `select`
(#356).
* Fix unexpected warning in `convert_na_to()` when `select` is a list (#352).
* Fixed issue with correct labelling of numeric variables with more than nine
unique values and associated value labels.

# datawizard 0.6.5

Expand Down
13 changes: 12 additions & 1 deletion R/data_read.R
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,11 @@ data_read <- function(path,
if (!is.character(i)) {
# if all values are labelled, we assume factor. Use labels as levels
if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) {
i <- factor(as.character(i), labels = names(value_labels))
if (is.numeric(i)) {
i <- factor(i, labels = names(value_labels))
} else {
i <- factor(as.character(i), labels = names(value_labels))
}
value_labels <- NULL
} else {
# else, fall back to numeric
Expand All @@ -171,6 +175,13 @@ data_read <- function(path,
}
i
})
} else {
# drop haven class attributes
x[] <- lapply(x, function(i) {
# save labels
class(i) <- setdiff(class(i), c("haven_labelled", "vctrs_vctr"))
i
})
}

class(x) <- "data.frame"
Expand Down
10 changes: 5 additions & 5 deletions tests/testthat/test-convert_to_na.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ test_that("convert_to_na-factor", {
expect_identical(as.vector(table(x)), c(50L, 50L))

expect_message(
x <- convert_to_na(iris$Species, na = 2),
x <- convert_to_na(iris$Species, na = 2), # nolint
"for a factor or character variable"
)
expect_identical(sum(is.na(x)), 0L)
Expand All @@ -39,7 +39,7 @@ test_that("convert_to_na-numeric", {

test_that("convert_to_na-df", {
expect_message(
x <- convert_to_na(iris, na = 5),
x <- convert_to_na(iris, na = 5), # nolint
"needs to be a character vector"
)
expect_identical(sum(is.na(x)), sum(vapply(iris, function(i) sum(i == 5), FUN.VALUE = integer(1L))))
Expand All @@ -49,7 +49,7 @@ test_that("convert_to_na-df", {

data(iris)
expect_message(
x <- convert_to_na(iris, na = 3),
x <- convert_to_na(iris, na = 3), # nolint
"needs to be a character vector"
)
expect_identical(sum(is.na(x)), sum(vapply(iris, function(i) {
Expand All @@ -76,7 +76,7 @@ test_that("convert_to_na other classes", {

x <- convert_to_na(d$a, na = 3)
expect_equal(x, c(1, 2, NA, 4, 5), tolerance = 1e-3, ignore_attr = TRUE)
expect_message(x <- convert_to_na(d$a, na = "c"), "needs to be a numeric vector")
expect_message(x <- convert_to_na(d$a, na = "c"), "needs to be a numeric vector") # nolint
expect_equal(x, 1:5, tolerance = 1e-3, ignore_attr = TRUE)

x <- convert_to_na(d$b, na = "c")
Expand Down Expand Up @@ -109,7 +109,7 @@ test_that("convert_to_na other classes", {
x <- convert_to_na(d$d, na = TRUE)
expect_equal(x, c(NA, NA, FALSE, FALSE, NA), tolerance = 1e-3, ignore_attr = TRUE)
expect_message(
x <- convert_to_na(d$e, na = as.complex(4)),
x <- convert_to_na(d$e, na = as.complex(4)), # nolint
"variables of class `complex`"
)
expect_equal(x, d$e, tolerance = 1e-3, ignore_attr = TRUE)
Expand Down
208 changes: 208 additions & 0 deletions tests/testthat/test-data_read.R
Original file line number Diff line number Diff line change
Expand Up @@ -235,3 +235,211 @@ test_that("data_read, convert_factors", {
})

unlink(temp_file)




# SPSS file, many value labels -----------------------------------

# Output validated against SPSS output from original dataset

temp_file <- tempfile(fileext = ".sav")
request <- httr::GET("https://raw.github.com/easystats/circus/master/data/spss_many_labels.sav")
httr::stop_for_status(request)
writeBin(httr::content(request, type = "raw"), temp_file)

test_that("data_read, convert many labels correctly", {
d <- data_read(
temp_file,
verbose = FALSE
)
# all are factors by default
expect_identical(
vapply(d, class, character(1)),
c(selv1 = "factor", c12 = "factor", c12a = "factor", c12c = "factor")
)
expect_identical(
levels(d$selv1),
c(
"Vignette 1 weiblich (Gülsen E. Reinigungskraft B)", "Vignette 2 weiblich (Gülsen E. Anwältin B)",
"Vignette 3 weiblich (Monika E. Reinigungskraft B)", "Vignette 4 weiblich (Monika E. Anwältin B)",
"Vignette 5 männlich (Hasan E. Reinigungskraft B)", "Vignette 6 männlich (Hasan E. Anwalt B)",
"Vignette 7 männlich (Martin E. Reinigungskraft B)", "Vignette 8 männlich (Martin E. Anwalt B)",
"Vignette 9 weiblich (Gülsen E. Reinigungskraft E)", "Vignette 10 weiblich (Gülsen E. Anwältin E)",
"Vignette 11 weiblich (Monika E. Reinigungskraft E)", "Vignette 12 weiblich (Monika E. Anwältin E)",
"Vignette 13 männlich (Hasan E. Reinigungskraft E)", "Vignette 14 männlich (Hasan E. Anwalt E)",
"Vignette 15 männlich (Martin E. Reinigungskraft E)", "Vignette 16 männlich (Martin E. Anwalt E)"
)
)
out <- capture.output(data_tabulate(d$selv1))
expect_identical(
out,
c(
"d$selv1 <categorical>", "# total N=2413 valid N=2413", "",
"Value | N | Raw % | Valid % | Cumulative %",
"---------------------------------------------------+-----+-------+---------+-------------",
"Vignette 1 weiblich (Gülsen E. Reinigungskraft B) | 150 | 6.22 | 6.22 | 6.22",
"Vignette 2 weiblich (Gülsen E. Anwältin B) | 150 | 6.22 | 6.22 | 12.43",
"Vignette 3 weiblich (Monika E. Reinigungskraft B) | 150 | 6.22 | 6.22 | 18.65",
"Vignette 4 weiblich (Monika E. Anwältin B) | 151 | 6.26 | 6.26 | 24.91",
"Vignette 5 männlich (Hasan E. Reinigungskraft B) | 151 | 6.26 | 6.26 | 31.16",
"Vignette 6 männlich (Hasan E. Anwalt B) | 153 | 6.34 | 6.34 | 37.51",
"Vignette 7 männlich (Martin E. Reinigungskraft B) | 150 | 6.22 | 6.22 | 43.72",
"Vignette 8 männlich (Martin E. Anwalt B) | 150 | 6.22 | 6.22 | 49.94",
"Vignette 9 weiblich (Gülsen E. Reinigungskraft E) | 151 | 6.26 | 6.26 | 56.20",
"Vignette 10 weiblich (Gülsen E. Anwältin E) | 150 | 6.22 | 6.22 | 62.41",
"Vignette 11 weiblich (Monika E. Reinigungskraft E) | 150 | 6.22 | 6.22 | 68.63",
"Vignette 12 weiblich (Monika E. Anwältin E) | 151 | 6.26 | 6.26 | 74.89",
"Vignette 13 männlich (Hasan E. Reinigungskraft E) | 155 | 6.42 | 6.42 | 81.31",
"Vignette 14 männlich (Hasan E. Anwalt E) | 150 | 6.22 | 6.22 | 87.53",
"Vignette 15 männlich (Martin E. Reinigungskraft E) | 150 | 6.22 | 6.22 | 93.74",
"Vignette 16 männlich (Martin E. Anwalt E) | 151 | 6.26 | 6.26 | 100.00",
"<NA> | 0 | 0.00 | <NA> | <NA>"
)
)

expect_identical(levels(d$c12), c("ja", "nein", "keine Angabe"))
out <- capture.output(data_tabulate(d$c12))
expect_identical(
out,
c(
"Sind oder waren Sie schon einmal selbst von solchen Beschwerden betroffen? (d$c12) <categorical>",
"# total N=2413 valid N=2413",
"",
"Value | N | Raw % | Valid % | Cumulative %",
"-------------+------+-------+---------+-------------",
"ja | 786 | 32.57 | 32.57 | 32.57",
"nein | 1616 | 66.97 | 66.97 | 99.54",
"keine Angabe | 11 | 0.46 | 0.46 | 100.00",
"<NA> | 0 | 0.00 | <NA> | <NA>"
)
)

expect_identical(levels(d$c12a), c("Filter", "ja", "nein", "keine Angabe"))
out <- capture.output(data_tabulate(d$c12a))
expect_identical(
out,
c(
"Haben Sie deswegen Behandlung(en) in Anspruch genommen? (d$c12a) <categorical>",
"# total N=2413 valid N=2413",
"",
"Value | N | Raw % | Valid % | Cumulative %",
"-------------+------+-------+---------+-------------",
"Filter | 1627 | 67.43 | 67.43 | 67.43",
"ja | 500 | 20.72 | 20.72 | 88.15",
"nein | 285 | 11.81 | 11.81 | 99.96",
"keine Angabe | 1 | 0.04 | 0.04 | 100.00",
"<NA> | 0 | 0.00 | <NA> | <NA>"
)
)
expect_identical(
levels(d$c12c),
c(
"Filter", "0 = keine", "1", "2", "3", "4", "5", "6", "7", "8",
"9", "10 = sehr starke", "weiß nicht / keine Angabe"
)
)
out <- capture.output(data_tabulate(d$c12c))
expect_identical(
out,
c(
"Wie sehr haben diese Behandlung(en) Ihre Beeinträchtigung durch die Beschwerden verbessert? (d$c12c) <categorical>", # nolint
"# total N=2413 valid N=2413",
"",
"Value | N | Raw % | Valid % | Cumulative %",
"--------------------------+------+-------+---------+-------------",
"Filter | 1913 | 79.28 | 79.28 | 79.28",
"0 = keine | 34 | 1.41 | 1.41 | 80.69",
"1 | 2 | 0.08 | 0.08 | 80.77",
"2 | 11 | 0.46 | 0.46 | 81.23",
"3 | 14 | 0.58 | 0.58 | 81.81",
"4 | 19 | 0.79 | 0.79 | 82.59",
"5 | 61 | 2.53 | 2.53 | 85.12",
"6 | 42 | 1.74 | 1.74 | 86.86",
"7 | 63 | 2.61 | 2.61 | 89.47",
"8 | 97 | 4.02 | 4.02 | 93.49",
"9 | 53 | 2.20 | 2.20 | 95.69",
"10 = sehr starke | 99 | 4.10 | 4.10 | 99.79",
"weiß nicht / keine Angabe | 5 | 0.21 | 0.21 | 100.00",
"<NA> | 0 | 0.00 | <NA> | <NA>"
)
)
})

test_that("data_read, convert many labels correctly", {
d <- data_read(
temp_file,
convert_factors = FALSE,
verbose = FALSE
)
# all are factors by default
expect_identical(
vapply(d, class, character(1)),
c(selv1 = "numeric", c12 = "numeric", c12a = "numeric", c12c = "numeric")
)
out <- capture.output(table(d$selv1))
expect_identical(
out,
c(
"",
" 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ",
"150 150 150 151 151 153 150 150 151 150 150 151 155 150 150 151 "
)
)
expect_identical(
attributes(d$selv1)$labels,
c(
`Vignette 1 weiblich (Gülsen E. Reinigungskraft B)` = 1, `Vignette 2 weiblich (Gülsen E. Anwältin B)` = 2,
`Vignette 3 weiblich (Monika E. Reinigungskraft B)` = 3, `Vignette 4 weiblich (Monika E. Anwältin B)` = 4,
`Vignette 5 männlich (Hasan E. Reinigungskraft B)` = 5, `Vignette 6 männlich (Hasan E. Anwalt B)` = 6,
`Vignette 7 männlich (Martin E. Reinigungskraft B)` = 7, `Vignette 8 männlich (Martin E. Anwalt B)` = 8,
`Vignette 9 weiblich (Gülsen E. Reinigungskraft E)` = 9, `Vignette 10 weiblich (Gülsen E. Anwältin E)` = 10,
`Vignette 11 weiblich (Monika E. Reinigungskraft E)` = 11, `Vignette 12 weiblich (Monika E. Anwältin E)` = 12,
`Vignette 13 männlich (Hasan E. Reinigungskraft E)` = 13, `Vignette 14 männlich (Hasan E. Anwalt E)` = 14,
`Vignette 15 männlich (Martin E. Reinigungskraft E)` = 15, `Vignette 16 männlich (Martin E. Anwalt E)` = 16,
`99` = 99
)
)

out <- capture.output(table(d$c12))
expect_identical(
out,
c(
"",
" 1 2 99 ",
" 786 1616 11 "
)
)
expect_identical(attributes(d$c12)$labels, c(Filter = -2, ja = 1, nein = 2, `keine Angabe` = 99))

out <- capture.output(table(d$c12a))
expect_identical(
out,
c(
"",
" -2 1 2 99 ",
"1627 500 285 1 "
)
)
expect_identical(attributes(d$c12a)$labels, c(Filter = -2, ja = 1, nein = 2, `keine Angabe` = 99))

out <- capture.output(table(d$c12c))
expect_identical(
out,
c(
"",
" -2 0 1 2 3 4 5 6 7 8 9 10 99 ",
"1913 34 2 11 14 19 61 42 63 97 53 99 5 "
)
)
expect_identical(
attributes(d$c12c)$labels,
c(
Filter = -2, `0 = keine` = 0, `1` = 1, `2` = 2, `3` = 3, `4` = 4,
`5` = 5, `6` = 6, `7` = 7, `8` = 8, `9` = 9, `10 = sehr starke` = 10,
`weiß nicht / keine Angabe` = 99
)
)
})

unlink(temp_file)
Loading

0 comments on commit fe74c96

Please sign in to comment.