Skip to content

Commit

Permalink
Coalesce partner birth year and relationship start year data across w…
Browse files Browse the repository at this point in the history
…aves #25
  • Loading branch information
emilycantrell committed Jun 2, 2024
1 parent 9e560ff commit cdf7acc
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 6 deletions.
98 changes: 98 additions & 0 deletions explorations/examine_patterns_in_partner_data_across_waves.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Emily Cantrell
# Exploration of data about partner from LISS
library(tidyverse)

# The purpose of this code is to determine which features about the partner requires us to
# coalesce data across waves. For example:
# (1) Partner's birthyear is only reported once. If the person remains with the same partner, the birthyear
# is not reported again in subsequent waves. Therefore, we need to coalesce data across waves.
# (2) Partner's gender is recorded repeatedly across waves even if there is no change. Therefore,
# we can use data from just the most recent wave (or a few waves), and don't need to coalesce across waves.

# Read in the data
train_full <- read.csv("/Users/ecantrell/Documents/PreFer\ 2024/prefer_data/training_data/PreFer_train_data.csv")
outcome <- read.csv("/Users/ecantrell/Documents/PreFer\ 2024/prefer_data/training_data/PreFer_train_outcome.csv")

# Filter to only people with non-missing outcomes
train <- train_full %>%
left_join(outcome) %>%
filter(!is.na(new_child))

# Do you currently have a partner?
train %>%
select(cf08a024, cf09b024, cf10c024, cf11d024, cf12e024, cf13f024, cf14g024, cf15h024, cf16i024, cf17j024, cf18k024, cf19l024, cf20m024) %>%
View()
# Decision: no need to coalesce across years; the same value is recorded repeatedly across waves

# Do you live with that partner?
train %>%
select(cf08a025, cf09b025, cf10c025, cf11d025, cf12e025, cf13f025, cf14g025, cf15h025, cf16i025, cf17j025, cf18k025, cf19l025, cf20m025) %>%
View()
# Decision: no need to coalesce across years; the same value is recorded repeatedly across waves

# What is his or her year of birth?
train %>%
select(cf20m024, cf08a026, cf09b026, cf10c026, cf11d026, cf12e026, cf13f026, cf14g026, cf15h026, cf16i026, cf17j026, cf18k026, cf19l026, cf20m026) %>%
View()
# Decision: the partner's birthyear seems to be only collected in one wave, but the wave varies by person depending
# when they started the survey or got together with the partner, so we should make a variable that takes the most
# recently reported partner birthyear (use the "coalesce" function).

# In what country was your partner born?
train %>%
select(cf08a027, cf09b027, cf10c027, cf11d027, cf12e027, cf13f027, cf14g027, cf15h027, cf16i027, cf17j027, cf18k027, cf19l027, cf20m027) %>%
View()
train %>% group_by(cf08a027) %>% count()
# Decision: almost all partners were born in Netherlands or have NA for this question, so I won't use it

# In what year did relationship begin?
train %>%
select(cf08a028, cf09b028, cf10c028, cf11d028, cf12e028, cf13f028, cf14g028, cf15h028, cf16i028, cf17j028, cf18k028, cf19l028, cf20m028) %>%
View()
# Decision: coalesce the data across years to get the most recently reported value

# In what year did you start living with partner?
train %>%
select(cf08a029, cf09b029, cf10c029, cf11d029, cf12e029, cf13f029, cf14g029, cf15h029, cf16i029, cf17j029, cf18k029, cf19l029, cf20m029) %>%
View()
# Decision: no need to coalesce across years; the same value is recorded repeatedly across waves

# Are you married?
train %>%
select(cf08a030, cf09b030, cf10c030, cf11d030, cf12e030, cf13f030, cf14g030, cf15h030, cf16i030, cf17j030, cf18k030, cf19l030, cf20m030) %>%
View()
# Decision: no need to coalesce across years; the same value is recorded repeatedly across waves

# In what year did you marry?
train %>%
select(cf08a031, cf09b031, cf10c031, cf11d031, cf12e031, cf13f031, cf14g031, cf15h031, cf16i031, cf17j031, cf18k031, cf19l031, cf20m031) %>%
View()
# Decision: no need to coalesce across years; the same value is recorded repeatedly across waves

# What is your partner's gender?
train %>%
select(cf20m024, cf08a032, cf09b032, cf10c032, cf11d032, cf12e032, cf13f032, cf14g032, cf15h032, cf16i032, cf17j032, cf18k032, cf19l032, cf20m032) %>%
View()
train %>%
group_by(cf20m024, is.na(cf20m032)) %>%
count()
train %>%
select(cf20m024, cf08a032, cf09b032, cf10c032, cf11d032, cf12e032, cf13f032, cf14g032, cf15h032, cf16i032, cf17j032, cf18k032, cf19l032, cf20m032) %>%
filter(cf20m024 == 1) %>% # Filter to people who currently have partner
View()
# Everyone who said they currently have a partner in 2020 has partner's gender reported in 2020
# Decision: no need to coalesce across years

#### Draft code for creating the variables ####
# I will insert a version of this code into submission.R
toy <- train %>%
mutate(partner_birth_year = coalesce(cf20m026, cf19l026, cf18k026, cf17j026, cf16i026, cf15h026, cf14g026, cf13f026, cf12e026, cf11d026, cf10c026, cf09b026, cf08a026))
toy %>%
select(cf20m024, cf08a026, cf09b026, cf10c026, cf11d026, cf12e026, cf13f026, cf14g026, cf15h026, cf16i026, cf17j026, cf18k026, cf19l026, cf20m026, most_recent) %>%
View()

toy <- train %>%
mutate(year_relationship_began = coalesce(cf20m028, cf19l028, cf18k028, cf17j028, cf16i028, cf15h028, cf14g028, cf13f028, cf12e028, cf11d028, cf10c028, cf09b028, cf08a028))
toy %>%
select(cf20m028, cf19l028, cf18k028, cf17j028, cf16i028, cf15h028, cf14g028, cf13f028, cf12e028, cf11d028, cf10c028, cf09b028, cf08a028, year_relationship_began) %>%
View()
8 changes: 7 additions & 1 deletion feature_time_shift.R
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,13 @@ features_for_2018to2020 <- features_for_2018to2020 %>%
cf10c129 = NA,
cf08a130 = NA,
cf09b130 = NA,
cf10c130 = NA
cf10c130 = NA,
cf08a026 = NA,
cf09b026 = NA,
cf10c026 = NA,
cf08a028 = NA,
cf09b028 = NA,
cf10c028 = NA
)

######## STEP 6: SAVE THE FILES! ########
Expand Down
Binary file modified model.rds
Binary file not shown.
17 changes: 13 additions & 4 deletions submission.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,15 @@ clean_df <- function(df, background_df) {
"cd20m034",
# Data about partner from 2020. We thank Sayash Kapoor and Benedikt Strobl's L1
# regression for directing our attention towards cf20m029
"cf20m024", "cf20m025", "cf20m026", "cf20m027", "cf20m028", "cf20m029", "cf20m030", "cf20m031", "cf20m032",
"cf20m024", "cf20m025", "cf20m029", "cf20m030", "cf20m031", "cf20m032", # I skipped feature on country of origin because almost all are from Netherlands
# Data about partner from 2019
"cf19l024", "cf19l025", "cf19l026", "cf19l027", "cf19l028", "cf19l029", "cf19l030", "cf19l031", "cf19l032",
"cf19l024", "cf19l025", "cf19l029", "cf19l030", "cf19l031", "cf19l032",
# Data about partner from 2018
"cf18k024", "cf18k025", "cf18k026", "cf18k027", "cf18k028", "cf18k029", "cf18k030", "cf18k031", "cf18k032",
"cf18k024", "cf18k025", "cf18k029", "cf18k030", "cf18k031", "cf18k032",
# Data about partner's birth year (we need to coalesce data across years to find the most recently reported value)
"cf20m026", "cf19l026", "cf18k026", "cf17j026", "cf16i026", "cf15h026", "cf14g026", "cf13f026", "cf12e026", "cf11d026", "cf10c026", "cf09b026", "cf08a026",
# Data about year relationship began (we need to coalesce data across years to find the most recently reported value)
"cf20m028", "cf19l028", "cf18k028", "cf17j028", "cf16i028", "cf15h028", "cf14g028", "cf13f028", "cf12e028", "cf11d028", "cf10c028", "cf09b028", "cf08a028",
# Birth year of first child
"cf18k456", "cf19l456", "cf20m456",
# Birth year of second child
Expand Down Expand Up @@ -173,6 +177,10 @@ clean_df <- function(df, background_df) {
cf20m030 = ifelse(cf20m024 == 2, 2, cf20m030),
cf19l030 = ifelse(cf19l024 == 2, 2, cf19l030),
cf18k030 = ifelse(cf18k024 == 2, 2, cf18k030),
# Identify partner's birth year based on most recent wave in which it was reported
partner_birth_year = coalesce(cf20m026, cf19l026, cf18k026, cf17j026, cf16i026, cf15h026, cf14g026, cf13f026, cf12e026, cf11d026, cf10c026, cf09b026, cf08a026),
# Identify year relationship began based on most recet wave in which it was reported
year_relationship_began = coalesce(cf20m028, cf19l028, cf18k028, cf17j028, cf16i028, cf15h028, cf14g028, cf13f028, cf12e028, cf11d028, cf10c028, cf09b028, cf08a028),
# If no expected kids, then expected number of kids is 0
# Note: in some years, "I don't know" was an option for *128; we don't use that info here, so the recoded *129 may not contain all info from *128
cf08a129 = ifelse(cf08a128 == 2, 0, cf08a129),
Expand Down Expand Up @@ -284,6 +292,8 @@ clean_df <- function(df, background_df) {
woning_2020 = case_when(woning_2020 == 1 ~ 1, woning_2020 %in% 2:4 ~ 0)
) %>%
select(-outcome_available,
-cf20m026, -cf19l026, -cf18k026, -cf17j026, -cf16i026, -cf15h026, -cf14g026, -cf13f026, -cf12e026, -cf11d026, -cf10c026, -cf09b026, -cf08a026,
-cf20m028, -cf19l028, -cf18k028, -cf17j028, -cf16i028, -cf15h028, -cf14g028, -cf13f028, -cf12e028, -cf11d028, -cf10c028, -cf09b028, -cf08a028,
-ca20g078, -ca20g013,
-cf20m454,
-cf19l454,
Expand Down Expand Up @@ -314,7 +324,6 @@ clean_df <- function(df, background_df) {
mutate(
across(everything(), as.numeric),
across(c(belbezig_2020, migration_background_bg, oplmet_2020,
cf18k027, cf19l027, cf20m027,
cf08a128, cf09b128, cf10c128, cf11d128, cf12e128,
cf13f128, cf14g128, cf15h128, cf16i128, cf17j128,
cf18k128, cf19l128, cf20m128), factor) # Some of the *128 are binary but it varies by year, so since we are doing a time-shift, I am one-hot encoding them all for simplicity
Expand Down
1 change: 0 additions & 1 deletion training.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ train_save_model <- function(cleaned_train_2021to2023, outcome_2021to2023,
recipe <- recipe(new_child ~ ., original_plus_timeshifted_model_df) %>%
step_rm(nomem_encr, nohouse_encr) %>%
step_dummy(c(belbezig_2020, migration_background_bg, oplmet_2020,
cf18k027, cf19l027, cf20m027,
cf08a128, cf09b128, cf10c128, cf11d128, cf12e128,
cf13f128, cf14g128, cf15h128, cf16i128, cf17j128,
cf18k128, cf19l128, cf20m128), # Some of the *128 are binary but it varies by year, so since we are doing a time-shift, I am one-hot encoding them all for simplicity
Expand Down

0 comments on commit cdf7acc

Please sign in to comment.