Coalesce partner birth year and relationship start year data across w…

…aves #25
citp · Jun 2, 2024 · cdf7acc · cdf7acc
1 parent 9e560ff
commit cdf7acc
Show file tree

Hide file tree

Showing 5 changed files with 118 additions and 6 deletions.
diff --git a/explorations/examine_patterns_in_partner_data_across_waves.R b/explorations/examine_patterns_in_partner_data_across_waves.R
@@ -0,0 +1,98 @@
+# Emily Cantrell
+# Exploration of data about partner from LISS 
+library(tidyverse)
+
+# The purpose of this code is to determine which features about the partner requires us to 
+# coalesce data across waves. For example: 
+# (1) Partner's birthyear is only reported once. If the person remains with the same partner, the birthyear
+# is not reported again in subsequent waves. Therefore, we need to coalesce data across waves.
+# (2) Partner's gender is recorded repeatedly across waves even if there is no change. Therefore, 
+# we can use data from just the most recent wave (or a few waves), and don't need to coalesce across waves.
+
+# Read in the data
+train_full <- read.csv("/Users/ecantrell/Documents/PreFer\ 2024/prefer_data/training_data/PreFer_train_data.csv")
+outcome <- read.csv("/Users/ecantrell/Documents/PreFer\ 2024/prefer_data/training_data/PreFer_train_outcome.csv")
+
+# Filter to only people with non-missing outcomes
+train <- train_full %>%
+  left_join(outcome) %>%
+  filter(!is.na(new_child))
+
+# Do you currently have a partner?
+train %>% 
+  select(cf08a024, cf09b024, cf10c024, cf11d024, cf12e024, cf13f024, cf14g024, cf15h024, cf16i024, cf17j024, cf18k024, cf19l024, cf20m024) %>%
+  View()
+# Decision: no need to coalesce across years; the same value is recorded repeatedly across waves
+
+# Do you live with that partner? 
+train %>%
+  select(cf08a025, cf09b025, cf10c025, cf11d025, cf12e025, cf13f025, cf14g025, cf15h025, cf16i025, cf17j025, cf18k025, cf19l025, cf20m025) %>% 
+  View()
+# Decision: no need to coalesce across years; the same value is recorded repeatedly across waves
+
+# What is his or her year of birth?
+train %>% 
+  select(cf20m024, cf08a026, cf09b026, cf10c026, cf11d026, cf12e026, cf13f026, cf14g026, cf15h026, cf16i026, cf17j026, cf18k026, cf19l026, cf20m026) %>%
+  View()
+# Decision: the partner's birthyear seems to be only collected in one wave, but the wave varies by person depending 
+# when they started the survey or got together with the partner, so we should make a variable that takes the most 
+# recently reported partner birthyear (use the "coalesce" function). 
+
+# In what country was your partner born?
+train %>% 
+  select(cf08a027, cf09b027, cf10c027, cf11d027, cf12e027, cf13f027, cf14g027, cf15h027, cf16i027, cf17j027, cf18k027, cf19l027, cf20m027) %>%
+  View()
+train %>% group_by(cf08a027) %>% count()
+# Decision: almost all partners were born in Netherlands or have NA for this question, so I won't use it
+
+# In what year did relationship begin? 
+train %>% 
+  select(cf08a028, cf09b028, cf10c028, cf11d028, cf12e028, cf13f028, cf14g028, cf15h028, cf16i028, cf17j028, cf18k028, cf19l028, cf20m028) %>%
+  View()
+# Decision: coalesce the data across years to get the most recently reported value
+
+# In what year did you start living with partner?
+train %>%
+  select(cf08a029, cf09b029, cf10c029, cf11d029, cf12e029, cf13f029, cf14g029, cf15h029, cf16i029, cf17j029, cf18k029, cf19l029, cf20m029) %>%
+  View()
+# Decision: no need to coalesce across years; the same value is recorded repeatedly across waves
+
+# Are you married?
+train %>% 
+  select(cf08a030, cf09b030, cf10c030, cf11d030, cf12e030, cf13f030, cf14g030, cf15h030, cf16i030, cf17j030, cf18k030, cf19l030, cf20m030) %>%
+  View()
+# Decision: no need to coalesce across years; the same value is recorded repeatedly across waves
+
+# In what year did you marry? 
+train %>%
+  select(cf08a031, cf09b031, cf10c031, cf11d031, cf12e031, cf13f031, cf14g031, cf15h031, cf16i031, cf17j031, cf18k031, cf19l031, cf20m031) %>% 
+  View()
+# Decision: no need to coalesce across years; the same value is recorded repeatedly across waves
+
+# What is your partner's gender? 
+train %>%
+  select(cf20m024, cf08a032, cf09b032, cf10c032, cf11d032, cf12e032, cf13f032, cf14g032, cf15h032, cf16i032, cf17j032, cf18k032, cf19l032, cf20m032) %>%
+  View()
+train %>%
+  group_by(cf20m024, is.na(cf20m032)) %>%
+  count()
+train %>%
+  select(cf20m024, cf08a032, cf09b032, cf10c032, cf11d032, cf12e032, cf13f032, cf14g032, cf15h032, cf16i032, cf17j032, cf18k032, cf19l032, cf20m032) %>%
+  filter(cf20m024 == 1) %>% # Filter to people who currently have partner
+  View()
+# Everyone who said they currently have a partner in 2020 has partner's gender reported in 2020
+# Decision: no need to coalesce across years
+
+#### Draft code for creating the variables ####
+# I will insert a version of this code into submission.R
+toy <- train %>% 
+  mutate(partner_birth_year = coalesce(cf20m026, cf19l026, cf18k026, cf17j026, cf16i026, cf15h026, cf14g026, cf13f026, cf12e026, cf11d026, cf10c026, cf09b026, cf08a026))
+toy %>%
+  select(cf20m024, cf08a026, cf09b026, cf10c026, cf11d026, cf12e026, cf13f026, cf14g026, cf15h026, cf16i026, cf17j026, cf18k026, cf19l026, cf20m026, most_recent) %>%
+  View()
+
+toy <- train %>%
+  mutate(year_relationship_began = coalesce(cf20m028, cf19l028, cf18k028, cf17j028, cf16i028, cf15h028, cf14g028, cf13f028, cf12e028, cf11d028, cf10c028, cf09b028, cf08a028))
+toy %>%
+  select(cf20m028, cf19l028, cf18k028, cf17j028, cf16i028, cf15h028, cf14g028, cf13f028, cf12e028, cf11d028, cf10c028, cf09b028, cf08a028, year_relationship_began) %>%
+  View()
diff --git a/feature_time_shift.R b/feature_time_shift.R
@@ -246,7 +246,13 @@ features_for_2018to2020 <- features_for_2018to2020 %>%
          cf10c129 = NA, 
          cf08a130 = NA, 
          cf09b130 = NA, 
-         cf10c130 = NA
+         cf10c130 = NA, 
+         cf08a026 = NA, 
+         cf09b026 = NA, 
+         cf10c026 = NA, 
+         cf08a028 = NA, 
+         cf09b028 = NA, 
+         cf10c028 = NA 
   )
 
 ######## STEP 6: SAVE THE FILES! ######## 

diff --git a/model.rds b/model.rds
diff --git a/submission.R b/submission.R
@@ -49,11 +49,15 @@ clean_df <- function(df, background_df) {
     "cd20m034",
     # Data about partner from 2020. We thank Sayash Kapoor and Benedikt Strobl's L1
     # regression for directing our attention towards cf20m029
-    "cf20m024", "cf20m025", "cf20m026", "cf20m027", "cf20m028", "cf20m029", "cf20m030", "cf20m031", "cf20m032",
+    "cf20m024", "cf20m025", "cf20m029", "cf20m030", "cf20m031", "cf20m032", # I skipped feature on country of origin because almost all are from Netherlands
     # Data about partner from 2019
-    "cf19l024", "cf19l025", "cf19l026", "cf19l027", "cf19l028", "cf19l029", "cf19l030", "cf19l031", "cf19l032",
+    "cf19l024", "cf19l025", "cf19l029", "cf19l030", "cf19l031", "cf19l032",
     # Data about partner from 2018
-    "cf18k024", "cf18k025", "cf18k026", "cf18k027", "cf18k028", "cf18k029", "cf18k030", "cf18k031", "cf18k032",
+    "cf18k024", "cf18k025", "cf18k029", "cf18k030", "cf18k031", "cf18k032",
+    # Data about partner's birth year (we need to coalesce data across years to find the most recently reported value)
+    "cf20m026", "cf19l026", "cf18k026", "cf17j026", "cf16i026", "cf15h026", "cf14g026", "cf13f026", "cf12e026", "cf11d026", "cf10c026", "cf09b026", "cf08a026",
+    # Data about year relationship began (we need to coalesce data across years to find the most recently reported value)
+    "cf20m028", "cf19l028", "cf18k028", "cf17j028", "cf16i028", "cf15h028", "cf14g028", "cf13f028", "cf12e028", "cf11d028", "cf10c028", "cf09b028", "cf08a028",
     # Birth year of first child
     "cf18k456", "cf19l456", "cf20m456", 
     # Birth year of second child 
@@ -173,6 +177,10 @@ clean_df <- function(df, background_df) {
       cf20m030 = ifelse(cf20m024 == 2, 2, cf20m030),
       cf19l030 = ifelse(cf19l024 == 2, 2, cf19l030),
       cf18k030 = ifelse(cf18k024 == 2, 2, cf18k030),
+      # Identify partner's birth year based on most recent wave in which it was reported
+      partner_birth_year = coalesce(cf20m026, cf19l026, cf18k026, cf17j026, cf16i026, cf15h026, cf14g026, cf13f026, cf12e026, cf11d026, cf10c026, cf09b026, cf08a026),
+      # Identify year relationship began based on most recet wave in which it was reported
+      year_relationship_began = coalesce(cf20m028, cf19l028, cf18k028, cf17j028, cf16i028, cf15h028, cf14g028, cf13f028, cf12e028, cf11d028, cf10c028, cf09b028, cf08a028),
       # If no expected kids, then expected number of kids is 0
       # Note: in some years, "I don't know" was an option for *128; we don't use that info here, so the recoded *129 may not contain all info from *128
       cf08a129 = ifelse(cf08a128 == 2, 0, cf08a129),
@@ -284,6 +292,8 @@ clean_df <- function(df, background_df) {
       woning_2020 = case_when(woning_2020 == 1 ~ 1, woning_2020 %in% 2:4 ~ 0)
     ) %>%
     select(-outcome_available,
+      -cf20m026, -cf19l026, -cf18k026, -cf17j026, -cf16i026, -cf15h026, -cf14g026, -cf13f026, -cf12e026, -cf11d026, -cf10c026, -cf09b026, -cf08a026,
+      -cf20m028, -cf19l028, -cf18k028, -cf17j028, -cf16i028, -cf15h028, -cf14g028, -cf13f028, -cf12e028, -cf11d028, -cf10c028, -cf09b028, -cf08a028,
       -ca20g078, -ca20g013,
       -cf20m454,
       -cf19l454, 
@@ -314,7 +324,6 @@ clean_df <- function(df, background_df) {
     mutate(
       across(everything(), as.numeric),
       across(c(belbezig_2020, migration_background_bg, oplmet_2020,
-               cf18k027, cf19l027, cf20m027,
                cf08a128, cf09b128, cf10c128, cf11d128, cf12e128,  
                cf13f128, cf14g128, cf15h128, cf16i128, cf17j128,
                cf18k128, cf19l128, cf20m128), factor) # Some of the *128 are binary but it varies by year, so since we are doing a time-shift, I am one-hot encoding them all for simplicity

diff --git a/training.R b/training.R
@@ -38,7 +38,6 @@ train_save_model <- function(cleaned_train_2021to2023, outcome_2021to2023,
   recipe <- recipe(new_child ~ ., original_plus_timeshifted_model_df) %>%
     step_rm(nomem_encr, nohouse_encr) %>%
     step_dummy(c(belbezig_2020, migration_background_bg, oplmet_2020,
-                 cf18k027, cf19l027, cf20m027,
                  cf08a128, cf09b128, cf10c128, cf11d128, cf12e128,  
                  cf13f128, cf14g128, cf15h128, cf16i128, cf17j128,
                  cf18k128, cf19l128, cf20m128), # Some of the *128 are binary but it varies by year, so since we are doing a time-shift, I am one-hot encoding them all for simplicity