Skip to content

Commit

Permalink
Use 2019/2020 fertility intentions as the only features #21
Browse files Browse the repository at this point in the history
The time shift indicator is also included as a feature since this branch still uses the time shift strategy
  • Loading branch information
emilycantrell committed May 17, 2024
1 parent 9dcc2ab commit 3d1bb81
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 201 deletions.
211 changes: 11 additions & 200 deletions submission.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,73 +43,10 @@ clean_df <- function(df, background_df) {
keepcols <- c("nomem_encr", # ID variable required for predictions,
"outcome_available", # Is there an outcome to predict?
"time_shifted_data", # Indicates whether this is original data or time-shifted data
# Savings
"ca20g012", "ca20g013", "ca20g078",
# Number of rooms
"cd20m034",
# Partnership status. We thank Sayash Kapoor and Benedikt Strobl's L1
# regression for directing our attention towards cf20m029
"cf20m024", "cf20m025", "cf20m026", "cf20m029", "cf20m030",
# Expected kids
# Expected kids reported in 2020
"cf20m128", "cf20m129", "cf20m130",
# Feelings about being single
"cf20m166",
# Existing children
"cf20m454", "cf20m455",
# Relationship with child
"cf20m513",
"cf20m514",
"cf20m515",
"cf20m516",
"cf20m517",
"cf20m518",
"cf20m519",
"cf20m520",
"cf20m521",
# Health
"ch20m004",
# Gynaecologist. We thank Sayash Kapoor and Benedikt Strobl's L1
# regression for directing our attention towards this variable
"ch20m219",
# Gendered religiosity
"cr18k101", "cr18k102", "cr18k103", "cr18k104", "cr18k105",
# Religiosity
"cr20m162",
# Traditional fertility
"cv10c135", "cv10c136", "cv10c137", "cv10c138",
# Traditional motherhood
"cv20l109", "cv20l110", "cv20l111",
# Traditional fatherhood
"cv20l112", "cv20l113", "cv20l114", "cv20l115",
# Traditional marriage
"cv20l124",
"cv20l125",
"cv20l126",
"cv20l127",
"cv20l128",
"cv20l129",
"cv20l130",
# Against working mothers
"cv20l143", "cv20l144", "cv20l145", "cv20l146",
# Sexism
"cv20l151", "cv20l152", "cv20l153", "cv20l154",
# Birth year
"birthyear_bg",
# Primary occupation. We thank Sayash Kapoor and Benedikt Strobl's L1
# regression for directing our attention towards this variable
"belbezig_2020",
# Gender
"gender_bg",
# Origins
"migration_background_bg",
# Houshold Income
"nettohh_f_2020",
# Education
"oplmet_2020",
# Urban
"sted_2020",
# Dwelling type
"woning_2020"
# Expected kids reported in 2019
"cf19l128", "cf19l129", "cf19l130"
)

## Keeping data with variables selected
Expand All @@ -119,148 +56,22 @@ clean_df <- function(df, background_df) {
df <- filter(df, outcome_available == 1) %>%
rowwise() %>%
mutate(
# Impute savings with range midpoints. Two exceptions: We impute -1200
# for those in the smallest category. -1200 is roughly the average
# savings of those who are in that category. Similarly we impute 62500
# for those in the largest category.
# Also, if one does not have accounts, then one does not have any savings
ca20g012 = case_when(ca20g078 == 0 ~ 0,
ca20g013 == 1 ~ -1200,
ca20g013 == 2 ~ 150,
ca20g013 == 3 ~ 375,
ca20g013 == 4 ~ 625,
ca20g013 == 5 ~ 875,
ca20g013 == 6 ~ 1750,
ca20g013 == 7 ~ 3750,
ca20g013 == 8 ~ 6250,
ca20g013 == 9 ~ 8750,
ca20g013 == 10 ~ 10750,
ca20g013 == 11 ~ 12750,
ca20g013 == 12 ~ 15500,
ca20g013 == 13 ~ 18500,
ca20g013 == 14 ~ 22500,
ca20g013 == 15 ~ 62500,
ca20g013 == 999 ~ NA,
ca20g012 < -9999999997 ~ NA,
TRUE ~ ca20g012
),
# If no partner, then one is not living together with partner
cf20m025 = ifelse(cf20m024 == 2, 2, cf20m025),
# If no partner, then one is not married to partner
cf20m030 = ifelse(cf20m024 == 2, 2, cf20m030),
# If no expected kids, then expected number of kids is 0
cf20m129 = ifelse(cf20m128 == 2, 0, cf20m129),
cf19l129 = ifelse(cf19l128 == 2, 0, cf19l129),
# If no expected kids, then a lower-bound estimate for the number of
# years within which to have kids is 31,
cf20m130 = case_when(cf20m128 == 2 ~ 31, cf20m130 == 2025 ~ 5,
TRUE ~ cf20m130
),
# Feeling about being single
cf20m166 = ifelse(cf20m166 == 99, NA, cf20m166),
# If one never had children, then one does not have any living children
cf20m455 = ifelse(cf20m454 == 2, 0, cf20m455),
# Scale for feeling towards child
across(c(cf20m515, cf20m516, cf20m518, cf20m519, cf20m520, cf20m521),
~ 8 - .x
),
child_feeling = mean(c(cf20m513,
cf20m514,
cf20m515,
cf20m516,
cf20m517,
cf20m518,
cf20m519,
cf20m520,
cf20m521
),
na.rm = TRUE
),
# Scale on gendered religiosity
across(c(cr18k101, cr18k102, cr18k103, cr18k104, cr18k105),
~ case_when(.x == 1 ~ 3, .x == 2 ~ 1, .x > 2 ~ 2)
),
across(c(cr18k102, cr18k105), ~ 4 - .x),
gendered_religiosity = mean(
c(cr18k101, cr18k102, cr18k103, cr18k104, cr18k105),
na.rm = TRUE
),
# Religiosity
cr20m162 = ifelse(cr20m162 == -9, NA, cr20m162),
# Scale on traditional fertility
traditional_fertility = mean(c(cv10c135, cv10c136, cv10c137, cv10c138),
na.rm = TRUE
),
# Scale on traditional motherhood
cv20l109 = 6 - cv20l109,
traditional_motherhood = mean(c(cv20l109, cv20l110, cv20l111),
na.rm = TRUE
),
# Scale on traditional fatherhood
across(c(cv20l112, cv20l114, cv20l115), ~ 6 - .x),
traditional_fatherhood = mean(c(cv20l112, cv20l113, cv20l114, cv20l115),
na.rm = TRUE
),
# Scale on traditional marriage
across(c(cv20l126, cv20l127, cv20l128, cv20l129, cv20l130), ~ 6 - .x),
traditional_marriage = mean(c(
cv20l124, cv20l125, cv20l126, cv20l127, cv20l128, cv20l129, cv20l130
),
na.rm = TRUE
),
# Scale on being against working mothers
working_mother = mean(c(cv20l143, cv20l144, cv20l145, cv20l146),
na.rm = TRUE
),
# Scale on sexism
sexism = mean(c(cv20l151, cv20l152, cv20l153, cv20l154), na.rm = TRUE),
# Primary occupations: employees, freelancers, seeking lost jobs,
# students, homemakers, work disability
belbezig_2020 = ifelse(belbezig_2020 %in% c(1, 3, 4, 7, 8, 10),
belbezig_2020, NA
),
# Distinguish first- and second- non-Western migrants from others
migration_background_bg =
case_when(
migration_background_bg %in% c(0, 101, 201) ~ 0, # Western origin
migration_background_bg %in% c(102, 202) ~ migration_background_bg
),
# Combine the lowest levels of education
oplmet_2020 = case_when(oplmet_2020 > 7 ~ 0, oplmet_2020 == 7 ~ NA,
TRUE ~ oplmet_2020
TRUE ~ cf20m130
),
# Distinguish between home owners and non-home owners
woning_2020 = case_when(woning_2020 == 1 ~ 1, woning_2020 %in% 2:4 ~ 0)
) %>%
select(-outcome_available,
-ca20g078, -ca20g013,
-cf20m128,
-cf20m454,
-cf20m513,
-cf20m514,
-cf20m515,
-cf20m516,
-cf20m517,
-cf20m518,
-cf20m519,
-cf20m520,
-cf20m521,
-cr18k101, -cr18k102, -cr18k103, -cr18k104, -cr18k105,
-cv10c135, -cv10c136, -cv10c137, -cv10c138,
-cv20l109, -cv20l110, -cv20l111,
-cv20l112, -cv20l113, -cv20l114, -cv20l115,
-cv20l124,
-cv20l125,
-cv20l126,
-cv20l127,
-cv20l128,
-cv20l129,
-cv20l130,
-cv20l143, -cv20l144, -cv20l145, -cv20l146,
-cv20l151, -cv20l152, -cv20l153, -cv20l154,
) %>%
cf19l130 = case_when(cf19l128 == 2 ~ 31,
TRUE ~ cf19l130)
) %>%
select(-outcome_available
) %>%
mutate(
across(everything(), as.numeric),
across(c(belbezig_2020, migration_background_bg, oplmet_2020), factor)
across(cf20m128, factor) # Options for cf20m128 were yes/no/IDK, but for cf19l128, options were only yes/no
)

# Append household ID
Expand Down
2 changes: 1 addition & 1 deletion training.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ train_save_model <- function(cleaned_train_2021to2023, outcome_2021to2023,
# and mean impute everything
recipe <- recipe(new_child ~ ., original_plus_timeshifted_model_df) %>%
step_rm(nomem_encr, nohouse_encr) %>%
step_dummy(c(belbezig_2020, migration_background_bg, oplmet_2020),
step_dummy(c(cf20m128),
one_hot = TRUE
) %>%
step_impute_mean(everything(), -new_child)
Expand Down

0 comments on commit 3d1bb81

Please sign in to comment.