Add time-shifted data and use household IDs in cross-validation #4 #16

citp · May 9, 2024 · f20de01 · f20de01
1 parent f985cbc
commit f20de01
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 10 deletions.
diff --git a/model.rds b/model.rds
diff --git a/training.R b/training.R
@@ -12,21 +12,37 @@ library(tidyr)
 library(tidymodels)
 library(xgboost)
 
-train_save_model <- function(cleaned_df, outcome_df) {
+train_save_model <- function(cleaned_train_2021to2023, outcome_2021to2023, 
+                             cleaned_train_2018to2020, outcome_2018to2020,
+                             background) {
   # Trains a model using the cleaned dataframe and saves the model to a file.
 
-  # Parameters:
-  # cleaned_df (dataframe): The cleaned data from clean_df function to be used for training the model.
-  # outcome_df (dataframe): The data with the outcome variable (e.g., from PreFer_train_outcome.csv or PreFer_fake_outcome.csv).
+  # Parameters (all of these are dataframes):
+  # cleaned_train_2021to2023: PreFer_train_data.csv after it has gone through the clean_df function 
+  # outcome_2021to2023: PreFer_train_outcome.csv 
+  # cleaned_train_2018to2020: A "time-shifted" dataframe of feature data, after it has gone through the clean_df function
+  # outcome_2018to2020: Outcome data for fertility in 2018-2020
+  # background: PreFer_train_background_data.csv
+
   set.seed(0)
 
-  # Combine cleaned_df and outcome_df
-  model_df <- merge(cleaned_df, outcome_df, by = "nomem_encr") %>%
-    select(-nomem_encr) %>%
+  # Combine cleaned training data and outcome_df
+  model_df_2021to2023 <- merge(cleaned_train_2021to2023, outcome_2021to2023, by = "nomem_encr") %>%
     mutate(new_child = factor(new_child))
+
+  model_df_2018to2020 <- merge(cleaned_train_2018to2020, outcome_2018to2020, by = "nomem_encr") %>%
+    mutate(new_child = factor(new_child)) %>%
+    select(-nomem_encr)
+
+  original_plus_timeshifted_model_df <- bind_rows(model_df_2021to2023, model_df_2018to2020) %>%
+    select(-nomem_encr)
 
   # Dummy-encode the categorical variables and mean impute everything
-  recipe <- recipe(new_child ~ ., model_df) %>%
+  recipe <- recipe(new_child ~ ., model_df_2021to2023) %>%
+    step_dummy(oplmet_2020, one_hot = TRUE) %>%
+    step_impute_mean(everything(), -new_child)
+
+  recipe <- recipe(new_child ~ ., model_df_2018to2020) %>%
     step_dummy(oplmet_2020, one_hot = TRUE) %>%
     step_impute_mean(everything(), -new_child)
 
@@ -36,7 +52,40 @@ train_save_model <- function(cleaned_df, outcome_df) {
     mtry = tune(), trees = tune(), tree_depth = tune(), learn_rate = tune()
   ) %>%
     set_engine("xgboost", counts = FALSE)
-  folds <- vfold_cv(model_df, v = 5)
+  # Set up cross-validation folds
+    # Identify the household each person was a member of at the last time that person
+    # was observed, up through December 2020
+    household_linkage <- background %>% 
+      arrange(desc(wave)) %>%
+      group_by(nomem_encr) %>%
+      slice_head %>%
+      select(nomem_encr, nohouse_encr)
+    # Merge the household ID with model_df_2021to2023
+    model_df_2021to2023 <- left_join(model_df_2021to2023, household_linkage)
+    # Set up CV folds within the original data 
+    n_folds <- 5
+    folds <- group_vfold_cv(data = model_df_2021to2023, 
+                            group = nohouse_encr, # Puts household members in same fold as each other
+                            balance = "observations",
+                            v = n_folds)
+    # Within each CV fold, remove ID numbers and append time-shifted data.
+    # Note: We are appending time-shifted data here rather than prior to creating the
+    # CV folds because we only want time-shifted data in training folds, not in test folds.
+    for(i in 1:n_folds) {
+      # Remove the personal and household ID numbers
+      folds$splits[[i]][[1]] <- folds$splits[[i]][[1]] %>%
+        select(-c(nomem_encr, nohouse_encr))
+      # Identify what index the first time-shifted observation will be placed at
+      start_index <- nrow(folds$splits[[i]][[1]]) + 1
+      # Append the time-shifted data
+      folds$splits[[i]][[1]] <- bind_rows(folds$splits[[i]][[1]], model_df_2018to2020)
+      # Add the indices for time-shifted data to the vector of train fold indices 
+      end_index <- nrow(folds$splits[[i]][[1]])
+      time_shifted_data_indices <- c(start_index:end_index)
+      folds$splits[[i]][[2]] <- c(folds$splits[[i]][[2]], time_shifted_data_indices)
+    } 
+
+  # Grid search for hyperparameter tuning
   grid <- expand.grid(
     mtry = c(.05, .1, .15, .2, .25, .3, .35, .4),
     trees = c(10, 50, 100, 300, 600, 900, 1200),
@@ -63,7 +112,7 @@ train_save_model <- function(cleaned_df, outcome_df) {
   model <- workflow() %>%
     add_model(model) %>%
     add_recipe(recipe) %>%
-    fit(model_df)
+    fit(original_plus_timeshifted_model_df)
 
   # Save the model
   saveRDS(model, "model.rds")