Merge branch 'main' into main_postprocess_update

HopkinsIDD · Jun 6, 2024 · c771178 · c771178
2 parents 160ce81 + ad733ca
commit c771178
Show file tree

Hide file tree

Showing 9 changed files with 119 additions and 105 deletions.
diff --git a/build/conda_environment.yml b/build/conda_environment.yml
@@ -478,7 +478,6 @@ dependencies:
   - conda-forge/linux-64::r-readxl==1.4.1=r42h3ebcfa7_1
   - conda-forge/noarch::r-reprex==2.0.2=r42hc72bb7e_1
   - conda-forge/linux-64::r-tidyr==1.3.0=r42h38f115c_0
-  - conda-forge/noarch::r-tigris==2.0.1=r42hc72bb7e_0
   - conda-forge/noarch::r-waldo==0.4.0=r42hc72bb7e_1
   - conda-forge/noarch::r-broom==1.0.3=r42hc72bb7e_0
   - conda-forge/linux-64::r-gdtools==0.3.0=r42he0ce631_0

diff --git a/build/local_install.R b/build/local_install.R
@@ -8,7 +8,7 @@ local({r <- getOption("repos")
 
 library(devtools)
 
-install.packages(c("covidcast","data.table","vroom","dplyr"), quiet=TRUE, dependencies = TRUE)
+install.packages(c("covidcast","data.table","vroom","dplyr"), quiet=TRUE)
 # devtools::install_github("hrbrmstr/cdcfluview")
 
 # To run if operating in the container -----

diff --git a/datasetup/build_US_setup.R b/datasetup/build_US_setup.R
@@ -70,7 +70,6 @@ state_level <- ifelse(!is.null(config$subpop_setup$state_level) && config$subpop
 # tidycensus::census_api_key(key = census_key)
 
 
-
 filterUSPS <- c("WY","VT","DC","AK","ND","SD","DE","MT","RI","ME","NH","HI","ID","WV","NE","NM",
                 "KS","NV","MS","AR","UT","IA","CT","OK","OR","KY","LA","AL","SC","MN","CO","WI",
                 "MD","MO","IN","TN","MA","AZ","WA","VA","NJ","MI","NC","GA","OH","IL","PA","NY","FL","TX","CA")

diff --git a/flepimop/R_packages/flepicommon/NAMESPACE b/flepimop/R_packages/flepicommon/NAMESPACE
@@ -17,7 +17,6 @@ export(fix_negative_counts_single_subpop)
 export(get_CSSE_US_data)
 export(get_CSSE_US_matchGlobal_data)
 export(get_CSSE_global_data)
-export(get_USAFacts_data)
 export(get_covidcast_data)
 export(get_groundtruth_from_source)
 export(load_config)

diff --git a/flepimop/R_packages/flepicommon/R/DataUtils.R b/flepimop/R_packages/flepicommon/R/DataUtils.R
@@ -426,64 +426,6 @@ aggregate_counties_to_state <- function(df, state_fips){
 }
 
 
-##'
-##' Pull case and death count data from USAFacts
-##'
-##' Pulls the USAFacts cumulative case count and death data. Calculates incident counts.
-##' USAFacts does not include data for all the territories (aka island areas). These data are pulled from NYTimes.
-##'
-##' Returned data preview:
-##' tibble [352,466 × 7] (S3: grouped_df/tbl_df/tbl/data.frame)
-##'  $ FIPS       : chr [1:352466] "00001" "00001" "00001" "00001" ...
-##'  $ source     : chr [1:352466] "NY" "NY" "NY" "NY" ...
-##'  $ Update     : Date[1:352466], format: "2020-01-22" "2020-01-23" ...
-##'  $ Confirmed  : num [1:352466] 0 0 0 0 0 0 0 0 0 0 ...
-##'  $ Deaths     : num [1:352466] 0 0 0 0 0 0 0 0 0 0 ...
-##'  $ incidI     : num [1:352466] 0 0 0 0 0 0 0 0 0 0 ...
-##'  $ incidDeath : num [1:352466] 0 0 0 0 0 0 0 0 0 0 ...
-##'
-##' @param case_data_filename Filename where case data are stored
-##' @param death_data_filename Filename where death data are stored
-##' @param incl_unassigned Includes data unassigned to counties (default is FALSE)
-##' @return the case and deaths data frame
-##'
-##'
-##' @export
-##'
-get_USAFacts_data <- function(case_data_filename = "data/case_data/USAFacts_case_data.csv",
-                              death_data_filename = "data/case_data/USAFacts_death_data.csv",
-                              incl_unassigned = FALSE){
-
-  USAFACTS_CASE_DATA_URL <- "https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_confirmed_usafacts.csv"
-  USAFACTS_DEATH_DATA_URL <- "https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_deaths_usafacts.csv"
-  usafacts_case <- download_USAFacts_data(case_data_filename, USAFACTS_CASE_DATA_URL, "Confirmed", incl_unassigned)
-  usafacts_death <- download_USAFacts_data(death_data_filename, USAFACTS_DEATH_DATA_URL, "Deaths", incl_unassigned)
-
-  usafacts_data <- dplyr::full_join(usafacts_case, usafacts_death)
-  usafacts_data <- dplyr::select(usafacts_data, Update, source, FIPS, Confirmed, Deaths)
-  usafacts_data <- rbind(usafacts_data, get_islandareas_data()) # Append island areas
-  usafacts_data <- dplyr::arrange(usafacts_data, source, FIPS, Update)
-
-  # Create columns incidI and incidDeath
-  usafacts_data <- dplyr::group_modify(
-    dplyr::group_by(
-      usafacts_data,
-      FIPS
-    ),
-    function(.x,.y){
-      .x$incidI = c(.x$Confirmed[1],diff(.x$Confirmed))
-      .x$incidDeath = c(.x$Deaths[1],diff(.x$Deaths,))
-      return(.x)
-    }
-  )
-
-  # Fix incidence counts that go negative and NA values or missing dates
-  usafacts_data <- fix_negative_counts(usafacts_data, "Confirmed", "incidI")
-  usafacts_data <- fix_negative_counts(usafacts_data, "Deaths", "incidDeath")
-
-  return(usafacts_data)
-}
-
 
 
 ##'

diff --git a/flepimop/R_packages/flepiconfig/R/process_npi_list.R b/flepimop/R_packages/flepiconfig/R/process_npi_list.R
@@ -16,6 +16,36 @@ NULL
 
 
 
+load_geodata_file <- function(filename,
+                              geoid_len = 0,
+                              geoid_pad = "0",
+                              state_name = TRUE) {
+
+    if(!file.exists(filename)){stop(paste(filename,"does not exist in",getwd()))}
+    geodata <- readr::read_csv(filename) %>%
+        dplyr::mutate(geoid = as.character(geoid))
+
+    if (!("geoid" %in% names(geodata))) {
+        stop(paste(filename, "does not have a column named geoid"))
+    }
+
+    if (geoid_len > 0) {
+        geodata$geoid <- stringr::str_pad(geodata$geoid, geoid_len, pad = geoid_pad)
+    }
+
+    if(state_name) {
+        utils::data(fips_us_county, package = "flepicommon") # arrow::read_parquet("datasetup/usdata/fips_us_county.parquet")
+        geodata <- fips_us_county %>%
+            dplyr::distinct(state, state_name) %>%
+            dplyr::rename(USPS = state) %>%
+            dplyr::rename(state = state_name) %>%
+            dplyr::mutate(state = dplyr::recode(state, "U.S. Virgin Islands" = "Virgin Islands")) %>%
+            dplyr::right_join(geodata)
+    }
+
+    return(geodata)
+}
+
 ##' find_truncnorm_mean_parameter
 ##'
 ##' Convenience function that estimates the mean value for a truncnorm distribution given a, b, and sd that will have the expected value of the input mean.

diff --git a/flepimop/main_scripts/create_seeding_added.R b/flepimop/main_scripts/create_seeding_added.R
@@ -44,6 +44,7 @@ library(purrr)
 
 option_list <- list(
     optparse::make_option(c("-c", "--config"), action = "store", default = Sys.getenv("CONFIG_PATH"), type = "character", help = "path to the config file"),
+    optparse::make_option(c("-p", "--flepi_path"), action="store", type='character', help="path to the flepiMoP directory", default = Sys.getenv("FLEPI_PATH", "flepiMoP/")),
     optparse::make_option(c("-k", "--keep_all_seeding"), action="store",default=TRUE,type='logical',help="Whether to filter away seeding prior to the start date of the simulation.")
 )
 opt <- optparse::parse_args(optparse::OptionParser(option_list = option_list))

diff --git a/flepimop/main_scripts/inference_slot.R b/flepimop/main_scripts/inference_slot.R
@@ -52,7 +52,8 @@ option_list = list(
   optparse::make_option(c("-H","--save_hosp"), action = "store", default = Sys.getenv("SAVE_HOSP", TRUE), type = 'logical', help = 'Should the HOSP output files be saved for each iteration'),
   optparse::make_option(c("-M", "--memory_profiling"), action = "store", default = Sys.getenv("FLEPI_MEM_PROFILE", FALSE), type = 'logical', help = 'Should the memory profiling be run during iterations'),
   optparse::make_option(c("-P", "--memory_profiling_iters"), action = "store", default = Sys.getenv("FLEPI_MEM_PROF_ITERS", 100), type = 'integer', help = 'If doing memory profiling, after every X iterations run the profiler'),
-  optparse::make_option(c("-g", "--subpop_len"), action="store", default=Sys.getenv("SUBPOP_LENGTH", 5), type='integer', help = "number of digits in subpop")
+  optparse::make_option(c("-g", "--subpop_len"), action="store", default=Sys.getenv("SUBPOP_LENGTH", 5), type='integer', help = "number of digits in subpop"),
+  optparse::make_option(c("-a", "--incl_aggr_likelihood"), action = "store", default = Sys.getenv("INCL_AGGR_LIKELIHOOD", TRUE), type = 'logical', help = 'Should the likelihood be calculated with the aggregate estiamtes.')
 )
 
 parser=optparse::OptionParser(option_list=option_list)
@@ -90,6 +91,17 @@ if (opt$config == ""){
 }
 config = flepicommon::load_config(opt$config)
 
+if (!is.null(config$inference$incl_aggr_likelihood)){
+    print("Using config option for `incl_aggr_likelihood`.")
+    opt$incl_aggr_likelihood <- config$inference$incl_aggr_likelihood
+    if (!is.null(config$inference$total_ll_multiplier)){
+        print("Using config option for `total_ll_multiplier`.")
+        opt$total_ll_multiplier <- config$inference$total_ll_multiplier
+    } else {
+        opt$total_ll_multiplier <- 1
+    }
+}
+
 ## Check for errors in config ---------------------------------------------------------------------
 
 ##  seeding section
@@ -262,6 +274,20 @@ if (config$inference$do_inference){
     dplyr::right_join(tidyr::expand_grid(subpop = unique(.$subpop), date = unique(.$date))) %>%
     dplyr::mutate_if(is.numeric, dplyr::coalesce, 0)
 
+
+  # add aggregate groundtruth to the obs data for the likelihood calc
+  if (opt$incl_aggr_likelihood){
+      obs <- obs %>%
+          dplyr::bind_rows(
+              obs %>%
+                  dplyr::select(date, where(is.numeric)) %>%
+                  dplyr::group_by(date) %>%
+                  dplyr::summarise(across(everything(), sum)) %>% # no likelihood is calculated for time periods with missing data for any subpop
+                  dplyr::mutate(source = "Total",
+                                subpop = "Total")
+          )
+  }
+
   subpopnames <- unique(obs[[obs_subpop]])
 
 
@@ -605,7 +631,19 @@ for(seir_modifiers_scenario in seir_modifiers_scenarios) {
       # run
       if (config$inference$do_inference){
         sim_hosp <- flepicommon::read_file_of_type(gsub(".*[.]","",this_global_files[['hosp_filename']]))(this_global_files[['hosp_filename']]) %>%
-          dplyr::filter(time >= min(obs$date),time <= max(obs$date))
+          dplyr::filter(time >= min(obs$date), time <= max(obs$date))
+
+        # add aggregate groundtruth to the obs data for the likelihood calc
+        if (opt$incl_aggr_likelihood){
+            sim_hosp <- sim_hosp %>%
+                dplyr::bind_rows(
+                    sim_hosp %>%
+                        dplyr::select(-tidyselect::all_of(obs_subpop), -tidyselect::starts_with("date")) %>%
+                        dplyr::group_by(time) %>%
+                        dplyr::summarise(dplyr::across(tidyselect::everything(), sum)) %>% # no likelihood is calculated for time periods with missing data for any subpop
+                        dplyr::mutate(!!obs_subpop := "Total")
+                )
+        }
 
         lhs <- unique(sim_hosp[[obs_subpop]])
         rhs <- unique(names(data_stats))
@@ -641,6 +679,12 @@ for(seir_modifiers_scenario in seir_modifiers_scenarios) {
 
       rm(sim_hosp)
 
+      # multiply aggregate likelihood by a factor if specified in config
+      if (opt$incl_aggr_likelihood){
+        proposed_likelihood_data$ll[proposed_likelihood_data$subpop == "Total"] <- proposed_likelihood_data$ll[proposed_likelihood_data$subpop == "Total"] * opt$total_ll_multiplier
+      }
+
+
       # write proposed likelihood to global file
       arrow::write_parquet(proposed_likelihood_data, this_global_files[['llik_filename']])