NSAPH-Data-Processing
diff --git a/‎README.md‎
Lines changed: 24 additions & 15 deletions b/‎README.md‎
Lines changed: 24 additions & 15 deletions
diff --git a/‎code/03_run_get_weights_par.R‎
Lines changed: 5 additions & 5 deletions b/‎code/03_run_get_weights_par.R‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎code/03_run_get_weights_par.sbatch‎
Lines changed: 0 additions & 9 deletions b/‎code/03_run_get_weights_par.sbatch‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎code/03_run_get_weights_par_script.R‎
Lines changed: 11 additions & 0 deletions b/‎code/03_run_get_weights_par_script.R‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎code/04_daily_aggregation.R‎
Lines changed: 11 additions & 37 deletions b/‎code/04_daily_aggregation.R‎
Lines changed: 11 additions & 37 deletions
diff --git a/‎code/05_pobox_zip_association.R‎
Lines changed: 0 additions & 65 deletions b/‎code/05_pobox_zip_association.R‎
Lines changed: 0 additions & 65 deletions
diff --git a/‎code/06_pobox_smoke_association.R‎
Lines changed: 0 additions & 46 deletions b/‎code/06_pobox_smoke_association.R‎
Lines changed: 0 additions & 46 deletions
diff --git a/‎code/07_annual_average.R‎
Lines changed: 0 additions & 30 deletions b/‎code/07_annual_average.R‎
Lines changed: 0 additions & 30 deletions
@@ -1,30 +1,39 @@
 # smoke_aggregation
-Repository for computing the standardized weight across different zipcode given the 10km grids
+Repository for computing the standardized weight across different zipcode given the 10km grids, then calculate the smoke values per zipcode from 2006-2016 
+
 * Input: 
-  * # 10km_grid/10km_grid_wgs84/: This is a folder that contains the shapefile for the 10 km grid.
+  * 10km_grid/10km_grid_wgs84/: This is a folder that contains the shapefile for the 10 km grid.
   * 10km_grid/smokePM2pt5_predictions_daily_10km_20060101-20201231.rds: This is a file that contains a data frame with the final set of daily smoke PM2.5 predictions on smoke days at 10 km resolution from January 1, 2006 to December 31, 2020 for the contiguous US. The 'grid_id_10km' column in this file corresponds to the 'ID' column in the 10 km grid shapefile.
-  * Yearly zipcode grid shapefile: ./data/input/Zipcode_Info/polygon/ESRI", year, "USZIP5_POLY_WGS84.shp 
+  * Yearly zipcode grid shapefile: ./data/input/Zipcode_Info/polygon/ESRI", year, "USZIP5_POLY_WGS84.shp" 
 
-* Output: Rds file containing zip_id, grid_id, and weight
+* Output: CSV file containing zip_id, date, and smoke
 
 Example output: 
 ```
-  zip_id grid_id  w
-1 03281 104504 0.120474045
-2 03281 104505 0.489675352
-3 03281 104506 0.001825444
-4 03281 105019 0.080026705
-5 03281 105020 0.307998349
+  zip   date       smoke
+  <chr> <date>     <dbl>
+1 00012 2016-01-01     0
+2 00012 2016-01-02     0
+3 00012 2016-01-03     0
+4 00012 2016-01-04     0
+5 00012 2016-01-05     0
+6 00012 2016-01-06     0
 ```
 
 To run example weights:   
 ```
-mkdir $HOME/singularity_images
-cd $HOME/singularity_images
-singularity pull docker://nsaph/r_exposures:v0
 cd code
 Rscript 01_yearly_grid_mean.R
 Rscript 02_match_cts_extent_list.R
-Rscript test_get_weights.R
-sbatch 03_run_get_weights_par.sbatch
+Rscript 03_run_get_weights_par_script.R
+Rscript 04_daily_aggregation.R
+
 ```
+## Notebook EDA 
+These are the high level explanation of what each code in the notebooks folder do: 
+1. 01_eda_preds.Rmd: Identify grids with smoke per day, distribution of smoke values
+2. 02_yearly_grid_mean.Rmd: Aggregation to obtain grid_id, year, and smoke values
+3. 03_match_cts_extent.Rmd: Match the crs and extents of sf objects, create bounding box, plot the mapping between zipcode and grid_id
+4. 04_test_grid_to_zip.Rmd: Join grid with smoke values with 'over' function
+5. 07_get_weights_par_test.Rmd: get_weights_par unit test. Testing if the sum of weights for 1 zipcode is equals to 1 
+6. 08_zip_code_EDA.Rmd: ZIP code EDA to see the coverage of the smoke aggregation files
@@ -8,7 +8,7 @@ library(parallel)
 library(argparse)
 print("This is the sf package version we are using:")
 print(packageVersion("sf"))
-
+print(Sys.time())
 ## define parser arguments ----
 parser <- ArgumentParser()
 parser$add_argument("-y", "--year", default=2006,
@@ -22,7 +22,7 @@ print("use R script get_weights_par")
 # args$cores = 24
 
 ## read functions ----
-source("../../lib/get_weights_par.R")
+source("../lib/get_weights_par.R")
 
 print("load data")
 ## Load grid and zip sf objects ----
@@ -50,8 +50,8 @@ zip_weights_df$year <- args$year
 ## save output ----
 write_csv(
   zip_weights_df, 
-  paste0("../data/output/scratch/", 
-         "zip_weights_df_test_par", as.character(args$year), ".csv")
+  paste0("../data/output/", 
+         "zip_weights_df_", as.character(args$year), ".csv")
 )
-
+print(Sys.time())
 print("completed")
@@ -0,0 +1,11 @@
+source("03_run_get_weights_par.R")
+
+
+# Define the range of years to loop through
+years <- 2006:2016
+
+# Loop through each year and call the 03_run_get_weights_par.R script with the appropriate -y parameter
+for (year in years) {
+  command <- paste("Rscript 03_run_get_weights_par.R -y", year, "-c 40", sep=" ")
+  system(command)
+}
@@ -13,7 +13,7 @@ grid_10km = read_sf("../data/input/local_data/10km_grid_wgs84/10km_grid_wgs84.sh
   )
 
 # Load smokePM predictions on smoke days
-preds = read_csv("/net/rcstorenfs02/ifs/rc_labs/dominici_lab/lab/data/exposures/smoke/smokePM2pt5_predictions_daily_10km_20060101-20201231.csv") %>% 
+preds = read_csv("../data/input/local_data/smokePM2pt5_predictions_daily_10km_20060101-20201231.csv") %>% 
   mutate(
     date = ymd(date)
   ) %>% 
@@ -28,9 +28,10 @@ zip_smoke_df = data.frame(zip = character(),
                           smoke = double(),
                           stringsAsFactors = FALSE)
 
-for(y_ in 2006:2007){
-  y_ <- 2006
+for(y_ in 2016:2016){
   # Load full set of dates
+  print("Now Processing year:")
+  print(y_)
   dates = seq.Date(ymd(paste0(y_, "0101")), 
                    ymd(paste0(y_, "1231")), 
                    by = "day")
@@ -59,38 +60,11 @@ for(y_ in 2006:2007){
     group_by(zip, date)  %>%
     summarise(smoke =  weighted.mean(smoke, w=w))
 
-  zip_smoke_df = rbind(zip_smoke_df, zip_smoke_df_y)
-  
-}
-
-zip_sf = read_rds("../data/intermediate/scratch/zip_sf_list.rds") 
-
-save(zip_smoke_df, file = "../data/output/smoke/daily_zip_test.RData")
-
-
-
-# Load required libraries
-library(tidyverse)
-library(sf)
-
-# Read in the shapefile for zipcodes
-zip_sf <- read_sf("../data/input/local_data/Zipcode_Info/polygon/ESRI06USZIP5_POLY_WGS84.shp")
-
-zip_smoke_df$date <- as.Date(zip_smoke_df$date)
-
-# Subset the data for the given date
-zip_smoke_subset <- zip_smoke_df %>% filter(date == "2006-01-01")
-
-# Join the data with the shapefile based on the zipcode
-zip_sf <- left_join(zip_sf, zip_smoke_subset, by = c("ZIP" = "zip"))
-
-# Drop rows with NULL values in smoke column
-zip_sf <- zip_sf %>% drop_na(smoke)
-
-
-# Create a map of smoke in every zipcode with thinner line width
-ggplot() +
-  geom_sf(data = zip_sf, aes(fill = smoke), lwd = 0.1) +
-  scale_fill_gradient(low = "yellow", high = "red") +
-  theme_void()
+  #save(zip_smoke_df_y, file = "../data/output/smoke/daily_zip_.RData")
+  write_csv(
+    zip_smoke_df_y, 
+    paste0("../data/output/", 
+           "daily_zip_", as.character(y_), ".csv")
+  )
 
+}