NSAPH-Data-Processing
diff --git a/‎README.md‎
Lines changed: 29 additions & 1 deletion b/‎README.md‎
Lines changed: 29 additions & 1 deletion
diff --git a/‎code/01_yearly_grid_mean.R‎
Lines changed: 63 additions & 0 deletions b/‎code/01_yearly_grid_mean.R‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎code/02_match_cts_extent_list.R‎
Lines changed: 47 additions & 0 deletions b/‎code/02_match_cts_extent_list.R‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎code/03_run_get_weights_par.R‎
Lines changed: 57 additions & 0 deletions b/‎code/03_run_get_weights_par.R‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎code/03_run_get_weights_par.sbatch‎
Lines changed: 9 additions & 0 deletions b/‎code/03_run_get_weights_par.sbatch‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎code/test_get_weights.R‎
Lines changed: 92 additions & 0 deletions b/‎code/test_get_weights.R‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎data/input/local_data/.gitignore‎
Lines changed: 8 additions & 0 deletions b/‎data/input/local_data/.gitignore‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎data/input/local_data/README.md‎
Lines changed: 5 additions & 0 deletions b/‎data/input/local_data/README.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎data/input/remote_data/README.md‎
Lines changed: 1 addition & 0 deletions b/‎data/input/remote_data/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎data/intermediate/.gitignore‎
Lines changed: 8 additions & 0 deletions b/‎data/intermediate/.gitignore‎
Lines changed: 8 additions & 0 deletions
@@ -1,2 +1,30 @@
 # smoke_aggregation
-Repository for computing the standardized weight across different zipcode given the 10km grids 
+Repository for computing the standardized weight across different zipcode given the 10km grids
+* Input: 
+  * # 10km_grid/10km_grid_wgs84/: This is a folder that contains the shapefile for the 10 km grid.
+  * 10km_grid/smokePM2pt5_predictions_daily_10km_20060101-20201231.rds: This is a file that contains a data frame with the final set of daily smoke PM2.5 predictions on smoke days at 10 km resolution from January 1, 2006 to December 31, 2020 for the contiguous US. The 'grid_id_10km' column in this file corresponds to the 'ID' column in the 10 km grid shapefile.
+  * Yearly zipcode grid shapefile: ./data/input/Zipcode_Info/polygon/ESRI", year, "USZIP5_POLY_WGS84.shp 
+  
+* Output: Rds file containing zip_id, grid_id, and weight
+
+Example output: 
+```
+  zip_id grid_id  w
+1 03281 104504 0.120474045
+2 03281 104505 0.489675352
+3 03281 104506 0.001825444
+4 03281 105019 0.080026705
+5 03281 105020 0.307998349
+```
+
+To run example weights:   
+```
+mkdir $HOME/singularity_images
+cd $HOME/singularity_images
+singularity pull docker://nsaph/r_exposures:v0
+cd code
+Rscript 01_yearly_grid_mean.R
+Rscript 02_match_cts_extent_list.R
+Rscript test_get_weights.R
+sbatch 03_run_get_weights_par.sbatch
+```
@@ -0,0 +1,63 @@
+
+# --------------------------------------------------------------------------------
+# 10 km grid
+# --------------------------------------------------------------------------------
+# 10km_grid/10km_grid_wgs84/:
+# This is a folder that contains the shapefile for the 10 km grid.
+# 
+# --------------------------------------------------------------------------------
+# 10km_grid/smokePM2pt5_predictions_daily_10km_20060101-20201231.rds:
+# This is a file that contains a data frame with the final set of daily smoke PM2.5 predictions on smoke days at 10 km resolution from January 1, 2006 to December 31, 2020 for the contiguous US. The 'grid_id_10km' column in this file corresponds to the 'ID' column in the 10 km grid shapefile.
+# 
+# All rows in this file are predictions on smoke days. Predictions on non-smoke days are by construction 0 ug/m^3 and not included in this file. A smoke PM2.5 prediction of 0 in this file means that the grid cell-day did have a smoke day but did not have elevated PM2.5. The full set of smoke PM2.5 predictions on both smoke days and non-smoke days can be obtained by setting the smoke PM2.5 prediction to 0 on grid cell-days in the 10 km grid and in the January 1, 2006-December 31, 2020 date range that are not in this file. For example, the R code below returns the full set of smoke PM2.5 predictions:
+
+
+library(tidyverse)
+library(lubridate)
+library(sf)
+
+# Load smokePM predictions on smoke days
+preds = read_csv("../data/input/smoke_PM/smokePM2pt5_predictions_daily_10km_20060101-20201231.csv") %>% 
+  mutate(
+    date = ymd(date)
+  ) %>% 
+  rename(
+    smoke = smokePM_pred,
+    grid_id = grid_id_10km
+  )
+
+# Load 10 km grid
+grid_10km = read_sf("../data/input/remote_data/10km_grid_wgs84/10km_grid_wgs84.shp")
+
+smoke_grid_df_list = list()
+years_ = 2006:2016
+
+for(y_ in years_) {
+  print(y_)
+  # Load full set of dates
+  #dates = seq.Date(ymd("20060101"), ymd("20201231"), by = "day")
+  dates = seq.Date(ymd(paste0(y_, "0101")), 
+                   ymd(paste0(y_, "1231")), 
+                   by = "day")
+  
+  # Get full combination of grid cell-days
+  # Warning: this may require a large amount of memory
+  out = expand.grid(grid_id = grid_10km$ID, date = dates)
+  
+  # Match smokePM predictions on smoke days to grid cell-days
+  out = left_join(out, preds, by = c("grid_id", "date"))
+  
+  # Predict 0 for remaining grid cell-days, which are non-smoke days
+  out = mutate(out, smoke = replace_na(smoke, 0))
+  
+  # Compute smoke yearly grid mean
+  out %<>% 
+    mutate(year = year(date)) %>% 
+    group_by(grid_id, year) %>% 
+    summarise(smoke = mean(smoke))
+  
+  smoke_grid_df_list[[as.character(y_)]] <- out
+}
+
+write_rds(smoke_grid_df_list, 
+          "../data/intermediate/scratch/smoke_grid_df_list.rds")
@@ -0,0 +1,47 @@
+library(tidyverse)
+library(magrittr)
+library(lubridate)
+library(sf)
+library(raster)
+library(rgeos)
+library(viridis)
+library(tictoc)
+library(dplyr)
+sf_use_s2(FALSE)
+
+years = sprintf("%02d", c(6:16))
+grid_sf = read_sf("./data/input/remote_data/10km_grid_wgs84/10km_grid_wgs84.shp")
+
+zip_sf_list = list()
+
+for(year in years){
+  
+  zip_sf = read_sf(paste0("./data/input/Zipcode_Info/polygon/ESRI", year, "USZIP5_POLY_WGS84.shp"))
+  
+  # making sure crs are equivalent 
+  # if(st_crs(grid_sf)!= st_crs(zip_sf)){
+  #   zip_sf <- st_transform(st_crs(grid_sf)) 
+  # }
+  
+  zip_sf <- st_make_valid(zip_sf)
+  zip_sf <- st_crop(zip_sf, st_bbox(grid_sf))
+  
+  zip_sf_list[[paste0("20",year)]] = zip_sf
+  
+}
+
+write_rds(zip_sf_list, "./data/intermediate/scratch/zip_sf_list.rds")
+
+
+# zip_sf_list = read_rds("./data/intermediate/scratch/zip_sf_list.rds")
+# zip_s = zip_sf_list[["2016"]]
+
+# zip_s %>% 
+#  ggplot() + 
+# geom_sf(aes(fill = "red"), alpha = 0.75, lwd = 0.1) + 
+#  theme(legend.position = "none")
+
+
+
+
+
@@ -0,0 +1,57 @@
+## load libraries ----
+library(dplyr)
+library(tidyverse)
+library(magrittr)
+library(sf)
+library(raster)
+library(parallel)
+library(argparse)
+print("This is the sf package version we are using:")
+print(packageVersion("sf"))
+
+## define parser arguments ----
+parser <- ArgumentParser()
+parser$add_argument("-y", "--year", default=2006,
+                    help="Year to run", type="integer")
+parser$add_argument("-c", "--cores", default=24,
+                    help="Number of cores", type="integer")
+args = parser$parse_args()
+print("use R script get_weights_par")
+# args = list()
+# args$year = 2006
+# args$cores = 24
+
+## read functions ----
+source("../../lib/get_weights_par.R")
+
+print("load data")
+## Load grid and zip sf objects ----
+grid_sf = read_rds("../data/intermediate/scratch/grid_sf.rds") %>% 
+  rename(grid_id = ID)
+zip_sf_list = read_rds("../data/intermediate/scratch/zip_sf_list.rds") 
+zip_sf = zip_sf_list[[as.character(args$year)]] %>% 
+  rename(zip_id = ZIP)
+rm(zip_sf_list)
+
+
+print("run aggregations")
+## run aggregations ----
+zip_weights_df <- get_weights_par(
+  x_poly_sf = zip_sf, 
+  y_poly_sf = grid_sf, 
+  x_id = "zip_id", 
+  y_id = "grid_id",
+  cores = args$cores
+)
+
+print("finish aggregations")
+zip_weights_df$year <- args$year
+
+## save output ----
+write_csv(
+  zip_weights_df, 
+  paste0("../data/output/scratch/", 
+         "zip_weights_df_test_par", as.character(args$year), ".csv")
+)
+
+print("completed")
@@ -0,0 +1,9 @@
+#!/bin/bash
+#
+#SBATCH -p fasse # partition (queue)
+#SBATCH -c 48 # number of cores
+#SBATCH --mem 100GB # memory pool for all cores
+#SBATCH -t 1-12:00 # time (D-HH:MM)
+
+
+singularity exec $HOME/singularity_images/smoke_weights_v0.sif Rscript 03_run_get_weights_par.R -y 2010 -c 40
@@ -0,0 +1,92 @@
+## load libraries ----
+library(dplyr)
+library(tidyverse)
+library(magrittr)
+library(sf)
+library(raster)
+library(parallel)
+library(argparse)
+print("This is the sf package version we are using:")
+print(packageVersion("sf"))
+
+## define parser arguments ----
+parser <- ArgumentParser()
+parser$add_argument("-y", "--year", default=2006,
+                    help="Year to run", type="integer")
+parser$add_argument("-c", "--cores", default=24,
+                    help="Number of cores", type="integer")
+args = parser$parse_args()
+print("use R script get_weights_par")
+# args = list()
+# args$year = 2006
+# args$cores = 24
+
+## read functions ----
+source("../../lib/get_weights_par.R")
+
+print("load data")
+## Load grid and zip sf objects ----
+grid_sf = read_rds("../data/intermediate/scratch/grid_sf.rds") %>% 
+  rename(grid_id = ID)
+zip_sf_list = read_rds("../data/intermediate/scratch/zip_sf_list.rds") 
+zip_sf = zip_sf_list[[as.character(args$year)]] %>% 
+  rename(zip_id = ZIP)
+rm(zip_sf_list)
+x_poly_sf = zip_sf
+y_poly_sf = grid_sf 
+x_id = "zip_id"
+y_id = "grid_id"
+cores = args$cores
+
+## crop polygons within same bounding box ---- 
+x_poly_sf <- st_make_valid(x_poly_sf)
+x_poly_sf <- st_crop(x_poly_sf, st_bbox(y_poly_sf))
+
+# assign 1s to zipcode polygons ----
+x_poly_sf$w <- 1
+
+
+x_to_y <- data.frame()
+
+# error zipcodes 
+for(i in c("03281")) {
+  x_i_sf <- dplyr::select(x_poly_sf[x_poly_sf[[x_id]] == i, ], c("w", "geometry"))
+  y_i_sf <- dplyr::select(st_crop(y_poly_sf, extent(x_i_sf)), c(y_id, "geometry"))
+  
+  # y_i_w <- st_drop_geometry(st_interpolate_aw(x_i_sf, y_i_sf, extensive = T))
+  tryCatch({
+    y_i_w <- st_drop_geometry(st_interpolate_aw(x_i_sf, y_i_sf, extensive = T))
+    
+    x_to_y_i <- data.frame(
+      x_id = i, 
+      y_id = y_i_sf[[y_id]][as.numeric(rownames(y_i_w))],
+      w = y_i_w$w)
+    
+    x_to_y <- rbind(x_to_y, x_to_y_i)
+    
+  }, error=function(e){
+    print("An error occurred while calculating the weights.")
+    print(i)
+  })
+}
+
+
+## example successful one
+test_x <- dplyr::select(x_poly_sf[x_poly_sf[[x_id]] == "01604", ], c("w", "geometry"))
+test_y <- dplyr::select(st_crop(y_poly_sf, extent(test_x)), c(y_id, "geometry"))
+
+test_interpolate <- st_interpolate_aw(test_x, test_y, extensive = T)
+
+############
+# The success one
+plot(test_x[[2]])
+plot(test_y[[2]], add = TRUE)
+plot(test_interpolate[[2]])
+
+
+# The error one 
+plot(x_i_sf[[2]])
+plot(y_i_sf[[2]], add = TRUE)
+st_interpolate_aw(x_i_sf, y_i_sf, extensive = T)
+
+
@@ -0,0 +1,8 @@
+# ignore everything (dirs, files, sub-dirs, sub-files)
+/*
+# but do not ignore the .gitignore file
+!.gitignore
+# but do not ignore the README.md file
+!README.md
+# do not ignore x folder
+!/x
@@ -0,0 +1,5 @@
+
+```
+ln -s /n/dominici_nsaph/Lab/data/shapefiles/zip_shape_files/Zipcode_Info .
+ln -s /n/dominici_lab/lab/data/10km_grid_wgs84 .
+```
@@ -0,0 +1 @@
+folder containing data that is pushed to the remote repository
@@ -0,0 +1,8 @@
+# ignore everything (dirs, files, sub-dirs, sub-files)
+/*
+# but do not ignore the .gitignore file
+!.gitignore
+# but do not ignore the README.md file
+!README.md
+# do not ignore shared_data folder
+!/remote_data
-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
++
 +```
 +ln -s /n/dominici_nsaph/Lab/data/shapefiles/zip_shape_files/Zipcode_Info .
 +ln -s /n/dominici_lab/lab/data/10km_grid_wgs84 .
 +```
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+folder containing data that is pushed to the remote repository`