NSAPH-Data-Processing
diff --git a/‎README.md‎
Lines changed: 29 additions & 1 deletion b/‎README.md‎
Lines changed: 29 additions & 1 deletion
diff --git a/‎code/01_yearly_grid_mean.R‎
Lines changed: 63 additions & 0 deletions b/‎code/01_yearly_grid_mean.R‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎code/02_match_cts_extent_list.R‎
Lines changed: 47 additions & 0 deletions b/‎code/02_match_cts_extent_list.R‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎code/03_run_get_weights_par.R‎
Lines changed: 57 additions & 0 deletions b/‎code/03_run_get_weights_par.R‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎code/03_run_get_weights_par.sbatch‎
Lines changed: 9 additions & 0 deletions b/‎code/03_run_get_weights_par.sbatch‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎code/04_daily_aggregation.R‎
Lines changed: 96 additions & 0 deletions b/‎code/04_daily_aggregation.R‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎code/05_pobox_zip_association.R‎
Lines changed: 65 additions & 0 deletions b/‎code/05_pobox_zip_association.R‎
Lines changed: 65 additions & 0 deletions
@@ -1,2 +1,30 @@
 # smoke_aggregation
-Repository for computing the standardized weight across different zipcode given the 10km grids 
+Repository for computing the standardized weight across different zipcode given the 10km grids
+* Input: 
+  * # 10km_grid/10km_grid_wgs84/: This is a folder that contains the shapefile for the 10 km grid.
+  * 10km_grid/smokePM2pt5_predictions_daily_10km_20060101-20201231.rds: This is a file that contains a data frame with the final set of daily smoke PM2.5 predictions on smoke days at 10 km resolution from January 1, 2006 to December 31, 2020 for the contiguous US. The 'grid_id_10km' column in this file corresponds to the 'ID' column in the 10 km grid shapefile.
+  * Yearly zipcode grid shapefile: ./data/input/Zipcode_Info/polygon/ESRI", year, "USZIP5_POLY_WGS84.shp 
+  
+* Output: Rds file containing zip_id, grid_id, and weight
+
+Example output: 
+```
+  zip_id grid_id  w
+1 03281 104504 0.120474045
+2 03281 104505 0.489675352
+3 03281 104506 0.001825444
+4 03281 105019 0.080026705
+5 03281 105020 0.307998349
+```
+
+To run example weights:   
+```
+mkdir $HOME/singularity_images
+cd $HOME/singularity_images
+singularity pull docker://nsaph/r_exposures:v0
+cd code
+Rscript 01_yearly_grid_mean.R
+Rscript 02_match_cts_extent_list.R
+Rscript test_get_weights.R
+sbatch 03_run_get_weights_par.sbatch
+```
@@ -0,0 +1,63 @@
+
+# --------------------------------------------------------------------------------
+# 10 km grid
+# --------------------------------------------------------------------------------
+# 10km_grid/10km_grid_wgs84/:
+# This is a folder that contains the shapefile for the 10 km grid.
+# 
+# --------------------------------------------------------------------------------
+# 10km_grid/smokePM2pt5_predictions_daily_10km_20060101-20201231.rds:
+# This is a file that contains a data frame with the final set of daily smoke PM2.5 predictions on smoke days at 10 km resolution from January 1, 2006 to December 31, 2020 for the contiguous US. The 'grid_id_10km' column in this file corresponds to the 'ID' column in the 10 km grid shapefile.
+# 
+# All rows in this file are predictions on smoke days. Predictions on non-smoke days are by construction 0 ug/m^3 and not included in this file. A smoke PM2.5 prediction of 0 in this file means that the grid cell-day did have a smoke day but did not have elevated PM2.5. The full set of smoke PM2.5 predictions on both smoke days and non-smoke days can be obtained by setting the smoke PM2.5 prediction to 0 on grid cell-days in the 10 km grid and in the January 1, 2006-December 31, 2020 date range that are not in this file. For example, the R code below returns the full set of smoke PM2.5 predictions:
+
+
+library(tidyverse)
+library(lubridate)
+library(sf)
+
+# Load smokePM predictions on smoke days
+preds = read_csv("../data/input/smoke_PM/smokePM2pt5_predictions_daily_10km_20060101-20201231.csv") %>% 
+  mutate(
+    date = ymd(date)
+  ) %>% 
+  rename(
+    smoke = smokePM_pred,
+    grid_id = grid_id_10km
+  )
+
+# Load 10 km grid
+grid_10km = read_sf("../data/input/remote_data/10km_grid_wgs84/10km_grid_wgs84.shp")
+
+smoke_grid_df_list = list()
+years_ = 2006:2016
+
+for(y_ in years_) {
+  print(y_)
+  # Load full set of dates
+  #dates = seq.Date(ymd("20060101"), ymd("20201231"), by = "day")
+  dates = seq.Date(ymd(paste0(y_, "0101")), 
+                   ymd(paste0(y_, "1231")), 
+                   by = "day")
+  
+  # Get full combination of grid cell-days
+  # Warning: this may require a large amount of memory
+  out = expand.grid(grid_id = grid_10km$ID, date = dates)
+  
+  # Match smokePM predictions on smoke days to grid cell-days
+  out = left_join(out, preds, by = c("grid_id", "date"))
+  
+  # Predict 0 for remaining grid cell-days, which are non-smoke days
+  out = mutate(out, smoke = replace_na(smoke, 0))
+  
+  # Compute smoke yearly grid mean
+  out %<>% 
+    mutate(year = year(date)) %>% 
+    group_by(grid_id, year) %>% 
+    summarise(smoke = mean(smoke))
+  
+  smoke_grid_df_list[[as.character(y_)]] <- out
+}
+
+write_rds(smoke_grid_df_list, 
+          "../data/intermediate/scratch/smoke_grid_df_list.rds")
@@ -0,0 +1,47 @@
+library(tidyverse)
+library(magrittr)
+library(lubridate)
+library(sf)
+library(raster)
+library(rgeos)
+library(viridis)
+library(tictoc)
+library(dplyr)
+sf_use_s2(FALSE)
+
+years = sprintf("%02d", c(6:16))
+grid_sf = read_sf("./data/input/remote_data/10km_grid_wgs84/10km_grid_wgs84.shp")
+
+zip_sf_list = list()
+
+for(year in years){
+  
+  zip_sf = read_sf(paste0("./data/input/Zipcode_Info/polygon/ESRI", year, "USZIP5_POLY_WGS84.shp"))
+  
+  # making sure crs are equivalent 
+  # if(st_crs(grid_sf)!= st_crs(zip_sf)){
+  #   zip_sf <- st_transform(st_crs(grid_sf)) 
+  # }
+  
+  zip_sf <- st_make_valid(zip_sf)
+  zip_sf <- st_crop(zip_sf, st_bbox(grid_sf))
+  
+  zip_sf_list[[paste0("20",year)]] = zip_sf
+  
+}
+
+write_rds(zip_sf_list, "./data/intermediate/scratch/zip_sf_list.rds")
+
+
+# zip_sf_list = read_rds("./data/intermediate/scratch/zip_sf_list.rds")
+# zip_s = zip_sf_list[["2016"]]
+
+# zip_s %>% 
+#  ggplot() + 
+# geom_sf(aes(fill = "red"), alpha = 0.75, lwd = 0.1) + 
+#  theme(legend.position = "none")
+
+
+
+
+
@@ -0,0 +1,57 @@
+## load libraries ----
+library(dplyr)
+library(tidyverse)
+library(magrittr)
+library(sf)
+library(raster)
+library(parallel)
+library(argparse)
+print("This is the sf package version we are using:")
+print(packageVersion("sf"))
+
+## define parser arguments ----
+parser <- ArgumentParser()
+parser$add_argument("-y", "--year", default=2006,
+                    help="Year to run", type="integer")
+parser$add_argument("-c", "--cores", default=24,
+                    help="Number of cores", type="integer")
+args = parser$parse_args()
+print("use R script get_weights_par")
+# args = list()
+# args$year = 2006
+# args$cores = 24
+
+## read functions ----
+source("../../lib/get_weights_par.R")
+
+print("load data")
+## Load grid and zip sf objects ----
+grid_sf = read_rds("../data/intermediate/scratch/grid_sf.rds") %>% 
+  rename(grid_id = ID)
+zip_sf_list = read_rds("../data/intermediate/scratch/zip_sf_list.rds") 
+zip_sf = zip_sf_list[[as.character(args$year)]] %>% 
+  rename(zip_id = ZIP)
+rm(zip_sf_list)
+
+
+print("run aggregations")
+## run aggregations ----
+zip_weights_df <- get_weights_par(
+  x_poly_sf = zip_sf, 
+  y_poly_sf = grid_sf, 
+  x_id = "zip_id", 
+  y_id = "grid_id",
+  cores = args$cores
+)
+
+print("finish aggregations")
+zip_weights_df$year <- args$year
+
+## save output ----
+write_csv(
+  zip_weights_df, 
+  paste0("../data/output/scratch/", 
+         "zip_weights_df_test_par", as.character(args$year), ".csv")
+)
+
+print("completed")
@@ -0,0 +1,9 @@
+#!/bin/bash
+#
+#SBATCH -p fasse # partition (queue)
+#SBATCH -c 48 # number of cores
+#SBATCH --mem 100GB # memory pool for all cores
+#SBATCH -t 1-12:00 # time (D-HH:MM)
+
+
+singularity exec $HOME/singularity_images/smoke_weights_v0.sif Rscript 03_run_get_weights_par.R -y 2010 -c 40
@@ -0,0 +1,96 @@
+library(tidyverse)
+library(magrittr)
+library(lubridate)
+library(sf)
+library(dplyr)
+library(ggplot2)
+sf_use_s2(FALSE)
+
+# Load grid
+grid_10km = read_sf("../data/input/local_data/10km_grid_wgs84/10km_grid_wgs84.shp")%>% 
+  rename(
+    grid_id = ID
+  )
+
+# Load smokePM predictions on smoke days
+preds = read_csv("/net/rcstorenfs02/ifs/rc_labs/dominici_lab/lab/data/exposures/smoke/smokePM2pt5_predictions_daily_10km_20060101-20201231.csv") %>% 
+  mutate(
+    date = ymd(date)
+  ) %>% 
+  rename(
+    smoke = smokePM_pred,
+    grid_id = grid_id_10km
+  )
+
+# Create output data.frame
+zip_smoke_df = data.frame(zip = character(),
+                          date = Date(),
+                          smoke = double(),
+                          stringsAsFactors = FALSE)
+
+for(y_ in 2006:2007){
+  y_ <- 2006
+  # Load full set of dates
+  dates = seq.Date(ymd(paste0(y_, "0101")), 
+                   ymd(paste0(y_, "1231")), 
+                   by = "day")
+  
+  # Get full combination of grid cell-days
+  out = expand.grid(grid_id = grid_10km$grid_id, date = dates)
+  
+  # Match smokePM predictions on smoke days to grid cell-days
+  out = left_join(out, preds, by = c("grid_id", "date"))
+  
+  # Predict 0 for remaining grid cell-days, which are non-smoke days
+  out = mutate(out, smoke = replace_na(smoke, 0))
+  
+  
+  # Load area-based weights 
+  zip_to_grid = read.csv(paste0("../data/output/zip_weights_df_", as.character(y_), ".csv"))%>% 
+    mutate(
+      zip = sprintf("%05d", zip_id)
+    )
+  
+  # Merge gridded smoke values with weights
+  zip_grid_smoke = merge(out, zip_to_grid)
+  
+  # Compute zip-code level smoke values
+  zip_smoke_df_y = zip_grid_smoke %>%
+    group_by(zip, date)  %>%
+    summarise(smoke =  weighted.mean(smoke, w=w))
+  
+  zip_smoke_df = rbind(zip_smoke_df, zip_smoke_df_y)
+  
+}
+
+zip_sf = read_rds("../data/intermediate/scratch/zip_sf_list.rds") 
+
+save(zip_smoke_df, file = "../data/output/smoke/daily_zip_test.RData")
+
+
+
+# Load required libraries
+library(tidyverse)
+library(sf)
+
+# Read in the shapefile for zipcodes
+zip_sf <- read_sf("../data/input/local_data/Zipcode_Info/polygon/ESRI06USZIP5_POLY_WGS84.shp")
+
+zip_smoke_df$date <- as.Date(zip_smoke_df$date)
+
+# Subset the data for the given date
+zip_smoke_subset <- zip_smoke_df %>% filter(date == "2006-01-01")
+
+# Join the data with the shapefile based on the zipcode
+zip_sf <- left_join(zip_sf, zip_smoke_subset, by = c("ZIP" = "zip"))
+
+# Drop rows with NULL values in smoke column
+zip_sf <- zip_sf %>% drop_na(smoke)
+
+
+# Create a map of smoke in every zipcode with thinner line width
+ggplot() +
+  geom_sf(data = zip_sf, aes(fill = smoke), lwd = 0.1) +
+  scale_fill_gradient(low = "yellow", high = "red") +
+  theme_void()
+
@@ -0,0 +1,65 @@
+library(tidyverse)
+library(magrittr)
+library(lubridate)
+library(sf)
+#library(viridis)
+#library(fst)
+#library(data.table)
+#library(dplyr)
+#library(rgeos)
+#library(sp)
+library(ggplot2)
+#library(rgdal)
+#sf_use_s2(FALSE)
+
+years = sprintf("%02d", c(06:16))
+
+# Load zip sf list
+zip_sf_list = read_rds("../data/input/smoke/zip_sf_list.rds") 
+
+pobox_zip_df = data.frame(PO_BOX = character(),
+                          ZIP = character(),
+                          #LAT = double(),
+                          #LON = double(),
+                          year = double(),
+                          stringsAsFactors = FALSE)
+
+for(year in years){
+  
+  zip_sf = zip_sf_list[[paste0("20", year)]]
+  
+  po_box = read_csv(paste0("/n/dominici_nsaph_l3/Lab/data/shapefiles/zip_shape_files/Zipcode_Info/pobox_csv/ESRI", year, "USZIP5_POINT_WGS84_POBOX.csv")) %>%
+    rename(PO_BOX = ZIP) %>%
+    mutate(PO_BOX = sprintf("%05d", PO_BOX))
+  
+  po_box_sf = st_as_sf(po_box[,c(1,3,4)], coords = c("POINT_X", "POINT_Y"), crs=st_crs(zip_sf))
+  
+  po_box_sf <- st_crop(po_box_sf, st_bbox(zip_sf))
+  
+  # zip_sf %>%
+  #   st_simplify() %>%
+  #   ggplot(aes(fill = "red"), alpha = 0.75, lwd = 0.1) +
+  #   geom_sf() +
+  #   geom_sf(data = po_box_sf ) +
+  #   theme(legend.position = "none")
+  
+  #po_box_sf$ZIP = unlist(st_drop_geometry(zip_sf[unlist(st_intersects(po_box_sf, zip_sf)),"ZIP"]))
+  po_box_sf$ZIP = st_drop_geometry(zip_sf)$ZIP[unlist(st_intersects(po_box_sf, zip_sf))]
+  
+  # po_box %<>%
+  #   merge(po_box_sf, by = "PO_BOX")
+  # 
+  # po_box = st_drop_geometry(po_box[, c(1, 6, 3, 4)])%>%
+  #   rename(LAT = POINT_X, LON = POINT_Y)
+  # 
+  # po_box$year = as.numeric(paste0("20",year))
+  # pobox_zip_df = rbind(pobox_zip_df, po_box)
+  
+  po_box_sf$year = as.numeric(paste0("20",year))
+  pobox_zip_df = rbind(pobox_zip_df, 
+                       st_drop_geometry(po_box_sf))
+  
+}
+
+#save(pobox_zip_df, file =  "../data/intermediate/scratch/pobox_zip_df.RData")
+saveRDS(pobox_zip_df, file =  "../data/input/smoke/pobox_zip_df.rds")