Skip to content

Commit b2bfe52

Browse files
author
Kezia Irene
committed
add smoke aggregation
2 parents d1703e5 + d157e39 commit b2bfe52

28 files changed

+1189
-2
lines changed

README.md

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,30 @@
11
# smoke_aggregation
2-
Repository for computing the standardized weight across different zipcode given the 10km grids
2+
Repository for computing the standardized weight across different zipcode given the 10km grids
3+
* Input:
4+
* # 10km_grid/10km_grid_wgs84/: This is a folder that contains the shapefile for the 10 km grid.
5+
* 10km_grid/smokePM2pt5_predictions_daily_10km_20060101-20201231.rds: This is a file that contains a data frame with the final set of daily smoke PM2.5 predictions on smoke days at 10 km resolution from January 1, 2006 to December 31, 2020 for the contiguous US. The 'grid_id_10km' column in this file corresponds to the 'ID' column in the 10 km grid shapefile.
6+
* Yearly zipcode grid shapefile: ./data/input/Zipcode_Info/polygon/ESRI", year, "USZIP5_POLY_WGS84.shp
7+
8+
* Output: Rds file containing zip_id, grid_id, and weight
9+
10+
Example output:
11+
```
12+
zip_id grid_id w
13+
1 03281 104504 0.120474045
14+
2 03281 104505 0.489675352
15+
3 03281 104506 0.001825444
16+
4 03281 105019 0.080026705
17+
5 03281 105020 0.307998349
18+
```
19+
20+
To run example weights:
21+
```
22+
mkdir $HOME/singularity_images
23+
cd $HOME/singularity_images
24+
singularity pull docker://nsaph/r_exposures:v0
25+
cd code
26+
Rscript 01_yearly_grid_mean.R
27+
Rscript 02_match_cts_extent_list.R
28+
Rscript test_get_weights.R
29+
sbatch 03_run_get_weights_par.sbatch
30+
```

code/01_yearly_grid_mean.R

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
2+
# --------------------------------------------------------------------------------
3+
# 10 km grid
4+
# --------------------------------------------------------------------------------
5+
# 10km_grid/10km_grid_wgs84/:
6+
# This is a folder that contains the shapefile for the 10 km grid.
7+
#
8+
# --------------------------------------------------------------------------------
9+
# 10km_grid/smokePM2pt5_predictions_daily_10km_20060101-20201231.rds:
10+
# This is a file that contains a data frame with the final set of daily smoke PM2.5 predictions on smoke days at 10 km resolution from January 1, 2006 to December 31, 2020 for the contiguous US. The 'grid_id_10km' column in this file corresponds to the 'ID' column in the 10 km grid shapefile.
11+
#
12+
# All rows in this file are predictions on smoke days. Predictions on non-smoke days are by construction 0 ug/m^3 and not included in this file. A smoke PM2.5 prediction of 0 in this file means that the grid cell-day did have a smoke day but did not have elevated PM2.5. The full set of smoke PM2.5 predictions on both smoke days and non-smoke days can be obtained by setting the smoke PM2.5 prediction to 0 on grid cell-days in the 10 km grid and in the January 1, 2006-December 31, 2020 date range that are not in this file. For example, the R code below returns the full set of smoke PM2.5 predictions:
13+
14+
15+
library(tidyverse)
16+
library(lubridate)
17+
library(sf)
18+
19+
# Load smokePM predictions on smoke days
20+
preds = read_csv("../data/input/smoke_PM/smokePM2pt5_predictions_daily_10km_20060101-20201231.csv") %>%
21+
mutate(
22+
date = ymd(date)
23+
) %>%
24+
rename(
25+
smoke = smokePM_pred,
26+
grid_id = grid_id_10km
27+
)
28+
29+
# Load 10 km grid
30+
grid_10km = read_sf("../data/input/remote_data/10km_grid_wgs84/10km_grid_wgs84.shp")
31+
32+
smoke_grid_df_list = list()
33+
years_ = 2006:2016
34+
35+
for(y_ in years_) {
36+
print(y_)
37+
# Load full set of dates
38+
#dates = seq.Date(ymd("20060101"), ymd("20201231"), by = "day")
39+
dates = seq.Date(ymd(paste0(y_, "0101")),
40+
ymd(paste0(y_, "1231")),
41+
by = "day")
42+
43+
# Get full combination of grid cell-days
44+
# Warning: this may require a large amount of memory
45+
out = expand.grid(grid_id = grid_10km$ID, date = dates)
46+
47+
# Match smokePM predictions on smoke days to grid cell-days
48+
out = left_join(out, preds, by = c("grid_id", "date"))
49+
50+
# Predict 0 for remaining grid cell-days, which are non-smoke days
51+
out = mutate(out, smoke = replace_na(smoke, 0))
52+
53+
# Compute smoke yearly grid mean
54+
out %<>%
55+
mutate(year = year(date)) %>%
56+
group_by(grid_id, year) %>%
57+
summarise(smoke = mean(smoke))
58+
59+
smoke_grid_df_list[[as.character(y_)]] <- out
60+
}
61+
62+
write_rds(smoke_grid_df_list,
63+
"../data/intermediate/scratch/smoke_grid_df_list.rds")

code/02_match_cts_extent_list.R

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
library(tidyverse)
2+
library(magrittr)
3+
library(lubridate)
4+
library(sf)
5+
library(raster)
6+
library(rgeos)
7+
library(viridis)
8+
library(tictoc)
9+
library(dplyr)
10+
sf_use_s2(FALSE)
11+
12+
years = sprintf("%02d", c(6:16))
13+
grid_sf = read_sf("./data/input/remote_data/10km_grid_wgs84/10km_grid_wgs84.shp")
14+
15+
zip_sf_list = list()
16+
17+
for(year in years){
18+
19+
zip_sf = read_sf(paste0("./data/input/Zipcode_Info/polygon/ESRI", year, "USZIP5_POLY_WGS84.shp"))
20+
21+
# making sure crs are equivalent
22+
# if(st_crs(grid_sf)!= st_crs(zip_sf)){
23+
# zip_sf <- st_transform(st_crs(grid_sf))
24+
# }
25+
26+
zip_sf <- st_make_valid(zip_sf)
27+
zip_sf <- st_crop(zip_sf, st_bbox(grid_sf))
28+
29+
zip_sf_list[[paste0("20",year)]] = zip_sf
30+
31+
}
32+
33+
write_rds(zip_sf_list, "./data/intermediate/scratch/zip_sf_list.rds")
34+
35+
36+
# zip_sf_list = read_rds("./data/intermediate/scratch/zip_sf_list.rds")
37+
# zip_s = zip_sf_list[["2016"]]
38+
39+
# zip_s %>%
40+
# ggplot() +
41+
# geom_sf(aes(fill = "red"), alpha = 0.75, lwd = 0.1) +
42+
# theme(legend.position = "none")
43+
44+
45+
46+
47+

code/03_run_get_weights_par.R

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
## load libraries ----
2+
library(dplyr)
3+
library(tidyverse)
4+
library(magrittr)
5+
library(sf)
6+
library(raster)
7+
library(parallel)
8+
library(argparse)
9+
print("This is the sf package version we are using:")
10+
print(packageVersion("sf"))
11+
12+
## define parser arguments ----
13+
parser <- ArgumentParser()
14+
parser$add_argument("-y", "--year", default=2006,
15+
help="Year to run", type="integer")
16+
parser$add_argument("-c", "--cores", default=24,
17+
help="Number of cores", type="integer")
18+
args = parser$parse_args()
19+
print("use R script get_weights_par")
20+
# args = list()
21+
# args$year = 2006
22+
# args$cores = 24
23+
24+
## read functions ----
25+
source("../../lib/get_weights_par.R")
26+
27+
print("load data")
28+
## Load grid and zip sf objects ----
29+
grid_sf = read_rds("../data/intermediate/scratch/grid_sf.rds") %>%
30+
rename(grid_id = ID)
31+
zip_sf_list = read_rds("../data/intermediate/scratch/zip_sf_list.rds")
32+
zip_sf = zip_sf_list[[as.character(args$year)]] %>%
33+
rename(zip_id = ZIP)
34+
rm(zip_sf_list)
35+
36+
37+
print("run aggregations")
38+
## run aggregations ----
39+
zip_weights_df <- get_weights_par(
40+
x_poly_sf = zip_sf,
41+
y_poly_sf = grid_sf,
42+
x_id = "zip_id",
43+
y_id = "grid_id",
44+
cores = args$cores
45+
)
46+
47+
print("finish aggregations")
48+
zip_weights_df$year <- args$year
49+
50+
## save output ----
51+
write_csv(
52+
zip_weights_df,
53+
paste0("../data/output/scratch/",
54+
"zip_weights_df_test_par", as.character(args$year), ".csv")
55+
)
56+
57+
print("completed")

code/03_run_get_weights_par.sbatch

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#!/bin/bash
2+
#
3+
#SBATCH -p fasse # partition (queue)
4+
#SBATCH -c 48 # number of cores
5+
#SBATCH --mem 100GB # memory pool for all cores
6+
#SBATCH -t 1-12:00 # time (D-HH:MM)
7+
8+
9+
singularity exec $HOME/singularity_images/smoke_weights_v0.sif Rscript 03_run_get_weights_par.R -y 2010 -c 40

code/04_daily_aggregation.R

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
library(tidyverse)
2+
library(magrittr)
3+
library(lubridate)
4+
library(sf)
5+
library(dplyr)
6+
library(ggplot2)
7+
sf_use_s2(FALSE)
8+
9+
# Load grid
10+
grid_10km = read_sf("../data/input/local_data/10km_grid_wgs84/10km_grid_wgs84.shp")%>%
11+
rename(
12+
grid_id = ID
13+
)
14+
15+
# Load smokePM predictions on smoke days
16+
preds = read_csv("/net/rcstorenfs02/ifs/rc_labs/dominici_lab/lab/data/exposures/smoke/smokePM2pt5_predictions_daily_10km_20060101-20201231.csv") %>%
17+
mutate(
18+
date = ymd(date)
19+
) %>%
20+
rename(
21+
smoke = smokePM_pred,
22+
grid_id = grid_id_10km
23+
)
24+
25+
# Create output data.frame
26+
zip_smoke_df = data.frame(zip = character(),
27+
date = Date(),
28+
smoke = double(),
29+
stringsAsFactors = FALSE)
30+
31+
for(y_ in 2006:2007){
32+
y_ <- 2006
33+
# Load full set of dates
34+
dates = seq.Date(ymd(paste0(y_, "0101")),
35+
ymd(paste0(y_, "1231")),
36+
by = "day")
37+
38+
# Get full combination of grid cell-days
39+
out = expand.grid(grid_id = grid_10km$grid_id, date = dates)
40+
41+
# Match smokePM predictions on smoke days to grid cell-days
42+
out = left_join(out, preds, by = c("grid_id", "date"))
43+
44+
# Predict 0 for remaining grid cell-days, which are non-smoke days
45+
out = mutate(out, smoke = replace_na(smoke, 0))
46+
47+
48+
# Load area-based weights
49+
zip_to_grid = read.csv(paste0("../data/output/zip_weights_df_", as.character(y_), ".csv"))%>%
50+
mutate(
51+
zip = sprintf("%05d", zip_id)
52+
)
53+
54+
# Merge gridded smoke values with weights
55+
zip_grid_smoke = merge(out, zip_to_grid)
56+
57+
# Compute zip-code level smoke values
58+
zip_smoke_df_y = zip_grid_smoke %>%
59+
group_by(zip, date) %>%
60+
summarise(smoke = weighted.mean(smoke, w=w))
61+
62+
zip_smoke_df = rbind(zip_smoke_df, zip_smoke_df_y)
63+
64+
}
65+
66+
zip_sf = read_rds("../data/intermediate/scratch/zip_sf_list.rds")
67+
68+
save(zip_smoke_df, file = "../data/output/smoke/daily_zip_test.RData")
69+
70+
71+
72+
# Load required libraries
73+
library(tidyverse)
74+
library(sf)
75+
76+
# Read in the shapefile for zipcodes
77+
zip_sf <- read_sf("../data/input/local_data/Zipcode_Info/polygon/ESRI06USZIP5_POLY_WGS84.shp")
78+
79+
zip_smoke_df$date <- as.Date(zip_smoke_df$date)
80+
81+
# Subset the data for the given date
82+
zip_smoke_subset <- zip_smoke_df %>% filter(date == "2006-01-01")
83+
84+
# Join the data with the shapefile based on the zipcode
85+
zip_sf <- left_join(zip_sf, zip_smoke_subset, by = c("ZIP" = "zip"))
86+
87+
# Drop rows with NULL values in smoke column
88+
zip_sf <- zip_sf %>% drop_na(smoke)
89+
90+
91+
# Create a map of smoke in every zipcode with thinner line width
92+
ggplot() +
93+
geom_sf(data = zip_sf, aes(fill = smoke), lwd = 0.1) +
94+
scale_fill_gradient(low = "yellow", high = "red") +
95+
theme_void()
96+

code/05_pobox_zip_association.R

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
library(tidyverse)
2+
library(magrittr)
3+
library(lubridate)
4+
library(sf)
5+
#library(viridis)
6+
#library(fst)
7+
#library(data.table)
8+
#library(dplyr)
9+
#library(rgeos)
10+
#library(sp)
11+
library(ggplot2)
12+
#library(rgdal)
13+
#sf_use_s2(FALSE)
14+
15+
years = sprintf("%02d", c(06:16))
16+
17+
# Load zip sf list
18+
zip_sf_list = read_rds("../data/input/smoke/zip_sf_list.rds")
19+
20+
pobox_zip_df = data.frame(PO_BOX = character(),
21+
ZIP = character(),
22+
#LAT = double(),
23+
#LON = double(),
24+
year = double(),
25+
stringsAsFactors = FALSE)
26+
27+
for(year in years){
28+
29+
zip_sf = zip_sf_list[[paste0("20", year)]]
30+
31+
po_box = read_csv(paste0("/n/dominici_nsaph_l3/Lab/data/shapefiles/zip_shape_files/Zipcode_Info/pobox_csv/ESRI", year, "USZIP5_POINT_WGS84_POBOX.csv")) %>%
32+
rename(PO_BOX = ZIP) %>%
33+
mutate(PO_BOX = sprintf("%05d", PO_BOX))
34+
35+
po_box_sf = st_as_sf(po_box[,c(1,3,4)], coords = c("POINT_X", "POINT_Y"), crs=st_crs(zip_sf))
36+
37+
po_box_sf <- st_crop(po_box_sf, st_bbox(zip_sf))
38+
39+
# zip_sf %>%
40+
# st_simplify() %>%
41+
# ggplot(aes(fill = "red"), alpha = 0.75, lwd = 0.1) +
42+
# geom_sf() +
43+
# geom_sf(data = po_box_sf ) +
44+
# theme(legend.position = "none")
45+
46+
#po_box_sf$ZIP = unlist(st_drop_geometry(zip_sf[unlist(st_intersects(po_box_sf, zip_sf)),"ZIP"]))
47+
po_box_sf$ZIP = st_drop_geometry(zip_sf)$ZIP[unlist(st_intersects(po_box_sf, zip_sf))]
48+
49+
# po_box %<>%
50+
# merge(po_box_sf, by = "PO_BOX")
51+
#
52+
# po_box = st_drop_geometry(po_box[, c(1, 6, 3, 4)])%>%
53+
# rename(LAT = POINT_X, LON = POINT_Y)
54+
#
55+
# po_box$year = as.numeric(paste0("20",year))
56+
# pobox_zip_df = rbind(pobox_zip_df, po_box)
57+
58+
po_box_sf$year = as.numeric(paste0("20",year))
59+
pobox_zip_df = rbind(pobox_zip_df,
60+
st_drop_geometry(po_box_sf))
61+
62+
}
63+
64+
#save(pobox_zip_df, file = "../data/intermediate/scratch/pobox_zip_df.RData")
65+
saveRDS(pobox_zip_df, file = "../data/input/smoke/pobox_zip_df.rds")

0 commit comments

Comments
 (0)