Skip to content

Commit d157e39

Browse files
Merge pull request #1 from NSAPH-Data-Processing/issue-1
add readme
2 parents 84d95cd + d883b89 commit d157e39

22 files changed

+904
-1
lines changed

README.md

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,30 @@
11
# smoke_aggregation
2-
Repository for computing the standardized weight across different zipcode given the 10km grids
2+
Repository for computing the standardized weight across different zipcode given the 10km grids
3+
* Input:
4+
* # 10km_grid/10km_grid_wgs84/: This is a folder that contains the shapefile for the 10 km grid.
5+
* 10km_grid/smokePM2pt5_predictions_daily_10km_20060101-20201231.rds: This is a file that contains a data frame with the final set of daily smoke PM2.5 predictions on smoke days at 10 km resolution from January 1, 2006 to December 31, 2020 for the contiguous US. The 'grid_id_10km' column in this file corresponds to the 'ID' column in the 10 km grid shapefile.
6+
* Yearly zipcode grid shapefile: ./data/input/Zipcode_Info/polygon/ESRI", year, "USZIP5_POLY_WGS84.shp
7+
8+
* Output: Rds file containing zip_id, grid_id, and weight
9+
10+
Example output:
11+
```
12+
zip_id grid_id w
13+
1 03281 104504 0.120474045
14+
2 03281 104505 0.489675352
15+
3 03281 104506 0.001825444
16+
4 03281 105019 0.080026705
17+
5 03281 105020 0.307998349
18+
```
19+
20+
To run example weights:
21+
```
22+
mkdir $HOME/singularity_images
23+
cd $HOME/singularity_images
24+
singularity pull docker://nsaph/r_exposures:v0
25+
cd code
26+
Rscript 01_yearly_grid_mean.R
27+
Rscript 02_match_cts_extent_list.R
28+
Rscript test_get_weights.R
29+
sbatch 03_run_get_weights_par.sbatch
30+
```

code/01_yearly_grid_mean.R

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
2+
# --------------------------------------------------------------------------------
3+
# 10 km grid
4+
# --------------------------------------------------------------------------------
5+
# 10km_grid/10km_grid_wgs84/:
6+
# This is a folder that contains the shapefile for the 10 km grid.
7+
#
8+
# --------------------------------------------------------------------------------
9+
# 10km_grid/smokePM2pt5_predictions_daily_10km_20060101-20201231.rds:
10+
# This is a file that contains a data frame with the final set of daily smoke PM2.5 predictions on smoke days at 10 km resolution from January 1, 2006 to December 31, 2020 for the contiguous US. The 'grid_id_10km' column in this file corresponds to the 'ID' column in the 10 km grid shapefile.
11+
#
12+
# All rows in this file are predictions on smoke days. Predictions on non-smoke days are by construction 0 ug/m^3 and not included in this file. A smoke PM2.5 prediction of 0 in this file means that the grid cell-day did have a smoke day but did not have elevated PM2.5. The full set of smoke PM2.5 predictions on both smoke days and non-smoke days can be obtained by setting the smoke PM2.5 prediction to 0 on grid cell-days in the 10 km grid and in the January 1, 2006-December 31, 2020 date range that are not in this file. For example, the R code below returns the full set of smoke PM2.5 predictions:
13+
14+
15+
library(tidyverse)
16+
library(lubridate)
17+
library(sf)
18+
19+
# Load smokePM predictions on smoke days
20+
preds = read_csv("../data/input/smoke_PM/smokePM2pt5_predictions_daily_10km_20060101-20201231.csv") %>%
21+
mutate(
22+
date = ymd(date)
23+
) %>%
24+
rename(
25+
smoke = smokePM_pred,
26+
grid_id = grid_id_10km
27+
)
28+
29+
# Load 10 km grid
30+
grid_10km = read_sf("../data/input/remote_data/10km_grid_wgs84/10km_grid_wgs84.shp")
31+
32+
smoke_grid_df_list = list()
33+
years_ = 2006:2016
34+
35+
for(y_ in years_) {
36+
print(y_)
37+
# Load full set of dates
38+
#dates = seq.Date(ymd("20060101"), ymd("20201231"), by = "day")
39+
dates = seq.Date(ymd(paste0(y_, "0101")),
40+
ymd(paste0(y_, "1231")),
41+
by = "day")
42+
43+
# Get full combination of grid cell-days
44+
# Warning: this may require a large amount of memory
45+
out = expand.grid(grid_id = grid_10km$ID, date = dates)
46+
47+
# Match smokePM predictions on smoke days to grid cell-days
48+
out = left_join(out, preds, by = c("grid_id", "date"))
49+
50+
# Predict 0 for remaining grid cell-days, which are non-smoke days
51+
out = mutate(out, smoke = replace_na(smoke, 0))
52+
53+
# Compute smoke yearly grid mean
54+
out %<>%
55+
mutate(year = year(date)) %>%
56+
group_by(grid_id, year) %>%
57+
summarise(smoke = mean(smoke))
58+
59+
smoke_grid_df_list[[as.character(y_)]] <- out
60+
}
61+
62+
write_rds(smoke_grid_df_list,
63+
"../data/intermediate/scratch/smoke_grid_df_list.rds")

code/02_match_cts_extent_list.R

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
library(tidyverse)
2+
library(magrittr)
3+
library(lubridate)
4+
library(sf)
5+
library(raster)
6+
library(rgeos)
7+
library(viridis)
8+
library(tictoc)
9+
library(dplyr)
10+
sf_use_s2(FALSE)
11+
12+
years = sprintf("%02d", c(6:16))
13+
grid_sf = read_sf("./data/input/remote_data/10km_grid_wgs84/10km_grid_wgs84.shp")
14+
15+
zip_sf_list = list()
16+
17+
for(year in years){
18+
19+
zip_sf = read_sf(paste0("./data/input/Zipcode_Info/polygon/ESRI", year, "USZIP5_POLY_WGS84.shp"))
20+
21+
# making sure crs are equivalent
22+
# if(st_crs(grid_sf)!= st_crs(zip_sf)){
23+
# zip_sf <- st_transform(st_crs(grid_sf))
24+
# }
25+
26+
zip_sf <- st_make_valid(zip_sf)
27+
zip_sf <- st_crop(zip_sf, st_bbox(grid_sf))
28+
29+
zip_sf_list[[paste0("20",year)]] = zip_sf
30+
31+
}
32+
33+
write_rds(zip_sf_list, "./data/intermediate/scratch/zip_sf_list.rds")
34+
35+
36+
# zip_sf_list = read_rds("./data/intermediate/scratch/zip_sf_list.rds")
37+
# zip_s = zip_sf_list[["2016"]]
38+
39+
# zip_s %>%
40+
# ggplot() +
41+
# geom_sf(aes(fill = "red"), alpha = 0.75, lwd = 0.1) +
42+
# theme(legend.position = "none")
43+
44+
45+
46+
47+

code/03_run_get_weights_par.R

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
## load libraries ----
2+
library(dplyr)
3+
library(tidyverse)
4+
library(magrittr)
5+
library(sf)
6+
library(raster)
7+
library(parallel)
8+
library(argparse)
9+
print("This is the sf package version we are using:")
10+
print(packageVersion("sf"))
11+
12+
## define parser arguments ----
13+
parser <- ArgumentParser()
14+
parser$add_argument("-y", "--year", default=2006,
15+
help="Year to run", type="integer")
16+
parser$add_argument("-c", "--cores", default=24,
17+
help="Number of cores", type="integer")
18+
args = parser$parse_args()
19+
print("use R script get_weights_par")
20+
# args = list()
21+
# args$year = 2006
22+
# args$cores = 24
23+
24+
## read functions ----
25+
source("../../lib/get_weights_par.R")
26+
27+
print("load data")
28+
## Load grid and zip sf objects ----
29+
grid_sf = read_rds("../data/intermediate/scratch/grid_sf.rds") %>%
30+
rename(grid_id = ID)
31+
zip_sf_list = read_rds("../data/intermediate/scratch/zip_sf_list.rds")
32+
zip_sf = zip_sf_list[[as.character(args$year)]] %>%
33+
rename(zip_id = ZIP)
34+
rm(zip_sf_list)
35+
36+
37+
print("run aggregations")
38+
## run aggregations ----
39+
zip_weights_df <- get_weights_par(
40+
x_poly_sf = zip_sf,
41+
y_poly_sf = grid_sf,
42+
x_id = "zip_id",
43+
y_id = "grid_id",
44+
cores = args$cores
45+
)
46+
47+
print("finish aggregations")
48+
zip_weights_df$year <- args$year
49+
50+
## save output ----
51+
write_csv(
52+
zip_weights_df,
53+
paste0("../data/output/scratch/",
54+
"zip_weights_df_test_par", as.character(args$year), ".csv")
55+
)
56+
57+
print("completed")

code/03_run_get_weights_par.sbatch

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#!/bin/bash
2+
#
3+
#SBATCH -p fasse # partition (queue)
4+
#SBATCH -c 48 # number of cores
5+
#SBATCH --mem 100GB # memory pool for all cores
6+
#SBATCH -t 1-12:00 # time (D-HH:MM)
7+
8+
9+
singularity exec $HOME/singularity_images/smoke_weights_v0.sif Rscript 03_run_get_weights_par.R -y 2010 -c 40

code/test_get_weights.R

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
## load libraries ----
2+
library(dplyr)
3+
library(tidyverse)
4+
library(magrittr)
5+
library(sf)
6+
library(raster)
7+
library(parallel)
8+
library(argparse)
9+
print("This is the sf package version we are using:")
10+
print(packageVersion("sf"))
11+
12+
## define parser arguments ----
13+
parser <- ArgumentParser()
14+
parser$add_argument("-y", "--year", default=2006,
15+
help="Year to run", type="integer")
16+
parser$add_argument("-c", "--cores", default=24,
17+
help="Number of cores", type="integer")
18+
args = parser$parse_args()
19+
print("use R script get_weights_par")
20+
# args = list()
21+
# args$year = 2006
22+
# args$cores = 24
23+
24+
## read functions ----
25+
source("../../lib/get_weights_par.R")
26+
27+
print("load data")
28+
## Load grid and zip sf objects ----
29+
grid_sf = read_rds("../data/intermediate/scratch/grid_sf.rds") %>%
30+
rename(grid_id = ID)
31+
zip_sf_list = read_rds("../data/intermediate/scratch/zip_sf_list.rds")
32+
zip_sf = zip_sf_list[[as.character(args$year)]] %>%
33+
rename(zip_id = ZIP)
34+
rm(zip_sf_list)
35+
x_poly_sf = zip_sf
36+
y_poly_sf = grid_sf
37+
x_id = "zip_id"
38+
y_id = "grid_id"
39+
cores = args$cores
40+
41+
## crop polygons within same bounding box ----
42+
x_poly_sf <- st_make_valid(x_poly_sf)
43+
x_poly_sf <- st_crop(x_poly_sf, st_bbox(y_poly_sf))
44+
45+
# assign 1s to zipcode polygons ----
46+
x_poly_sf$w <- 1
47+
48+
49+
x_to_y <- data.frame()
50+
51+
# error zipcodes
52+
for(i in c("03281")) {
53+
x_i_sf <- dplyr::select(x_poly_sf[x_poly_sf[[x_id]] == i, ], c("w", "geometry"))
54+
y_i_sf <- dplyr::select(st_crop(y_poly_sf, extent(x_i_sf)), c(y_id, "geometry"))
55+
56+
# y_i_w <- st_drop_geometry(st_interpolate_aw(x_i_sf, y_i_sf, extensive = T))
57+
tryCatch({
58+
y_i_w <- st_drop_geometry(st_interpolate_aw(x_i_sf, y_i_sf, extensive = T))
59+
60+
x_to_y_i <- data.frame(
61+
x_id = i,
62+
y_id = y_i_sf[[y_id]][as.numeric(rownames(y_i_w))],
63+
w = y_i_w$w)
64+
65+
x_to_y <- rbind(x_to_y, x_to_y_i)
66+
67+
}, error=function(e){
68+
print("An error occurred while calculating the weights.")
69+
print(i)
70+
})
71+
}
72+
73+
74+
## example successful one
75+
test_x <- dplyr::select(x_poly_sf[x_poly_sf[[x_id]] == "01604", ], c("w", "geometry"))
76+
test_y <- dplyr::select(st_crop(y_poly_sf, extent(test_x)), c(y_id, "geometry"))
77+
78+
test_interpolate <- st_interpolate_aw(test_x, test_y, extensive = T)
79+
80+
############
81+
# The success one
82+
plot(test_x[[2]])
83+
plot(test_y[[2]], add = TRUE)
84+
plot(test_interpolate[[2]])
85+
86+
87+
# The error one
88+
plot(x_i_sf[[2]])
89+
plot(y_i_sf[[2]], add = TRUE)
90+
st_interpolate_aw(x_i_sf, y_i_sf, extensive = T)
91+
92+

data/input/local_data/.gitignore

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# ignore everything (dirs, files, sub-dirs, sub-files)
2+
/*
3+
# but do not ignore the .gitignore file
4+
!.gitignore
5+
# but do not ignore the README.md file
6+
!README.md
7+
# do not ignore x folder
8+
!/x

data/input/local_data/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
2+
```
3+
ln -s /n/dominici_nsaph/Lab/data/shapefiles/zip_shape_files/Zipcode_Info .
4+
ln -s /n/dominici_lab/lab/data/10km_grid_wgs84 .
5+
```

data/input/remote_data/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
folder containing data that is pushed to the remote repository

data/intermediate/.gitignore

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# ignore everything (dirs, files, sub-dirs, sub-files)
2+
/*
3+
# but do not ignore the .gitignore file
4+
!.gitignore
5+
# but do not ignore the README.md file
6+
!README.md
7+
# do not ignore shared_data folder
8+
!/remote_data

0 commit comments

Comments
 (0)