Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ RUN mamba env update -n base -f requirements.yaml
#&& mamba clean -a

# Create paths to data placeholders
RUN python utils/create_dir_paths.py datapaths.input.satellite_pm25.annual=null datapaths.input.satellite_pm25.monthly=null
RUN python utils/create_dir_paths.py datapaths.input.satellite_pm25.yearly=null datapaths.input.satellite_pm25.monthly=null

# snakemake --configfile conf/config.yaml --cores 4 -C temporal_freq=annual
# snakemake --configfile conf/config.yaml --cores 4 -C temporal_freq=yearly
ENTRYPOINT ["snakemake", "--configfile", "conf/config.yaml"]
CMD ["--cores", "4", "-C", "polygon_name=county", "temporal_freq=annual"]
CMD ["--cores", "4", "-C", "polygon_name=county", "temporal_freq=yearly"]
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# pm25_washu_raster2polygon
# pm25_randall_raster2polygon

Code to produce spatial aggregations of pm25 estimates as generated by the [Atmospheric Composition Analysis Group](https://sites.wustl.edu/acag/datasets/surface-pm2-5/). The spatial aggregation are performed for satellite pm25 from grid/raster (NetCDF) to polygons (shp).

Expand All @@ -10,7 +10,7 @@ The [Atmospheric Composition Analysis Group](https://sites.wustl.edu/acag/datase

The version [V5.GL.04](https://sites.wustl.edu/acag/datasets/surface-pm2-5/#V5.GL.04) consists of mean PM2.5 (ug/m3) available at:

* Temporal frequency: Annual and monthly
* Temporal frequency: yearly and monthly
* Grid resolutions: (0.1° × 0.1°) and (0.01° × 0.01°)
* Geographic regions: North America, Europe, Asia, and Global

Expand Down Expand Up @@ -47,7 +47,7 @@ The configuration structure withing the `/conf` folder allow you to modify the i
* aggregate pm25: `src/aggregate_pm25.py`

The key parameters are:
* `temporal_freq` which determines whether the original annual or monthly pm25 files will be aggregated. The options are: `annual` and `monthly`.
* `temporal_freq` which determines whether the original yearly or monthly pm25 files will be aggregated. The options are: `yearly` and `monthly`.
* `polygon_name` which determines into which polygons the pm25 grid will the aggregated. The options are: `zcta` and `county`.

---
Expand Down Expand Up @@ -98,7 +98,7 @@ python src/aggregate_pm25.py
or run the pipeline:

```bash
snakemake --cores 4 -C polygon_name=county temporal_freq=annual
snakemake --cores 4 -C polygon_name=county temporal_freq=yearly
```

Modify `cores`, `polygon_name` and `temporal_freq` as you find convenient.
Expand All @@ -115,7 +115,7 @@ mkdir <path>/satellite_pm25_raster2polygon

```bash
docker pull nsaph/satellite_pm25_raster2polygon
docker run -v <path>:/app/data/input/satellite_pm25/annual <path>/satellite_pm25_raster2polygon/:/app/data/output/satellite_pm25_raster2polygon nsaph/satellite_pm25_raster2polygon
docker run -v <path>:/app/data/input/satellite_pm25/yearly <path>/satellite_pm25_raster2polygon/:/app/data/output/satellite_pm25_raster2polygon nsaph/satellite_pm25_raster2polygon
```

If you are interested in storing the input raw and intermediate data run
Expand Down
20 changes: 10 additions & 10 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,21 @@ temporal_freq = config['temporal_freq']
polygon_name = config['polygon_name']

with initialize(version_base=None, config_path="conf"):
hydra_cfg = compose(config_name="config", overrides=[f"temporal_freq={temporal_freq}", f"polygon_name={polygon_name}"])
cfg = compose(config_name="config", overrides=[f"temporal_freq={temporal_freq}", f"polygon_name={polygon_name}"])

satellite_pm25_cfg = hydra_cfg.satellite_pm25
shapefiles_cfg = hydra_cfg.shapefiles
satellite_pm25_cfg = cfg.satellite_pm25
shapefiles_cfg = cfg.shapefiles

shapefile_years_list = list(shapefiles_cfg[polygon_name].keys())

months_list = "01" if temporal_freq == 'yearly' else [str(i).zfill(2) for i in range(1, 12 + 1)]
years_list = list(range(1998, 2022 + 1))
years_list = list(range(1998, 2023 + 1))

# == Define rules ==
rule all:
input:
expand(
f"data/output/pm25__washu/{polygon_name}_{temporal_freq}/pm25__washu__{polygon_name}_{temporal_freq}__" +
f"{cfg.datapaths.base_path}/output/{polygon_name}_{temporal_freq}/pm25__randall__{polygon_name}_{temporal_freq}__" +
("{year}.parquet" if temporal_freq == 'yearly' else "{year}_{month}.parquet"),
year=years_list,
month=months_list
Expand All @@ -40,14 +40,14 @@ rule all:
# remove and use symlink to the us census geoboundaries
rule download_shapefiles:
output:
f"data/input/shapefiles/shapefile_{polygon_name}_" + "{shapefile_year}/shapefile.shp"
f"{cfg.datapaths.base_path}/input/shapefiles/shapefile_{polygon_name}_" + "{shapefile_year}/shapefile.shp"
shell:
f"python src/download_shapefile.py polygon_name={polygon_name} " + "shapefile_year={wildcards.shapefile_year}"

rule download_satellite_pm25:
output:
expand(
f"data/input/pm25__washu__raw/{temporal_freq}/{satellite_pm25_cfg[temporal_freq]['file_prefix']}." +
f"{cfg.datapaths.base_path}/input/raw/{temporal_freq}/{satellite_pm25_cfg[temporal_freq]['file_prefix']}." +
("{year}01-{year}12.nc" if temporal_freq == 'yearly' else "{year}{month}-{year}{month}.nc"),
year=years_list,
month=months_list)
Expand All @@ -58,20 +58,20 @@ rule download_satellite_pm25:

def get_shapefile_input(wildcards):
shapefile_year = available_shapefile_year(int(wildcards.year), shapefile_years_list)
return f"data/input/shapefiles/shapefile_{polygon_name}_{shapefile_year}/shapefile.shp"
return f"{cfg.datapaths.base_path}/input/shapefiles/shapefile_{polygon_name}_{shapefile_year}/shapefile.shp"

rule aggregate_pm25:
input:
get_shapefile_input,
expand(
f"data/input/pm25__washu__raw/{temporal_freq}/{satellite_pm25_cfg[temporal_freq]['file_prefix']}." +
f"{cfg.datapaths.base_path}/input/raw/{temporal_freq}/{satellite_pm25_cfg[temporal_freq]['file_prefix']}." +
("{{year}}01-{{year}}12.nc" if temporal_freq == 'yearly' else "{{year}}{month}-{{year}}{month}.nc"),
month=months_list
)

output:
expand(
f"data/output/pm25__washu/{polygon_name}_{temporal_freq}/pm25__washu__{polygon_name}_{temporal_freq}__" +
f"{cfg.datapaths.base_path}/output/{polygon_name}_{temporal_freq}/pm25__randall__{polygon_name}_{temporal_freq}__" +
("{{year}}.parquet" if temporal_freq == 'yearly' else "{{year}}_{month}.parquet"),
month=months_list # we only want to expand months_list and keep year as wildcard
)
Expand Down
6 changes: 3 additions & 3 deletions conf/config.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
defaults:
- _self_
- datapaths: cannon_datapaths
- datapaths: cannon_v5gl
- shapefiles: shapefiles
- satellite_pm25: us_pm25
- satellite_pm25: V5GL0502.HybridPM25c_0p05.NorthAmerica

# == aggregation args
temporal_freq: yearly # yearly, monthly to be matched with cfg.satellite_pm25
year: 2020

# == shapefile download args
polygon_name: zcta # zcta, county to be matched with cfg.shapefiles
polygon_name: county # zcta, county to be matched with cfg.shapefiles
shapefile_year: 2020 #to be matched with cfg.shapefiles

show_progress: false
Expand Down
13 changes: 0 additions & 13 deletions conf/datapaths/cannon_datapaths.yaml

This file was deleted.

14 changes: 14 additions & 0 deletions conf/datapaths/cannon_v5gl.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
base_path: data/V5GL

dirs:
input:
raw:
yearly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V5GL/raw/yearly #/n/netscratch/dominici_lab/Lab/pm25__randall__raw/yearly
monthly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V5GL/raw/monthly #/n/netscratch/dominici_lab/Lab/pm25__randall__raw/monthly
shapefiles: null

output:
zcta_yearly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V5GL/zcta_yearly
zcta_monthly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V5GL/zcta_monthly
county_yearly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V5GL/county_yearly
county_monthly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V5GL/county_monthly
14 changes: 14 additions & 0 deletions conf/datapaths/cannon_v6gl.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
base_path: data/V6GL

dirs:
input:
raw:
yearly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V6GL/raw/yearly #/n/netscratch/dominici_lab/Lab/pm25__randall__raw/yearly
monthly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V6GL/raw/monthly #/n/netscratch/dominici_lab/Lab/pm25__randall__raw/monthly
shapefiles: null

output:
zcta_yearly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V6GL/zcta_yearly
zcta_monthly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V6GL/zcta_monthly
county_yearly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V6GL/county_yearly
county_monthly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V6GL/county_monthly
17 changes: 9 additions & 8 deletions conf/datapaths/datapaths.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# if files are stored within the local copy of the repository, then use null:
input:
pm25__washu__raw:
yearly: null
monthly: null
shapefiles: null
base_path: data/V6GL

output:
pm25__washu:
dirs:
input:
raw:
yearly: null
monthly: null
shapefiles: null

output:
zcta_yearly: null
zcta_monthly: null
county_yearly: null
Expand Down
19 changes: 19 additions & 0 deletions conf/satellite_pm25/V5GL0502.HybridPM25c_0p05.NorthAmerica.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
yearly:
url: https://wustl.app.box.com/v/ACAG-V5GL0502-GWRPM25c0p05/folder/293383209520

zipname: Annual

file_prefix: "V5GL0502.HybridPM25c_0p05.NorthAmerica"
#file name convention is V5GL0502.HybridPM25c_0p05.NorthAmerica.yyyymm-yyyymm.nc

monthly:
url: https://wustl.app.box.com/v/ACAG-V5GL0502-GWRPM25c0p05/folder/293385030318

zipname: Monthly

file_prefix: "V5GL0502.HybridPM25c_0p05.NorthAmerica"
#file name convention is V5GL0502.HybridPM25c_0p05.NorthAmerica.yyyymm-yyyymm.nc

layer: "GWRPM25" #geographic weighted regression PM2.5
latitude_layer: "lat"
longitude_layer: "lon"
18 changes: 18 additions & 0 deletions conf/satellite_pm25/V6GL02.04.CNNPM25.0p10.NA.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
yearly:
url: https://wustl.app.box.com/s/s7eiaxytjr9w1z7glat45cesitcemprv/folder/327763225614

zipname: Annual

file_prefix: "V6GL02.04.CNNPM25.0p10.NA"
#file name convention is V6GL02.04.CNNPM25.0p10.NA.yyyymm-yyyymm.nc

monthly:
url: https://wustl.app.box.com/s/s7eiaxytjr9w1z7glat45cesitcemprv/folder/327764742544
zipname: Monthly

file_prefix: "V6GL02.04.CNNPM25.0p10.NA"
#file name convention is V6GL02.04.CNNPM25.0p10.NA.yyyymm-yyyymm.nc

layer: "GWRPM25" #geographic weighted regression PM2.5
latitude_layer: "lat"
longitude_layer: "lon"
5 changes: 1 addition & 4 deletions requirements.yaml → environment.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: satellite_pm25_raster2polygon
name: pm25_randall
channels:
- conda-forge
- defaults
Expand All @@ -19,6 +19,3 @@ dependencies:
- selenium==4.29.0
- chromedriver-binary==135.0.7030.0.0
- tqdm==4.67.1
- torch==2.6.0
- torchaudio==2.6.0
- torchvision==0.21.0
8 changes: 0 additions & 8 deletions fasrc_jobs/county_monthly.sbatch

This file was deleted.

8 changes: 0 additions & 8 deletions fasrc_jobs/zcta_monthly.sbatch

This file was deleted.

File renamed without changes.
10 changes: 10 additions & 0 deletions jobs/county_monthly.sbatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash
#
#SBATCH -p serial_requeue # partition (queue)
#SBATCH -c 16 # number of cores
#SBATCH --mem 96GB # memory
#SBATCH -t 0-02:00 # time (D-HH:MM)

#singularity exec $HOME/singularity_images/satellite_pm25_raster2polygon_latest.sif snakemake --cores 16 -C polygon_name=county temporal_freq=monthly

snakemake --cores 16 -C polygon_name=county temporal_freq=monthly
13 changes: 13 additions & 0 deletions jobs/v5gl.sbatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash
#
#SBATCH -p serial_requeue # partition (queue)
#SBATCH -c 48 # number of cores
#SBATCH --mem 184GB # memory
#SBATCH -t 0-12:00 # time (D-HH:MM)

#singularity exec $HOME/singularity_images/satellite_pm25_raster2polygon_latest.sif snakemake --cores 16 -C polygon_name=county temporal_freq=monthly

snakemake --cores 24 -C polygon_name=county temporal_freq=yearly
snakemake --cores 24 -C polygon_name=county temporal_freq=monthly
snakemake --cores 24 -C polygon_name=zcta temporal_freq=yearly
snakemake --cores 24 -C polygon_name=zcta temporal_freq=monthly
11 changes: 11 additions & 0 deletions jobs/zcta_monthly.sbatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
#
#SBATCH -p shared # partition (queue)
#SBATCH -c 32 # number of cores
#SBATCH --mem 96GB # memory
#SBATCH -t 0-01:00 # time (D-HH:MM)

#singularity exec $HOME/singularity_images/satellite_pm25_raster2polygon_latest.sif snakemake --cores 16 -C polygon_name=zcta temporal_freq=monthly

snakemake --cores 32 -C polygon_name=zcta temporal_freq=yearly

10 changes: 10 additions & 0 deletions jobs/zcta_yearly.sbatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash
#
#SBATCH -p shared # partition (queue)
#SBATCH -c 32 # number of cores
#SBATCH --mem 96GB # memory
#SBATCH -t 0-01:00 # time (D-HH:MM)

#singularity exec $HOME/singularity_images/satellite_pm25_raster2polygon_latest.sif snakemake --cores 16 -C polygon_name=zcta temporal_freq=monthly

snakemake --cores 32 -C polygon_name=zcta temporal_freq=yearly
14 changes: 7 additions & 7 deletions notes/eda_input.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -38,7 +38,7 @@
],
"source": [
"# Open the netCDF file\n",
"file_path = \"../data/input/satellite_pm25/annual/V5GL04.HybridPM25c_0p10.NorthAmerica.202201-202212.nc\"\n",
"file_path = f\"../{cfg.datapaths.base_path}/input/satellite_pm25/yearly/V5GL04.HybridPM25c_0p10.NorthAmerica.202201-202212.nc\"\n",
"dataset = netCDF4.Dataset(file_path)\n",
"\n",
"# Print the global attributes\n",
Expand Down Expand Up @@ -183,7 +183,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -210,7 +210,7 @@
"import matplotlib.pyplot as plt\n",
"\n",
"# Open the netCDF file\n",
"file_path = \"data/V5GL04.HybridPM25.NorthAmerica.202201-202212.nc\"\n",
"file_path = f\"{cfg.datapaths.base_path}/V5GL04.HybridPM25.NorthAmerica.202201-202212.nc\"\n",
"dataset = netCDF4.Dataset(file_path)\n",
"\n",
"# Get the latitude and longitude variables\n",
Expand Down Expand Up @@ -260,7 +260,7 @@
},
{
"cell_type": "code",
"execution_count": 45,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -280,12 +280,12 @@
"import matplotlib.pyplot as plt\n",
"\n",
"# Read the CSV file\n",
"pm25_data = pd.read_csv('data/county_pm25.csv')\n",
"pm25_data = pd.read_csv(f'{cfg.datapaths.base_path}/county_pm25.csv')\n",
"# Convert GEOID to string using trailing zeros\n",
"pm25_data['GEOID'] = pm25_data['GEOID'].astype(str).str.zfill(5)\n",
"\n",
"# Read the shapefile\n",
"shapefile = gpd.read_file('data/shapefile_cb_county_2015/shapefile.shp')\n",
"shapefile = gpd.read_file(f\"{cfg.datapaths.base_path}/shapefile_cb_county_2015/shapefile.shp\")\n",
"\n",
"# Merge the data\n",
"merged_data = shapefile.merge(pm25_data, on='GEOID', how='left')\n",
Expand Down
Loading