diff --git a/Dockerfile b/Dockerfile index c871f80..d69b3fd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,8 +13,8 @@ RUN mamba env update -n base -f requirements.yaml #&& mamba clean -a # Create paths to data placeholders -RUN python utils/create_dir_paths.py datapaths.input.satellite_pm25.annual=null datapaths.input.satellite_pm25.monthly=null +RUN python utils/create_dir_paths.py datapaths.input.satellite_pm25.yearly=null datapaths.input.satellite_pm25.monthly=null -# snakemake --configfile conf/config.yaml --cores 4 -C temporal_freq=annual +# snakemake --configfile conf/config.yaml --cores 4 -C temporal_freq=yearly ENTRYPOINT ["snakemake", "--configfile", "conf/config.yaml"] -CMD ["--cores", "4", "-C", "polygon_name=county", "temporal_freq=annual"] +CMD ["--cores", "4", "-C", "polygon_name=county", "temporal_freq=yearly"] diff --git a/README.md b/README.md index cb3a85f..83af885 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# pm25_washu_raster2polygon +# pm25_randall_raster2polygon Code to produce spatial aggregations of pm25 estimates as generated by the [Atmospheric Composition Analysis Group](https://sites.wustl.edu/acag/datasets/surface-pm2-5/). The spatial aggregation are performed for satellite pm25 from grid/raster (NetCDF) to polygons (shp). @@ -10,7 +10,7 @@ The [Atmospheric Composition Analysis Group](https://sites.wustl.edu/acag/datase The version [V5.GL.04](https://sites.wustl.edu/acag/datasets/surface-pm2-5/#V5.GL.04) consists of mean PM2.5 (ug/m3) available at: -* Temporal frequency: Annual and monthly +* Temporal frequency: yearly and monthly * Grid resolutions: (0.1° × 0.1°) and (0.01° × 0.01°) * Geographic regions: North America, Europe, Asia, and Global @@ -47,7 +47,7 @@ The configuration structure withing the `/conf` folder allow you to modify the i * aggregate pm25: `src/aggregate_pm25.py` The key parameters are: -* `temporal_freq` which determines whether the original annual or monthly pm25 files will be aggregated. The options are: `annual` and `monthly`. +* `temporal_freq` which determines whether the original yearly or monthly pm25 files will be aggregated. The options are: `yearly` and `monthly`. * `polygon_name` which determines into which polygons the pm25 grid will the aggregated. The options are: `zcta` and `county`. --- @@ -98,7 +98,7 @@ python src/aggregate_pm25.py or run the pipeline: ```bash -snakemake --cores 4 -C polygon_name=county temporal_freq=annual +snakemake --cores 4 -C polygon_name=county temporal_freq=yearly ``` Modify `cores`, `polygon_name` and `temporal_freq` as you find convenient. @@ -115,7 +115,7 @@ mkdir /satellite_pm25_raster2polygon ```bash docker pull nsaph/satellite_pm25_raster2polygon -docker run -v :/app/data/input/satellite_pm25/annual /satellite_pm25_raster2polygon/:/app/data/output/satellite_pm25_raster2polygon nsaph/satellite_pm25_raster2polygon +docker run -v :/app/data/input/satellite_pm25/yearly /satellite_pm25_raster2polygon/:/app/data/output/satellite_pm25_raster2polygon nsaph/satellite_pm25_raster2polygon ``` If you are interested in storing the input raw and intermediate data run diff --git a/Snakefile b/Snakefile index df4aa44..b6e63a1 100644 --- a/Snakefile +++ b/Snakefile @@ -17,21 +17,21 @@ temporal_freq = config['temporal_freq'] polygon_name = config['polygon_name'] with initialize(version_base=None, config_path="conf"): - hydra_cfg = compose(config_name="config", overrides=[f"temporal_freq={temporal_freq}", f"polygon_name={polygon_name}"]) + cfg = compose(config_name="config", overrides=[f"temporal_freq={temporal_freq}", f"polygon_name={polygon_name}"]) -satellite_pm25_cfg = hydra_cfg.satellite_pm25 -shapefiles_cfg = hydra_cfg.shapefiles +satellite_pm25_cfg = cfg.satellite_pm25 +shapefiles_cfg = cfg.shapefiles shapefile_years_list = list(shapefiles_cfg[polygon_name].keys()) months_list = "01" if temporal_freq == 'yearly' else [str(i).zfill(2) for i in range(1, 12 + 1)] -years_list = list(range(1998, 2022 + 1)) +years_list = list(range(1998, 2023 + 1)) # == Define rules == rule all: input: expand( - f"data/output/pm25__washu/{polygon_name}_{temporal_freq}/pm25__washu__{polygon_name}_{temporal_freq}__" + + f"{cfg.datapaths.base_path}/output/{polygon_name}_{temporal_freq}/pm25__randall__{polygon_name}_{temporal_freq}__" + ("{year}.parquet" if temporal_freq == 'yearly' else "{year}_{month}.parquet"), year=years_list, month=months_list @@ -40,14 +40,14 @@ rule all: # remove and use symlink to the us census geoboundaries rule download_shapefiles: output: - f"data/input/shapefiles/shapefile_{polygon_name}_" + "{shapefile_year}/shapefile.shp" + f"{cfg.datapaths.base_path}/input/shapefiles/shapefile_{polygon_name}_" + "{shapefile_year}/shapefile.shp" shell: f"python src/download_shapefile.py polygon_name={polygon_name} " + "shapefile_year={wildcards.shapefile_year}" rule download_satellite_pm25: output: expand( - f"data/input/pm25__washu__raw/{temporal_freq}/{satellite_pm25_cfg[temporal_freq]['file_prefix']}." + + f"{cfg.datapaths.base_path}/input/raw/{temporal_freq}/{satellite_pm25_cfg[temporal_freq]['file_prefix']}." + ("{year}01-{year}12.nc" if temporal_freq == 'yearly' else "{year}{month}-{year}{month}.nc"), year=years_list, month=months_list) @@ -58,20 +58,20 @@ rule download_satellite_pm25: def get_shapefile_input(wildcards): shapefile_year = available_shapefile_year(int(wildcards.year), shapefile_years_list) - return f"data/input/shapefiles/shapefile_{polygon_name}_{shapefile_year}/shapefile.shp" + return f"{cfg.datapaths.base_path}/input/shapefiles/shapefile_{polygon_name}_{shapefile_year}/shapefile.shp" rule aggregate_pm25: input: get_shapefile_input, expand( - f"data/input/pm25__washu__raw/{temporal_freq}/{satellite_pm25_cfg[temporal_freq]['file_prefix']}." + + f"{cfg.datapaths.base_path}/input/raw/{temporal_freq}/{satellite_pm25_cfg[temporal_freq]['file_prefix']}." + ("{{year}}01-{{year}}12.nc" if temporal_freq == 'yearly' else "{{year}}{month}-{{year}}{month}.nc"), month=months_list ) output: expand( - f"data/output/pm25__washu/{polygon_name}_{temporal_freq}/pm25__washu__{polygon_name}_{temporal_freq}__" + + f"{cfg.datapaths.base_path}/output/{polygon_name}_{temporal_freq}/pm25__randall__{polygon_name}_{temporal_freq}__" + ("{{year}}.parquet" if temporal_freq == 'yearly' else "{{year}}_{month}.parquet"), month=months_list # we only want to expand months_list and keep year as wildcard ) diff --git a/conf/config.yaml b/conf/config.yaml index c3c4483..3c502cd 100644 --- a/conf/config.yaml +++ b/conf/config.yaml @@ -1,15 +1,15 @@ defaults: - _self_ - - datapaths: cannon_datapaths + - datapaths: cannon_v5gl - shapefiles: shapefiles - - satellite_pm25: us_pm25 + - satellite_pm25: V5GL0502.HybridPM25c_0p05.NorthAmerica # == aggregation args temporal_freq: yearly # yearly, monthly to be matched with cfg.satellite_pm25 year: 2020 # == shapefile download args -polygon_name: zcta # zcta, county to be matched with cfg.shapefiles +polygon_name: county # zcta, county to be matched with cfg.shapefiles shapefile_year: 2020 #to be matched with cfg.shapefiles show_progress: false diff --git a/conf/datapaths/cannon_datapaths.yaml b/conf/datapaths/cannon_datapaths.yaml deleted file mode 100644 index c41a3fe..0000000 --- a/conf/datapaths/cannon_datapaths.yaml +++ /dev/null @@ -1,13 +0,0 @@ -# if files are stored within the local copy of the repository, then use null: -input: - pm25__washu__raw: - yearly: /n/netscratch/dominici_lab/Lab/pm25__washu__raw/yearly/ #/n/dominici_lab/lab/lego/environmnetal/pm25__washu/raw/annual - monthly: /n/netscratch/dominici_lab/Lab/pm25__washu__raw/monthly/ #/n/dominici_lab/lab/lego/environmnetal/pm25__washu/raw/monthly - shapefiles: null - -output: - pm25__washu: - zcta_yearly: /n/dominici_lab/lab/lego/environmental/pm25__washu/zcta_yearly - zcta_monthly: /n/dominici_lab/lab/lego/environmental/pm25__washu/zcta_monthly - county_yearly: /n/dominici_lab/lab/lego/environmental/pm25__washu/county_yearly - county_monthly: /n/dominici_lab/lab/lego/environmental/pm25__washu/county_monthly diff --git a/conf/datapaths/cannon_v5gl.yaml b/conf/datapaths/cannon_v5gl.yaml new file mode 100644 index 0000000..45845f8 --- /dev/null +++ b/conf/datapaths/cannon_v5gl.yaml @@ -0,0 +1,14 @@ +base_path: data/V5GL + +dirs: + input: + raw: + yearly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V5GL/raw/yearly #/n/netscratch/dominici_lab/Lab/pm25__randall__raw/yearly + monthly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V5GL/raw/monthly #/n/netscratch/dominici_lab/Lab/pm25__randall__raw/monthly + shapefiles: null + + output: + zcta_yearly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V5GL/zcta_yearly + zcta_monthly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V5GL/zcta_monthly + county_yearly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V5GL/county_yearly + county_monthly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V5GL/county_monthly diff --git a/conf/datapaths/cannon_v6gl.yaml b/conf/datapaths/cannon_v6gl.yaml new file mode 100644 index 0000000..50fcbf7 --- /dev/null +++ b/conf/datapaths/cannon_v6gl.yaml @@ -0,0 +1,14 @@ +base_path: data/V6GL + +dirs: + input: + raw: + yearly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V6GL/raw/yearly #/n/netscratch/dominici_lab/Lab/pm25__randall__raw/yearly + monthly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V6GL/raw/monthly #/n/netscratch/dominici_lab/Lab/pm25__randall__raw/monthly + shapefiles: null + + output: + zcta_yearly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V6GL/zcta_yearly + zcta_monthly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V6GL/zcta_monthly + county_yearly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V6GL/county_yearly + county_monthly: /n/dominici_lab/lab/lego/environmental/pm25__randall/V6GL/county_monthly diff --git a/conf/datapaths/datapaths.yaml b/conf/datapaths/datapaths.yaml index e9d1be7..3250093 100644 --- a/conf/datapaths/datapaths.yaml +++ b/conf/datapaths/datapaths.yaml @@ -1,12 +1,13 @@ -# if files are stored within the local copy of the repository, then use null: -input: - pm25__washu__raw: - yearly: null - monthly: null - shapefiles: null +base_path: data/V6GL -output: - pm25__washu: +dirs: + input: + raw: + yearly: null + monthly: null + shapefiles: null + + output: zcta_yearly: null zcta_monthly: null county_yearly: null diff --git a/conf/satellite_pm25/us_pm25.yaml b/conf/satellite_pm25/V5GL04.HybridPM25c_0p10.NorthAmerica.yaml similarity index 100% rename from conf/satellite_pm25/us_pm25.yaml rename to conf/satellite_pm25/V5GL04.HybridPM25c_0p10.NorthAmerica.yaml diff --git a/conf/satellite_pm25/V5GL0502.HybridPM25c_0p05.NorthAmerica.yaml b/conf/satellite_pm25/V5GL0502.HybridPM25c_0p05.NorthAmerica.yaml new file mode 100644 index 0000000..94ab872 --- /dev/null +++ b/conf/satellite_pm25/V5GL0502.HybridPM25c_0p05.NorthAmerica.yaml @@ -0,0 +1,19 @@ +yearly: + url: https://wustl.app.box.com/v/ACAG-V5GL0502-GWRPM25c0p05/folder/293383209520 + + zipname: Annual + + file_prefix: "V5GL0502.HybridPM25c_0p05.NorthAmerica" + #file name convention is V5GL0502.HybridPM25c_0p05.NorthAmerica.yyyymm-yyyymm.nc + +monthly: + url: https://wustl.app.box.com/v/ACAG-V5GL0502-GWRPM25c0p05/folder/293385030318 + + zipname: Monthly + + file_prefix: "V5GL0502.HybridPM25c_0p05.NorthAmerica" + #file name convention is V5GL0502.HybridPM25c_0p05.NorthAmerica.yyyymm-yyyymm.nc + +layer: "GWRPM25" #geographic weighted regression PM2.5 +latitude_layer: "lat" +longitude_layer: "lon" diff --git a/conf/satellite_pm25/V6GL02.04.CNNPM25.0p10.NA.yaml b/conf/satellite_pm25/V6GL02.04.CNNPM25.0p10.NA.yaml new file mode 100644 index 0000000..97cb511 --- /dev/null +++ b/conf/satellite_pm25/V6GL02.04.CNNPM25.0p10.NA.yaml @@ -0,0 +1,18 @@ +yearly: + url: https://wustl.app.box.com/s/s7eiaxytjr9w1z7glat45cesitcemprv/folder/327763225614 + + zipname: Annual + + file_prefix: "V6GL02.04.CNNPM25.0p10.NA" + #file name convention is V6GL02.04.CNNPM25.0p10.NA.yyyymm-yyyymm.nc + +monthly: + url: https://wustl.app.box.com/s/s7eiaxytjr9w1z7glat45cesitcemprv/folder/327764742544 + zipname: Monthly + + file_prefix: "V6GL02.04.CNNPM25.0p10.NA" + #file name convention is V6GL02.04.CNNPM25.0p10.NA.yyyymm-yyyymm.nc + +layer: "GWRPM25" #geographic weighted regression PM2.5 +latitude_layer: "lat" +longitude_layer: "lon" diff --git a/requirements.yaml b/environment.yaml similarity index 78% rename from requirements.yaml rename to environment.yaml index 6f0ee4d..596e80a 100644 --- a/requirements.yaml +++ b/environment.yaml @@ -1,4 +1,4 @@ -name: satellite_pm25_raster2polygon +name: pm25_randall channels: - conda-forge - defaults @@ -19,6 +19,3 @@ dependencies: - selenium==4.29.0 - chromedriver-binary==135.0.7030.0.0 - tqdm==4.67.1 - - torch==2.6.0 - - torchaudio==2.6.0 - - torchvision==0.21.0 diff --git a/fasrc_jobs/county_monthly.sbatch b/fasrc_jobs/county_monthly.sbatch deleted file mode 100644 index 2f47486..0000000 --- a/fasrc_jobs/county_monthly.sbatch +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -# -#SBATCH -p serial_requeue # partition (queue) -#SBATCH -c 16 # number of cores -#SBATCH --mem 96GB # memory -#SBATCH -t 0-02:00 # time (D-HH:MM) - -singularity exec $HOME/singularity_images/satellite_pm25_raster2polygon_latest.sif snakemake --cores 16 -C polygon_name=county temporal_freq=monthly diff --git a/fasrc_jobs/zcta_monthly.sbatch b/fasrc_jobs/zcta_monthly.sbatch deleted file mode 100644 index 38c04a4..0000000 --- a/fasrc_jobs/zcta_monthly.sbatch +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -# -#SBATCH -p shared # partition (queue) -#SBATCH -c 32 # number of cores -#SBATCH --mem 96GB # memory -#SBATCH -t 0-01:00 # time (D-HH:MM) - -singularity exec $HOME/singularity_images/satellite_pm25_raster2polygon_latest.sif snakemake --cores 16 -C polygon_name=zcta temporal_freq=monthly diff --git a/fasrc_jobs/README.md b/jobs/README.md similarity index 100% rename from fasrc_jobs/README.md rename to jobs/README.md diff --git a/jobs/county_monthly.sbatch b/jobs/county_monthly.sbatch new file mode 100644 index 0000000..4e28318 --- /dev/null +++ b/jobs/county_monthly.sbatch @@ -0,0 +1,10 @@ +#!/bin/bash +# +#SBATCH -p serial_requeue # partition (queue) +#SBATCH -c 16 # number of cores +#SBATCH --mem 96GB # memory +#SBATCH -t 0-02:00 # time (D-HH:MM) + +#singularity exec $HOME/singularity_images/satellite_pm25_raster2polygon_latest.sif snakemake --cores 16 -C polygon_name=county temporal_freq=monthly + +snakemake --cores 16 -C polygon_name=county temporal_freq=monthly diff --git a/jobs/v5gl.sbatch b/jobs/v5gl.sbatch new file mode 100644 index 0000000..8bb89cc --- /dev/null +++ b/jobs/v5gl.sbatch @@ -0,0 +1,13 @@ +#!/bin/bash +# +#SBATCH -p serial_requeue # partition (queue) +#SBATCH -c 48 # number of cores +#SBATCH --mem 184GB # memory +#SBATCH -t 0-12:00 # time (D-HH:MM) + +#singularity exec $HOME/singularity_images/satellite_pm25_raster2polygon_latest.sif snakemake --cores 16 -C polygon_name=county temporal_freq=monthly + +snakemake --cores 24 -C polygon_name=county temporal_freq=yearly +snakemake --cores 24 -C polygon_name=county temporal_freq=monthly +snakemake --cores 24 -C polygon_name=zcta temporal_freq=yearly +snakemake --cores 24 -C polygon_name=zcta temporal_freq=monthly diff --git a/jobs/zcta_monthly.sbatch b/jobs/zcta_monthly.sbatch new file mode 100644 index 0000000..2f1d953 --- /dev/null +++ b/jobs/zcta_monthly.sbatch @@ -0,0 +1,11 @@ +#!/bin/bash +# +#SBATCH -p shared # partition (queue) +#SBATCH -c 32 # number of cores +#SBATCH --mem 96GB # memory +#SBATCH -t 0-01:00 # time (D-HH:MM) + +#singularity exec $HOME/singularity_images/satellite_pm25_raster2polygon_latest.sif snakemake --cores 16 -C polygon_name=zcta temporal_freq=monthly + +snakemake --cores 32 -C polygon_name=zcta temporal_freq=yearly + diff --git a/jobs/zcta_yearly.sbatch b/jobs/zcta_yearly.sbatch new file mode 100644 index 0000000..9d94155 --- /dev/null +++ b/jobs/zcta_yearly.sbatch @@ -0,0 +1,10 @@ +#!/bin/bash +# +#SBATCH -p shared # partition (queue) +#SBATCH -c 32 # number of cores +#SBATCH --mem 96GB # memory +#SBATCH -t 0-01:00 # time (D-HH:MM) + +#singularity exec $HOME/singularity_images/satellite_pm25_raster2polygon_latest.sif snakemake --cores 16 -C polygon_name=zcta temporal_freq=monthly + +snakemake --cores 32 -C polygon_name=zcta temporal_freq=yearly diff --git a/notes/eda_input.ipynb b/notes/eda_input.ipynb index 9bb877c..55a1ae2 100644 --- a/notes/eda_input.ipynb +++ b/notes/eda_input.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -38,7 +38,7 @@ ], "source": [ "# Open the netCDF file\n", - "file_path = \"../data/input/satellite_pm25/annual/V5GL04.HybridPM25c_0p10.NorthAmerica.202201-202212.nc\"\n", + "file_path = f\"../{cfg.datapaths.base_path}/input/satellite_pm25/yearly/V5GL04.HybridPM25c_0p10.NorthAmerica.202201-202212.nc\"\n", "dataset = netCDF4.Dataset(file_path)\n", "\n", "# Print the global attributes\n", @@ -183,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -210,7 +210,7 @@ "import matplotlib.pyplot as plt\n", "\n", "# Open the netCDF file\n", - "file_path = \"data/V5GL04.HybridPM25.NorthAmerica.202201-202212.nc\"\n", + "file_path = f\"{cfg.datapaths.base_path}/V5GL04.HybridPM25.NorthAmerica.202201-202212.nc\"\n", "dataset = netCDF4.Dataset(file_path)\n", "\n", "# Get the latitude and longitude variables\n", @@ -260,7 +260,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -280,12 +280,12 @@ "import matplotlib.pyplot as plt\n", "\n", "# Read the CSV file\n", - "pm25_data = pd.read_csv('data/county_pm25.csv')\n", + "pm25_data = pd.read_csv(f'{cfg.datapaths.base_path}/county_pm25.csv')\n", "# Convert GEOID to string using trailing zeros\n", "pm25_data['GEOID'] = pm25_data['GEOID'].astype(str).str.zfill(5)\n", "\n", "# Read the shapefile\n", - "shapefile = gpd.read_file('data/shapefile_cb_county_2015/shapefile.shp')\n", + "shapefile = gpd.read_file(f\"{cfg.datapaths.base_path}/shapefile_cb_county_2015/shapefile.shp\")\n", "\n", "# Merge the data\n", "merged_data = shapefile.merge(pm25_data, on='GEOID', how='left')\n", diff --git a/notes/eda_output.ipynb b/notes/eda_output.ipynb index 9ab63ca..dcc5f7e 100644 --- a/notes/eda_output.ipynb +++ b/notes/eda_output.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -47,7 +47,7 @@ ], "source": [ "# Open the netCDF file\n", - "file_path = \"data/V5GL04.HybridPM25.NorthAmerica.202201-202212.nc\"\n", + "file_path = f\"{cfg.datapaths.base_path}/input/satellite_pm25/yearly/V5GL04.HybridPM25c_0p10.NorthAmerica.202201-202212.nc\"\n", "dataset = netCDF4.Dataset(file_path)\n", "\n", "# Print the global attributes\n", @@ -192,7 +192,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -219,7 +219,7 @@ "import matplotlib.pyplot as plt\n", "\n", "# Open the netCDF file\n", - "file_path = \"data/V5GL04.HybridPM25.NorthAmerica.202201-202212.nc\"\n", + "file_path = f\"{cfg.datapaths.base_path}/input/satellite_pm25/yearly/V5GL04.HybridPM25c_0p10.NorthAmerica.202201-202212.nc\"\n", "dataset = netCDF4.Dataset(file_path)\n", "\n", "# Get the latitude and longitude variables\n", @@ -269,7 +269,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -279,7 +279,7 @@ "import pyarrow.parquet as pq\n", "\n", "# Read parquet file with pm25 at county level for 2015\n", - "pm25_data = pq.read_table(\"data/output/satellite_pm25_raster2polygon/monthly/satellite_pm25_zcta_2015_01.parquet\").to_pandas()" + "pm25_data = pq.read_table(f\"{cfg.datapaths.base_path}/datapaths.base_path}/datapaths.base_path}/datapaths.base_path}/output/satellite_pm25_raster2polygon/monthly/satellite_pm25_zcta_2015_01.parquet\").to_pandas()" ] }, { @@ -368,7 +368,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -388,12 +388,12 @@ "import matplotlib.pyplot as plt\n", "\n", "# Read the CSV file\n", - "pm25_data = pd.read_csv('data/county_pm25.csv')\n", + "pm25_data = pd.read_csv(f'{cfg.datapaths.base_path}/county_pm25.csv')\n", "# Convert GEOID to string using trailing zeros\n", "pm25_data['GEOID'] = pm25_data['GEOID'].astype(str).str.zfill(5)\n", "\n", "# Read the shapefile\n", - "shapefile = gpd.read_file('data/shapefile_cb_county_2015/shapefile.shp')\n", + "shapefile = gpd.read_file('{cfg.datapaths.base_path}/datapaths.base_path}/datapaths.base_path}/shapefile_cb_county_2015/shapefile.shp')\n", "\n", "# Merge the data\n", "merged_data = shapefile.merge(pm25_data, on='GEOID', how='left')\n", diff --git a/src/aggregate_pm25.py b/src/aggregate_pm25.py index 21a7dd1..cb320b5 100644 --- a/src/aggregate_pm25.py +++ b/src/aggregate_pm25.py @@ -39,7 +39,7 @@ def main(cfg): #use previously available shapefile shapefile_year = available_shapefile_year(cfg.year, shapefile_years_list) - shape_path = f'data/input/shapefiles/shapefile_{cfg.polygon_name}_{shapefile_year}/shapefile.shp' + shape_path = f'{cfg.datapaths.base_path}/input/shapefiles/shapefile_{cfg.polygon_name}_{shapefile_year}/shapefile.shp' polygon = gpd.read_file(shape_path) polygon_ids = polygon[cfg.shapefiles[cfg.polygon_name][shapefile_year].idvar].values @@ -62,7 +62,7 @@ def main(cfg): # load the first file to obtain the affine transform/boundaries LOGGER.info("Mapping polygons to raster cells.") - ds = xarray.open_dataset(f"data/input/pm25__washu__raw/{cfg.temporal_freq}/{filenames[0]}") + ds = xarray.open_dataset(f"{cfg.datapaths.base_path}/input/raw/{cfg.temporal_freq}/{filenames[0]}") layer = getattr(ds, cfg.satellite_pm25.layer) # obtain affine transform/boundaries @@ -90,7 +90,7 @@ def main(cfg): if i > 0: # reload the file only if it is different from the first one - ds = xarray.open_dataset(f"data/input/pm25__washu__raw/{cfg.temporal_freq}/{filename}") + ds = xarray.open_dataset(f"{cfg.datapaths.base_path}/input/raw/{cfg.temporal_freq}/{filename}") layer = getattr(ds, cfg.satellite_pm25.layer) # === obtain stats quickly using precomputed mapping @@ -111,15 +111,15 @@ def main(cfg): # == save output file if cfg.temporal_freq == "yearly": # ignore month since len(filenames) == 1 - output_filename = f"pm25__washu__{cfg.polygon_name}_{cfg.temporal_freq}__{cfg.year}.parquet" + output_filename = f"pm25__randall__{cfg.polygon_name}_{cfg.temporal_freq}__{cfg.year}.parquet" elif cfg.temporal_freq == "monthly": # use month in filename since len(filenames) = 12 month = f"{i + 1:02d}" df["month"] = month - output_filename = f"pm25__washu__{cfg.polygon_name}_{cfg.temporal_freq}__{cfg.year}_{month}.parquet" + output_filename = f"pm25__randall__{cfg.polygon_name}_{cfg.temporal_freq}__{cfg.year}_{month}.parquet" - output_path = f"data/output/pm25__washu/{cfg.polygon_name}_{cfg.temporal_freq}/{output_filename}" + output_path = f"{cfg.datapaths.base_path}/output/{cfg.polygon_name}_{cfg.temporal_freq}/{output_filename}" df.to_parquet(output_path) # plot aggregation map using geopandas diff --git a/utils/create_dir_paths.py b/src/create_datapaths.py similarity index 77% rename from utils/create_dir_paths.py rename to src/create_datapaths.py index 696390d..a3d0d13 100644 --- a/utils/create_dir_paths.py +++ b/src/create_datapaths.py @@ -5,14 +5,31 @@ LOGGER = logging.getLogger(__name__) +def init_folder(folder_cfg=None): + folder_dict = folder_cfg.dirs + + # defines a base path for the data + datapath = folder_cfg.base_path + if datapath is None: + datapath = "data" + # check if datapath exists, if not create it + if os.path.exists(datapath): + LOGGER.info(f"Base path {datapath} already exists") + else: + LOGGER.info(f"Creating base path {datapath}") + os.makedirs(datapath, exist_ok=True) + + # create subfolders and symbolic links + create_subfolders_and_links(datapath=datapath, folder_dict=folder_dict) def create_subfolders_and_links(datapath="data", folder_dict=None): """ Recursively create subfolders and symbolic links. """ if not os.path.exists(datapath): - LOGGER.info(f"Error: {datapath} does not exists.") + LOGGER.info(f"Error: {datapath} does not exist.") return + if isinstance(folder_dict, DictConfig): for path, subfolder_dict in folder_dict.items(): sub_datapath = os.path.join(datapath, path) @@ -50,7 +67,7 @@ def create_subfolders_and_links(datapath="data", folder_dict=None): @hydra.main(config_path="../conf", config_name="config", version_base=None) def main(cfg): """Create data subfolders and symbolic links as indicated in config file.""" - create_subfolders_and_links(folder_dict=cfg.datapaths) + init_folder(folder_cfg=cfg.datapaths) if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/src/download_pm25.py b/src/download_pm25.py index a063c41..2466aec 100644 --- a/src/download_pm25.py +++ b/src/download_pm25.py @@ -25,7 +25,7 @@ def main(cfg): # == setup chrome driver # Expand the tilde to the user's home directory - download_dir = f"data/input/pm25__washu__raw/" + download_dir = f"{cfg.datapaths.base_path}/input/raw/" download_dir = os.path.abspath(download_dir) download_zip = f"{download_dir}/{cfg.satellite_pm25[cfg.temporal_freq].zipname}.zip" src_dir = f"{download_dir}/{cfg.satellite_pm25[cfg.temporal_freq].zipname}" diff --git a/src/download_shapefile.py b/src/download_shapefile.py index 492c9ca..0ea0059 100644 --- a/src/download_shapefile.py +++ b/src/download_shapefile.py @@ -8,7 +8,7 @@ def main(cfg): url = cfg.shapefiles[cfg.polygon_name][cfg.shapefile_year].url - tgt = f"data/input/shapefiles/shapefile_{cfg.polygon_name}_{cfg.shapefile_year}" + tgt = f"{cfg.datapaths.base_path}/input/shapefiles/shapefile_{cfg.polygon_name}_{cfg.shapefile_year}" tgtdir = os.path.dirname(tgt) tgtfile = os.path.basename(tgt)