fixed the snakemake hydra behavior Fixes #47

audiracmichelle · audiracmichelle · commit e5f494aad9fc · 2024-08-23T07:14:33.000-04:00
diff --git a/Snakefile b/Snakefile
@@ -1,26 +1,37 @@
-import yaml
+import hydra
+from omegaconf import OmegaConf
+import json
 
 conda: "requirements.yaml"
-configfile: "conf/config.yaml"
 
-# == Load configuration ==
+# the workflow configuration file is orchestrated by hydra
+# read config with hydra
+with hydra.initialize(config_path="conf", version_base=None):
+    cfg = hydra.compose(config_name="config", overrides=[])
+    #print(OmegaConf.to_yaml(cfg))
 
-# dynamic config files
-defaults_dict = {key: value for d in config['defaults'] if isinstance(d, dict) for key, value in d.items()}
-shapefiles_cfg = yaml.safe_load(open(f"conf/shapefiles/{defaults_dict['shapefiles']}.yaml", 'r'))
-# == Define variables ==
-shapefile_list = shapefiles_cfg.keys()
-print(shapefile_list)
+# convert to dict of single shapefile dicts 
+shapefiles_cfg = OmegaConf.to_container(cfg.shapefiles, resolve=True) 
+#print(shapefiles_cfg)
+shapefiles_cfg_dict = {shapefile["name"]: "[" + json.dumps(shapefile).replace('"', '') + "]" for shapefile in shapefiles_cfg}
+#print(shapefiles_cfg_dict)
+shapefiles_list = list(shapefiles_cfg_dict.keys())
+#print(shapefiles_list)
+# print(f"""
+#     python src/aggregate_climate_types.py "+shapefiles={shapefiles_cfg_dict[shapefiles_list[0]]}"
+#     """)
+
+#raise ValueError("stop here")
 
 rule all:
     input:
         expand(f"data/output/climate_types_raster2polygon/climate_types_{{shapefile_name}}.parquet", 
-            shapefile_name=shapefile_list
+            shapefile_name=shapefiles_list
         )
 
 rule download_climate_types:
     output:
-        f"data/input/climate_types/{config['climate_types_file']}"
+        f"data/input/climate_types/{cfg.climate_types_file}"
     shell:
         "python src/download_climate_types.py"
 
@@ -33,11 +44,17 @@ rule download_climate_types:
 
 rule aggregate_climate_types:
     input:
-        f"data/input/climate_types/{config['climate_types_file']}", 
+        f"data/input/climate_types/{cfg.climate_types_file}", 
         f"data/input/shapefiles/{{shapefile_name}}/{{shapefile_name}}.shp"
     output:
         f"data/output/climate_types_raster2polygon/climate_types_{{shapefile_name}}.parquet",
         f"data/intermediate/climate_pcts/climate_pcts_{{shapefile_name}}.json",
         f"data/intermediate/climate_pcts/climate_types_{{shapefile_name}}.csv"
+    params:
+        shapefile_name = lambda wildcards: shapefiles_cfg_dict[wildcards.shapefile_name]
     shell:
-        f"python src/aggregate_climate_types.py"
+        (f"""
+        echo {{wildcards.shapefile_name}}
+        python src/aggregate_climate_types.py "+shapefiles={{params.shapefile_name}}"
+        """)
+#python src/aggregate_climate_types.py "+shapefiles=[{name: CAN_ADM2, url: null, idvar: shapeID, output_idvar: id}]"
diff --git a/src/aggregate_climate_types.py b/src/aggregate_climate_types.py
@@ -13,6 +13,7 @@
 
 @hydra.main(config_path="../conf", config_name="config", version_base=None)
 def main(cfg):
+    print(cfg.shapefiles)
     LOGGER.info("""
         # Extract transform, crs, nodata from raster
     """)
@@ -39,10 +40,10 @@ def main(cfg):
     )
 
     # read shapefile
-    for shapefile_name in cfg.shapefiles:
-        LOGGER.info(f"Shapefile: {shapefile_name}")
-        idvar = cfg.shapefiles[shapefile_name].idvar
-        shp_path = f"data/input/shapefiles/{shapefile_name}/{shapefile_name}.shp"
+    for shapefile in cfg.shapefiles:
+        LOGGER.info(f"Shapefile: {shapefile.name}")
+        idvar = shapefile.idvar
+        shp_path = f"data/input/shapefiles/{shapefile.name}/{shapefile.name}.shp"
         LOGGER.info(f"Reading shapefile {shp_path}")
         shp = gpd.read_file(shp_path)
         LOGGER.info(f"Read shapefile with head\n: {shp.drop(columns='geometry').head()}")
@@ -93,7 +94,7 @@ def main(cfg):
         LOGGER.info(f"Fraction of locations with ties: {100 * frac_ties:.2f}%")
 
         intermediate_dir = f"data/intermediate/climate_pcts"
-        pcts_file = f"{intermediate_dir}/climate_pcts_{shapefile_name}.json"
+        pcts_file = f"{intermediate_dir}/climate_pcts_{shapefile.name}.json"
         LOGGER.info(f"Saving pcts to {pcts_file}")
         with open(pcts_file, "w") as f:
             json.dump(avs, f)
@@ -109,7 +110,7 @@ def main(cfg):
         class_df["climate_type_long"] = class_df["climate_type_num"].map(codedict_long) # if a polygon intersects only with water then there is no assignment
         class_df = class_df.drop(columns="climate_type_num")
 
-        class_file = f"{intermediate_dir}/climate_types_{shapefile_name}.csv"
+        class_file = f"{intermediate_dir}/climate_types_{shapefile.name}.csv"
         LOGGER.info(f"Saving classification to {class_file}")
         class_df.to_csv(class_file, index=False)
 
@@ -125,9 +126,9 @@ def main(cfg):
 
         output_df = pd.merge(class_df, output_df, on="id")
 
-        output_file = f"data/output/climate_types_raster2polygon/climate_types_{shapefile_name}.parquet"
+        output_file = f"data/output/climate_types_raster2polygon/climate_types_{shapefile.name}.parquet"
         LOGGER.info(f"Saving output to {output_file}")
-        output_df.rename(columns={"id": cfg.shapefiles[shapefile_name].output_idvar}, inplace=True)
+        output_df.rename(columns={"id": shapefile.output_idvar}, inplace=True)
         output_df.to_parquet(output_file)
 
 if __name__ == "__main__":