NSAPH-Data-Processing · mauriciogtec · Sep 23, 2025 · Sep 23, 2025 · Sep 23, 2025 · Sep 23, 2025
diff --git a/conf/datapaths/datapaths_cannon.yaml b/conf/datapaths/datapaths_cannon.yaml
@@ -3,4 +3,5 @@ name: null
 dirs:
   input:
     lego: /n/dominici_lab/lab/lego
-  output: /n/dominici_lab/lab/lego_loader_x/output
+  covars: /n/dominici_lab/lab/lego_loader_x/output
+  health: /n/dominici_lab/lab/lego_loader_x/synthetic_health
diff --git a/conf/synthetic/config.yaml b/conf/synthetic/config.yaml
@@ -0,0 +1,54 @@
+year: 2010
+horizons: [30, 90, 180] # Horizons in days (including daily)
+
+# conf
+var_group: health
+vg_name: synthetic_health
+
+var: diabetes
+
+spatial_res: zcta
+temporal_res: daily
+
+input_dir: data/input/
+output_dir: data/health/
+
+#var_group
+min_year: 2000
+max_year: 2015
+min_spatial_res: zcta
+min_temporal_res: daily
+lego_nm: synthetic_sparse_counts
+lego_dir: lego/synthetic/medpar_outcomes/ccw/zcta_daily
+
+# Debug options
+debug_days: 3  # Set to null or remove for full year processing
+
+# Synthetic data parameters
+synthetic:
+  # Paths for ZCTA data
+  zcta_unique_path: data/input/lego/geoboundaries/us_geoboundaries__census/us_uniqueid__census/zcta_yearly
+  zcta_shapefile_path: data/input/lego/geoboundaries/us_geoboundaries__census/us_shapefile__census/zcta_yearly
+  population_path: data/input/lego/social/demographics__census/raw/core/zcta__dec__population.parquet
+
+  # Poisson distribution parameters for synthetic data generation
+  poisson_params:
+    base_rate: 0.11             # Base rate for Poisson distribution (target: ~85% zeros)
+    seasonal_amplitude: 0.02    # Seasonal variation amplitude (reduced to maintain sparsity)
+    spatial_variance: 0.03      # Spatial variance across ZCTAs (reduced to maintain sparsity)
+    latitude_effect: 0.2        # Effect of latitude on incidence
+    longitude_effect: 0.1       # Effect of longitude on incidence
+    population_effect: 0.0001   # Population scaling factor (per capita effect)
+    random_seed: 42             # For reproducibility
+
+  # Geographic constraints
+  mainland_only: true           # Filter for continental US only
+
+  # Date range for synthetic data
+  date_range:
+    start_year: 2000
+    end_year: 2015
+
+hydra:
+  run:
+    dir: logs/synthetic/${now:%Y-%m-%d}/${now:%H-%M-%S}
diff --git a/requirements.txt b/requirements.txt
@@ -1,9 +1,11 @@
 numpy
 torch
-pandas==2.2.2
-pyarrow==11.0.0
+pandas>2.2.2
+pyarrow
 duckdb==0.9.2
 hydra-core==1.3.2
 snakemake==8.16
 tqdm
 ipykernel
+geopandas
+scipy
diff --git a/snakefile_synthetic_health.smk b/snakefile_synthetic_health.smk
@@ -0,0 +1,31 @@
+# Snakemake file for synthetic health data generation
+
+# Load config
+configfile: "conf/synthetic/config.yaml"
+
+# Get config values - using same years as real health data but can be different
+years = [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
+         2010, 2011, 2012, 2013, 2014]
+vars = ["diabetes"]  # Start with diabetes, can expand to other synthetic vars
+
+# Rule: final output is one sentinel file per var/year (Dec 31)
+rule all:
+    input:
+        expand(
+            "data/health/synthetic_health/{var}/{var}__{year}1231.parquet",
+            var=vars,
+            year=years
+        )
+
+# Rule: preprocess synthetic health data for given var and year
+rule preprocess_synthetic_health:
+    output:
+        "data/health/synthetic_health/{var}/{var}__{year}1231.parquet"
+    shell:
+        """
+        python src/preprocessing_synth_health.py \
+            hydra.run.dir=. \
+            var={wildcards.var} \
+            year={wildcards.year} \
+            debug_days=null
+        """