Skip to content
Draft
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion conf/datapaths/datapaths_cannon.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ name: null
dirs:
input:
lego: /n/dominici_lab/lab/lego
output: /n/dominici_lab/lab/lego_loader_x/output
covars: /n/dominici_lab/lab/lego_loader_x/output
health: /n/dominici_lab/lab/lego_loader_x/synthetic_health
54 changes: 54 additions & 0 deletions conf/synthetic/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
year: 2010
horizons: [30, 90, 180] # Horizons in days (including daily)

# conf
var_group: health
vg_name: synthetic_health

var: diabetes

spatial_res: zcta
temporal_res: daily

input_dir: data/input/
output_dir: data/health/

#var_group
min_year: 2000
max_year: 2015
min_spatial_res: zcta
min_temporal_res: daily
lego_nm: synthetic_sparse_counts
lego_dir: lego/synthetic/medpar_outcomes/ccw/zcta_daily

# Debug options
debug_days: 3 # Set to null or remove for full year processing

# Synthetic data parameters
synthetic:
# Paths for ZCTA data
zcta_unique_path: data/input/lego/geoboundaries/us_geoboundaries__census/us_uniqueid__census/zcta_yearly
zcta_shapefile_path: data/input/lego/geoboundaries/us_geoboundaries__census/us_shapefile__census/zcta_yearly
population_path: data/input/lego/social/demographics__census/raw/core/zcta__dec__population.parquet

# Poisson distribution parameters for synthetic data generation
poisson_params:
base_rate: 0.11 # Base rate for Poisson distribution (target: ~85% zeros)
seasonal_amplitude: 0.02 # Seasonal variation amplitude (reduced to maintain sparsity)
spatial_variance: 0.03 # Spatial variance across ZCTAs (reduced to maintain sparsity)
latitude_effect: 0.2 # Effect of latitude on incidence
longitude_effect: 0.1 # Effect of longitude on incidence
population_effect: 0.0001 # Population scaling factor (per capita effect)
random_seed: 42 # For reproducibility

# Geographic constraints
mainland_only: true # Filter for continental US only

# Date range for synthetic data
date_range:
start_year: 2000
end_year: 2015

hydra:
run:
dir: logs/synthetic/${now:%Y-%m-%d}/${now:%H-%M-%S}
6 changes: 4 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
numpy
torch
pandas==2.2.2
pyarrow==11.0.0
pandas>2.2.2
pyarrow
duckdb==0.9.2
hydra-core==1.3.2
snakemake==8.16
tqdm
ipykernel
geopandas
scipy
31 changes: 31 additions & 0 deletions snakefile_synthetic_health.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Snakemake file for synthetic health data generation

# Load config
configfile: "conf/synthetic/config.yaml"

# Get config values - using same years as real health data but can be different
years = [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
2010, 2011, 2012, 2013, 2014]
vars = ["diabetes"] # Start with diabetes, can expand to other synthetic vars

# Rule: final output is one sentinel file per var/year (Dec 31)
rule all:
input:
expand(
"data/health/synthetic_health/{var}/{var}__{year}1231.parquet",
var=vars,
year=years
)

# Rule: preprocess synthetic health data for given var and year
rule preprocess_synthetic_health:
output:
"data/health/synthetic_health/{var}/{var}__{year}1231.parquet"
shell:
"""
python src/preprocessing_synth_health.py \
hydra.run.dir=. \
var={wildcards.var} \
year={wildcards.year} \
debug_days=null
"""
Loading