Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion conf/datapaths/datapaths_cannon.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ name: null
dirs:
input:
lego: /n/dominici_lab/lab/lego
output: /n/dominici_lab/lab/lego_loader_x/output
covars: /n/dominici_lab/lab/lego_loader_x/output
health: /n/dominici_lab/lab/lego_loader_x/synthetic_health
54 changes: 54 additions & 0 deletions conf/synthetic/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
year: 2010
horizons: [30, 90, 180] # Horizons in days (including daily)

# conf
var_group: health
vg_name: synthetic_health

var: diabetes

spatial_res: zcta
temporal_res: daily

input_dir: data/input/
output_dir: data/health/

#var_group
min_year: 2000
max_year: 2015
min_spatial_res: zcta
min_temporal_res: daily
lego_nm: synthetic_sparse_counts
lego_dir: lego/synthetic/medpar_outcomes/ccw/zcta_daily

# Debug options
debug_days: 3 # Set to null or remove for full year processing

# Synthetic data parameters
synthetic:
# Paths for ZCTA data
zcta_unique_path: data/input/lego/geoboundaries/us_geoboundaries__census/us_uniqueid__census/zcta_yearly
zcta_shapefile_path: data/input/lego/geoboundaries/us_geoboundaries__census/us_shapefile__census/zcta_yearly
population_path: data/input/lego/social/demographics__census/raw/core/zcta__dec__population.parquet

# Poisson distribution parameters for synthetic data generation
poisson_params:
base_rate: 0.11 # Base rate for Poisson distribution (target: ~85% zeros)
seasonal_amplitude: 0.02 # Seasonal variation amplitude (reduced to maintain sparsity)
spatial_variance: 0.03 # Spatial variance across ZCTAs (reduced to maintain sparsity)
latitude_effect: 0.2 # Effect of latitude on incidence
longitude_effect: 0.1 # Effect of longitude on incidence
population_effect: 0.0001 # Population scaling factor (per capita effect)
random_seed: 42 # For reproducibility

# Geographic constraints
mainland_only: true # Filter for continental US only

# Date range for synthetic data
date_range:
start_year: 2000
end_year: 2015

hydra:
run:
dir: logs/synthetic/${now:%Y-%m-%d}/${now:%H-%M-%S}
112 changes: 112 additions & 0 deletions conf/synthetic/snakemake.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
years: [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
2010, 2011, 2012, 2013, 2014]

# All health outcomes (same as real health data)
vars: ['anemia', 'ami', 'alzh', 'alzhdmta', 'atrialfb', 'cataract',
'chrnkidn', 'copd', 'chf', 'diabetes', 'stroke', 'breastCancer',
'colorectalCancer', 'prostateCancer', 'lungCancer',
'endometrialCancer', 'hyperp', 'glaucoma', 'hipfrac', 'ischmcht',
'depressn', 'osteoprs', 'ra_oa', 'asthma', 'hyperl', 'hypert',
'hypoth']

# Default parameters for all diseases
default_params:
base_rate: 0.11
seasonal_amplitude: 0.02
spatial_variance: 0.03
latitude_effect: 0.2
longitude_effect: 0.1
population_effect: 0.0001
random_seed: 42

# Disease-specific parameter variations
disease_params:
# More common diseases
diabetes:
base_rate: 0.15
seasonal_amplitude: 0.015

hypert:
base_rate: 0.18
seasonal_amplitude: 0.01

hyperl:
base_rate: 0.16
seasonal_amplitude: 0.01

chf:
base_rate: 0.12
seasonal_amplitude: 0.03

# Seasonal diseases
asthma:
base_rate: 0.10
seasonal_amplitude: 0.04

copd:
base_rate: 0.09
seasonal_amplitude: 0.035

stroke:
base_rate: 0.08
seasonal_amplitude: 0.025

# Cancers (less seasonal, more spatial variation)
breastCancer:
base_rate: 0.05
seasonal_amplitude: 0.005
spatial_variance: 0.05

lungCancer:
base_rate: 0.06
seasonal_amplitude: 0.008
spatial_variance: 0.06

colorectalCancer:
base_rate: 0.04
seasonal_amplitude: 0.005
spatial_variance: 0.04

prostateCancer:
base_rate: 0.07
seasonal_amplitude: 0.005
spatial_variance: 0.05

endometrialCancer:
base_rate: 0.03
seasonal_amplitude: 0.003
spatial_variance: 0.04

# Age-related diseases (higher population effect)
alzh:
base_rate: 0.06
population_effect: 0.0002
spatial_variance: 0.04

alzhdmta:
base_rate: 0.05
population_effect: 0.0002
spatial_variance: 0.04

osteoprs:
base_rate: 0.08
population_effect: 0.00015
seasonal_amplitude: 0.015

cataract:
base_rate: 0.10
population_effect: 0.00015
seasonal_amplitude: 0.01

# Other diseases
ami:
base_rate: 0.07
seasonal_amplitude: 0.025

hipfrac:
base_rate: 0.04
seasonal_amplitude: 0.02

anemia:
base_rate: 0.09
seasonal_amplitude: 0.02
6 changes: 4 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
numpy
torch
pandas==2.2.2
pyarrow==11.0.0
pandas>2.2.2
pyarrow
duckdb==0.9.2
hydra-core==1.3.2
snakemake==8.16
tqdm
ipykernel
geopandas
scipy
52 changes: 52 additions & 0 deletions snakefile_synthetic_health.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Snakemake file for synthetic health data generation

# Load config
configfile: "conf/synthetic/snakemake.yaml"

# Get config values
years = config["years"]
vars = config["vars"]

def get_disease_params(var):
"""Get disease-specific parameters for synthetic data generation"""
params = config["default_params"].copy()

# Apply disease-specific variations if they exist
if var in config["disease_params"]:
params.update(config["disease_params"][var])

# Add disease-specific random seed for reproducibility but variation
params["random_seed"] = hash(var) % 1000 + 42

return params

# Rule: final output is one sentinel file per var/year (Dec 31)
rule all:
input:
expand(
"data/health/synthetic_health/{var}/{var}__{year}1231.parquet",
var=vars,
year=years
)

# Rule: preprocess synthetic health data for given var and year
rule preprocess_synthetic_health:
output:
"data/health/synthetic_health/{var}/{var}__{year}1231.parquet"
params:
disease_params = lambda wildcards: get_disease_params(wildcards.var)
shell:
"""
python src/preprocessing_synth_health.py \
hydra.run.dir=. \
var={wildcards.var} \
year={wildcards.year} \
debug_days=null \
synthetic.poisson_params.base_rate={params.disease_params[base_rate]} \
synthetic.poisson_params.seasonal_amplitude={params.disease_params[seasonal_amplitude]} \
synthetic.poisson_params.spatial_variance={params.disease_params[spatial_variance]} \
synthetic.poisson_params.latitude_effect={params.disease_params[latitude_effect]} \
synthetic.poisson_params.longitude_effect={params.disease_params[longitude_effect]} \
synthetic.poisson_params.population_effect={params.disease_params[population_effect]} \
synthetic.poisson_params.random_seed={params.disease_params[random_seed]}
"""
Loading