diff --git a/.gitignore b/.gitignore
index 15eb7d1..fc6880b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -172,3 +172,15 @@ results/
 profiles/
 docs/*.html
 docs/index_files/
+
+# --- Local junk / scratch artifacts (root) ---
+sample*.csv
+*.log
+Untitled
+tl_2023_17_bg.zip
+
+# --- Quarantine area (local) ---
+archive_quarantine/
+
+# Legacy artifacts (do not track)
+archive/
diff --git a/analysis/clustering/euclidean_clustering_k_search.py b/analysis/clustering/euclidean_clustering_k_search.py
index b655065..5ab66f2 100644
--- a/analysis/clustering/euclidean_clustering_k_search.py
+++ b/analysis/clustering/euclidean_clustering_k_search.py
@@ -228,7 +228,7 @@ def predict_and_write_assignments_streaming(
 
     pf = pq.ParquetFile(input_path)
     out_schema = pf.schema_arrow.append(pa.field("cluster", pa.int32()))
-    writer = pq.ParquetWriter(output_path, out_schema, compression="zstd")
+    writer = pq.ParquetWriter(output_path, out_schema, compression="snappy")
 
     k = int(model.n_clusters)
     counts = np.zeros(k, dtype=np.int64)
@@ -273,7 +273,7 @@ def plot_centroids(centroids: np.ndarray, output_path: Path) -> None:
         x = np.arange(n_timepoints)
         xlabel = "Time Interval"
 
-    fig, ax = plt.subplots(figsize=(12, 6))
+    _fig, ax = plt.subplots(figsize=(12, 6))
     for i in range(k):
         ax.plot(x, centroids[i], label=f"Cluster {i}", linewidth=2)
 
diff --git a/analysis/clustering/euclidean_clustering_minibatch.py b/analysis/clustering/euclidean_clustering_minibatch.py
index 49a690d..8ddd1b3 100644
--- a/analysis/clustering/euclidean_clustering_minibatch.py
+++ b/analysis/clustering/euclidean_clustering_minibatch.py
@@ -16,10 +16,14 @@
 from __future__ import annotations
 
 import argparse
+import gc
 import json
 import logging
+import os
+import time
 from collections.abc import Iterable
 from pathlib import Path
+from typing import Any
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -29,15 +33,118 @@
 from sklearn.cluster import MiniBatchKMeans
 from sklearn.metrics import silhouette_score
 
+try:
+    import psutil  # optional
+except Exception:
+    psutil = None
+
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
 
+CGROUP_ROOT = Path("/sys/fs/cgroup")
+_BASELINE_CGROUP_EVENTS: dict[str, int] = {}
+
 
 # =============================================================================
-# IO + batch utilities
+# MEMORY TELEMETRY (unprivileged container-safe)
 # =============================================================================
+def _read_text_file(path: Path) -> str | None:
+    try:
+        return path.read_text(encoding="utf-8").strip()
+    except Exception:
+        return None
+
+
+def _read_int_file(path: Path) -> int | None:
+    s = _read_text_file(path)
+    if s is None:
+        return None
+    try:
+        return int(s)
+    except Exception:
+        return None
+
 
+def _get_rss_bytes() -> int | None:
+    try:
+        if psutil is not None:
+            return int(psutil.Process(os.getpid()).memory_info().rss)
+    except Exception:  # noqa: S110
+        pass
+    try:
+        with open("/proc/self/status", encoding="utf-8") as f:
+            for line in f:
+                if line.startswith("VmRSS:"):
+                    parts = line.split()
+                    return int(parts[1]) * 1024
+    except Exception:
+        return None
+    return None
 
+
+def _read_cgroup_memory_bytes() -> dict[str, int | None]:
+    cur = _read_int_file(CGROUP_ROOT / "memory.current")
+    peak = _read_int_file(CGROUP_ROOT / "memory.peak")
+    maxv = _read_text_file(CGROUP_ROOT / "memory.max")
+    limit = None
+    if maxv is not None and maxv != "max":
+        try:
+            limit = int(maxv)
+        except Exception:
+            limit = None
+    return {"current": cur, "peak": peak, "limit": limit}
+
+
+def _read_cgroup_memory_events() -> dict[str, int]:
+    out: dict[str, int] = {}
+    try:
+        txt = (CGROUP_ROOT / "memory.events").read_text(encoding="utf-8")
+        for line in txt.splitlines():
+            parts = line.split()
+            if len(parts) == 2:
+                out[parts[0]] = int(parts[1])
+    except Exception:  # noqa: S110
+        pass
+    return out
+
+
+def _mb(x: int | None) -> float | None:
+    if x is None:
+        return None
+    return round(x / (1024.0 * 1024.0), 3)
+
+
+def log_memory(stage: str, extra: dict[str, Any] | None = None) -> None:
+    global _BASELINE_CGROUP_EVENTS
+    if not _BASELINE_CGROUP_EVENTS:
+        _BASELINE_CGROUP_EVENTS = _read_cgroup_memory_events()
+
+    rss_b = _get_rss_bytes()
+    cg = _read_cgroup_memory_bytes()
+    ev = _read_cgroup_memory_events()
+    keys = set(_BASELINE_CGROUP_EVENTS) | set(ev)
+    delta = {k: ev.get(k, 0) - _BASELINE_CGROUP_EVENTS.get(k, 0) for k in keys}
+
+    payload: dict[str, Any] = {
+        "ts": round(time.time(), 3),
+        "event": "mem",
+        "stage": stage,
+        "rss_mb": _mb(rss_b),
+        "cgroup_current_mb": _mb(cg.get("current")),
+        "cgroup_peak_mb": _mb(cg.get("peak")),
+        "cgroup_limit_mb": _mb(cg.get("limit")),
+        "cgroup_oom_kill_delta": delta.get("oom_kill", 0),
+        "cgroup_events_delta": delta,
+    }
+    if extra:
+        payload.update(extra)
+
+    logger.info("[MEMORY] %s", json.dumps(payload, separators=(",", ":"), sort_keys=True))
+
+
+# =============================================================================
+# IO + batch utilities
+# =============================================================================
 def parquet_num_rows(path: Path) -> int:
     """Return Parquet row count from file metadata (no full scan)."""
     pf = pq.ParquetFile(path)
@@ -57,15 +164,82 @@ def iter_profile_batches(path: Path, batch_size: int, columns: list[str] | None)
     yield from pf.iter_batches(batch_size=batch_size, columns=columns)
 
 
+def _list_array_to_numpy_matrix(arr: pa.Array) -> np.ndarray:
+    """
+    Convert Arrow list-like array (List/LargeList/FixedSizeList) to a 2D NumPy array
+    WITHOUT materializing Python lists.
+
+    Assumes lists are uniform length within the batch (true for 48-pt profiles).
+    """
+    if isinstance(arr, pa.FixedSizeListArray):
+        n = len(arr)
+        list_size = int(arr.type.list_size)
+        values = arr.values
+        v = values.to_numpy(zero_copy_only=False)
+        if v.size != n * list_size:
+            raise ValueError(f"Unexpected FixedSizeList flatten size={v.size} for n={n}, list_size={list_size}")
+        X = v.reshape((n, list_size))
+        return X
+
+    if not (pa.types.is_list(arr.type) or pa.types.is_large_list(arr.type)):
+        raise TypeError(f"Expected List/LargeList/FixedSizeList; got {arr.type}")
+
+    n = len(arr)
+    if n == 0:
+        return np.empty((0, 0), dtype=np.float64)
+
+    # ListArray / LargeListArray: use offsets + values
+    # Offsets length = n+1; differences must be constant (48) for this dataset.
+    offsets = arr.offsets.to_numpy(zero_copy_only=False)
+    diffs = np.diff(offsets)
+    if diffs.size == 0:
+        return np.empty((0, 0), dtype=np.float64)
+
+    first = int(diffs[0])
+    if not np.all(diffs == first):
+        raise ValueError("Non-uniform list lengths detected in profile column within a batch; cannot reshape safely.")
+    if first <= 0:
+        raise ValueError(f"Invalid list length inferred from offsets: {first}")
+
+    values = arr.values
+    v = values.to_numpy(zero_copy_only=False)
+    expected = int(n * first)
+    if v.size < expected:
+        raise ValueError(f"Flattened values too short: {v.size} < expected {expected}")
+    if v.size != expected:
+        # This should not happen for uniform lists; fail loud to avoid silent corruption.
+        raise ValueError(f"Flattened values size mismatch: {v.size} != expected {expected}")
+
+    X = v.reshape((n, first))
+    return X
+
+
 def recordbatch_profiles_to_numpy(rb: pa.RecordBatch, profile_col: str = "profile") -> np.ndarray:
-    """Convert RecordBatch `profile` list column into a 2D float64 NumPy array."""
+    """
+    Convert RecordBatch `profile` list column into a 2D float64 NumPy array,
+    avoiding Python object materialization.
+    """
     idx = rb.schema.get_field_index(profile_col)
     if idx < 0:
         raise ValueError(f"RecordBatch missing required column '{profile_col}'")
-    profiles = rb.column(idx).to_pylist()
-    X = np.asarray(profiles, dtype=np.float64)
+    col = rb.column(idx)
+
+    # RecordBatch columns are typically Array, not ChunkedArray; handle defensively.
+    if isinstance(col, pa.ChunkedArray):
+        if col.num_chunks != 1:
+            # Concatenate chunks cheaply in Arrow-space (still bounded by batch size).
+            col = pa.chunked_array(col.chunks).combine_chunks()
+        col_arr = col.chunk(0)
+    else:
+        col_arr = col
+
+    X = _list_array_to_numpy_matrix(col_arr).astype(np.float64, copy=False)
     if X.ndim != 2:
         raise ValueError(f"Expected 2D profile array; got shape={X.shape}")
+
+    if not np.isfinite(X).all():
+        X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
+
     return X
 
 
@@ -96,8 +270,6 @@ def parse_output_columns(spec: str) -> list[str] | None:
 # =============================================================================
 # Normalization
 # =============================================================================
-
-
 def normalize_batch(X: np.ndarray, method: str) -> np.ndarray:
     """Row-wise normalization: minmax, zscore, or none. Constant rows -> zeros."""
     if method in ("none", "", None):
@@ -126,8 +298,6 @@ def normalize_batch(X: np.ndarray, method: str) -> np.ndarray:
 # =============================================================================
 # Clustering
 # =============================================================================
-
-
 def fit_minibatch_kmeans(
     input_path: Path,
     k: int,
@@ -139,6 +309,7 @@ def fit_minibatch_kmeans(
 ) -> MiniBatchKMeans:
     """Fit MiniBatchKMeans by streaming over `profile` batches and calling partial_fit()."""
     logger.info("Fitting MiniBatchKMeans (k=%d, batch_size=%s, n_init=%d)...", k, f"{batch_size:,}", n_init)
+    log_memory("fit_start", {"k": int(k), "batch_size": int(batch_size), "n_init": int(n_init)})
 
     model = MiniBatchKMeans(
         n_clusters=k,
@@ -159,10 +330,15 @@ def fit_minibatch_kmeans(
             X = normalize_batch(X, normalize_method)
         model.partial_fit(X)
         seen += int(X.shape[0])
-        if bi % 10 == 0 or bi == n_batches:
-            logger.info("    Trained batch %d/%d (seen=%s)", bi, n_batches, f"{seen:,}")
+
+        if bi == 1 or bi == n_batches or bi % 10 == 0:
+            log_memory("fit_progress", {"batch_i": int(bi), "n_batches": int(n_batches), "seen": int(seen)})
+
+        del X, rb
+        gc.collect()
 
     logger.info("  Training complete. Inertia: %s", f"{float(model.inertia_):,.2f}")
+    log_memory("fit_done", {"seen": int(seen), "inertia": float(model.inertia_)})
     return model
 
 
@@ -200,6 +376,7 @@ def predict_write_and_optional_silhouette(  # noqa: C901
     """
     logger.info("Predicting labels + writing assignments streaming: %s", output_path)
     output_path.parent.mkdir(parents=True, exist_ok=True)
+    log_memory("predict_start", {"batch_size": int(batch_size)})
 
     k = int(model.n_clusters)
     counts = np.zeros(k, dtype=np.int64)
@@ -211,15 +388,15 @@ def predict_write_and_optional_silhouette(  # noqa: C901
     global_row = 0
 
     writer: pq.ParquetWriter | None = None
-    out_schema: pa.Schema | None = None
 
     try:
-        for rb in iter_profile_batches(input_path, batch_size=batch_size, columns=read_columns):
+        for bi, rb in enumerate(iter_profile_batches(input_path, batch_size=batch_size, columns=read_columns), start=1):
             # Compute labels (requires profile)
             X = recordbatch_profiles_to_numpy(rb, profile_col="profile")
             if normalize and normalize_method != "none":
                 X = normalize_batch(X, normalize_method)
-            labels = model.predict(X).astype(np.int32)
+
+            labels = model.predict(X).astype(np.int32, copy=False)
             counts += np.bincount(labels, minlength=k)
 
             # Silhouette sampling (uses labels on this pass; avoids a second scan)
@@ -236,9 +413,10 @@ def predict_write_and_optional_silhouette(  # noqa: C901
 
                 if sample_pos > start:
                     idx_in_batch = silhouette_sample_idx[start:sample_pos] - global_row
+                    # Copy rows into a compact array later; these are small (<=5000).
                     for j in idx_in_batch:
                         jj = int(j)
-                        sample_X.append(X[jj])
+                        sample_X.append(X[jj].copy())
                         sample_y.append(int(labels[jj]))
 
             # Build output batch: select write columns (or all columns), then append cluster
@@ -255,14 +433,18 @@ def predict_write_and_optional_silhouette(  # noqa: C901
 
             out_rb = out_rb.append_column("cluster", pa.array(labels, type=pa.int32()))
 
-            # Initialize writer lazily from first out_rb schema
             if writer is None:
-                out_schema = out_rb.schema
-                writer = pq.ParquetWriter(output_path, out_schema, compression="zstd")
-            writer.write_batch(out_rb)
+                writer = pq.ParquetWriter(output_path, out_rb.schema, compression="snappy")
 
+            writer.write_batch(out_rb)
             global_row += int(labels.shape[0])
 
+            if bi == 1 or (bi % 10 == 0):
+                log_memory("predict_progress", {"batch_i": int(bi), "rows_written": int(global_row)})
+
+            del out_rb, labels, X, rb
+            gc.collect()
+
     finally:
         if writer is not None:
             writer.close()
@@ -279,21 +461,22 @@ def predict_write_and_optional_silhouette(  # noqa: C901
         if np.unique(ys).size >= 2:
             Xs = np.asarray(sample_X, dtype=np.float64)
             logger.info("Computing silhouette on sample_size=%s ...", f"{len(ys):,}")
+            log_memory("silhouette_start", {"sample_size": len(ys)})
             sil = float(silhouette_score(Xs, ys, metric="euclidean"))
             logger.info("  Silhouette score (sample): %.3f", sil)
+            log_memory("silhouette_done", {"silhouette": float(sil)})
         else:
             logger.info("Silhouette: skipped (sample fell into a single cluster)")
     elif use_sil:
         logger.info("Silhouette: skipped (insufficient sample)")
 
+    log_memory("predict_done", {"rows_written": int(total)})
     return counts, sil
 
 
 # =============================================================================
 # Plotting + outputs
 # =============================================================================
-
-
 def plot_centroids(centroids: np.ndarray, output_path: Path) -> None:
     """Save a line plot of cluster centroids."""
     k = int(centroids.shape[0])
@@ -306,7 +489,7 @@ def plot_centroids(centroids: np.ndarray, output_path: Path) -> None:
         x = np.arange(n_timepoints)
         xlabel = "Time Interval"
 
-    fig, ax = plt.subplots(figsize=(12, 6))
+    _fig, ax = plt.subplots(figsize=(12, 6))
     for i in range(k):
         ax.plot(x, centroids[i], label=f"Cluster {i}", linewidth=2)
 
@@ -348,8 +531,6 @@ def save_metadata(metadata: dict[str, object], output_path: Path) -> None:
 # =============================================================================
 # Main
 # =============================================================================
-
-
 def build_parser() -> argparse.ArgumentParser:
     p = argparse.ArgumentParser(
         description="Memory-Efficient K-Means Clustering (MiniBatch, PyArrow-batched)",
@@ -412,18 +593,16 @@ def main() -> int:
 
     n_profiles = parquet_num_rows(args.input)
     logger.info("Profiles (from parquet metadata): %s", f"{n_profiles:,}")
+    log_memory("start", {"n_profiles": int(n_profiles)})
 
-    # Output column handling
     requested_out_cols = parse_output_columns(args.output_columns)
 
-    # Read columns must include profile for prediction; if requested_out_cols is None -> read all columns.
-    # If a subset was requested, read only: requested_out_cols + profile (deduped).
     if requested_out_cols is None:
         read_columns = None
-        write_columns = None  # write everything we read (which is everything)
+        write_columns = None
         logger.info("Assignments output columns: ALL (default)")
     else:
-        read_columns = []
+        read_columns: list[str] = []
         seen = set()
         for c in [*requested_out_cols, "profile"]:
             if c not in seen:
@@ -432,7 +611,6 @@ def main() -> int:
         write_columns = requested_out_cols
         logger.info("Assignments output columns: %s (+cluster)", ",".join(write_columns))
 
-    # Fit
     model = fit_minibatch_kmeans(
         input_path=args.input,
         k=int(args.k),
@@ -444,14 +622,12 @@ def main() -> int:
     )
     centroids = model.cluster_centers_
 
-    # Deterministic silhouette sample indices (optional)
     sample_idx = make_silhouette_sample_idx(int(n_profiles), int(args.silhouette_sample_size), int(args.seed))
     if sample_idx.size > 0:
         logger.info("Silhouette sample size: %s", f"{sample_idx.size:,}")
     else:
         logger.info("Silhouette: disabled")
 
-    # Predict + write assignments (and optional silhouette)
     assignments_path = args.output_dir / "cluster_assignments.parquet"
     counts, sil_score = predict_write_and_optional_silhouette(
         model=model,
@@ -465,7 +641,6 @@ def main() -> int:
         silhouette_sample_idx=sample_idx,
     )
 
-    # Save artifacts
     centroids_plot_path = args.output_dir / "cluster_centroids.png"
     centroids_parquet_path = args.output_dir / "cluster_centroids.parquet"
     metadata_path = args.output_dir / "clustering_metadata.json"
@@ -503,6 +678,7 @@ def main() -> int:
     if sil_score is not None:
         logger.info("Silhouette (sample): %.3f", float(sil_score))
     logger.info("Output: %s", args.output_dir)
+    log_memory("end")
 
     return 0
 
diff --git a/analysis/clustering/prepare_clustering_data_households.py b/analysis/clustering/prepare_clustering_data_households.py
index d285d6b..451ae3b 100644
--- a/analysis/clustering/prepare_clustering_data_households.py
+++ b/analysis/clustering/prepare_clustering_data_households.py
@@ -6,115 +6,311 @@
 Transforms interval-level energy data into daily load profiles at the
 HOUSEHOLD (account) level for k-means clustering.
 
-MODIFIED: Now accepts multiple input parquet files (e.g., split by time period).
+MODIFIED: Accepts multiple input parquet files (e.g., split by time period).
+ROBUST: Also accepts *_accounts.parquet and *_dates.parquet as inputs (as produced
+by the pipeline), and will not attempt to treat them as interval data.
 
 What this does:
-    1. Validate schema (no full data scan)
-    2. Build/load manifests for accounts and dates (streaming, memory-safe)
+    1. Validate schema (schema-only checks; no full scan)
+    2. Build/load manifests for accounts and dates (memory-safe)
     3. Sample households + dates from manifests
     4. Create daily 48-point load profiles per household
     5. Output profiles ready for clustering
 
 Design notes:
     - Uses MANIFESTS to avoid OOM on unique().collect() for large files
-    - Manifests are built once via sink_parquet (streaming) and cached
-    - Standard mode: single filtered collect() for selected households/dates
-      * good for up to ~5k-10k households sampled (depending on memory)
-    - Chunked streaming mode: fully streaming pipeline with per-chunk sinks
-      * required for 10k+ households on constrained hardware
-    - Profiles are 48 half-hourly kWh values in chronological order (00:30-24:00)
-    - Incomplete days (not 48 intervals) are dropped as "missing/irregular data"
+    - Chunked streaming mode writes per-chunk parquet files and then performs a
+      bounded-memory merge via PyArrow iter_batches + ParquetWriter.
+    - In streaming mode we DO NOT read the full merged sampled_profiles.parquet
+      into memory; we derive the household map and stats via lazy scans.
 
 Output files:
     - sampled_profiles.parquet:
-        One row per (account_identifier, date) with 'profile' = list[float] of length 48
+        One row per (account_identifier, date) with 'profile' = list[float] length 48
     - household_zip4_map.parquet:
         Unique account_identifier → zip_code mapping for later joins
-
-Manifest files (auto-generated alongside input, via smart_meter_analysis.manifests):
-    - {input_stem}_accounts.parquet: unique (account_identifier, zip_code) pairs
-    - {input_stem}_dates.parquet: unique (date, is_weekend, weekday) tuples
 """
 
 from __future__ import annotations
 
 import argparse
 import gc
+import json
 import logging
+import os
+import time
+from calendar import monthrange
+from collections.abc import Sequence
 from pathlib import Path
 from typing import Any, Literal
 
 import polars as pl
+import pyarrow.parquet as pq
 
 from smart_meter_analysis.manifests import ensure_account_manifest, ensure_date_manifest
 
+try:
+    import psutil  # optional
+except Exception:
+    psutil = None
+
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
 
-REQUIRED_ENERGY_COLS = ["zip_code", "account_identifier", "datetime", "kwh"]
-REQUIRED_TIME_COLS = ["date", "hour", "is_weekend", "weekday"]
+REQUIRED_INTERVAL_COLS = ["zip_code", "account_identifier", "datetime", "kwh", "date", "hour", "is_weekend", "weekday"]
+REQUIRED_ACCOUNT_MANIFEST_COLS = ["account_identifier", "zip_code"]
+REQUIRED_DATE_MANIFEST_COLS = ["date", "is_weekend", "weekday"]
+
+CGROUP_ROOT = Path("/sys/fs/cgroup")
+
+# No CLI flags per requirement: keep bounded-memory merge settings internal.
+MERGE_BATCH_SIZE_ROWS = 65_536
+FINAL_PARQUET_COMPRESSION = "snappy"
 
 
 # =============================================================================
-# MEMORY INSTRUMENTATION
+# MEMORY TELEMETRY (unprivileged container-safe)
 # =============================================================================
-def log_memory(label: str) -> None:
-    """Log current RSS memory usage (Linux only)."""
+_BASELINE_CGROUP_EVENTS: dict[str, int] = {}
+
+
+def _read_text_file(path: Path) -> str | None:
     try:
-        with open("/proc/self/status") as f:
+        return path.read_text(encoding="utf-8").strip()
+    except Exception:
+        return None
+
+
+def _read_int_file(path: Path) -> int | None:
+    s = _read_text_file(path)
+    if s is None:
+        return None
+    try:
+        return int(s)
+    except Exception:
+        return None
+
+
+def _get_rss_bytes() -> int | None:
+    try:
+        if psutil is not None:
+            return int(psutil.Process(os.getpid()).memory_info().rss)
+    except Exception:  # noqa: S110
+        pass
+
+    try:
+        with open("/proc/self/status", encoding="utf-8") as f:
             for line in f:
                 if line.startswith("VmRSS:"):
-                    mem_mb = int(line.split()[1]) / 1024
-                    logger.info("[MEMORY] %s: %.0f MB", label, mem_mb)
-                    break
-    except Exception as exc:
-        logger.debug("Skipping memory log for %s: %s", label, exc)
+                    parts = line.split()
+                    return int(parts[1]) * 1024
+    except Exception:
+        return None
+    return None
+
+
+def _read_cgroup_memory_bytes() -> dict[str, int | None]:
+    cur = _read_int_file(CGROUP_ROOT / "memory.current")
+    peak = _read_int_file(CGROUP_ROOT / "memory.peak")
+    maxv = _read_text_file(CGROUP_ROOT / "memory.max")
+    limit = None
+    if maxv is not None and maxv != "max":
+        try:
+            limit = int(maxv)
+        except Exception:
+            limit = None
+    return {"current": cur, "peak": peak, "limit": limit}
+
+
+def _read_cgroup_memory_events() -> dict[str, int]:
+    out: dict[str, int] = {}
+    try:
+        txt = (CGROUP_ROOT / "memory.events").read_text(encoding="utf-8")
+        for line in txt.splitlines():
+            parts = line.split()
+            if len(parts) == 2:
+                out[parts[0]] = int(parts[1])
+    except Exception:  # noqa: S110
+        pass
+    return out
+
+
+def _mb(x: int | None) -> float | None:
+    if x is None:
+        return None
+    return round(x / (1024.0 * 1024.0), 3)
+
+
+def log_memory(label: str, extra: dict[str, Any] | None = None) -> None:
+    global _BASELINE_CGROUP_EVENTS
+    if not _BASELINE_CGROUP_EVENTS:
+        _BASELINE_CGROUP_EVENTS = _read_cgroup_memory_events()
+
+    rss_b = _get_rss_bytes()
+    cg = _read_cgroup_memory_bytes()
+    ev = _read_cgroup_memory_events()
+    keys = set(_BASELINE_CGROUP_EVENTS) | set(ev)
+    delta = {k: ev.get(k, 0) - _BASELINE_CGROUP_EVENTS.get(k, 0) for k in keys}
+
+    payload: dict[str, Any] = {
+        "ts": round(time.time(), 3),
+        "event": "mem",
+        "stage": label,
+        "rss_mb": _mb(rss_b),
+        "cgroup_current_mb": _mb(cg.get("current")),
+        "cgroup_peak_mb": _mb(cg.get("peak")),
+        "cgroup_limit_mb": _mb(cg.get("limit")),
+        "cgroup_oom_kill_delta": delta.get("oom_kill", 0),
+        "cgroup_events_delta": delta,
+    }
+    if extra:
+        payload.update(extra)
+
+    logger.info("[MEMORY] %s", json.dumps(payload, separators=(",", ":"), sort_keys=True))
 
 
 # =============================================================================
-# MULTI-FILE SUPPORT
+# INPUT CLASSIFICATION
 # =============================================================================
+def _schema_names_for_path(p: Path) -> list[str]:
+    try:
+        return pl.scan_parquet(p).collect_schema().names()
+    except Exception as exc:
+        raise RuntimeError(f"Failed to read parquet schema for {p}: {exc}") from exc
+
+
+def classify_inputs(paths: list[Path]) -> tuple[list[Path], list[Path], list[Path]]:
+    """
+    Classify inputs into:
+      - interval_paths: full interval dataset parts
+      - account_manifest_paths: *_accounts.parquet (or schema matches)
+      - date_manifest_paths: *_dates.parquet (or schema matches)
+
+    This makes the script robust to the orchestrator passing mixed inputs.
+    """
+    interval_paths: list[Path] = []
+    account_manifest_paths: list[Path] = []
+    date_manifest_paths: list[Path] = []
+
+    for p in paths:
+        name = p.name.lower()
+
+        # Fast filename heuristics first
+        if name.endswith("_accounts.parquet"):
+            account_manifest_paths.append(p)
+            continue
+        if name.endswith("_dates.parquet"):
+            date_manifest_paths.append(p)
+            continue
+
+        cols = set(_schema_names_for_path(p))
+
+        if set(REQUIRED_INTERVAL_COLS).issubset(cols):
+            interval_paths.append(p)
+            continue
+
+        # Schema-based fallback for manifests
+        if set(REQUIRED_ACCOUNT_MANIFEST_COLS).issubset(cols) and "datetime" not in cols and "kwh" not in cols:
+            account_manifest_paths.append(p)
+            continue
+        if set(REQUIRED_DATE_MANIFEST_COLS).issubset(cols) and "datetime" not in cols and "kwh" not in cols:
+            date_manifest_paths.append(p)
+            continue
+
+        raise ValueError(
+            "Unrecognized parquet input (neither interval data nor expected manifest schema): "
+            f"{p} with columns={sorted(cols)}"
+        )
+
+    if not interval_paths:
+        raise ValueError("No interval parquet inputs detected. Expected at least one file with interval schema.")
+
+    logger.info(
+        "Classified inputs: interval=%d, accounts_manifest=%d, dates_manifest=%d",
+        len(interval_paths),
+        len(account_manifest_paths),
+        len(date_manifest_paths),
+    )
+    return interval_paths, account_manifest_paths, date_manifest_paths
+
+
 def scan_multiple_parquet(paths: list[Path]) -> pl.LazyFrame:
-    """Create a unified LazyFrame from multiple parquet files."""
+    """Create a unified LazyFrame from multiple interval parquet files."""
     if len(paths) == 1:
         return pl.scan_parquet(paths[0])
-    logger.info("Combining %d input files...", len(paths))
+    logger.info("Combining %d interval input files...", len(paths))
     return pl.concat([pl.scan_parquet(p) for p in paths])
 
 
 # =============================================================================
-# DATA INSPECTION & SAMPLING
+# VALIDATION
 # =============================================================================
-def validate_schema(paths: list[Path]) -> dict[str, Any]:
-    """Validate input data has required columns (schema-only check)."""
-    lf = scan_multiple_parquet(paths)
-    schema = lf.collect_schema()
-
+def validate_interval_schema(interval_paths: list[Path]) -> dict[str, Any]:
+    """
+    Validate required columns exist and all interval parts share the same schema.
+    Schema-only checks; does not scan full data.
+    """
     errors: list[str] = []
-    missing_energy = [c for c in REQUIRED_ENERGY_COLS if c not in schema.names()]
-    missing_time = [c for c in REQUIRED_TIME_COLS if c not in schema.names()]
 
-    if missing_energy:
-        errors.append(f"Missing energy columns: {missing_energy}")
-    if missing_time:
-        errors.append(f"Missing time columns: {missing_time}")
+    first_schema = pl.scan_parquet(interval_paths[0]).collect_schema()
+    first_names = first_schema.names()
+    missing = [c for c in REQUIRED_INTERVAL_COLS if c not in first_names]
+    if missing:
+        errors.append(f"Missing interval columns in first file {interval_paths[0]}: {missing}")
 
-    return {"valid": len(errors) == 0, "errors": errors, "columns": schema.names()}
+    # Ensure all interval parts match schema (fail-loud with useful diagnostics)
+    for p in interval_paths[1:]:
+        s = pl.scan_parquet(p).collect_schema()
+        if s.names() != first_names:
+            errors.append(
+                "Interval schema mismatch:\n"
+                f"  first={interval_paths[0]} cols={first_names}\n"
+                f"  this ={p} cols={s.names()}"
+            )
 
+    return {"valid": len(errors) == 0, "errors": errors, "columns": first_names}
 
+
+# =============================================================================
+# SAMPLING HELPERS
+# =============================================================================
 def _as_polars_date_list(dates: list[Any]) -> list[Any]:
-    """
-    Ensure Python/Polars date values are normalized to Polars Date dtype
-    before using in `.is_in()`.
-    """
     if not dates:
         return dates
-    # This handles python datetime.date, python datetime.datetime, strings, etc.
     return pl.Series("dates", dates).cast(pl.Date).to_list()
 
 
+def _sum_parquet_rows(paths: Sequence[Path]) -> int:
+    total = 0
+    for p in paths:
+        pf = pq.ParquetFile(str(p))
+        md = pf.metadata
+        if md is None:
+            raise RuntimeError(f"Missing parquet metadata for input: {p}")
+        total += int(md.num_rows)
+    return total
+
+
+def _load_union_account_manifest(paths: list[Path]) -> pl.DataFrame:
+    return (
+        pl.concat([pl.scan_parquet(p) for p in paths])
+        .unique(subset=["account_identifier", "zip_code"])
+        .collect(engine="streaming")
+    )
+
+
+def _load_union_date_manifest(paths: list[Path]) -> pl.DataFrame:
+    return (
+        pl.concat([pl.scan_parquet(p) for p in paths])
+        .unique(subset=["date", "is_weekend", "weekday"])
+        .collect(engine="streaming")
+    )
+
+
 def get_metadata_and_samples(  # noqa: C901
-    input_paths: list[Path],
+    interval_paths: list[Path],
+    account_manifest_paths: list[Path],
+    date_manifest_paths: list[Path],
     sample_households: int | None,
     sample_days: int,
     day_strategy: Literal["stratified", "random"],
@@ -123,52 +319,27 @@ def get_metadata_and_samples(  # noqa: C901
     month: int | None = None,
 ) -> dict[str, Any]:
     """
-    Get summary statistics and sample households + dates using MANIFESTS.
+    Get summary statistics and sample households + dates using manifests.
+
+    If manifests are provided via CLI inputs (as the orchestrator does), use them directly.
+    Otherwise, fall back to ensure_*_manifest(interval_file) for each interval file.
     """
-    logger.info("Scanning input data for metadata and sampling...")
-    log_memory("start of get_metadata_and_samples")
-
-    lf = scan_multiple_parquet(input_paths)
-
-    logger.info("  Computing summary statistics...")
-    summary_df = lf.select([
-        pl.len().alias("n_rows"),
-        pl.col("zip_code").n_unique().alias("n_zip_codes"),
-        pl.col("account_identifier").n_unique().alias("n_accounts"),
-        pl.col("date").min().alias("min_date"),
-        pl.col("date").max().alias("max_date"),
-    ]).collect(engine="streaming")
-    summary = summary_df.to_dicts()[0]
-
-    if summary["n_rows"] == 0:
-        raise ValueError("Input files contain no rows.")
-
-    logger.info("  %s rows", f"{summary['n_rows']:,}")
-    logger.info("  %s households", f"{summary['n_accounts']:,}")
-    logger.info("  %s ZIP+4 codes", f"{summary['n_zip_codes']:,}")
-    logger.info("  Date range: %s to %s", summary["min_date"], summary["max_date"])
-
-    # Load manifests from first input
-    logger.info("  Loading manifests from first input file...")
-    account_manifest0 = ensure_account_manifest(input_paths[0])
-    date_manifest0 = ensure_date_manifest(input_paths[0])
-
-    accounts_df = pl.read_parquet(account_manifest0)
-    dates_df = pl.read_parquet(date_manifest0)
-
-    # If multiple files, combine manifests from all files first
-    if len(input_paths) > 1:
-        logger.info("  Loading manifests from additional input files...")
-        for path in input_paths[1:]:
-            acc_manifest = ensure_account_manifest(path)
-            date_manifest_extra = ensure_date_manifest(path)
-            accounts_df = pl.concat([accounts_df, pl.read_parquet(acc_manifest)]).unique()
-            dates_df = pl.concat([dates_df, pl.read_parquet(date_manifest_extra)]).unique()
-
-    # Apply month filter if year/month are specified (after all dates are assembled)
-    if year is not None and month is not None:
-        from calendar import monthrange
+    logger.info("Sampling from manifests...")
+    log_memory("start_of_get_metadata_and_samples")
+
+    if account_manifest_paths:
+        accounts_df = _load_union_account_manifest(account_manifest_paths)
+    else:
+        manifests = [ensure_account_manifest(p) for p in interval_paths]
+        accounts_df = _load_union_account_manifest([Path(m) for m in manifests])
+
+    if date_manifest_paths:
+        dates_df = _load_union_date_manifest(date_manifest_paths)
+    else:
+        manifests = [ensure_date_manifest(p) for p in interval_paths]
+        dates_df = _load_union_date_manifest([Path(m) for m in manifests])
 
+    if year is not None and month is not None:
         _, last_day = monthrange(year, month)
         start_date = pl.date(year, month, 1)
         end_date = pl.date(year, month, last_day)
@@ -178,18 +349,31 @@ def get_metadata_and_samples(  # noqa: C901
         logger.info("  No month filter applied (using all available dates): %d", dates_df.height)
 
     if accounts_df.height == 0:
-        raise ValueError("No account_identifier values found in manifest.")
+        raise ValueError("No account_identifier values found in account manifest(s).")
     if dates_df.height == 0:
-        raise ValueError("No dates found in manifest.")
+        raise ValueError("No dates found in date manifest(s).")
+
+    summary = {
+        "n_rows": _sum_parquet_rows(interval_paths),
+        "n_accounts": int(accounts_df.height),
+        "n_zip_codes": int(accounts_df.select(pl.col("zip_code").n_unique()).item()),
+        "min_date": dates_df.select(pl.col("date").min()).item(),
+        "max_date": dates_df.select(pl.col("date").max()).item(),
+    }
 
-    log_memory("after loading manifests")
+    logger.info("  %s rows (from interval parquet metadata)", f"{summary['n_rows']:,}")
+    logger.info("  %s households (from manifests)", f"{summary['n_accounts']:,}")
+    logger.info("  %s ZIP+4 codes (from manifests)", f"{summary['n_zip_codes']:,}")
+    logger.info("  Date range: %s to %s (from manifests)", summary["min_date"], summary["max_date"])
+
+    log_memory("after_loading_manifests", {"accounts_rows": accounts_df.height, "dates_rows": dates_df.height})
 
     # Sample households
-    if sample_households is not None and sample_households < len(accounts_df):
+    if sample_households is not None and sample_households < accounts_df.height:
         accounts_df = accounts_df.sample(n=sample_households, shuffle=True, seed=seed)
-        logger.info("  Sampled %s households", f"{len(accounts_df):,}")
+        logger.info("  Sampled %s households", f"{accounts_df.height:,}")
     else:
-        logger.info("  Using all %s households", f"{len(accounts_df):,}")
+        logger.info("  Using all %s households", f"{accounts_df.height:,}")
 
     accounts = accounts_df["account_identifier"].to_list()
 
@@ -202,11 +386,14 @@ def get_metadata_and_samples(  # noqa: C901
             day_strategy = "random"
 
     if day_strategy == "stratified":
+        weekday_df = dates_df.filter(~pl.col("is_weekend"))
+        weekend_df = dates_df.filter(pl.col("is_weekend"))
+
         n_weekdays = int(sample_days * 0.7)
         n_weekends = sample_days - n_weekdays
 
-        n_weekdays = min(n_weekdays, len(weekday_df))
-        n_weekends = min(n_weekends, len(weekend_df))
+        n_weekdays = min(n_weekdays, weekday_df.height)
+        n_weekends = min(n_weekends, weekend_df.height)
 
         sampled_weekdays = (
             weekday_df.sample(n=n_weekdays, shuffle=True, seed=seed)["date"].to_list() if n_weekdays > 0 else []
@@ -217,107 +404,44 @@ def get_metadata_and_samples(  # noqa: C901
 
         dates = sorted(sampled_weekdays + sampled_weekends)
         logger.info(
-            "  Sampled %d weekdays + %d weekend days (stratified)",
-            len(sampled_weekdays),
-            len(sampled_weekends),
+            "  Sampled %d weekdays + %d weekend days (stratified)", len(sampled_weekdays), len(sampled_weekends)
         )
     else:
-        n_sample = min(sample_days, len(dates_df))
+        n_sample = min(sample_days, dates_df.height)
         dates = dates_df.sample(n=n_sample, shuffle=True, seed=seed)["date"].to_list()
         logger.info("  Sampled %d days (random)", len(dates))
 
     if not dates:
         raise ValueError("No dates were sampled; check input data and sampling settings.")
 
-    # Normalize date types now (so downstream `.is_in()` is reliable)
     dates = _as_polars_date_list(dates)
 
-    log_memory("end of get_metadata_and_samples")
-    logger.info("  Sampled days requested: %d; sampled days returned: %d", sample_days, len(dates))
-    logger.info("  Sampled dates: %s", dates)
+    del accounts_df, dates_df
+    gc.collect()
 
+    log_memory("end_of_get_metadata_and_samples", {"sampled_accounts": len(accounts), "sampled_dates": len(dates)})
     return {"summary": summary, "accounts": accounts, "dates": dates}
 
 
 # =============================================================================
-# PROFILE CREATION
+# PROFILE CREATION (STREAMING)
 # =============================================================================
-def create_household_profiles(input_paths: list[Path], accounts: list[str], dates: list[Any]) -> pl.DataFrame:
-    """
-    Create daily load profiles for selected households and dates (in-memory).
-    """
-    logger.info("Creating profiles for %s households x %d days...", f"{len(accounts):,}", len(dates))
-    log_memory("start of create_household_profiles")
-
-    if not accounts:
-        raise ValueError("No accounts provided for profile creation.")
-    if not dates:
-        raise ValueError("No dates provided for profile creation.")
-
-    # ensure dates are Polars Date-compatible
-    dates = _as_polars_date_list(dates)
-
-    lf = scan_multiple_parquet(input_paths).filter(
-        pl.col("account_identifier").is_in(accounts) & pl.col("date").is_in(dates)
-    )
-
-    # Diagnostic: how many of the sampled dates actually appear after filtering?
-    present_dates = lf.select(pl.col("date").unique().sort()).collect(engine="streaming")["date"].to_list()
-    logger.info("  Sampled dates present in filtered interval data: %d / %d", len(present_dates), len(dates))
-    if len(present_dates) < len(dates):
-        missing = sorted(set(dates) - set(present_dates))
-        logger.warning("  Missing sampled dates after interval filter: %s", missing)
-
-    df = lf.sort(["account_identifier", "datetime"]).collect()
-
-    if df.is_empty():
-        logger.warning("  No data found for selected households/dates")
-        return pl.DataFrame()
-
-    logger.info("  Loaded %s interval records", f"{len(df):,}")
-    log_memory("after loading filtered intervals")
-
-    profiles_df = df.group_by(["account_identifier", "zip_code", "date"]).agg([
-        pl.col("kwh").alias("profile"),
-        pl.col("is_weekend").first(),
-        pl.col("weekday").first(),
-        pl.len().alias("num_intervals"),
-    ])
-
-    n_before = len(profiles_df)
-    profiles_df = profiles_df.filter(pl.col("num_intervals") == 48)
-    n_dropped = n_before - len(profiles_df)
-
-    if n_dropped > 0:
-        logger.info("  Dropped %s incomplete profiles (missing or irregular data)", f"{n_dropped:,}")
-
-    logger.info("  Created %s complete profiles", f"{len(profiles_df):,}")
-    log_memory("end of create_household_profiles")
-
-    return profiles_df.drop("num_intervals")
-
-
 def _create_profiles_for_chunk_streaming(
-    input_paths: list[Path],
+    interval_paths: list[Path],
     accounts_chunk: list[str],
     dates: list[Any],
     chunk_idx: int,
     total_chunks: int,
     tmp_dir: Path,
 ) -> Path:
-    """
-    Create profiles for a single chunk of households and write directly to parquet.
-    """
     logger.info("  Chunk %d/%d: %s households...", chunk_idx + 1, total_chunks, f"{len(accounts_chunk):,}")
-    log_memory(f"chunk {chunk_idx + 1} start")
+    log_memory(f"chunk_{chunk_idx + 1}_start", {"chunk_accounts": len(accounts_chunk), "n_chunks": total_chunks})
 
-    # ensure dates are Polars Date-compatible
     dates = _as_polars_date_list(dates)
-
     chunk_path = tmp_dir / f"sampled_profiles_chunk_{chunk_idx:03d}.parquet"
 
     (
-        scan_multiple_parquet(input_paths)
+        scan_multiple_parquet(interval_paths)
         .filter(pl.col("account_identifier").is_in(accounts_chunk) & pl.col("date").is_in(dates))
         .group_by(["account_identifier", "zip_code", "date"])
         .agg([
@@ -331,29 +455,89 @@ def _create_profiles_for_chunk_streaming(
         .sink_parquet(chunk_path)
     )
 
-    log_memory(f"chunk {chunk_idx + 1} done")
+    log_memory(f"chunk_{chunk_idx + 1}_done", {"chunk_path": str(chunk_path)})
     logger.info("    Wrote chunk parquet: %s", chunk_path)
     return chunk_path
 
 
+def _merge_parquet_chunks_bounded_memory(chunk_paths: Sequence[Path], output_path: Path) -> int:
+    if not chunk_paths:
+        raise ValueError("No chunk files provided for merge.")
+
+    chunk_files = sorted((Path(p) for p in chunk_paths), key=lambda p: p.name)
+    output_path = Path(output_path)
+    tmp_out = output_path.with_suffix(output_path.suffix + ".tmp")
+
+    if tmp_out.exists():
+        tmp_out.unlink()
+
+    log_memory("merge_start", {"n_chunks": len(chunk_files), "batch_rows": MERGE_BATCH_SIZE_ROWS})
+
+    expected_rows = 0
+    for p in chunk_files:
+        pf = pq.ParquetFile(str(p))
+        md = pf.metadata
+        if md is None:
+            raise RuntimeError(f"Missing parquet metadata for chunk: {p}")
+        expected_rows += int(md.num_rows)
+
+    first_pf = pq.ParquetFile(str(chunk_files[0]))
+    schema = first_pf.schema_arrow
+
+    writer = pq.ParquetWriter(
+        where=str(tmp_out),
+        schema=schema,
+        compression=FINAL_PARQUET_COMPRESSION,
+        use_dictionary=True,
+        write_statistics=True,
+    )
+
+    rows_written = 0
+    t0 = time.time()
+    try:
+        for i, p in enumerate(chunk_files, start=1):
+            pf = pq.ParquetFile(str(p))
+            if not pf.schema_arrow.equals(schema, check_metadata=False):
+                raise RuntimeError(
+                    "Schema mismatch across chunk files; refusing to merge.\n"
+                    f"First schema: {schema}\n"
+                    f"Mismatch file: {p}\n"
+                    f"File schema: {pf.schema_arrow}"
+                )
+
+            for rb in pf.iter_batches(batch_size=MERGE_BATCH_SIZE_ROWS):
+                writer.write_batch(rb)
+                rows_written += int(rb.num_rows)
+
+            del pf
+            gc.collect()
+
+            if i == 1 or i == len(chunk_files) or (i % 10 == 0):
+                log_memory("merge_progress", {"chunk_i": i, "n_chunks": len(chunk_files), "rows_written": rows_written})
+    finally:
+        writer.close()
+
+    if rows_written != expected_rows:
+        raise RuntimeError(f"Merged row count mismatch: wrote {rows_written} rows but expected {expected_rows} rows.")
+
+    os.replace(str(tmp_out), str(output_path))
+    log_memory("merge_done", {"rows_written": rows_written, "seconds": round(time.time() - t0, 3)})
+    return rows_written
+
+
 def create_household_profiles_chunked_streaming(
-    input_paths: list[Path],
+    interval_paths: list[Path],
     accounts: list[str],
     dates: list[Any],
     output_path: Path,
     chunk_size: int = 5000,
 ) -> int:
-    """
-    Create daily load profiles using chunked streaming.
-    """
     if not accounts:
         raise ValueError("No accounts provided for chunked streaming profile creation.")
     if not dates:
         raise ValueError("No dates provided for chunked streaming profile creation.")
 
-    # Defensive: ensure dates are Polars Date-compatible
     dates = _as_polars_date_list(dates)
-
     n_accounts = len(accounts)
     n_chunks = (n_accounts + chunk_size - 1) // chunk_size
 
@@ -364,7 +548,7 @@ def create_household_profiles_chunked_streaming(
         f"{n_accounts:,}",
         len(dates),
     )
-    log_memory("before chunked streaming")
+    log_memory("before_chunked_streaming", {"n_accounts": n_accounts, "n_chunks": n_chunks, "chunk_size": chunk_size})
 
     output_path.parent.mkdir(parents=True, exist_ok=True)
     tmp_dir = output_path.parent
@@ -376,7 +560,7 @@ def create_household_profiles_chunked_streaming(
         accounts_chunk = accounts[start_idx:end_idx]
 
         chunk_path = _create_profiles_for_chunk_streaming(
-            input_paths=input_paths,
+            interval_paths=interval_paths,
             accounts_chunk=accounts_chunk,
             dates=dates,
             chunk_idx=i,
@@ -393,14 +577,10 @@ def create_household_profiles_chunked_streaming(
         logger.warning("No profiles created in chunked streaming mode!")
         return 0
 
-    logger.info("Combining %d chunk files into %s", len(chunk_paths), output_path)
-    log_memory("before streaming concat")
+    logger.info("Combining %d chunk files into %s (bounded-memory merge)", len(chunk_paths), output_path)
+    gc.collect()
+    n_profiles = _merge_parquet_chunks_bounded_memory(chunk_paths=chunk_paths, output_path=output_path)
 
-    pl.concat([pl.scan_parquet(p) for p in chunk_paths]).sink_parquet(output_path)
-
-    log_memory("after streaming concat")
-
-    n_profiles = pl.scan_parquet(output_path).select(pl.len()).collect()[0, 0]
     logger.info("  Created %s complete profiles (chunked streaming)", f"{n_profiles:,}")
     logger.info("  Saved to %s", output_path)
 
@@ -410,6 +590,7 @@ def create_household_profiles_chunked_streaming(
         except OSError as exc:
             logger.warning("Failed to delete temp chunk file %s: %s", p, exc)
 
+    log_memory("after_chunk_cleanup", {"deleted_chunks": len(chunk_paths)})
     return int(n_profiles)
 
 
@@ -428,7 +609,6 @@ def prepare_clustering_data(
     year: int | None = None,
     month: int | None = None,
 ) -> dict[str, Any]:
-    """Prepare household-level clustering data from interval parquet."""
     logger.info("=" * 70)
     logger.info("PREPARING HOUSEHOLD-LEVEL CLUSTERING DATA")
     if len(input_paths) > 1:
@@ -436,14 +616,18 @@ def prepare_clustering_data(
     if streaming:
         logger.info("(STREAMING MODE ENABLED, chunk_size=%d)", chunk_size)
     logger.info("=" * 70)
-    log_memory("start of prepare_clustering_data")
+    log_memory("start_of_prepare_clustering_data", {"streaming": streaming, "n_inputs": len(input_paths)})
 
-    validation = validate_schema(input_paths)
+    interval_paths, account_manifest_paths, date_manifest_paths = classify_inputs(input_paths)
+
+    validation = validate_interval_schema(interval_paths)
     if not validation["valid"]:
         raise ValueError(f"Input validation failed: {validation['errors']}")
 
     metadata = get_metadata_and_samples(
-        input_paths=input_paths,
+        interval_paths=interval_paths,
+        account_manifest_paths=account_manifest_paths,
+        date_manifest_paths=date_manifest_paths,
         sample_households=sample_households,
         sample_days=sample_days,
         day_strategy=day_strategy,
@@ -457,10 +641,11 @@ def prepare_clustering_data(
 
     output_dir.mkdir(parents=True, exist_ok=True)
     profiles_path = output_dir / "sampled_profiles.parquet"
+    map_path = output_dir / "household_zip4_map.parquet"
 
     if streaming:
         n_profiles = create_household_profiles_chunked_streaming(
-            input_paths=input_paths,
+            interval_paths=interval_paths,
             accounts=accounts,
             dates=dates,
             output_path=profiles_path,
@@ -468,12 +653,48 @@ def prepare_clustering_data(
         )
         if n_profiles == 0:
             raise ValueError("No profiles created in chunked streaming mode - check input data and sampling settings.")
-        profiles_df = pl.read_parquet(profiles_path)
+
+        # Memory-safe: do not read entire profiles parquet
+        log_memory("build_household_map_start")
+        (pl.scan_parquet(profiles_path).select(["account_identifier", "zip_code"]).unique().sink_parquet(map_path))
+        log_memory("build_household_map_done", {"map_path": str(map_path)})
+
+        stats_row = (
+            pl.scan_parquet(profiles_path)
+            .select([
+                pl.len().alias("n_profiles"),
+                pl.col("account_identifier").n_unique().alias("n_households"),
+                pl.col("zip_code").n_unique().alias("n_zip4s"),
+                pl.col("date").n_unique().alias("n_dates"),
+            ])
+            .collect(engine="streaming")
+            .to_dicts()[0]
+        )
+        stats = {k: int(stats_row[k]) for k in ["n_profiles", "n_households", "n_zip4s", "n_dates"]}
     else:
-        profiles_df = create_household_profiles(input_paths=input_paths, accounts=accounts, dates=dates)
+        # Non-streaming mode unchanged from original implementation.
+        profiles_df = (
+            scan_multiple_parquet(interval_paths)
+            .filter(pl.col("account_identifier").is_in(accounts) & pl.col("date").is_in(_as_polars_date_list(dates)))
+            .sort(["account_identifier", "datetime"])
+            .collect()
+        )
         if profiles_df.is_empty():
             raise ValueError("No profiles created - check input data and sampling settings.")
-        profiles_df.select([
+
+        profiles_out = (
+            profiles_df.group_by(["account_identifier", "zip_code", "date"])
+            .agg([
+                pl.col("kwh").alias("profile"),
+                pl.col("is_weekend").first(),
+                pl.col("weekday").first(),
+                pl.len().alias("num_intervals"),
+            ])
+            .filter(pl.col("num_intervals") == 48)
+            .drop("num_intervals")
+        )
+
+        profiles_out.select([
             "account_identifier",
             "zip_code",
             "date",
@@ -483,17 +704,16 @@ def prepare_clustering_data(
         ]).write_parquet(profiles_path)
         logger.info("  Saved profiles: %s", profiles_path)
 
-    household_map = profiles_df.select(["account_identifier", "zip_code"]).unique()
-    map_path = output_dir / "household_zip4_map.parquet"
-    household_map.write_parquet(map_path)
-    logger.info("  Saved household-ZIP+4 map: %s (%s households)", map_path, f"{len(household_map):,}")
-
-    stats = {
-        "n_profiles": len(profiles_df),
-        "n_households": profiles_df["account_identifier"].n_unique(),
-        "n_zip4s": profiles_df["zip_code"].n_unique(),
-        "n_dates": profiles_df["date"].n_unique(),
-    }
+        household_map = profiles_out.select(["account_identifier", "zip_code"]).unique()
+        household_map.write_parquet(map_path)
+        logger.info("  Saved household-ZIP+4 map: %s (%s households)", map_path, f"{household_map.height:,}")
+
+        stats = {
+            "n_profiles": int(profiles_out.height),
+            "n_households": int(profiles_out["account_identifier"].n_unique()),
+            "n_zip4s": int(profiles_out["zip_code"].n_unique()),
+            "n_dates": int(profiles_out["date"].n_unique()),
+        }
 
     logger.info("")
     logger.info("=" * 70)
@@ -504,7 +724,7 @@ def prepare_clustering_data(
     logger.info("  ZIP+4s represented: %s", f"{stats['n_zip4s']:,}")
     logger.info("  Days: %d", stats["n_dates"])
     logger.info("  Output: %s", output_dir)
-    log_memory("end of prepare_clustering_data")
+    log_memory("end_of_prepare_clustering_data", stats)
 
     return stats
 
diff --git a/analysis/stage2/stage2_multinom_blockgroup_weighted.R b/analysis/stage2/stage2_multinom_blockgroup_weighted.R
index 3efb410..5e595bc 100644
--- a/analysis/stage2/stage2_multinom_blockgroup_weighted.R
+++ b/analysis/stage2/stage2_multinom_blockgroup_weighted.R
@@ -4,36 +4,24 @@
 # Stage 2 Multinomial Logit (Block Group): Cluster Composition ~ Census Predictors
 # =============================================================================
 #
-# What this script does (high-level)
-# ----------------------------------
-# 1) Reads household-day cluster assignments (ZIP+4 resolution) from a Parquet file.
-# 2) Aggregates these household-days into ZIP+4 × cluster counts using Arrow compute
-#    (i.e., without loading the full row-level data into memory).
-# 3) Joins ZIP+4 to Census Block Group using a crosswalk, then aggregates to
-#    Block Group × cluster counts and total household-days per Block Group.
-# 4) Joins Block Group-level census predictors (Parquet output from upstream census pipeline).
-# 5) Fits a multinomial logit model with COUNT response:
-#       Y_bg = (n_bg,clusterA, n_bg,clusterB, ..., n_bg,clusterBaseline)
-#    where cluster probabilities are modeled as a function of BG predictors.
-#
-# Why this structure
-# ------------------
-# - The regression is formulated at the Block Group level to avoid per-row modeling at the
-#   household-day level while still leveraging the full count information (multinomial likelihood).
-# - Arrow Dataset aggregation avoids memory blowups when the input Parquet is very large.
-# - Predictors are inferred from the census parquet to reduce coupling / hardcoding.
-# - VGAM is used by default because it is typically more numerically stable at large scale than
-#   nnet::multinom, but VGAM requires a full-rank design matrix; rank-deficient terms are dropped
-#   deterministically to satisfy that constraint.
+# Key principles for regulatory defensibility
+# -------------------------------------------
+# - Predictors are inferred from the census parquet to avoid fragile hard-coding.
+# - Some predictors are "shares" that sum to 100 within a group (compositional).
+#   To ensure a unique model, one pre-specified reference share per group is omitted
+#   from estimation. Omitted shares are documented explicitly in outputs.
+# - Predictors are NOT mean-centered. Optional scaling divides by SD only.
+# - If VGAM is used, any remaining rank-deficient terms are dropped deterministically.
 #
 # Outputs
 # -------
-# - regression_results.parquet: coefficient table with cluster, predictor, estimate, SE, z, p, q.
-# - regression_diagnostics.json: model fit diagnostics (LL, deviance, pseudo-R2, AIC/BIC, etc.)
-# - stage2_input_qc.json: data lineage + drop counts + inferred/used predictor lists
-# - regression_data_blockgroups_wide.parquet: the modeled BG dataset (counts + predictors)
-# - stage2_manifest.json: paths of all outputs
-# - stage2_metadata.json: runtime + package versions (provenance)
+# - regression_results.parquet
+# - regression_diagnostics.json
+# - stage2_input_qc.json
+# - regression_data_blockgroups_wide.parquet
+# - omitted_predictors.parquet   <-- NEW: documents reference omissions + rank drops
+# - stage2_manifest.json
+# - stage2_metadata.json
 #
 # =============================================================================
 
@@ -41,9 +29,6 @@
 # -----------------------------
 # CLI args (no external deps)
 # -----------------------------
-# Notes:
-# - Avoids argparse-style dependencies in R, keeping the script self-contained.
-# - Supports both "--flag value" and "--flag=value".
 print_help_and_exit <- function(exit_code = 0) {
   cat(
     paste0(
@@ -59,24 +44,17 @@ print_help_and_exit <- function(exit_code = 0) {
       "  --baseline-cluster K           Baseline cluster label (default: choose most frequent)\n",
       "  --min-obs-per-bg N             Drop BGs with total household-days < N (default: 50)\n",
       "  --allow-missing-predictors 0|1 If 0, abort if predictor NA would drop any BGs (default: 0)\n",
-      "  --standardize 0|1              Z-score standardize predictors (default: 0)\n",
+      "  --standardize 0|1              If 1, divide predictors by SD (NO mean-centering) (default: 0)\n",
       "  --use-vgam 0|1                 Use VGAM::vglm() (IRLS) instead of nnet::multinom (default: 1)\n",
       "  --verbose 0|1                  Verbose logging (default: 1)\n",
       "  --no-emoji 0|1                 Disable unicode icons (default: 0)\n",
       "  --help                         Print this help and exit\n\n",
       "Notes:\n",
       "  - Predictors are inferred from the census parquet columns.\n",
+      "  - Some predictors are compositional shares; one reference share per group is omitted\n",
+      "    deterministically and documented in omitted_predictors.parquet.\n",
       "  - Model uses COUNT response: cbind(count_clusterA, count_clusterB, ...)\n",
-      "  - Zeros are handled naturally by the multinomial likelihood; no smoothing is applied.\n",
-      "  - Standardization (--standardize=1) is STRONGLY RECOMMENDED for numerical stability.\n",
-      "  - VGAM requires a full-rank design matrix; this script drops rank-deficient terms deterministically.\n",
-      "  - Outputs written under out-dir:\n",
-      "      regression_results.parquet\n",
-      "      regression_diagnostics.json\n",
-      "      stage2_input_qc.json\n",
-      "      regression_data_blockgroups_wide.parquet\n",
-      "      stage2_manifest.json\n",
-      "      stage2_metadata.json\n\n"
+      "  - Zeros are handled naturally by the multinomial likelihood; no smoothing is applied.\n\n"
     )
   )
   quit(status = exit_code)
@@ -134,12 +112,6 @@ if (is.null(CLUSTERS_PATH) || is.null(CROSSWALK_PATH) || is.null(CENSUS_PATH) ||
 # -----------------------------
 # Deps
 # -----------------------------
-# The approach uses:
-# - arrow: efficient parquet I/O and dataset aggregation
-# - dplyr/tibble: data manipulation
-# - jsonlite: writing QC/diagnostics
-# - nnet: fallback multinomial logit solver
-# - VGAM: default multinomial logit solver for stability (full-rank requirement)
 require_pkg <- function(pkg) requireNamespace(pkg, quietly = TRUE)
 
 if (!require_pkg("arrow")) stopf("Missing R package 'arrow'. Install with: install.packages('arrow')")
@@ -176,13 +148,72 @@ logi("%s Config: standardize=%d use_vgam=%d", IC$ok, STANDARDIZE, USE_VGAM)
 t_total_start <- Sys.time()
 
 
+# -----------------------------
+# Reference-category specification (deterministic omissions)
+# -----------------------------
+REFERENCE_GROUPS <- list(
+  list(
+    group_id = "income_3bin",
+    group_name = "Household income distribution (3 bins)",
+    variables = c("pct_income_under_25k", "pct_income_25k_to_75k", "pct_income_75k_plus"),
+    reference = "pct_income_75k_plus",
+    rationale = "Shares sum to 100; omit one bin to ensure a unique model. Highest-income share is a stable reference for policy interpretation."
+  ),
+  list(
+    group_id = "tenure_2bin",
+    group_name = "Housing tenure (occupied units)",
+    variables = c("pct_owner_occupied", "pct_renter_occupied"),
+    reference = "pct_owner_occupied",
+    rationale = "Owner + renter shares sum to 100; omit owners so renter share is interpreted relative to owners."
+  ),
+  list(
+    group_id = "vintage_3bin",
+    group_name = "Housing vintage distribution (3 bins)",
+    variables = c("pct_housing_built_2000_plus", "pct_housing_built_1980_1999", "old_building_pct"),
+    reference = "pct_housing_built_2000_plus",
+    rationale = "Shares sum to 100; omit newest stock so older stock shares are interpreted relative to newer construction."
+  ),
+  list(
+    group_id = "home_value_3bin",
+    group_name = "Owner-occupied home value distribution (3 bins)",
+    variables = c("pct_home_value_under_150k", "pct_home_value_150k_to_299k", "pct_home_value_300k_plus"),
+    reference = "pct_home_value_300k_plus",
+    rationale = "Shares sum to 100; omit highest-value share so lower/middle value shares are interpreted relative to highest-value homes."
+  ),
+  list(
+    group_id = "age_6bin",
+    group_name = "Population age distribution (6 bins)",
+    variables = c(
+      "pct_population_under_5", "pct_population_5_to_17", "pct_population_18_to_24",
+      "pct_population_25_to_44", "pct_population_45_to_64", "pct_population_65_plus"
+    ),
+    reference = "pct_population_25_to_44",
+    rationale = "Shares sum to 100; omit 25–44 as a central working-age reference group."
+  )
+)
+
+REFERENCE_VARS <- unique(vapply(REFERENCE_GROUPS, function(g) g$reference, character(1)))
+
+implied_definitions <- lapply(REFERENCE_GROUPS, function(g) {
+  incl <- setdiff(g$variables, g$reference)
+  list(
+    group_id = g$group_id,
+    group_name = g$group_name,
+    reference = g$reference,
+    implied_as = paste0(g$reference, " = 100 - (", paste(incl, collapse = " + "), ")")
+  )
+})
+
+REFERENCE_CAVEAT <- paste(
+  "Some predictors are shares that sum to 100 within a group (compositional).",
+  "To ensure a unique multinomial logit model, one pre-specified reference share per group is omitted from estimation.",
+  "The omitted share is not independently estimated; it is implied by the included shares via the identity in the diagnostics."
+)
+
+
 # -----------------------------
 # Helpers: keys + inference
 # -----------------------------
-# ZIP+4 normalization:
-# - Cluster parquet uses "zip_code" which can appear as:
-#   - "60601-1234" or "606011234" or other string forms.
-# - Crosswalk expects Zip + Zip4 columns; we standardize to "#####-####".
 normalize_zip4 <- function(x) {
   s <- as.character(x)
   s <- trimws(s)
@@ -193,7 +224,6 @@ normalize_zip4 <- function(x) {
   out
 }
 
-# Ensure Zip4 is exactly 4 digits, leading zeros preserved.
 zfill4 <- function(x) {
   s <- as.character(x)
   s <- trimws(s)
@@ -203,8 +233,6 @@ zfill4 <- function(x) {
   s
 }
 
-# Census GEOID column inference:
-# - Upstream data may name the key GEOID, CensusKey2023, CensusKey2020, etc.
 infer_geoid_col <- function(df) {
   nms <- names(df)
   low <- tolower(nms)
@@ -214,10 +242,6 @@ infer_geoid_col <- function(df) {
   NULL
 }
 
-# Predictor inference:
-# - Uses numeric/integer/logical columns as candidate predictors.
-# - Excludes id-like columns (GEOID/NAME + the inferred geoid key).
-# - Drops columns that are entirely NA.
 infer_predictors <- function(census_df) {
   geoid_col <- infer_geoid_col(census_df)
   if (is.null(geoid_col)) {
@@ -240,16 +264,6 @@ infer_predictors <- function(census_df) {
   list(geoid_col = geoid_col, predictors = preds, dropped_all_na = all_na)
 }
 
-# Rank-deficiency handling for VGAM:
-# - VGAM::vglm(multinomial()) requires a full-rank model matrix.
-# - We build a standard model.matrix with intercept and remove:
-#   (a) constant columns (excluding intercept)
-#   (b) columns beyond QR rank using pivot ordering
-#
-# Note: This is deterministic and reproducible, but may drop terms that are
-# substantively meaningful if predictors are highly collinear. Treat as a
-# pragmatic requirement for full-rank MLE and revisit for a more principled
-# approach if needed.
 drop_rank_deficient_terms <- function(model_df, predictors) {
   ftmp <- stats::as.formula(paste0("~ ", paste(predictors, collapse = " + ")))
   Xmm <- stats::model.matrix(ftmp, data = model_df)
@@ -311,9 +325,6 @@ drop_rank_deficient_terms <- function(model_df, predictors) {
 # -----------------------------
 # Read + aggregate clusters (memory-safe)
 # -----------------------------
-# Key point:
-# - We do not read the full household-day dataset into memory.
-# - Instead we open it as an Arrow dataset and aggregate "zip_code × cluster" counts.
 if (!file.exists(CLUSTERS_PATH)) stopf("Clusters parquet not found: %s", CLUSTERS_PATH)
 if (!file.exists(CROSSWALK_PATH)) stopf("Crosswalk file not found: %s", CROSSWALK_PATH)
 if (!file.exists(CENSUS_PATH)) stopf("Census predictors parquet not found: %s", CENSUS_PATH)
@@ -333,11 +344,6 @@ zip_cluster_counts <- clusters_ds %>%
   dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
   dplyr::collect()
 
-if (!("zip_code" %in% names(zip_cluster_counts))) stopf("Clusters parquet must include column 'zip_code'")
-if (!("cluster" %in% names(zip_cluster_counts))) stopf("Clusters parquet must include column 'cluster'")
-if (!("n" %in% names(zip_cluster_counts))) stopf("Internal error: missing 'n' after aggregation")
-
-# Normalize and sanitize types
 zip_cluster_counts <- zip_cluster_counts %>%
   dplyr::mutate(
     zip4 = normalize_zip4(zip_code),
@@ -353,14 +359,6 @@ household_day_rows_total <- sum(zip_cluster_counts$n, na.rm = TRUE)
 # -----------------------------
 # Read crosswalk
 # -----------------------------
-# Crosswalk requirements:
-# - Must include Zip, Zip4, and CensusKey2023 (or equivalent).
-# - We standardize to "zip4" = "#####-####" and "block_group_geoid" = 12-char BG GEOID.
-#
-# Design choice:
-# - If multiple BGs map to the same ZIP+4, we deterministically pick the first after sorting.
-#   This enforces a single mapping and avoids row inflation, but it should be reviewed for whether
-#   a probabilistic or fractional allocation is preferable.
 logi("%s Reading crosswalk: %s", IC$ok, CROSSWALK_PATH)
 
 cw_tbl <- tryCatch(
@@ -403,7 +401,6 @@ cw <- cw %>%
 
 if (nrow(cw) == 0) stopf("Crosswalk produced 0 usable rows after cleaning/filtering.")
 
-# Deterministic one-to-one ZIP+4 → BG mapping
 cw <- cw %>%
   dplyr::arrange(zip4, block_group_geoid) %>%
   dplyr::group_by(zip4) %>%
@@ -417,10 +414,6 @@ if (nrow(dup_zip4) > 0) stopf("Crosswalk still has non-unique zip4 after determi
 # -----------------------------
 # Join + aggregate to BG counts
 # -----------------------------
-# After joining the crosswalk, we can compute:
-# - BG×cluster counts
-# - BG total household-days
-# and optionally drop BGs with too few observations (min_obs_per_bg).
 logi("%s Joining ZIP+4×cluster counts to crosswalk...", IC$ok)
 
 clusters2 <- zip_cluster_counts %>%
@@ -445,7 +438,6 @@ total_by_bg <- clusters2 %>%
 clusters_observed <- sort(unique(bg_counts$cluster))
 if (length(clusters_observed) < 2) stopf("Need at least 2 clusters observed after aggregation; found: %s", paste(clusters_observed, collapse = ","))
 
-# Wide BG frame: GEOID + total + one column per cluster count
 bg_wide <- total_by_bg %>% dplyr::rename(GEOID = block_group_geoid)
 
 for (k in clusters_observed) {
@@ -460,11 +452,9 @@ for (k in clusters_observed) {
     dplyr::select(-n)
 }
 
-# Observation floor: avoids fragile inference for tiny BG totals.
 bg_wide <- bg_wide %>% dplyr::filter(total_household_days >= as.integer(MIN_OBS_PER_BG))
 if (nrow(bg_wide) == 0) stopf("No block groups remain after --min-obs-per-bg filtering (N=%d).", MIN_OBS_PER_BG)
 
-# Zero count diagnostic: proportion of BGs with zero count in each cluster column.
 zero_stats <- list()
 for (k in clusters_observed) {
   colname <- paste0("cluster_", k)
@@ -479,31 +469,26 @@ for (k in clusters_observed) {
 # -----------------------------
 # Read census predictors + infer predictors
 # -----------------------------
-# This step establishes the modeling covariate set at the BG level.
-# Predictors are inferred (numeric/integer/logical columns) rather than hard-coded.
 logi("%s Reading census predictors: %s", IC$ok, CENSUS_PATH)
 
 census <- arrow::read_parquet(CENSUS_PATH, as_data_frame = TRUE)
 inf <- infer_predictors(census)
 CENSUS_GEOID_COL <- inf$geoid_col
-PREDICTORS <- inf$predictors
+PREDICTORS_RAW <- inf$predictors
 DROPPED_ALL_NA <- inf$dropped_all_na
 
-if (length(PREDICTORS) == 0) stopf("No usable numeric predictors inferred from census parquet.")
+if (length(PREDICTORS_RAW) == 0) stopf("No usable numeric predictors inferred from census parquet.")
 
-# Deduplicate by GEOID defensively; keep only inferred predictors.
 census <- census %>%
   dplyr::mutate(GEOID = as.character(.data[[CENSUS_GEOID_COL]])) %>%
-  dplyr::select(GEOID, dplyr::any_of(PREDICTORS)) %>%
+  dplyr::select(GEOID, dplyr::any_of(PREDICTORS_RAW)) %>%
   dplyr::distinct(GEOID, .keep_all = TRUE)
 
 logi("%s Joining census predictors to BG counts...", IC$ok)
 bg_model <- bg_wide %>% dplyr::inner_join(census, by = "GEOID")
 
-# Missing predictor handling:
-# - If allow_missing_predictors=0: fail fast if any BG would be dropped by NA.
-# - If allow_missing_predictors=1: drop incomplete BGs (complete-case analysis).
-pred_mat <- bg_model %>% dplyr::select(dplyr::any_of(PREDICTORS))
+# Missing predictor handling
+pred_mat <- bg_model %>% dplyr::select(dplyr::any_of(PREDICTORS_RAW))
 any_na <- apply(is.na(pred_mat), 1, any)
 
 drop_missing_pred_bg <- sum(any_na)
@@ -522,17 +507,26 @@ if (nrow(bg_model) == 0) stopf("No block groups remain after predictor missingne
 
 
 # -----------------------------
-# Optional: standardize predictors (z-score)
+# Deterministic reference-variable omission (if present)
+# -----------------------------
+omit_refs_present <- intersect(REFERENCE_VARS, PREDICTORS_RAW)
+PREDICTORS <- setdiff(PREDICTORS_RAW, omit_refs_present)
+
+if (length(omit_refs_present) > 0) {
+  logi("%s Omitted %d reference-share predictor(s): %s", IC$warn, length(omit_refs_present), paste(omit_refs_present, collapse = ", "))
+}
+
+if (length(PREDICTORS) == 0) stopf("No predictors remain after reference omissions.")
+
+
+# -----------------------------
+# Optional: scale by SD only (NO mean-centering)
 # -----------------------------
-# Rationale:
-# - Improves numerical conditioning for optimization / IRLS.
-# - Makes coefficients more comparable (effect per 1 SD change).
-# - Retains original scale parameters in scaling_info for provenance.
 scaling_info <- NULL
 zero_var_predictors <- character(0)
 
 if (STANDARDIZE == 1L) {
-  logi("%s Standardizing %d predictors (z-score: mean=0, sd=1)...", IC$ok, length(PREDICTORS))
+  logi("%s Scaling %d predictors by SD only (no mean-centering)...", IC$ok, length(PREDICTORS))
   scaling_info <- list()
 
   for (col in PREDICTORS) {
@@ -540,16 +534,14 @@ if (STANDARDIZE == 1L) {
     x <- bg_model[[col]]
     if (is.logical(x)) x <- as.numeric(x)
 
-    mu <- mean(x, na.rm = TRUE)
     sigma <- stats::sd(x, na.rm = TRUE)
-    scaling_info[[col]] <- list(mean = as.numeric(mu), sd = as.numeric(sigma))
+    scaling_info[[col]] <- list(sd = as.numeric(sigma), centered = FALSE)
 
     if (is.finite(sigma) && sigma > 1e-10) {
-      bg_model[[col]] <- (x - mu) / sigma
+      bg_model[[col]] <- x / sigma
     } else {
-      # Retain original scale if SD ~ 0; keep a list for QC outputs.
       zero_var_predictors <- c(zero_var_predictors, col)
-      cat(sprintf("%s Predictor '%s' has ~zero variance; not standardized.\n", IC$warn, col))
+      cat(sprintf("%s Predictor '%s' has ~zero variance; not scaled.\n", IC$warn, col))
       bg_model[[col]] <- x
     }
   }
@@ -559,11 +551,6 @@ if (STANDARDIZE == 1L) {
 # -----------------------------
 # Choose baseline + response matrix
 # -----------------------------
-# The response is a matrix of counts per BG across clusters:
-# - Columns are ordered so the baseline cluster is last, matching the refLevel config for VGAM.
-# - Baseline selection:
-#   - If user specifies --baseline-cluster, use it.
-#   - Otherwise choose the most frequent cluster by total count.
 resp_cols <- paste0("cluster_", clusters_observed)
 resp_cols <- resp_cols[resp_cols %in% names(bg_model)]
 if (length(resp_cols) < 2) stopf("Need >=2 cluster count columns; found: %s", paste(resp_cols, collapse = ","))
@@ -586,7 +573,6 @@ if (!paste0("cluster_", baseline_cluster) %in% resp_cols) {
 resp_cols_ordered <- c(setdiff(resp_cols, paste0("cluster_", baseline_cluster)), paste0("cluster_", baseline_cluster))
 Y <- as.matrix(bg_model[, resp_cols_ordered, drop = FALSE])
 
-# Basic response validation
 if (any(is.na(Y))) stopf("NA values detected in response matrix.")
 if (any(Y < 0, na.rm = TRUE)) stopf("Negative counts detected in response matrix.")
 rs <- rowSums(Y)
@@ -595,13 +581,9 @@ if (any(rs <= 0, na.rm = TRUE)) {
   stopf("Block groups with non-positive total counts. Example row indices: %s", paste(head(bad_rows, 10), collapse = ", "))
 }
 
-# Model frame for fitting
 model_df <- bg_model[, c("GEOID", "total_household_days", PREDICTORS), drop = FALSE]
 model_df$Y <- I(Y)
 
-# Map “equations” to cluster labels:
-# - Multinomial logit has one linear predictor per non-baseline outcome.
-# - Here, eq index i corresponds to nonbase_clusters[i].
 nonbase_cols <- resp_cols_ordered[resp_cols_ordered != paste0("cluster_", baseline_cluster)]
 nonbase_clusters <- as.integer(sub("^cluster_", "", nonbase_cols))
 
@@ -620,21 +602,16 @@ if (USE_VGAM == 1L) {
   design_rank <- as.integer(rk$rank)
   design_ncol <- as.integer(rk$ncol_design)
 
-  # Rebuild with kept predictors only
   model_df <- bg_model[, c("GEOID", "total_household_days", PREDICTORS), drop = FALSE]
   model_df$Y <- I(Y)
 }
 
+if (length(PREDICTORS) == 0) stopf("No predictors remain after rank-deficiency handling.")
+
 
 # -----------------------------
 # Fit model
 # -----------------------------
-# Model form:
-#   Y ~ X1 + X2 + ... + Xp
-#
-# For VGAM::vglm:
-# - family = multinomial(refLevel = K) where K is the last column (baseline).
-# - IRLS typically handles scale better than nnet::multinom in large-count settings.
 logi(
   "%s Fitting multinomial logit (counts) with %d BGs, %d predictors, %d clusters (baseline=%d)...",
   IC$ok, nrow(model_df), length(PREDICTORS), length(resp_cols_ordered), baseline_cluster
@@ -658,7 +635,6 @@ if (USE_VGAM == 1L) {
     error = function(e) stopf("Model fit failed (VGAM::vglm): %s", e$message)
   )
 
-  # Null model (intercept-only): provides baseline for pseudo-R2 and deviance comparisons
   fit0 <- tryCatch(
     withCallingHandlers(
       VGAM::vglm(Y ~ 1, family = VGAM::multinomial(refLevel = length(resp_cols_ordered)), data = model_df),
@@ -692,10 +668,6 @@ logi("%s Model fit completed in %.1f seconds", IC$ok, fit_duration)
 # -----------------------------
 # Convergence heuristics
 # -----------------------------
-# Approach:
-# - Primary: check warnings that suggest iteration/step issues.
-# - Secondary: compare deviance_full vs deviance_null; if too close, signal weak fit or instability.
-# Note: VGAM is S4; we avoid `$iter` or similar direct slot assumptions here.
 convergence_ok <- TRUE
 convergence_message <- "Model converged successfully"
 
@@ -732,43 +704,9 @@ if (is.finite(deviance_ratio) && deviance_ratio > 0.95) {
 }
 
 
-# -----------------------------
-# Correlations (diagnostic)
-# -----------------------------
-# This is a quick heuristic to identify predictors correlated with observed cluster proportions.
-# It is not used for modeling; it is logged for reviewer sanity-checking and interpretation guidance.
-if (all(c("cluster_0", "cluster_1", "cluster_3") %in% names(bg_model))) {
-  bg_tmp <- bg_model %>%
-    dplyr::mutate(
-      prop_cluster_0 = cluster_0 / total_household_days,
-      prop_cluster_1 = cluster_1 / total_household_days,
-      prop_cluster_3 = cluster_3 / total_household_days
-    )
-
-  cor_matrix <- tryCatch(
-    stats::cor(
-      bg_tmp[, PREDICTORS, drop = FALSE],
-      bg_tmp[, c("prop_cluster_0", "prop_cluster_1", "prop_cluster_3"), drop = FALSE],
-      use = "complete.obs"
-    ),
-    error = function(e) NULL
-  )
-
-  if (!is.null(cor_matrix)) {
-    strong_cors <- apply(abs(cor_matrix), 1, max)
-    cat("\nPredictors with |correlation| > 0.1 to any cluster proportion:\n")
-    print(sort(strong_cors[strong_cors > 0.1], decreasing = TRUE))
-  }
-}
-
-
 # -----------------------------
 # Core stats
 # -----------------------------
-# McFadden pseudo-R2:
-#   1 - (LL_full / LL_null)
-# Interpreted as improvement over intercept-only baseline on the log-likelihood scale.
-# Not directly comparable to OLS R^2; treat as a relative fit measure.
 ll_full <- as.numeric(stats::logLik(fit))
 ll_null <- as.numeric(stats::logLik(fit0))
 pseudo_r2 <- 1.0 - (ll_full / ll_null)
@@ -777,15 +715,6 @@ pseudo_r2 <- 1.0 - (ll_full / ll_null)
 # -----------------------------
 # Coefficients table (nnet + VGAM) -> ALWAYS returns 'cluster'
 # -----------------------------
-# This function normalizes output format across engines:
-# - Each row: (cluster, predictor, coefficient, std_err, z_stat, p_value)
-# - cluster indicates the non-baseline outcome corresponding to the equation.
-#
-# For VGAM:
-# - We use vcov(fit) for SEs and align by exact coefficient names.
-# - We robustly parse equation indices from coefficient naming conventions:
-#     A) "term:1" style
-#     B) "log(mu[,1]/mu[,K]):term" style
 extract_coef_table <- function(fit_obj, nonbase_clusters) {
   if (inherits(fit_obj, "multinom")) {
     coefs <- summary(fit_obj)$coefficients
@@ -841,7 +770,6 @@ extract_coef_table <- function(fit_obj, nonbase_clusters) {
       eq <- rep(NA_integer_, length(nm_vec))
       term <- rep(NA_character_, length(nm_vec))
 
-      # A) "term:1"
       m_a <- regexec("^(.*):([0-9]+)$", nm_vec)
       r_a <- regmatches(nm_vec, m_a)
       has_a <- lengths(r_a) == 3
@@ -850,7 +778,6 @@ extract_coef_table <- function(fit_obj, nonbase_clusters) {
         eq[has_a] <- suppressWarnings(as.integer(vapply(r_a[has_a], function(x) x[[3]], character(1))))
       }
 
-      # B) "log(mu[,k]/mu[,K]):term"
       need_b <- !has_a
       if (any(need_b)) {
         nm_b <- nm_vec[need_b]
@@ -898,9 +825,6 @@ extract_coef_table <- function(fit_obj, nonbase_clusters) {
 
 res_tbl <- extract_coef_table(fit, nonbase_clusters)
 
-# Multiple testing control:
-# - BH q-values computed within each cluster equation (i.e., within each non-baseline outcome),
-#   which is a sensible default for “per-equation” screening.
 res_tbl <- res_tbl %>%
   dplyr::group_by(cluster) %>%
   dplyr::mutate(q_value = p.adjust(p_value, method = "BH")) %>%
@@ -918,12 +842,39 @@ res_tbl <- res_tbl %>%
 # -----------------------------
 # QC + metadata outputs
 # -----------------------------
-# This section writes:
-# - input_qc: what was dropped and why; predictor inference outcomes; configuration
-# - diag: fit diagnostics and cluster marginals
-# - manifest: file path registry for downstream automation
 household_day_rows_modeled <- sum(bg_model$total_household_days, na.rm = TRUE)
 
+# New omitted predictors artifact (reference omissions + rank drops)
+omitted_tbl <- tibble::tibble(
+  predictor = c(omit_refs_present, dropped_predictors_rank),
+  reason = c(
+    rep("omitted_reference_share", length(omit_refs_present)),
+    rep("dropped_rank_deficient", length(dropped_predictors_rank))
+  )
+) %>%
+  dplyr::mutate(
+    caveat = dplyr::case_when(
+      reason == "omitted_reference_share" ~ REFERENCE_CAVEAT,
+      TRUE ~ "Dropped because the fitted design matrix was not full rank (VGAM full-rank requirement)."
+    )
+  )
+
+# Add group metadata for reference omissions
+if (nrow(omitted_tbl) > 0) {
+  ref_map <- do.call(rbind.data.frame, lapply(REFERENCE_GROUPS, function(g) {
+    data.frame(
+      predictor = g$reference,
+      group_id = g$group_id,
+      group_name = g$group_name,
+      implied_definition = implied_definitions[[which(vapply(implied_definitions, function(x) x$group_id, character(1)) == g$group_id)[1]]]$implied_as,
+      rationale = g$rationale,
+      stringsAsFactors = FALSE
+    )
+  }))
+  omitted_tbl <- omitted_tbl %>%
+    dplyr::left_join(ref_map, by = "predictor")
+}
+
 input_qc <- list(
   inputs = list(
     clusters = CLUSTERS_PATH,
@@ -934,7 +885,15 @@ input_qc <- list(
     "Predictors are inferred from the census parquet columns (numeric/logical), excluding GEOID/NAME.",
     "Counts are computed memory-safely using Arrow Dataset aggregation; the full household-day parquet is not read into RAM.",
     "Zero counts are expected in BG×cluster composition; multinomial likelihood handles zeros naturally (no smoothing/alpha).",
-    "If VGAM is used, rank-deficient predictors are dropped to satisfy full-rank design requirement."
+    "Reference-share variables (compositional groups) are omitted deterministically when present to ensure identifiability.",
+    "If VGAM is used, remaining rank-deficient predictors are dropped deterministically to satisfy full-rank design requirement."
+  ),
+  reference_categories = list(
+    reference_groups = REFERENCE_GROUPS,
+    reference_variables = REFERENCE_VARS,
+    reference_variables_present_in_input = omit_refs_present,
+    implied_definitions = implied_definitions,
+    caveat = REFERENCE_CAVEAT
   ),
   counts = list(
     household_day_rows_total_after_basic_filter = as.integer(household_day_rows_total),
@@ -950,9 +909,10 @@ input_qc <- list(
     geoid_column = CENSUS_GEOID_COL,
     predictors_inferred = inf$predictors,
     predictors_used = PREDICTORS,
+    predictors_omitted_reference_share = omit_refs_present,
     predictors_dropped_all_na = DROPPED_ALL_NA,
     predictors_dropped_rank_deficient = dropped_predictors_rank,
-    predictors_zero_variance_not_standardized = zero_var_predictors
+    predictors_zero_variance_not_scaled = zero_var_predictors
   ),
   model = list(
     clusters_observed = as.integer(clusters_observed),
@@ -998,6 +958,17 @@ diag <- list(
     clusters_observed = as.integer(clusters_observed),
     baseline_cluster = as.integer(baseline_cluster)
   ),
+  reference_categories = list(
+    reference_groups = REFERENCE_GROUPS,
+    reference_variables = REFERENCE_VARS,
+    reference_variables_present_in_input = omit_refs_present,
+    implied_definitions = implied_definitions,
+    caveat = REFERENCE_CAVEAT
+  ),
+  omitted_predictors_summary = list(
+    n_omitted_reference_share = as.integer(length(omit_refs_present)),
+    n_dropped_rank_deficient = as.integer(length(dropped_predictors_rank))
+  ),
   cluster_marginal_distributions = lapply(names(cluster_totals), function(cn) {
     k <- as.integer(sub("^cluster_", "", cn))
     list(cluster = k, household_days = as.integer(cluster_totals[[cn]]), proportion = as.numeric(cluster_props[[cn]]))
@@ -1011,6 +982,7 @@ manifest <- list(
     regression_diagnostics = file.path(OUT_DIR, "regression_diagnostics.json"),
     stage2_input_qc = file.path(OUT_DIR, "stage2_input_qc.json"),
     regression_data_blockgroups_wide = file.path(OUT_DIR, "regression_data_blockgroups_wide.parquet"),
+    omitted_predictors = file.path(OUT_DIR, "omitted_predictors.parquet"),
     stage2_manifest = file.path(OUT_DIR, "stage2_manifest.json"),
     stage2_metadata = file.path(OUT_DIR, "stage2_metadata.json")
   )
@@ -1020,6 +992,7 @@ logi("%s Writing outputs to: %s", IC$ok, OUT_DIR)
 
 arrow::write_parquet(res_tbl, file.path(OUT_DIR, "regression_results.parquet"))
 arrow::write_parquet(bg_model, file.path(OUT_DIR, "regression_data_blockgroups_wide.parquet"))
+arrow::write_parquet(omitted_tbl, file.path(OUT_DIR, "omitted_predictors.parquet"))
 safe_write_json(diag, file.path(OUT_DIR, "regression_diagnostics.json"))
 safe_write_json(input_qc, file.path(OUT_DIR, "stage2_input_qc.json"))
 safe_write_json(manifest, file.path(OUT_DIR, "stage2_manifest.json"))
@@ -1047,6 +1020,8 @@ cat(sprintf("%s Stage 2 multinomial logit complete.\n", IC$ok))
 cat(sprintf("  Block Groups (modeled): %d\n", nrow(bg_model)))
 cat(sprintf("  Household-days (modeled): %s\n", format(household_day_rows_modeled, big.mark = ",")))
 cat(sprintf("  Predictors (used): %d\n", length(PREDICTORS)))
+cat(sprintf("  Omitted reference predictors (present): %d\n", length(omit_refs_present)))
+cat(sprintf("  Rank-deficient predictors dropped: %d\n", length(dropped_predictors_rank)))
 cat(sprintf("  Pseudo R^2 (McFadden): %.4f\n", pseudo_r2))
 cat(sprintf("  Converged: %s\n", ifelse(convergence_ok, "true", "false")))
 cat("\n")
diff --git a/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_heatmap.png b/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_heatmap.png
deleted file mode 100644
index 8f606f0..0000000
Binary files a/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_heatmap.png and /dev/null differ
diff --git a/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_hourly_profile.png b/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_hourly_profile.png
deleted file mode 100644
index 97d6ebc..0000000
Binary files a/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_hourly_profile.png and /dev/null differ
diff --git a/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_monthly_profile.png b/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_monthly_profile.png
deleted file mode 100644
index 8c5511c..0000000
Binary files a/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_monthly_profile.png and /dev/null differ
diff --git a/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_weekend_comparison.png b/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_weekend_comparison.png
deleted file mode 100644
index a62da58..0000000
Binary files a/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_weekend_comparison.png and /dev/null differ
diff --git a/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/summary_table.txt b/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/summary_table.txt
deleted file mode 100644
index d455a91..0000000
--- a/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/summary_table.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-================================================================================
-TABLE 1: Monthly Sample Summary
-================================================================================
-
-Month        Customers    ZIP Codes    Mean kWh     Std Dev      Observations
---------------------------------------------------------------------------------
-2024-01      995          10           0.4011       0.5584       1,479,469
-2024-02      979          10           0.2643       0.3245       1,361,693
-2024-03      987          10           0.2252       0.3126       1,465,503
-2024-04    * 229          47           0.3035       0.6872       318,930
-2024-05      781          8            0.2611       0.3299       1,156,931
-2024-06      964          10           0.4038       0.5014       1,387,196
-2024-07      962          10           0.4036       0.4819       1,428,964
-2024-08      908          10           0.5073       1.0055       1,349,911
-2024-09      958          10           0.3175       0.3892       1,372,516
-2024-10      954          10           0.2242       0.2682       1,418,168
-2024-11      963          10           0.2669       0.3413       1,387,539
-2024-12      977          10           0.2838       0.3554       1,443,393
---------------------------------------------------------------------------------
-TOTAL        10,270       -            -            -            15,570,213
-
-*April 2024: Limited data availability due to utility data quality issues.
-  Sample size reduced to 229 customers (24% of typical) despite city-wide
-  sampling effort across 47 ZIP codes.
diff --git a/archive/residential_load_viz_2024/combine_chicago_zips.py b/archive/residential_load_viz_2024/combine_chicago_zips.py
deleted file mode 100755
index 54364be..0000000
--- a/archive/residential_load_viz_2024/combine_chicago_zips.py
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/usr/bin/env python
-"""
-Combine all sampled ZIP codes into one Chicago-wide dataset.
-Checks for April 2024 data quality issues.
-"""
-
-from pathlib import Path
-
-import polars as pl
-
-BASE_DIR = Path("analysis/chicago_2024_full_year")
-OUTPUT_DIR = BASE_DIR / "combined"
-OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-
-print("=" * 80)
-print("COMBINING ALL CHICAGO ZIP CODES - 2024 FULL YEAR")
-print("=" * 80)
-
-# Find all CLIPPED_CM90 files
-cm90_files = sorted(BASE_DIR.glob("zip*/final/*_CLIPPED_CM90.parquet"))
-
-print(f"Found {len(cm90_files)} ZIP datasets")
-
-if not cm90_files:
-    print("❌ No files found! Run sampling first.")
-    exit(1)
-
-# Combine all ZIPs
-print("\nLoading and combining...")
-all_data = []
-
-for file in cm90_files:
-    zipcode = file.parent.parent.name.replace("zip", "")
-    print(f"  ZIP {zipcode}...", end=" ")
-
-    df = pl.read_parquet(file)
-    df = df.with_columns(pl.lit(zipcode).alias("zipcode"))
-    all_data.append(df)
-    print(f"{len(df):,} rows")
-
-combined = pl.concat(all_data)
-
-print("\n✅ Combined dataset:")
-print(f"   Total rows: {len(combined):,}")
-print(f"   Unique customers: {combined['account_identifier'].n_unique():,}")
-print(f"   ZIP codes: {combined['zipcode'].n_unique()}")
-print(f"   Months: {combined['sample_month'].n_unique()}")
-print(f"   Date range: {combined['date'].min()} to {combined['date'].max()}")
-
-# Summary by month
-monthly = (
-    combined.group_by("sample_month")
-    .agg([
-        pl.col("account_identifier").n_unique().alias("customers"),
-        pl.col("zipcode").n_unique().alias("zips"),
-        pl.col("kwh").mean().alias("mean_kwh"),
-        pl.len().alias("rows"),
-    ])
-    .sort("sample_month")
-)
-
-print("\n📊 Monthly Summary (ALL 12 months of 2024):")
-print(monthly)
-
-# Check April specifically
-april = monthly.filter(pl.col("sample_month") == "202404")
-if april.height > 0:
-    april_customers = april["customers"][0]
-    other_avg = monthly.filter(pl.col("sample_month") != "202404")["customers"].mean()
-    print("\n⚠️  April 2024 check:")
-    print(f"   April customers: {april_customers}")
-    print(f"   Other months avg: {other_avg:.0f}")
-    if april_customers < other_avg * 0.3:
-        print("   🚨 WARNING: April has significantly fewer customers!")
-        print("   Consider excluding April from analysis.")
-
-# Summary by ZIP
-zip_summary = (
-    combined.group_by("zipcode")
-    .agg([
-        pl.col("sample_month").n_unique().alias("months"),
-        pl.col("account_identifier").n_unique().alias("customers"),
-        pl.col("kwh").mean().alias("mean_kwh"),
-    ])
-    .sort("zipcode")
-)
-
-print("\n📍 ZIP Code Summary:")
-print(zip_summary)
-
-# Save
-output_file = OUTPUT_DIR / "chicago_all_zips_2024_CM90.parquet"
-combined.write_parquet(output_file)
-
-print(f"\n✅ Saved: {output_file}")
-print(f"   File size: {output_file.stat().st_size / 1e9:.2f} GB")
-print("=" * 80)
diff --git a/archive/residential_load_viz_2024/combine_chicago_zips_streaming.py b/archive/residential_load_viz_2024/combine_chicago_zips_streaming.py
deleted file mode 100644
index f2d849c..0000000
--- a/archive/residential_load_viz_2024/combine_chicago_zips_streaming.py
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/usr/bin/env python
-"""
-Combine all sampled ZIP codes using streaming (lazy frames).
-"""
-
-from pathlib import Path
-
-import polars as pl
-
-BASE_DIR = Path("analysis/chicago_2024_full_year")
-OUTPUT_DIR = BASE_DIR / "combined"
-OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-
-print("=" * 80)
-print("COMBINING ALL CHICAGO ZIP CODES - 2024 FULL YEAR (STREAMING)")
-print("=" * 80)
-
-# Find all CLIPPED_CM90 files
-cm90_files = sorted(BASE_DIR.glob("zip*/final/*_CLIPPED_CM90.parquet"))
-print(f"Found {len(cm90_files)} ZIP datasets\n")
-
-if not cm90_files:
-    print("❌ No files found!")
-    exit(1)
-
-# Add zipcode column to each file and save temporarily
-temp_dir = OUTPUT_DIR / "temp"
-temp_dir.mkdir(exist_ok=True)
-
-print("Adding zipcode column to each file...")
-temp_files = []
-
-for file in cm90_files:
-    zipcode = file.parent.parent.name.replace("zip", "")
-    temp_file = temp_dir / f"{zipcode}.parquet"
-
-    if not temp_file.exists():
-        print(f"  Processing ZIP {zipcode}...", end=" ")
-        df = pl.read_parquet(file).with_columns(pl.lit(zipcode).alias("zipcode"))
-        df.write_parquet(temp_file)
-        print(f"{len(df):,} rows")
-    else:
-        print(f"  ZIP {zipcode} already processed")
-
-    temp_files.append(temp_file)
-
-# Now use lazy scan to combine WITHOUT loading into memory
-print("\nCombining all ZIPs (streaming)...")
-output_file = OUTPUT_DIR / "chicago_all_zips_2024_CM90.parquet"
-
-# Scan all temp files lazily and sink to final file
-lf = pl.scan_parquet(temp_files)
-lf.sink_parquet(output_file)
-
-print(f"✅ Saved: {output_file}")
-print(f"   File size: {output_file.stat().st_size / 1e9:.2f} GB")
-
-# Now compute summaries using streaming
-print("\n📊 Computing monthly summary (streaming)...")
-monthly = (
-    pl.scan_parquet(output_file)
-    .group_by("sample_month")
-    .agg([
-        pl.col("account_identifier").n_unique().alias("customers"),
-        pl.col("zipcode").n_unique().alias("zips"),
-        pl.col("kwh").mean().alias("mean_kwh"),
-        pl.len().alias("rows"),
-    ])
-    .sort("sample_month")
-    .collect(streaming=True)
-)
-
-print("\nMonthly Summary (ALL 12 months of 2024):")
-print(monthly)
-
-# Check April
-april = monthly.filter(pl.col("sample_month") == "202404")
-if april.height > 0:
-    april_customers = april["customers"][0]
-    other_months = monthly.filter(pl.col("sample_month") != "202404")
-    other_avg = other_months["customers"].mean()
-
-    print("\n⚠️  April 2024 Data Quality Check:")
-    print(f"   April customers: {april_customers:,}")
-    print(f"   Other months avg: {other_avg:.0f}")
-
-    if april_customers < other_avg * 0.5:
-        print(f"   🚨 WARNING: April has {(1 - april_customers / other_avg) * 100:.0f}% fewer customers!")
-        print("   Recommendation: Exclude April from analysis")
-    else:
-        print("   ✅ April looks okay!")
-
-# ZIP summary
-print("\n📍 Computing ZIP summary (streaming)...")
-zip_summary = (
-    pl.scan_parquet(output_file)
-    .group_by("zipcode")
-    .agg([
-        pl.col("sample_month").n_unique().alias("months"),
-        pl.col("account_identifier").n_unique().alias("customers_total"),
-        pl.col("kwh").mean().alias("mean_kwh"),
-    ])
-    .sort("zipcode")
-    .collect(streaming=True)
-)
-
-print("\nZIP Code Summary:")
-print(zip_summary)
-
-# Overall stats
-print("\n📈 Overall Statistics:")
-overall = (
-    pl.scan_parquet(output_file)
-    .select([
-        pl.col("account_identifier").n_unique().alias("unique_customers"),
-        pl.len().alias("total_rows"),
-        pl.col("zipcode").n_unique().alias("zip_count"),
-        pl.col("sample_month").n_unique().alias("month_count"),
-        pl.col("kwh").mean().alias("mean_kwh"),
-        pl.min("date").alias("start_date"),
-        pl.max("date").alias("end_date"),
-    ])
-    .collect(streaming=True)
-)
-
-print(f"   Total customers: {overall['unique_customers'][0]:,}")
-print(f"   Total rows: {overall['total_rows'][0]:,}")
-print(f"   ZIP codes: {overall['zip_count'][0]}")
-print(f"   Months: {overall['month_count'][0]}")
-print(f"   Mean kWh: {overall['mean_kwh'][0]:.4f}")
-print(f"   Date range: {overall['start_date'][0]} to {overall['end_date'][0]}")
-
-print("\n" + "=" * 80)
-print("✅ COMPLETE! Ready for visualization.")
-print("=" * 80)
diff --git a/archive/residential_load_viz_2024/combine_with_april_boost.py b/archive/residential_load_viz_2024/combine_with_april_boost.py
deleted file mode 100755
index 0338ccd..0000000
--- a/archive/residential_load_viz_2024/combine_with_april_boost.py
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/usr/bin/env python
-"""
-Combine original 10-ZIP dataset with April city-wide boost.
-"""
-
-from pathlib import Path
-
-import polars as pl
-
-print("=" * 80)
-print("COMBINING: Original + April City-Wide Boost")
-print("=" * 80)
-
-# Original 10 ZIPs (all 12 months)
-original = Path("analysis/chicago_2024_full_year/combined/chicago_all_zips_2024_CM90.parquet")
-
-# April boost from 45 additional ZIPs
-april_files = list(Path("analysis/chicago_april_citywide").glob("zip*/final/*_CLIPPED_CM90.parquet"))
-
-print(f"Original dataset: {original}")
-print(f"April boost files: {len(april_files)} ZIPs")
-
-if not april_files:
-    print("\n❌ No April boost files! Run sample_april_citywide.py first")
-    exit(1)
-
-# Load April boost data
-print("\nLoading April boost data...")
-april_frames = []
-for file in april_files:
-    zipcode = file.parent.parent.name.replace("zip", "")
-    df = pl.read_parquet(file).with_columns(pl.lit(zipcode).alias("zipcode"))
-    customers = df["account_identifier"].n_unique()
-    print(f"  ZIP {zipcode}: {customers} customers (after CM90)")
-    april_frames.append(df)
-
-if april_frames:
-    april_boost = pl.concat(april_frames)
-    total_april_boost = april_boost["account_identifier"].n_unique()
-    print(f"\n✅ April boost total: {total_april_boost:,} customers from {len(april_files)} ZIPs")
-else:
-    print("\n❌ No April data passed CM90!")
-    exit(1)
-
-# Combine
-print("\nCombining with original dataset...")
-combined = pl.concat([pl.scan_parquet(original), pl.LazyFrame(april_boost)], how="vertical_relaxed")
-
-output = Path("analysis/chicago_2024_full_year/combined/chicago_2024_with_april_boost_CM90.parquet")
-combined.sink_parquet(output)
-
-print(f"\n✅ Saved: {output}")
-print(f"   Size: {output.stat().st_size / 1e9:.2f} GB")
-
-# Monthly summary
-monthly = (
-    pl.scan_parquet(output)
-    .group_by("sample_month")
-    .agg([
-        pl.col("account_identifier").n_unique().alias("customers"),
-        pl.col("zipcode").n_unique().alias("zips"),
-        pl.col("kwh").mean().alias("mean_kwh"),
-    ])
-    .sort("sample_month")
-    .collect(engine="streaming")
-)
-
-print("\n📊 FINAL Monthly Summary (with April boost):")
-print(monthly)
-
-april_row = monthly.filter(pl.col("sample_month") == "202404")
-if april_row.height > 0:
-    april_customers = april_row["customers"][0]
-    april_zips = april_row["zips"][0]
-    other_avg = monthly.filter(pl.col("sample_month") != "202404")["customers"].mean()
-
-    print(f"\n{'=' * 80}")
-    print("APRIL 2024 FINAL STATUS:")
-    print(f"  Customers: {april_customers:,}")
-    print(f"  ZIPs: {april_zips}")
-    print(f"  Other months avg: {other_avg:.0f}")
-    print(f"  April is {april_customers / other_avg * 100:.0f}% of typical month")
-
-    if april_customers >= other_avg * 0.8:
-        print("  ✅ EXCELLENT - April is now comparable!")
-    elif april_customers >= other_avg * 0.5:
-        print("  ✅ GOOD - April is usable with minor caveat")
-    else:
-        print("  ⚠️  FAIR - April still lower but much better")
-    print(f"{'=' * 80}")
-
-print("\n✅ Ready for visualization!")
diff --git a/archive/residential_load_viz_2024/create_heatmap.py b/archive/residential_load_viz_2024/create_heatmap.py
deleted file mode 100644
index 66cc270..0000000
--- a/archive/residential_load_viz_2024/create_heatmap.py
+++ /dev/null
@@ -1,131 +0,0 @@
-#!/usr/bin/env python
-"""
-Create load heatmap: Hour × Day of Year
-Aggregated across all customers (200 residential accounts).
-"""
-
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import polars as pl
-import seaborn as sns
-
-
-def create_load_heatmap(usage_df: pl.DataFrame, output_path: str = "load_heatmap.png"):
-    """Create beautiful seaborn heatmap of usage patterns."""
-    print("📊 Creating heatmap...")
-
-    # Add day_of_year and hour
-    df = usage_df.with_columns([
-        pl.col("datetime").dt.ordinal_day().alias("day_of_year"),
-        pl.col("datetime").dt.hour().alias("hour"),
-    ])
-
-    # Aggregate: total kWh for each (day_of_year, hour) combination
-    heatmap_data = (
-        df.group_by(["day_of_year", "hour"]).agg(pl.col("kwh").sum().alias("total_kwh")).sort(["day_of_year", "hour"])
-    )
-
-    print(f"  Data points: {heatmap_data.height:,} hour×day combinations")
-
-    # Pivot to matrix: rows=hours (0-23), cols=days (1-366)
-    matrix = heatmap_data.pivot(index="hour", columns="day_of_year", values="total_kwh").fill_null(0)
-
-    # Convert to numpy
-    hour_labels = matrix.select("hour").to_series().to_list()
-    day_cols = [c for c in matrix.columns if c != "hour"]
-    data_matrix = matrix.select(day_cols).to_numpy()
-
-    print(f"  Matrix: {data_matrix.shape[0]} hours × {data_matrix.shape[1]} days")
-
-    # Create figure
-    fig, ax = plt.subplots(figsize=(24, 10))
-
-    # Seaborn heatmap with YlOrRd colormap
-    sns.heatmap(
-        data_matrix,
-        cmap="YlOrRd",  # Yellow-Orange-Red
-        cbar_kws={"label": "Total kWh (200 homes)", "shrink": 0.8},
-        xticklabels=30,  # Show every 30th day
-        yticklabels=hour_labels,
-        linewidths=0,
-        ax=ax,
-        robust=True,  # Use robust quantiles for color limits
-    )
-
-    # Labels and title
-    ax.set_xlabel("Day of Year 2024", fontsize=16, fontweight="bold")
-    ax.set_ylabel("Hour of Day", fontsize=16, fontweight="bold")
-    ax.set_title(
-        "Illinois Residential Electricity Load Patterns - 2024\n200 Random Households",
-        fontsize=18,
-        fontweight="bold",
-        pad=20,
-    )
-
-    # Invert y-axis so midnight is at top
-    ax.invert_yaxis()
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-def create_hourly_profile(usage_df: pl.DataFrame, output_path: str = "hourly_profile.png"):
-    """Create average hourly load profile."""
-    print("\n📊 Creating hourly profile...")
-
-    # Average by hour across all days
-    hourly = (
-        usage_df.with_columns(pl.col("datetime").dt.hour().alias("hour"))
-        .group_by("hour")
-        .agg([pl.col("kwh").mean().alias("avg_kwh"), pl.col("kwh").std().alias("std_kwh")])
-        .sort("hour")
-    )
-
-    # Plot
-    fig, ax = plt.subplots(figsize=(12, 6))
-
-    hours = hourly["hour"].to_list()
-    avg = hourly["avg_kwh"].to_list()
-    std = hourly["std_kwh"].to_list()
-
-    ax.plot(hours, avg, linewidth=3, color="#d62728", marker="o")
-    ax.fill_between(
-        hours, [a - s for a, s in zip(avg, std)], [a + s for a, s in zip(avg, std)], alpha=0.3, color="#d62728"
-    )
-
-    ax.set_xlabel("Hour of Day", fontsize=14, fontweight="bold")
-    ax.set_ylabel("Average kWh", fontsize=14, fontweight="bold")
-    ax.set_title("Average Hourly Electricity Usage - 2024\n200 Illinois Homes", fontsize=16, fontweight="bold")
-    ax.grid(True, alpha=0.3)
-    ax.set_xticks(range(0, 24, 2))
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-if __name__ == "__main__":
-    print("=" * 60)
-    print("RESIDENTIAL LOAD VISUALIZATION")
-    print("=" * 60)
-
-    # Load data
-    data_file = Path("sampled_customers_2024.parquet")
-    print(f"\n📂 Loading: {data_file}")
-    df = pl.read_parquet(data_file)
-
-    print(f"  Rows: {df.height:,}")
-    print(f"  Date range: {df['date'].min()} to {df['date'].max()}")
-    print(f"  ZIP+4 codes: {df['zip_code'].n_unique()}")
-
-    # Create visualizations
-    create_load_heatmap(df, "residential_load_heatmap_2024.png")
-    create_hourly_profile(df, "residential_hourly_profile_2024.png")
-
-    print("\n" + "=" * 60)
-    print("✅ VISUALIZATIONS COMPLETE!")
-    print("=" * 60)
diff --git a/archive/residential_load_viz_2024/create_heatmap_professional.py b/archive/residential_load_viz_2024/create_heatmap_professional.py
deleted file mode 100644
index 92b6e7e..0000000
--- a/archive/residential_load_viz_2024/create_heatmap_professional.py
+++ /dev/null
@@ -1,196 +0,0 @@
-#!/usr/bin/env python
-"""
-Professional-quality load visualizations for presentation.
-"""
-
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import polars as pl
-import seaborn as sns
-
-# Set Garamond font
-plt.rcParams["font.family"] = "serif"
-plt.rcParams["font.serif"] = ["Garamond", "Georgia", "Times New Roman"]
-plt.rcParams["font.size"] = 11
-plt.rcParams["axes.labelsize"] = 13
-plt.rcParams["axes.titlesize"] = 16
-plt.rcParams["xtick.labelsize"] = 10
-plt.rcParams["ytick.labelsize"] = 10
-plt.rcParams["legend.fontsize"] = 11
-plt.rcParams["figure.titlesize"] = 18
-
-
-def create_load_heatmap(usage_df: pl.DataFrame, output_path: str = "load_heatmap_pro.png"):
-    """Create professional heatmap with month labels."""
-    print("📊 Creating professional heatmap...")
-
-    # Prepare data
-    df = usage_df.with_columns([
-        pl.col("datetime").dt.ordinal_day().alias("day_of_year"),
-        pl.col("datetime").dt.hour().alias("hour"),
-    ])
-
-    # Aggregate
-    heatmap_data = (
-        df.group_by(["day_of_year", "hour"]).agg(pl.col("kwh").sum().alias("total_kwh")).sort(["day_of_year", "hour"])
-    )
-
-    # Pivot to matrix
-    matrix = heatmap_data.pivot(index="hour", columns="day_of_year", values="total_kwh").fill_null(0)
-
-    hour_labels = matrix.select("hour").to_series().to_list()
-    day_cols = [c for c in matrix.columns if c != "hour"]
-    data_matrix = matrix.select(day_cols).to_numpy()
-
-    print(f"  Data range: {data_matrix.min():.0f} to {data_matrix.max():.0f} kWh")
-    print(f"  Matrix shape: {data_matrix.shape[0]} hours × {data_matrix.shape[1]} days")
-
-    # Create figure
-    fig, ax = plt.subplots(figsize=(20, 8))
-
-    # Heatmap
-    sns.heatmap(
-        data_matrix,
-        cmap="rocket_r",
-        cbar_kws={"label": "Total kWh (8,131 homes)", "shrink": 0.8, "pad": 0.02},
-        xticklabels=False,  # We'll add custom labels
-        yticklabels=hour_labels,
-        linewidths=0,
-        ax=ax,
-        robust=True,
-    )
-
-    # Add month labels on x-axis
-    # 2024 is a leap year
-    month_starts = [1, 32, 61, 92, 122, 153, 183, 214, 245, 275, 306, 336]
-    month_names = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
-    month_midpoints = [(month_starts[i] + (month_starts[i + 1] if i < 11 else 367)) / 2 - 1 for i in range(12)]
-
-    # Set month names as x-tick labels
-    ax.set_xticks(month_midpoints)
-    ax.set_xticklabels(month_names, fontsize=11)
-
-    # Add subtle month divider lines
-    for day in month_starts[1:]:
-        ax.axvline(x=day - 1, color="white", linewidth=0.8, alpha=0.4)
-
-    # Labels
-    ax.set_xlabel("Month (2024)", fontsize=14, fontweight="bold", labelpad=10)
-    ax.set_ylabel("Hour of Day", fontsize=14, fontweight="bold", labelpad=10)
-    ax.set_title(
-        "Illinois Residential Electricity Load Patterns - 2024\n8,131 Random Households",
-        fontsize=17,
-        fontweight="bold",
-        pad=20,
-    )
-
-    # Invert y-axis so midnight is at top
-    ax.invert_yaxis()
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-def create_hourly_profile(usage_df: pl.DataFrame, output_path: str = "hourly_profile_pro.png"):
-    """Create hourly profile with IQR."""
-    print("\n📊 Creating professional hourly profile...")
-
-    # Calculate stats
-    hourly = (
-        usage_df.with_columns(pl.col("datetime").dt.hour().alias("hour"))
-        .group_by("hour")
-        .agg([
-            pl.col("kwh").mean().alias("mean_kwh"),
-            pl.col("kwh").quantile(0.25).alias("p25_kwh"),
-            pl.col("kwh").quantile(0.75).alias("p75_kwh"),
-        ])
-        .sort("hour")
-    )
-
-    # Reality check output
-    print("\n  📊 REALITY CHECK:")
-    print(f"     Lowest avg: {hourly['mean_kwh'].min():.3f} kWh per 30-min")
-    print(f"     Highest avg: {hourly['mean_kwh'].max():.3f} kWh per 30-min")
-    print(f"     Daily total: ~{hourly['mean_kwh'].sum():.1f} kWh per home per day")
-
-    # Create figure
-    fig, ax = plt.subplots(figsize=(14, 7))
-
-    hours = hourly["hour"].to_list()
-    mean = hourly["mean_kwh"].to_list()
-    p25 = hourly["p25_kwh"].to_list()
-    p75 = hourly["p75_kwh"].to_list()
-
-    # Interquartile range (tighter than std dev)
-    ax.fill_between(hours, p25, p75, alpha=0.25, color="#d62728", label="Interquartile range (25th-75th %ile)")
-    ax.plot(hours, mean, linewidth=3, color="#d62728", marker="o", markersize=6, label="Average usage", zorder=5)
-
-    # Styling
-    ax.set_xlabel("Hour of Day", fontsize=14, fontweight="bold", labelpad=10)
-    ax.set_ylabel("Average kWh per 30-min interval", fontsize=14, fontweight="bold", labelpad=10)
-    ax.set_title(
-        "Average Hourly Electricity Usage - 2024\n8,131 Illinois Homes", fontsize=17, fontweight="bold", pad=20
-    )
-    ax.grid(True, alpha=0.3, linestyle="--", linewidth=0.5)
-    ax.set_xticks(range(0, 24, 2))
-    ax.set_xlim(-0.5, 23.5)
-    ax.set_ylim(bottom=0)
-
-    # Time-of-day shading
-    ax.axvspan(0, 6, alpha=0.08, color="blue", label="Overnight (12-6 AM)")
-    ax.axvspan(17, 21, alpha=0.08, color="red", label="Evening peak (5-9 PM)")
-
-    # Annotations
-    peak_hour = mean.index(max(mean))
-    peak_value = max(mean)
-    ax.annotate(
-        f"Peak: {peak_value:.3f} kWh\nat {peak_hour}:00",
-        xy=(peak_hour, peak_value),
-        xytext=(peak_hour - 3, peak_value + 0.03),
-        fontsize=11,
-        fontweight="bold",
-        bbox=dict(boxstyle="round", facecolor="yellow", alpha=0.8, edgecolor="black", linewidth=1.5),
-        arrowprops=dict(arrowstyle="->", lw=2, color="black"),
-    )
-
-    min_value = min(mean)
-    min_hour = mean.index(min_value)
-    ax.annotate(
-        f"Baseload: {min_value:.3f} kWh\nat {min_hour}:00",
-        xy=(min_hour, min_value),
-        xytext=(min_hour + 2, min_value + 0.05),
-        fontsize=11,
-        bbox=dict(boxstyle="round", facecolor="lightblue", alpha=0.8, edgecolor="black", linewidth=1.5),
-        arrowprops=dict(arrowstyle="->", lw=2, color="black"),
-    )
-
-    ax.legend(loc="upper left", framealpha=0.95, edgecolor="black", fancybox=False)
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-if __name__ == "__main__":
-    print("=" * 60)
-    print("PROFESSIONAL RESIDENTIAL LOAD VISUALIZATION")
-    print("=" * 60)
-
-    data_file = Path("sampled_customers_2024.parquet")
-    print(f"\n📂 Loading: {data_file}")
-    df = pl.read_parquet(data_file)
-
-    print(f"  Rows: {df.height:,}")
-    print(f"  Customers: {df['account_identifier'].n_unique():,}")
-    print(f"  Date range: {df['date'].min()} to {df['date'].max()}")
-
-    create_load_heatmap(df, "residential_load_heatmap_professional.png")
-    create_hourly_profile(df, "residential_hourly_profile_professional.png")
-
-    print("\n" + "=" * 60)
-    print("✅ COMPLETE!")
-    print("=" * 60)
diff --git a/archive/residential_load_viz_2024/create_visualization.py b/archive/residential_load_viz_2024/create_visualization.py
deleted file mode 100644
index 2ff9f6c..0000000
--- a/archive/residential_load_viz_2024/create_visualization.py
+++ /dev/null
@@ -1,258 +0,0 @@
-#!/usr/bin/env python
-"""
-Professional-quality load visualizations - FIXED for June 2024-June 2025 data.
-Memory-safe for Codespaces.
-"""
-
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import polars as pl
-import seaborn as sns
-
-# Set Garamond font
-plt.rcParams["font.family"] = "serif"
-plt.rcParams["font.serif"] = ["Garamond", "Georgia", "Times New Roman"]
-plt.rcParams["font.size"] = 11
-plt.rcParams["axes.labelsize"] = 13
-plt.rcParams["axes.titlesize"] = 16
-plt.rcParams["xtick.labelsize"] = 10
-plt.rcParams["ytick.labelsize"] = 10
-plt.rcParams["legend.fontsize"] = 11
-plt.rcParams["figure.titlesize"] = 18
-
-
-def create_monthly_heatmap(data_path: str, output_path: str = "load_heatmap_monthly.png"):
-    """
-    Create professional heatmap using MONTHLY aggregation (memory-safe).
-    Shows hour-of-day vs month (not day-of-year).
-    """
-    print("📊 Creating monthly heatmap (memory-safe)...")
-
-    # LAZY LOAD - aggregate in-database
-    lf = pl.scan_parquet(data_path)
-
-    # Get stats
-    stats = lf.select([
-        pl.col("account_identifier").n_unique().alias("n_customers"),
-        pl.min("date").alias("min_date"),
-        pl.max("date").alias("max_date"),
-    ]).collect(engine="streaming")
-
-    n_customers = stats["n_customers"][0]
-    date_range = f"{stats['min_date'][0]} to {stats['max_date'][0]}"
-    print(f"  Customers: {n_customers:,}")
-    print(f"  Date range: {date_range}")
-
-    # Aggregate by month and hour (streaming)
-    monthly_hourly = (
-        lf.group_by(["sample_month", "hour"])
-        .agg(pl.col("kwh").sum().alias("total_kwh"))
-        .sort(["sample_month", "hour"])
-        .collect(engine="streaming")
-    )
-
-    print(f"  Aggregated to {len(monthly_hourly):,} rows")
-
-    # Pivot to matrix
-    matrix = monthly_hourly.pivot(index="hour", columns="sample_month", values="total_kwh").fill_null(0)
-
-    hour_labels = matrix.select("hour").to_series().to_list()
-    month_cols = sorted([c for c in matrix.columns if c != "hour"])
-    data_matrix = matrix.select(month_cols).to_numpy()
-
-    print(f"  Matrix: {data_matrix.shape[0]} hours × {data_matrix.shape[1]} months")
-    print(f"  Range: {data_matrix.min():,.0f} to {data_matrix.max():,.0f} kWh")
-
-    # Create figure
-    fig, ax = plt.subplots(figsize=(16, 8))
-
-    # Heatmap
-    sns.heatmap(
-        data_matrix,
-        cmap="rocket_r",
-        cbar_kws={"label": f"Total kWh (~{n_customers:,} homes)", "shrink": 0.8, "pad": 0.02},
-        xticklabels=[f"{m[:4]}-{m[4:]}" for m in month_cols],  # Format as YYYY-MM
-        yticklabels=hour_labels,
-        linewidths=0.5,
-        linecolor="white",
-        ax=ax,
-        robust=True,
-    )
-
-    # Labels
-    ax.set_xlabel("Month", fontsize=14, fontweight="bold", labelpad=10)
-    ax.set_ylabel("Hour of Day", fontsize=14, fontweight="bold", labelpad=10)
-    ax.set_title(
-        f"Illinois Residential Electricity Load Patterns\n{date_range} • {n_customers:,} Random Households",
-        fontsize=17,
-        fontweight="bold",
-        pad=20,
-    )
-
-    # Rotate x-labels for readability
-    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
-
-    # Invert y-axis
-    ax.invert_yaxis()
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-def create_hourly_profile(data_path: str, output_path: str = "hourly_profile_pro.png"):
-    """Create hourly profile with IQR (memory-safe)."""
-    print("\n📊 Creating hourly profile (memory-safe)...")
-
-    # LAZY LOAD
-    lf = pl.scan_parquet(data_path)
-
-    # Get customer count
-    n_customers = lf.select(pl.col("account_identifier").n_unique()).collect(engine="streaming")[0, 0]
-
-    # Calculate stats (streaming aggregation)
-    hourly = (
-        lf.group_by("hour")
-        .agg([
-            pl.col("kwh").mean().alias("mean_kwh"),
-            pl.col("kwh").quantile(0.25).alias("p25_kwh"),
-            pl.col("kwh").quantile(0.75).alias("p75_kwh"),
-        ])
-        .sort("hour")
-        .collect(engine="streaming")
-    )
-
-    print("\n  📊 Statistics:")
-    print(f"     Customers: {n_customers:,}")
-    print(f"     Lowest avg: {hourly['mean_kwh'].min():.3f} kWh per 30-min")
-    print(f"     Highest avg: {hourly['mean_kwh'].max():.3f} kWh per 30-min")
-    print(f"     Daily total: ~{hourly['mean_kwh'].sum():.1f} kWh per home per day")
-
-    # Create figure
-    fig, ax = plt.subplots(figsize=(14, 7))
-
-    hours = hourly["hour"].to_list()
-    mean = hourly["mean_kwh"].to_list()
-    p25 = hourly["p25_kwh"].to_list()
-    p75 = hourly["p75_kwh"].to_list()
-
-    # Interquartile range
-    ax.fill_between(hours, p25, p75, alpha=0.25, color="#d62728", label="Interquartile range (25th-75th %ile)")
-    ax.plot(hours, mean, linewidth=3, color="#d62728", marker="o", markersize=6, label="Average usage", zorder=5)
-
-    # Styling
-    ax.set_xlabel("Hour of Day", fontsize=14, fontweight="bold", labelpad=10)
-    ax.set_ylabel("Average kWh per 30-min interval", fontsize=14, fontweight="bold", labelpad=10)
-    ax.set_title(
-        f"Average Hourly Electricity Usage\n{n_customers:,} Illinois Homes • June 2024 - June 2025",
-        fontsize=17,
-        fontweight="bold",
-        pad=20,
-    )
-    ax.grid(True, alpha=0.3, linestyle="--", linewidth=0.5)
-    ax.set_xticks(range(0, 24, 2))
-    ax.set_xlim(-0.5, 23.5)
-    ax.set_ylim(bottom=0)
-
-    # Time-of-day shading
-    ax.axvspan(0, 6, alpha=0.08, color="blue", label="Overnight (12-6 AM)")
-    ax.axvspan(17, 21, alpha=0.08, color="red", label="Evening peak (5-9 PM)")
-
-    # Annotations
-    peak_hour = mean.index(max(mean))
-    peak_value = max(mean)
-    ax.annotate(
-        f"Peak: {peak_value:.3f} kWh\nat {peak_hour}:00",
-        xy=(peak_hour, peak_value),
-        xytext=(peak_hour - 3, peak_value + 0.03),
-        fontsize=11,
-        fontweight="bold",
-        bbox=dict(boxstyle="round", facecolor="yellow", alpha=0.8, edgecolor="black", linewidth=1.5),
-        arrowprops=dict(arrowstyle="->", lw=2, color="black"),
-    )
-
-    min_value = min(mean)
-    min_hour = mean.index(min_value)
-    ax.annotate(
-        f"Baseload: {min_value:.3f} kWh\nat {min_hour}:00",
-        xy=(min_hour, min_value),
-        xytext=(min_hour + 2, min_value + 0.05),
-        fontsize=11,
-        bbox=dict(boxstyle="round", facecolor="lightblue", alpha=0.8, edgecolor="black", linewidth=1.5),
-        arrowprops=dict(arrowstyle="->", lw=2, color="black"),
-    )
-
-    ax.legend(loc="upper left", framealpha=0.95, edgecolor="black", fancybox=False)
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-def create_weekend_comparison(data_path: str, output_path: str = "weekend_comparison.png"):
-    """Compare weekday vs weekend load profiles."""
-    print("\n📊 Creating weekend comparison...")
-
-    lf = pl.scan_parquet(data_path)
-
-    # Aggregate by hour and weekend status
-    comparison = (
-        lf.group_by(["hour", "is_weekend"])
-        .agg(pl.col("kwh").mean().alias("mean_kwh"))
-        .sort(["is_weekend", "hour"])
-        .collect(engine="streaming")
-    )
-
-    weekday = comparison.filter(pl.col("is_weekend") == False)
-    weekend = comparison.filter(pl.col("is_weekend") == True)
-
-    fig, ax = plt.subplots(figsize=(14, 7))
-
-    ax.plot(
-        weekday["hour"], weekday["mean_kwh"], linewidth=3, color="#1f77b4", marker="o", label="Weekday", markersize=6
-    )
-    ax.plot(
-        weekend["hour"], weekend["mean_kwh"], linewidth=3, color="#ff7f0e", marker="s", label="Weekend", markersize=6
-    )
-
-    ax.set_xlabel("Hour of Day", fontsize=14, fontweight="bold")
-    ax.set_ylabel("Average kWh per 30-min", fontsize=14, fontweight="bold")
-    ax.set_title("Weekday vs Weekend Load Profiles", fontsize=17, fontweight="bold", pad=20)
-    ax.grid(True, alpha=0.3, linestyle="--")
-    ax.legend(fontsize=13, framealpha=0.95)
-    ax.set_xticks(range(0, 24, 2))
-    ax.set_xlim(-0.5, 23.5)
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-if __name__ == "__main__":
-    print("=" * 60)
-    print("PROFESSIONAL RESIDENTIAL LOAD VISUALIZATION")
-    print("=" * 60)
-
-    # YOUR ACTUAL FILE
-    data_file = "analysis/zip60622_2024/final/sample_60622_202406_202506_CLIPPED.parquet"
-
-    print(f"\n📂 Using: {data_file}")
-
-    # Check file exists
-    if not Path(data_file).exists():
-        print(f"❌ File not found: {data_file}")
-        print("Please update the path!")
-        exit(1)
-
-    # Create visualizations (all memory-safe)
-    create_monthly_heatmap(data_file, "residential_load_heatmap_professional.png")
-    create_hourly_profile(data_file, "residential_hourly_profile_professional.png")
-    create_weekend_comparison(data_file, "residential_weekend_comparison.png")
-
-    print("\n" + "=" * 60)
-    print("✅ COMPLETE!")
-    print("=" * 60)
diff --git a/archive/residential_load_viz_2024/create_visualizations_cm90.py b/archive/residential_load_viz_2024/create_visualizations_cm90.py
deleted file mode 100644
index b3df259..0000000
--- a/archive/residential_load_viz_2024/create_visualizations_cm90.py
+++ /dev/null
@@ -1,294 +0,0 @@
-#!/usr/bin/env python
-"""
-Professional visualizations using CM90 (quality-filtered) data.
-"""
-
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import polars as pl
-import seaborn as sns
-
-# Set seaborn style
-sns.set_style("whitegrid")
-plt.rcParams["font.family"] = "sans-serif"
-plt.rcParams["font.size"] = 11
-plt.rcParams["axes.labelsize"] = 13
-plt.rcParams["axes.titlesize"] = 16
-plt.rcParams["legend.fontsize"] = 11
-plt.rcParams["figure.titlesize"] = 18
-plt.rcParams["axes.grid"] = True
-plt.rcParams["grid.alpha"] = 0.3
-
-
-def create_monthly_heatmap_cm90(data_path: str, output_path: str = "heatmap_cm90.png"):
-    """Create professional seaborn heatmap from CM90 data."""
-    print("\n📊 Creating CM90 heatmap...")
-
-    lf = pl.scan_parquet(data_path)
-
-    stats = lf.select([
-        pl.col("account_identifier").n_unique().alias("n_customers"),
-        pl.min("date").alias("min_date"),
-        pl.max("date").alias("max_date"),
-    ]).collect(engine="streaming")
-
-    n_customers = stats["n_customers"][0]
-    date_range = f"{stats['min_date'][0]} to {stats['max_date'][0]}"
-
-    monthly_hourly = (
-        lf.group_by(["sample_month", "hour"])
-        .agg(pl.col("kwh").sum().alias("total_kwh"))
-        .sort(["sample_month", "hour"])
-        .collect(engine="streaming")
-    )
-
-    matrix = monthly_hourly.pivot(index="hour", columns="sample_month", values="total_kwh").fill_null(0)
-
-    hour_labels = matrix.select("hour").to_series().to_list()
-    month_cols = sorted([c for c in matrix.columns if c != "hour"])
-    data_matrix = matrix.select(month_cols).to_numpy()
-
-    # Create figure
-    fig, ax = plt.subplots(figsize=(16, 9))
-
-    sns.heatmap(
-        data_matrix,
-        cmap="RdYlBu_r",
-        cbar_kws={
-            "label": f"Total Energy Consumption (kWh)\n~{n_customers:,} Quality-Filtered Households",
-            "shrink": 0.75,
-            "pad": 0.02,
-            "aspect": 30,
-        },
-        xticklabels=[f"{m[:4]}-{m[4:]}" for m in month_cols],
-        yticklabels=hour_labels,
-        linewidths=0.8,
-        linecolor="white",
-        ax=ax,
-        robust=True,
-        square=False,
-    )
-
-    ax.set_xlabel("Month", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_ylabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_title(
-        f"Residential Electricity Load Patterns (CM90 Quality-Filtered)\n"
-        f"Chicago ZIP 60622 • {date_range} • {n_customers:,} Households\n"
-        f"Natural Gas Heating | Electric Air Conditioning",
-        fontsize=17,
-        fontweight="bold",
-        pad=25,
-    )
-
-    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=11)
-    ax.invert_yaxis()
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-def create_hourly_profile_cm90(data_path: str, output_path: str = "hourly_profile_cm90.png"):
-    """Create hourly profile from CM90 data."""
-    print("\n📊 Creating CM90 hourly profile...")
-
-    lf = pl.scan_parquet(data_path)
-    n_customers = lf.select(pl.col("account_identifier").n_unique()).collect(engine="streaming")[0, 0]
-
-    hourly = (
-        lf.group_by("hour")
-        .agg([
-            pl.col("kwh").mean().alias("mean_kwh"),
-            pl.col("kwh").quantile(0.25).alias("p25_kwh"),
-            pl.col("kwh").quantile(0.75).alias("p75_kwh"),
-        ])
-        .sort("hour")
-        .collect(engine="streaming")
-    )
-
-    fig, ax = plt.subplots(figsize=(15, 8))
-
-    hours = hourly["hour"].to_list()
-    mean = hourly["mean_kwh"].to_list()
-    p25 = hourly["p25_kwh"].to_list()
-    p75 = hourly["p75_kwh"].to_list()
-
-    ax.fill_between(hours, p25, p75, alpha=0.3, color="#e74c3c", label="Interquartile Range")
-
-    sns.lineplot(x=hours, y=mean, linewidth=3.5, color="#c0392b", marker="o", markersize=8, ax=ax, label="Mean Usage")
-
-    ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_ylabel("Energy Consumption (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_title(
-        f"Average Hourly Electricity Usage (CM90 Quality-Filtered)\n"
-        f"{n_customers:,} Chicago Households • June 2024 - June 2025",
-        fontsize=17,
-        fontweight="bold",
-        pad=25,
-    )
-
-    ax.set_xticks(range(0, 24, 2))
-    ax.set_xlim(-0.5, 23.5)
-    ax.set_ylim(bottom=0)
-    ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8)
-
-    ax.axvspan(0, 6, alpha=0.1, color="#3498db", label="Overnight", zorder=0)
-    ax.axvspan(17, 21, alpha=0.1, color="#e74c3c", label="Evening Peak", zorder=0)
-
-    # Annotations
-    peak_hour = mean.index(max(mean))
-    peak_value = max(mean)
-    ax.annotate(
-        f"Peak\n{peak_value:.3f} kWh\nat {peak_hour}:00",
-        xy=(peak_hour, peak_value),
-        xytext=(peak_hour - 4, peak_value + 0.04),
-        fontsize=12,
-        fontweight="bold",
-        bbox=dict(boxstyle="round,pad=0.7", facecolor="#f39c12", alpha=0.9, edgecolor="#000", linewidth=2),
-        arrowprops=dict(arrowstyle="->", lw=2.5, color="#000"),
-    )
-
-    min_value = min(mean)
-    min_hour = mean.index(min_value)
-    ax.annotate(
-        f"Baseload\n{min_value:.3f} kWh\nat {min_hour}:00",
-        xy=(min_hour, min_value),
-        xytext=(min_hour + 3, min_value + 0.06),
-        fontsize=12,
-        fontweight="bold",
-        bbox=dict(boxstyle="round,pad=0.7", facecolor="#3498db", alpha=0.9, edgecolor="#000", linewidth=2),
-        arrowprops=dict(arrowstyle="->", lw=2.5, color="#000"),
-    )
-
-    ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=12)
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-def create_weekend_comparison_cm90(data_path: str, output_path: str = "weekend_comparison_cm90.png"):
-    """Compare weekday vs weekend from CM90 data."""
-    print("\n📊 Creating CM90 weekend comparison...")
-
-    lf = pl.scan_parquet(data_path)
-
-    comparison = (
-        lf.group_by(["hour", "is_weekend"])
-        .agg(pl.col("kwh").mean().alias("mean_kwh"))
-        .sort(["is_weekend", "hour"])
-        .collect(engine="streaming")
-    )
-
-    weekday = comparison.filter(pl.col("is_weekend") == False)
-    weekend = comparison.filter(pl.col("is_weekend") == True)
-
-    fig, ax = plt.subplots(figsize=(15, 8))
-
-    weekday_hours = weekday["hour"].to_list()
-    weekday_kwh = weekday["mean_kwh"].to_list()
-    weekend_hours = weekend["hour"].to_list()
-    weekend_kwh = weekend["mean_kwh"].to_list()
-
-    sns.lineplot(
-        x=weekday_hours, y=weekday_kwh, linewidth=3.5, color="#2980b9", marker="o", markersize=8, ax=ax, label="Weekday"
-    )
-    sns.lineplot(
-        x=weekend_hours, y=weekend_kwh, linewidth=3.5, color="#e67e22", marker="s", markersize=8, ax=ax, label="Weekend"
-    )
-
-    ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_ylabel("Average Energy (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_title(
-        "Weekday vs Weekend Load Profiles (CM90 Quality-Filtered)\nTemporal Consumption Patterns",
-        fontsize=17,
-        fontweight="bold",
-        pad=25,
-    )
-    ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8)
-    ax.set_xticks(range(0, 24, 2))
-    ax.set_xlim(-0.5, 23.5)
-    ax.set_ylim(bottom=0)
-
-    # Annotations
-    weekday_peak_hour = weekday_kwh.index(max(weekday_kwh))
-    weekday_peak = max(weekday_kwh)
-    ax.annotate(
-        f"Weekday Peak\n{weekday_peak:.3f} kWh\nat {weekday_peak_hour}:00",
-        xy=(weekday_peak_hour, weekday_peak),
-        xytext=(weekday_peak_hour - 3, weekday_peak + 0.04),
-        fontsize=11,
-        fontweight="bold",
-        bbox=dict(boxstyle="round,pad=0.6", facecolor="#3498db", alpha=0.9, edgecolor="#000", linewidth=2),
-        arrowprops=dict(arrowstyle="->", lw=2, color="#000"),
-    )
-
-    weekend_peak_hour = weekend_kwh.index(max(weekend_kwh))
-    weekend_peak = max(weekend_kwh)
-    ax.annotate(
-        f"Weekend Peak\n{weekend_peak:.3f} kWh\nat {weekend_peak_hour}:00",
-        xy=(weekend_peak_hour, weekend_peak),
-        xytext=(weekend_peak_hour + 2, weekend_peak + 0.04),
-        fontsize=11,
-        fontweight="bold",
-        bbox=dict(boxstyle="round,pad=0.6", facecolor="#e67e22", alpha=0.9, edgecolor="#000", linewidth=2),
-        arrowprops=dict(arrowstyle="->", lw=2, color="#000"),
-    )
-
-    weekday_min = min(weekday_kwh)
-    weekday_min_hour = weekday_kwh.index(weekday_min)
-    ax.annotate(
-        f"Weekday Low\n{weekday_min:.3f} kWh",
-        xy=(weekday_min_hour, weekday_min),
-        xytext=(weekday_min_hour + 2, weekday_min + 0.05),
-        fontsize=10,
-        bbox=dict(boxstyle="round,pad=0.5", facecolor="#bdc3c7", alpha=0.9, edgecolor="#000", linewidth=1.5),
-        arrowprops=dict(arrowstyle="->", lw=1.5, color="#000"),
-    )
-
-    weekend_min = min(weekend_kwh)
-    weekend_min_hour = weekend_kwh.index(weekend_min)
-    ax.annotate(
-        f"Weekend Low\n{weekend_min:.3f} kWh",
-        xy=(weekend_min_hour, weekend_min),
-        xytext=(weekend_min_hour - 4, weekend_min + 0.05),
-        fontsize=10,
-        bbox=dict(boxstyle="round,pad=0.5", facecolor="#ecf0f1", alpha=0.9, edgecolor="#000", linewidth=1.5),
-        arrowprops=dict(arrowstyle="->", lw=1.5, color="#000"),
-    )
-
-    ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=13)
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-if __name__ == "__main__":
-    print("=" * 80)
-    print("CM90 QUALITY-FILTERED VISUALIZATIONS")
-    print("=" * 80)
-
-    data_file = "analysis/zip60622_2024/final/sample_60622_202406_202506_CLIPPED_CM90.parquet"
-
-    if not Path(data_file).exists():
-        print(f"❌ File not found: {data_file}")
-        exit(1)
-
-    create_monthly_heatmap_cm90(data_file, "residential_heatmap_cm90.png")
-    create_hourly_profile_cm90(data_file, "residential_hourly_cm90.png")
-    create_weekend_comparison_cm90(data_file, "residential_weekend_cm90.png")
-
-    print("\n" + "=" * 80)
-    print("✅ CM90 VISUALIZATIONS COMPLETE!")
-    print("=" * 80)
-    print("\nGenerated files:")
-    print("  📈 residential_heatmap_cm90.png")
-    print("  📈 residential_hourly_cm90.png")
-    print("  📈 residential_weekend_cm90.png")
-    print("\nNote: These use quality-filtered data (≥90% valid days)")
-    print("=" * 80)
diff --git a/archive/residential_load_viz_2024/create_visualizations_enhanced.py b/archive/residential_load_viz_2024/create_visualizations_enhanced.py
deleted file mode 100644
index 5868fbf..0000000
--- a/archive/residential_load_viz_2024/create_visualizations_enhanced.py
+++ /dev/null
@@ -1,447 +0,0 @@
-#!/usr/bin/env python
-"""
-Professional-quality load visualizations with summary statistics.
-"""
-
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import polars as pl
-import seaborn as sns
-
-# Set seaborn style
-sns.set_style("whitegrid")
-sns.set_palette("husl")
-
-# Professional typography
-plt.rcParams["font.family"] = "serif"
-plt.rcParams["font.serif"] = ["Garamond", "Georgia", "Times New Roman"]
-plt.rcParams["font.size"] = 11
-plt.rcParams["axes.labelsize"] = 13
-plt.rcParams["axes.titlesize"] = 16
-plt.rcParams["xtick.labelsize"] = 10
-plt.rcParams["ytick.labelsize"] = 10
-plt.rcParams["legend.fontsize"] = 11
-plt.rcParams["figure.titlesize"] = 18
-plt.rcParams["axes.grid"] = True
-plt.rcParams["grid.alpha"] = 0.3
-
-
-def generate_summary_statistics(data_path: str):
-    """Generate comprehensive summary statistics table."""
-    print("\n📊 Generating Summary Statistics...")
-
-    lf = pl.scan_parquet(data_path)
-
-    # Overall statistics
-    overall = lf.select([
-        pl.col("account_identifier").n_unique().alias("unique_customers"),
-        pl.len().alias("total_observations"),
-        pl.min("date").alias("start_date"),
-        pl.max("date").alias("end_date"),
-        pl.col("kwh").mean().alias("mean_kwh"),
-        pl.col("kwh").median().alias("median_kwh"),
-        pl.col("kwh").std().alias("std_kwh"),
-        pl.col("kwh").min().alias("min_kwh"),
-        pl.col("kwh").max().alias("max_kwh"),
-        pl.col("kwh").quantile(0.25).alias("q25_kwh"),
-        pl.col("kwh").quantile(0.75).alias("q75_kwh"),
-    ]).collect(engine="streaming")
-
-    # Monthly breakdown
-    monthly = (
-        lf.group_by("sample_month")
-        .agg([
-            pl.col("account_identifier").n_unique().alias("customers"),
-            pl.len().alias("observations"),
-            pl.col("kwh").mean().alias("mean_kwh"),
-            pl.col("kwh").sum().alias("total_kwh"),
-        ])
-        .sort("sample_month")
-        .collect(engine="streaming")
-    )
-
-    # Weekday vs Weekend
-    weekend_stats = (
-        lf.group_by("is_weekend")
-        .agg([
-            pl.col("kwh").mean().alias("mean_kwh"),
-            pl.col("kwh").median().alias("median_kwh"),
-            pl.col("kwh").std().alias("std_kwh"),
-        ])
-        .collect(engine="streaming")
-    )
-
-    # Hourly statistics
-    hourly_stats = (
-        lf.group_by("hour")
-        .agg([
-            pl.col("kwh").mean().alias("mean_kwh"),
-            pl.col("kwh").std().alias("std_kwh"),
-        ])
-        .sort("hour")
-        .collect(engine="streaming")
-    )
-
-    # Print comprehensive report
-    print("\n" + "=" * 80)
-    print("COMPREHENSIVE SUMMARY STATISTICS")
-    print("=" * 80)
-
-    print("\n📋 OVERALL STATISTICS")
-    print("-" * 80)
-    print(f"  Total Unique Customers:        {overall['unique_customers'][0]:>12,}")
-    print(f"  Total Observations:            {overall['total_observations'][0]:>12,}")
-    print(f"  Date Range:                    {overall['start_date'][0]} to {overall['end_date'][0]}")
-    print(f"  Average kWh per 30-min:        {overall['mean_kwh'][0]:>12.4f}")
-    print(f"  Median kWh per 30-min:         {overall['median_kwh'][0]:>12.4f}")
-    print(f"  Std Dev kWh:                   {overall['std_kwh'][0]:>12.4f}")
-    print(f"  Min kWh:                       {overall['min_kwh'][0]:>12.4f}")
-    print(f"  Max kWh:                       {overall['max_kwh'][0]:>12.4f}")
-    print(f"  25th Percentile:               {overall['q25_kwh'][0]:>12.4f}")
-    print(f"  75th Percentile:               {overall['q75_kwh'][0]:>12.4f}")
-
-    # Estimated daily/annual usage
-    daily_kwh = overall["mean_kwh"][0] * 48  # 48 half-hour intervals per day
-    annual_kwh = daily_kwh * 365
-    print(f"\n  Est. Daily Usage per Home:     {daily_kwh:>12.1f} kWh/day")
-    print(f"  Est. Annual Usage per Home:    {annual_kwh:>12,.0f} kWh/year")
-
-    print("\n📅 MONTHLY BREAKDOWN")
-    print("-" * 80)
-    print(f"{'Month':<12} {'Customers':>10} {'Observations':>15} {'Avg kWh':>12} {'Total kWh':>15}")
-    print("-" * 80)
-    for row in monthly.iter_rows():
-        month_label = f"{row[0][:4]}-{row[0][4:]}"
-        print(f"{month_label:<12} {row[1]:>10,} {row[2]:>15,} {row[3]:>12.4f} {row[4]:>15,.0f}")
-
-    print("\n📊 WEEKDAY vs WEEKEND COMPARISON")
-    print("-" * 80)
-    print(f"{'Period':<15} {'Mean kWh':>12} {'Median kWh':>12} {'Std Dev':>12}")
-    print("-" * 80)
-    for row in weekend_stats.iter_rows():
-        period = "Weekend" if row[0] else "Weekday"
-        print(f"{period:<15} {row[1]:>12.4f} {row[2]:>12.4f} {row[3]:>12.4f}")
-
-    print("\n⏰ HOURLY STATISTICS (Peak Hours)")
-    print("-" * 80)
-    # Show top 5 and bottom 5 hours
-    top5 = hourly_stats.sort("mean_kwh", descending=True).head(5)
-    bottom5 = hourly_stats.sort("mean_kwh").head(5)
-
-    print("  Highest Usage Hours:")
-    for row in top5.iter_rows():
-        print(f"    Hour {row[0]:2d}:00  -  {row[1]:.4f} kWh (±{row[2]:.4f})")
-
-    print("\n  Lowest Usage Hours:")
-    for row in bottom5.iter_rows():
-        print(f"    Hour {row[0]:2d}:00  -  {row[1]:.4f} kWh (±{row[2]:.4f})")
-
-    print("\n" + "=" * 80)
-
-    # Save to file
-    with open("summary_statistics.txt", "w") as f:
-        f.write("=" * 80 + "\n")
-        f.write("COMPREHENSIVE SUMMARY STATISTICS\n")
-        f.write("=" * 80 + "\n\n")
-        f.write(f"Total Customers: {overall['unique_customers'][0]:,}\n")
-        f.write(f"Date Range: {overall['start_date'][0]} to {overall['end_date'][0]}\n")
-        f.write(f"Mean kWh per 30-min: {overall['mean_kwh'][0]:.4f}\n")
-        f.write(f"Estimated Daily Usage: {daily_kwh:.1f} kWh/day\n")
-        f.write(f"Estimated Annual Usage: {annual_kwh:,.0f} kWh/year\n")
-        f.write("\nMonthly Breakdown:\n")
-        f.write(monthly.write_csv())
-
-    print("\n✅ Summary saved to: summary_statistics.txt")
-
-    return overall, monthly, weekend_stats, hourly_stats
-
-
-def create_monthly_heatmap(data_path: str, output_path: str = "load_heatmap_seaborn.png"):
-    """Create professional seaborn heatmap."""
-    print("\n📊 Creating professional seaborn heatmap...")
-
-    lf = pl.scan_parquet(data_path)
-
-    stats = lf.select([
-        pl.col("account_identifier").n_unique().alias("n_customers"),
-        pl.min("date").alias("min_date"),
-        pl.max("date").alias("max_date"),
-    ]).collect(engine="streaming")
-
-    n_customers = stats["n_customers"][0]
-    date_range = f"{stats['min_date'][0]} to {stats['max_date'][0]}"
-
-    monthly_hourly = (
-        lf.group_by(["sample_month", "hour"])
-        .agg(pl.col("kwh").sum().alias("total_kwh"))
-        .sort(["sample_month", "hour"])
-        .collect(engine="streaming")
-    )
-
-    matrix = monthly_hourly.pivot(index="hour", columns="sample_month", values="total_kwh").fill_null(0)
-
-    hour_labels = matrix.select("hour").to_series().to_list()
-    month_cols = sorted([c for c in matrix.columns if c != "hour"])
-    data_matrix = matrix.select(month_cols).to_numpy()
-
-    # Create figure with seaborn
-    fig, ax = plt.subplots(figsize=(16, 9))
-
-    sns.heatmap(
-        data_matrix,
-        cmap="RdYlBu_r",  # Professional diverging colormap
-        cbar_kws={
-            "label": f"Total Energy Consumption (kWh)\n~{n_customers:,} Households",
-            "shrink": 0.75,
-            "pad": 0.02,
-            "aspect": 30,
-        },
-        xticklabels=[f"{m[:4]}-{m[4:]}" for m in month_cols],
-        yticklabels=hour_labels,
-        linewidths=0.8,
-        linecolor="white",
-        ax=ax,
-        robust=True,
-        square=False,
-        annot=False,
-    )
-
-    # Professional styling
-    ax.set_xlabel("Month", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_ylabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_title(
-        f"Residential Electricity Load Patterns: Temporal Heat Map\n"
-        f"Chicago ZIP 60622 • {date_range} • {n_customers:,} Random Households",
-        fontsize=18,
-        fontweight="bold",
-        pad=25,
-    )
-
-    # Rotate labels
-    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=11)
-    ax.set_yticklabels(ax.get_yticklabels(), fontsize=10)
-
-    # Invert y-axis
-    ax.invert_yaxis()
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-def create_hourly_profile_seaborn(data_path: str, output_path: str = "hourly_profile_seaborn.png"):
-    """Create professional seaborn hourly profile."""
-    print("\n📊 Creating professional seaborn hourly profile...")
-
-    lf = pl.scan_parquet(data_path)
-    n_customers = lf.select(pl.col("account_identifier").n_unique()).collect(engine="streaming")[0, 0]
-
-    hourly = (
-        lf.group_by("hour")
-        .agg([
-            pl.col("kwh").mean().alias("mean_kwh"),
-            pl.col("kwh").quantile(0.25).alias("p25_kwh"),
-            pl.col("kwh").quantile(0.75).alias("p75_kwh"),
-        ])
-        .sort("hour")
-        .collect(engine="streaming")
-    )
-
-    # Create figure
-    fig, ax = plt.subplots(figsize=(15, 8))
-
-    hours = hourly["hour"].to_list()
-    mean = hourly["mean_kwh"].to_list()
-    p25 = hourly["p25_kwh"].to_list()
-    p75 = hourly["p75_kwh"].to_list()
-
-    # Use seaborn lineplot style
-    ax.fill_between(hours, p25, p75, alpha=0.3, color="#e74c3c", label="Interquartile Range (25th-75th percentile)")
-
-    sns.lineplot(x=hours, y=mean, linewidth=3.5, color="#c0392b", marker="o", markersize=8, ax=ax, label="Mean Usage")
-
-    # Styling
-    ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_ylabel("Energy Consumption (kWh per 30-min interval)", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_title(
-        f"Average Hourly Electricity Usage Profile\n{n_customers:,} Illinois Households • June 2024 - June 2025",
-        fontsize=18,
-        fontweight="bold",
-        pad=25,
-    )
-
-    ax.set_xticks(range(0, 24, 2))
-    ax.set_xlim(-0.5, 23.5)
-    ax.set_ylim(bottom=0)
-    ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8)
-
-    # Time-of-day shading
-    ax.axvspan(0, 6, alpha=0.1, color="#3498db", label="Overnight (12-6 AM)", zorder=0)
-    ax.axvspan(17, 21, alpha=0.1, color="#e74c3c", label="Evening Peak (5-9 PM)", zorder=0)
-
-    # Annotations with improved styling
-    peak_hour = mean.index(max(mean))
-    peak_value = max(mean)
-    ax.annotate(
-        f"Peak Demand\n{peak_value:.3f} kWh\nat {peak_hour}:00",
-        xy=(peak_hour, peak_value),
-        xytext=(peak_hour - 4, peak_value + 0.04),
-        fontsize=12,
-        fontweight="bold",
-        bbox=dict(boxstyle="round,pad=0.7", facecolor="#f39c12", alpha=0.9, edgecolor="#000", linewidth=2),
-        arrowprops=dict(arrowstyle="->", lw=2.5, color="#000", connectionstyle="arc3,rad=0.3"),
-    )
-
-    min_value = min(mean)
-    min_hour = mean.index(min_value)
-    ax.annotate(
-        f"Baseload\n{min_value:.3f} kWh\nat {min_hour}:00",
-        xy=(min_hour, min_value),
-        xytext=(min_hour + 3, min_value + 0.06),
-        fontsize=12,
-        fontweight="bold",
-        bbox=dict(boxstyle="round,pad=0.7", facecolor="#3498db", alpha=0.9, edgecolor="#000", linewidth=2),
-        arrowprops=dict(arrowstyle="->", lw=2.5, color="#000", connectionstyle="arc3,rad=-0.3"),
-    )
-
-    ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=12)
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-def create_weekend_comparison_seaborn(data_path: str, output_path: str = "weekend_comparison_seaborn.png"):
-    """Compare weekday vs weekend with peak/baseload annotations."""
-    print("\n📊 Creating professional weekend comparison...")
-
-    lf = pl.scan_parquet(data_path)
-
-    comparison = (
-        lf.group_by(["hour", "is_weekend"])
-        .agg(pl.col("kwh").mean().alias("mean_kwh"))
-        .sort(["is_weekend", "hour"])
-        .collect(engine="streaming")
-    )
-
-    weekday = comparison.filter(pl.col("is_weekend") == False)
-    weekend = comparison.filter(pl.col("is_weekend") == True)
-
-    # Create figure
-    fig, ax = plt.subplots(figsize=(15, 8))
-
-    weekday_hours = weekday["hour"].to_list()
-    weekday_kwh = weekday["mean_kwh"].to_list()
-    weekend_hours = weekend["hour"].to_list()
-    weekend_kwh = weekend["mean_kwh"].to_list()
-
-    # Seaborn style lines
-    sns.lineplot(
-        x=weekday_hours, y=weekday_kwh, linewidth=3.5, color="#2980b9", marker="o", markersize=8, ax=ax, label="Weekday"
-    )
-    sns.lineplot(
-        x=weekend_hours, y=weekend_kwh, linewidth=3.5, color="#e67e22", marker="s", markersize=8, ax=ax, label="Weekend"
-    )
-
-    # Styling
-    ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_ylabel("Average Energy Consumption (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_title(
-        "Weekday vs Weekend Load Profiles\nTemporal Consumption Patterns", fontsize=18, fontweight="bold", pad=25
-    )
-    ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8)
-    ax.set_xticks(range(0, 24, 2))
-    ax.set_xlim(-0.5, 23.5)
-    ax.set_ylim(bottom=0)
-
-    # Peak annotations for BOTH weekday and weekend
-    # Weekday peak
-    weekday_peak_hour = weekday_kwh.index(max(weekday_kwh))
-    weekday_peak = max(weekday_kwh)
-    ax.annotate(
-        f"Weekday Peak\n{weekday_peak:.3f} kWh\nat {weekday_peak_hour}:00",
-        xy=(weekday_peak_hour, weekday_peak),
-        xytext=(weekday_peak_hour - 3, weekday_peak + 0.04),
-        fontsize=11,
-        fontweight="bold",
-        bbox=dict(boxstyle="round,pad=0.6", facecolor="#3498db", alpha=0.9, edgecolor="#000", linewidth=2),
-        arrowprops=dict(arrowstyle="->", lw=2, color="#000"),
-    )
-
-    # Weekend peak
-    weekend_peak_hour = weekend_kwh.index(max(weekend_kwh))
-    weekend_peak = max(weekend_kwh)
-    ax.annotate(
-        f"Weekend Peak\n{weekend_peak:.3f} kWh\nat {weekend_peak_hour}:00",
-        xy=(weekend_peak_hour, weekend_peak),
-        xytext=(weekend_peak_hour + 2, weekend_peak + 0.04),
-        fontsize=11,
-        fontweight="bold",
-        bbox=dict(boxstyle="round,pad=0.6", facecolor="#e67e22", alpha=0.9, edgecolor="#000", linewidth=2),
-        arrowprops=dict(arrowstyle="->", lw=2, color="#000"),
-    )
-
-    # Baseload annotations
-    # Weekday baseload
-    weekday_min = min(weekday_kwh)
-    weekday_min_hour = weekday_kwh.index(weekday_min)
-    ax.annotate(
-        f"Weekday Baseload\n{weekday_min:.3f} kWh",
-        xy=(weekday_min_hour, weekday_min),
-        xytext=(weekday_min_hour + 2, weekday_min + 0.05),
-        fontsize=10,
-        bbox=dict(boxstyle="round,pad=0.5", facecolor="#bdc3c7", alpha=0.9, edgecolor="#000", linewidth=1.5),
-        arrowprops=dict(arrowstyle="->", lw=1.5, color="#000"),
-    )
-
-    # Weekend baseload
-    weekend_min = min(weekend_kwh)
-    weekend_min_hour = weekend_kwh.index(weekend_min)
-    ax.annotate(
-        f"Weekend Baseload\n{weekend_min:.3f} kWh",
-        xy=(weekend_min_hour, weekend_min),
-        xytext=(weekend_min_hour - 4, weekend_min + 0.05),
-        fontsize=10,
-        bbox=dict(boxstyle="round,pad=0.5", facecolor="#ecf0f1", alpha=0.9, edgecolor="#000", linewidth=1.5),
-        arrowprops=dict(arrowstyle="->", lw=1.5, color="#000"),
-    )
-
-    ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=13)
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-if __name__ == "__main__":
-    print("=" * 80)
-    print("PROFESSIONAL RESIDENTIAL LOAD ANALYSIS & VISUALIZATION")
-    print("=" * 80)
-
-    data_file = "../zip60622_2024/final/sample_60622_202406_202506_CLIPPED.parquet"
-
-    if not Path(data_file).exists():
-        print(f"❌ File not found: {data_file}")
-        exit(1)
-
-    # 1. Generate comprehensive summary statistics
-    generate_summary_statistics(data_file)
-
-    # 2. Create professional visualizations
-    create_monthly_heatmap(data_file, "residential_load_heatmap_professional.png")
-    create_hourly_profile_seaborn(data_file, "residential_hourly_profile_professional.png")
-    create_weekend_comparison_seaborn(data_file, "residential_weekend_comparison_professional.png")
-
-    print("\n" + "=" * 80)
-    print("✅ ANALYSIS COMPLETE!")
-    print("=" * 80)
-    print("\nGenerated files:")
-    print("  📊 summary_statistics.txt")
-    print("  📈 residential_load_heatmap_professional.png")
-    print("  📈 residential_hourly_profile_professional.png")
-    print("  📈 residential_weekend_comparison_professional.png")
-    print("=" * 80)
diff --git a/archive/residential_load_viz_2024/create_visualizations_final.py b/archive/residential_load_viz_2024/create_visualizations_final.py
deleted file mode 100644
index 48275df..0000000
--- a/archive/residential_load_viz_2024/create_visualizations_final.py
+++ /dev/null
@@ -1,321 +0,0 @@
-#!/usr/bin/env python
-"""
-Final professional visualizations with all corrections.
-"""
-
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import polars as pl
-import seaborn as sns
-
-# Set seaborn style
-sns.set_style("whitegrid")
-plt.rcParams["font.family"] = "sans-serif"
-plt.rcParams["font.size"] = 11
-plt.rcParams["axes.labelsize"] = 13
-plt.rcParams["axes.titlesize"] = 16
-plt.rcParams["legend.fontsize"] = 11
-plt.rcParams["figure.titlesize"] = 18
-plt.rcParams["axes.grid"] = True
-plt.rcParams["grid.alpha"] = 0.3
-
-
-def create_monthly_heatmap(data_path: str, output_path: str = "heatmap_final.png"):
-    """Create professional seaborn heatmap."""
-    print("\n📊 Creating heatmap...")
-
-    lf = pl.scan_parquet(data_path)
-
-    # Filter to Aug 2024 - Aug 2025 only
-    desired_months = [f"2024{m:02d}" for m in range(8, 13)] + [f"2025{m:02d}" for m in range(1, 9)]
-    lf = lf.filter(pl.col("sample_month").is_in(desired_months))
-
-    stats = lf.select([
-        pl.col("account_identifier").n_unique().alias("n_customers"),
-        pl.min("date").alias("min_date"),
-        pl.max("date").alias("max_date"),
-    ]).collect(engine="streaming")
-
-    n_customers = stats["n_customers"][0]
-    date_range = f"{stats['min_date'][0]} to {stats['max_date'][0]}"
-
-    print(f"  Unique households: {n_customers:,}")
-
-    monthly_hourly = (
-        lf.group_by(["sample_month", "hour"])
-        .agg(pl.col("kwh").sum().alias("total_kwh"))
-        .sort(["sample_month", "hour"])
-        .collect(engine="streaming")
-    )
-
-    # Check what months we actually have
-    available_months = sorted(monthly_hourly["sample_month"].unique().to_list())
-    print(f"  Available months: {', '.join(available_months)}")
-    if "202507" not in available_months:
-        print("  ⚠️  Note: July 2025 (202507) missing from source data")
-
-    matrix = monthly_hourly.pivot(index="hour", columns="sample_month", values="total_kwh").fill_null(0)
-
-    hour_labels = matrix.select("hour").to_series().to_list()
-    month_cols = sorted([c for c in matrix.columns if c != "hour"])
-    data_matrix = matrix.select(month_cols).to_numpy()
-
-    # Create figure
-    fig, ax = plt.subplots(figsize=(16, 9))
-
-    sns.heatmap(
-        data_matrix,
-        cmap="RdYlBu_r",
-        cbar_kws={
-            "label": f"Total Energy Consumption (kWh)\n{n_customers:,} Households",
-            "shrink": 0.75,
-            "pad": 0.02,
-            "aspect": 30,
-        },
-        xticklabels=[f"{m[:4]}-{m[4:]}" for m in month_cols],
-        yticklabels=hour_labels,
-        linewidths=0.8,
-        linecolor="white",
-        ax=ax,
-        robust=True,
-        square=False,
-    )
-
-    ax.set_xlabel("Month", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_ylabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_title(
-        f"Residential Electricity Load Patterns: Temporal Heat Map\n"
-        f"Chicago ZIP 60622 • {date_range} • {n_customers:,} Households",
-        fontsize=18,
-        fontweight="bold",
-        pad=25,
-    )
-
-    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=11)
-    ax.invert_yaxis()
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-def create_hourly_profile(data_path: str, output_path: str = "hourly_profile_final.png"):
-    """Create hourly profile with seaborn."""
-    print("\n📊 Creating hourly profile...")
-
-    lf = pl.scan_parquet(data_path)
-
-    # Filter to Aug 2024 - Aug 2025
-    desired_months = [f"2024{m:02d}" for m in range(8, 13)] + [f"2025{m:02d}" for m in range(1, 9)]
-    lf = lf.filter(pl.col("sample_month").is_in(desired_months))
-
-    n_customers = lf.select(pl.col("account_identifier").n_unique()).collect(engine="streaming")[0, 0]
-
-    hourly = (
-        lf.group_by("hour")
-        .agg([
-            pl.col("kwh").mean().alias("mean_kwh"),
-            pl.col("kwh").quantile(0.25).alias("p25_kwh"),
-            pl.col("kwh").quantile(0.75).alias("p75_kwh"),
-        ])
-        .sort("hour")
-        .collect(engine="streaming")
-    )
-
-    fig, ax = plt.subplots(figsize=(15, 8))
-
-    hours = hourly["hour"].to_list()
-    mean = hourly["mean_kwh"].to_list()
-    p25 = hourly["p25_kwh"].to_list()
-    p75 = hourly["p75_kwh"].to_list()
-
-    # Seaborn fill and line
-    ax.fill_between(hours, p25, p75, alpha=0.3, color="#e74c3c", label="Interquartile Range")
-
-    sns.lineplot(x=hours, y=mean, linewidth=3.5, color="#c0392b", marker="o", markersize=8, ax=ax, label="Mean Usage")
-
-    ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_ylabel("Energy Consumption (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_title(
-        f"Average Hourly Electricity Usage\n{n_customers:,} Chicago Households • August 2024 - August 2025",
-        fontsize=18,
-        fontweight="bold",
-        pad=25,
-    )
-
-    ax.set_xticks(range(0, 24, 2))
-    ax.set_xlim(-0.5, 23.5)
-    ax.set_ylim(bottom=0)
-    ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8)
-
-    ax.axvspan(0, 6, alpha=0.1, color="#3498db", label="Overnight", zorder=0)
-    ax.axvspan(17, 21, alpha=0.1, color="#e74c3c", label="Evening Peak", zorder=0)
-
-    # Annotations
-    peak_hour = mean.index(max(mean))
-    peak_value = max(mean)
-    ax.annotate(
-        f"Peak\n{peak_value:.3f} kWh\nat {peak_hour}:00",
-        xy=(peak_hour, peak_value),
-        xytext=(peak_hour - 4, peak_value + 0.04),
-        fontsize=12,
-        fontweight="bold",
-        bbox=dict(boxstyle="round,pad=0.7", facecolor="#f39c12", alpha=0.9, edgecolor="#000", linewidth=2),
-        arrowprops=dict(arrowstyle="->", lw=2.5, color="#000"),
-    )
-
-    min_value = min(mean)
-    min_hour = mean.index(min_value)
-    ax.annotate(
-        f"Baseload\n{min_value:.3f} kWh\nat {min_hour}:00",
-        xy=(min_hour, min_value),
-        xytext=(min_hour + 3, min_value + 0.06),
-        fontsize=12,
-        fontweight="bold",
-        bbox=dict(boxstyle="round,pad=0.7", facecolor="#3498db", alpha=0.9, edgecolor="#000", linewidth=2),
-        arrowprops=dict(arrowstyle="->", lw=2.5, color="#000"),
-    )
-
-    ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=12)
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-def create_weekend_comparison(data_path: str, output_path: str = "weekend_comparison_final.png"):
-    """Compare weekday vs weekend with seaborn."""
-    print("\n📊 Creating weekend comparison...")
-
-    lf = pl.scan_parquet(data_path)
-
-    # Filter to Aug 2024 - Aug 2025
-    desired_months = [f"2024{m:02d}" for m in range(8, 13)] + [f"2025{m:02d}" for m in range(1, 9)]
-    lf = lf.filter(pl.col("sample_month").is_in(desired_months))
-
-    comparison = (
-        lf.group_by(["hour", "is_weekend"])
-        .agg(pl.col("kwh").mean().alias("mean_kwh"))
-        .sort(["is_weekend", "hour"])
-        .collect(engine="streaming")
-    )
-
-    weekday = comparison.filter(pl.col("is_weekend") == False)
-    weekend = comparison.filter(pl.col("is_weekend") == True)
-
-    fig, ax = plt.subplots(figsize=(15, 9))  # Taller to avoid overlap
-
-    weekday_hours = weekday["hour"].to_list()
-    weekday_kwh = weekday["mean_kwh"].to_list()
-    weekend_hours = weekend["hour"].to_list()
-    weekend_kwh = weekend["mean_kwh"].to_list()
-
-    # Seaborn lines
-    sns.lineplot(
-        x=weekday_hours, y=weekday_kwh, linewidth=3.5, color="#2980b9", marker="o", markersize=8, ax=ax, label="Weekday"
-    )
-    sns.lineplot(
-        x=weekend_hours, y=weekend_kwh, linewidth=3.5, color="#e67e22", marker="s", markersize=8, ax=ax, label="Weekend"
-    )
-
-    ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_ylabel("Average Energy (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_title(
-        "Weekday vs Weekend Load Profiles\nTemporal Consumption Patterns",
-        fontsize=18,
-        fontweight="bold",
-        pad=30,  # More padding
-    )
-    ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8)
-    ax.set_xticks(range(0, 24, 2))
-    ax.set_xlim(-0.5, 23.5)
-
-    # Set y-limit to give more headroom for annotations
-    max_val = max(max(weekday_kwh), max(weekend_kwh))
-    ax.set_ylim(0, max_val * 1.25)  # 25% headroom
-
-    # Annotations - adjusted positions to avoid overlap
-    weekday_peak_hour = weekday_kwh.index(max(weekday_kwh))
-    weekday_peak = max(weekday_kwh)
-    ax.annotate(
-        f"Weekday Peak\n{weekday_peak:.3f} kWh\nat {weekday_peak_hour}:00",
-        xy=(weekday_peak_hour, weekday_peak),
-        xytext=(weekday_peak_hour - 4, weekday_peak + 0.03),  # Closer to point
-        fontsize=11,
-        fontweight="bold",
-        bbox=dict(boxstyle="round,pad=0.6", facecolor="#3498db", alpha=0.9, edgecolor="#000", linewidth=2),
-        arrowprops=dict(arrowstyle="->", lw=2, color="#000"),
-    )
-
-    weekend_peak_hour = weekend_kwh.index(max(weekend_kwh))
-    weekend_peak = max(weekend_kwh)
-    ax.annotate(
-        f"Weekend Peak\n{weekend_peak:.3f} kWh\nat {weekend_peak_hour}:00",
-        xy=(weekend_peak_hour, weekend_peak),
-        xytext=(weekend_peak_hour + 3, weekend_peak + 0.03),  # Closer to point
-        fontsize=11,
-        fontweight="bold",
-        bbox=dict(boxstyle="round,pad=0.6", facecolor="#e67e22", alpha=0.9, edgecolor="#000", linewidth=2),
-        arrowprops=dict(arrowstyle="->", lw=2, color="#000"),
-    )
-
-    # Baseload annotations
-    weekday_min = min(weekday_kwh)
-    weekday_min_hour = weekday_kwh.index(weekday_min)
-    ax.annotate(
-        f"Weekday Low\n{weekday_min:.3f} kWh",
-        xy=(weekday_min_hour, weekday_min),
-        xytext=(weekday_min_hour + 2, weekday_min + 0.04),
-        fontsize=10,
-        bbox=dict(boxstyle="round,pad=0.5", facecolor="#bdc3c7", alpha=0.9, edgecolor="#000", linewidth=1.5),
-        arrowprops=dict(arrowstyle="->", lw=1.5, color="#000"),
-    )
-
-    weekend_min = min(weekend_kwh)
-    weekend_min_hour = weekend_kwh.index(weekend_min)
-    ax.annotate(
-        f"Weekend Low\n{weekend_min:.3f} kWh",
-        xy=(weekend_min_hour, weekend_min),
-        xytext=(weekend_min_hour - 4, weekend_min + 0.04),
-        fontsize=10,
-        bbox=dict(boxstyle="round,pad=0.5", facecolor="#ecf0f1", alpha=0.9, edgecolor="#000", linewidth=1.5),
-        arrowprops=dict(arrowstyle="->", lw=1.5, color="#000"),
-    )
-
-    ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=13)
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-if __name__ == "__main__":
-    print("=" * 80)
-    print("FINAL PROFESSIONAL VISUALIZATIONS")
-    print("=" * 80)
-
-    data_file = "analysis/zip60622_2024/final/sample_60622_202406_202506_CLIPPED_CM90.parquet"
-
-    if not Path(data_file).exists():
-        print(f"❌ File not found: {data_file}")
-        exit(1)
-
-    print("\n✓ Using seaborn for all visualizations")
-    print("✓ Filtering to August 2024 - August 2025 only")
-
-    create_monthly_heatmap(data_file, "residential_heatmap_final.png")
-    create_hourly_profile(data_file, "residential_hourly_final.png")
-    create_weekend_comparison(data_file, "residential_weekend_final.png")
-
-    print("\n" + "=" * 80)
-    print("✅ VISUALIZATIONS COMPLETE!")
-    print("=" * 80)
-    print("\nGenerated files:")
-    print("  📈 residential_heatmap_final.png")
-    print("  📈 residential_hourly_final.png")
-    print("  📈 residential_weekend_final.png")
-    print("=" * 80)
diff --git a/archive/residential_load_viz_2024/create_visualizations_with_qc.py b/archive/residential_load_viz_2024/create_visualizations_with_qc.py
deleted file mode 100644
index d1c52e9..0000000
--- a/archive/residential_load_viz_2024/create_visualizations_with_qc.py
+++ /dev/null
@@ -1,593 +0,0 @@
-#!/usr/bin/env python
-"""
-Professional-quality load visualizations with QC validation.
-"""
-
-import sys
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import polars as pl
-import seaborn as sns
-
-# Import your transformation QC functions
-sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
-from smart_meter_analysis.transformation import daily_interval_qc, dst_transition_dates
-
-# Set seaborn style
-sns.set_style("whitegrid")
-sns.set_palette("husl")
-
-# Professional typography
-plt.rcParams["font.family"] = "serif"
-plt.rcParams["font.serif"] = ["Garamond", "Georgia", "Times New Roman"]
-plt.rcParams["font.size"] = 11
-plt.rcParams["axes.labelsize"] = 13
-plt.rcParams["axes.titlesize"] = 16
-plt.rcParams["xtick.labelsize"] = 10
-plt.rcParams["ytick.labelsize"] = 10
-plt.rcParams["legend.fontsize"] = 11
-plt.rcParams["figure.titlesize"] = 18
-plt.rcParams["axes.grid"] = True
-plt.rcParams["grid.alpha"] = 0.3
-
-
-def validate_data_quality(data_path: str):
-    """Run comprehensive data quality checks using transformation QC functions."""
-    print("\n" + "=" * 80)
-    print("DATA QUALITY VALIDATION")
-    print("=" * 80)
-
-    lf = pl.scan_parquet(data_path)
-
-    # Load a representative sample (faster QC)
-    print("\n🔍 Loading sample for quality checks...")
-    sample_size = 500000  # Half million rows should be representative
-    df_sample = lf.limit(sample_size).collect()
-    print(f"   Analyzing {len(df_sample):,} rows (sample)")
-
-    # Run interval count QC
-    print("\n📊 Running Interval Count Analysis...")
-    qc = daily_interval_qc(df_sample)
-
-    # Day type distribution
-    day_types = (
-        qc.group_by("day_type")
-        .agg([pl.len().alias("count"), pl.col("n_intervals").mean().alias("avg_intervals")])
-        .sort("day_type")
-    )
-
-    print("\n   Day Type Distribution:")
-    print("   " + "-" * 60)
-    print(f"   {'Type':<20} {'Count':>10} {'Avg Intervals':>15}")
-    print("   " + "-" * 60)
-    for row in day_types.iter_rows():
-        print(f"   {row[0]:<20} {row[1]:>10,} {row[2]:>15.1f}")
-
-    # Check for odd days
-    odd_days = qc.filter(pl.col("is_odd_count"))
-    total_days = qc.height
-    pct_valid = ((total_days - odd_days.height) / total_days) * 100
-
-    print("\n   Quality Metrics:")
-    print(f"   {'Total customer-days analyzed:':<40} {total_days:>10,}")
-    print(f"   {'Days with valid counts (46/48/50):':<40} {total_days - odd_days.height:>10,}")
-    print(f"   {'Days with odd counts:':<40} {odd_days.height:>10,}")
-    print(f"   {'Data quality score:':<40} {pct_valid:>9.2f}%")
-
-    if odd_days.height > 0:
-        print(f"\n   ⚠️  WARNING: {odd_days.height} days have unusual interval counts")
-        print("   Sample of odd days:")
-        print(odd_days.select(["account_identifier", "date", "n_intervals"]).head(5))
-    else:
-        print("\n   ✅ EXCELLENT: All days have valid interval counts!")
-
-    # DST transition validation
-    print("\n📅 DST Transition Analysis...")
-    try:
-        # Get full dataset for DST check (need complete date range)
-        dst_dates = dst_transition_dates(df_sample)
-
-        if dst_dates.height > 0:
-            print("\n   Detected DST Transition Dates:")
-            print("   " + "-" * 60)
-            for row in dst_dates.iter_rows():
-                print(f"   {row[0]}  -  {row[1].upper()}")
-            print("\n   ✅ DST transitions properly detected and handled")
-        else:
-            print("   ℹ️  No DST transitions in sampled data")
-    except Exception as e:
-        print(f"   ⚠️  Could not validate DST: {e}")
-
-    # Interval distribution across hours
-    print("\n⏰ Hourly Interval Distribution...")
-    hourly_counts = df_sample.group_by("hour").agg(pl.len().alias("count")).sort("hour")
-
-    avg_per_hour = hourly_counts["count"].mean()
-    std_per_hour = hourly_counts["count"].std()
-    print(f"   Average intervals per hour: {avg_per_hour:,.0f} (±{std_per_hour:,.0f})")
-
-    # Check for missing hours
-    expected_hours = set(range(24))
-    actual_hours = set(hourly_counts["hour"].to_list())
-    missing_hours = expected_hours - actual_hours
-
-    if missing_hours:
-        print(f"   ⚠️  Missing hours: {sorted(missing_hours)}")
-    else:
-        print("   ✅ All 24 hours represented")
-
-    # kWh value validation
-    print("\n⚡ Energy Value (kWh) Validation...")
-    kwh_stats = df_sample.select([
-        pl.col("kwh").min().alias("min"),
-        pl.col("kwh").quantile(0.01).alias("p01"),
-        pl.col("kwh").median().alias("median"),
-        pl.col("kwh").quantile(0.99).alias("p99"),
-        pl.col("kwh").max().alias("max"),
-        (pl.col("kwh") < 0).sum().alias("negative_count"),
-        pl.col("kwh").is_null().sum().alias("null_count"),
-    ])
-
-    print(f"   Min kWh:           {kwh_stats['min'][0]:>10.4f}")
-    print(f"   1st percentile:    {kwh_stats['p01'][0]:>10.4f}")
-    print(f"   Median kWh:        {kwh_stats['median'][0]:>10.4f}")
-    print(f"   99th percentile:   {kwh_stats['p99'][0]:>10.4f}")
-    print(f"   Max kWh:           {kwh_stats['max'][0]:>10.4f}")
-    print(f"   Negative values:   {kwh_stats['negative_count'][0]:>10,}")
-    print(f"   Null values:       {kwh_stats['null_count'][0]:>10,}")
-
-    if kwh_stats["negative_count"][0] > 0:
-        print("   ⚠️  WARNING: Negative kWh values detected!")
-    if kwh_stats["null_count"][0] > 0:
-        print("   ⚠️  WARNING: Null kWh values detected!")
-    if kwh_stats["negative_count"][0] == 0 and kwh_stats["null_count"][0] == 0:
-        print("   ✅ All kWh values are valid (no negatives or nulls)")
-
-    # Overall quality assessment
-    print("\n" + "=" * 80)
-    print("OVERALL DATA QUALITY ASSESSMENT")
-    print("=" * 80)
-
-    issues = []
-    if odd_days.height > 0:
-        issues.append(f"{odd_days.height} days with odd interval counts")
-    if kwh_stats["negative_count"][0] > 0:
-        issues.append("Negative kWh values present")
-    if kwh_stats["null_count"][0] > 0:
-        issues.append("Null kWh values present")
-    if missing_hours:
-        issues.append("Some hours missing from data")
-
-    if not issues:
-        print("✅ EXCELLENT: Data quality is very high!")
-        print("   - All interval counts valid (46/48/50)")
-        print("   - No negative or null kWh values")
-        print("   - All hours represented")
-        print("   - Ready for analysis")
-        quality_grade = "A+"
-    elif len(issues) == 1 and "odd interval" in issues[0] and odd_days.height < total_days * 0.01:
-        print("✅ GOOD: Data quality is acceptable")
-        print(f"   - Minor issues: {issues[0]}")
-        print("   - Should not significantly impact analysis")
-        quality_grade = "B+"
-    else:
-        print("⚠️  FAIR: Some data quality issues detected:")
-        for issue in issues:
-            print(f"   - {issue}")
-        print("   - Review issues before drawing conclusions")
-        quality_grade = "C"
-
-    print(f"\n   Quality Grade: {quality_grade}")
-    print("=" * 80 + "\n")
-
-    return quality_grade
-
-
-def generate_summary_statistics(data_path: str):
-    """Generate comprehensive summary statistics table."""
-    print("\n" + "=" * 80)
-    print("COMPREHENSIVE SUMMARY STATISTICS")
-    print("=" * 80)
-
-    lf = pl.scan_parquet(data_path)
-
-    # Overall statistics
-    overall = lf.select([
-        pl.col("account_identifier").n_unique().alias("unique_customers"),
-        pl.len().alias("total_observations"),
-        pl.min("date").alias("start_date"),
-        pl.max("date").alias("end_date"),
-        pl.col("kwh").mean().alias("mean_kwh"),
-        pl.col("kwh").median().alias("median_kwh"),
-        pl.col("kwh").std().alias("std_kwh"),
-        pl.col("kwh").min().alias("min_kwh"),
-        pl.col("kwh").max().alias("max_kwh"),
-        pl.col("kwh").quantile(0.25).alias("q25_kwh"),
-        pl.col("kwh").quantile(0.75).alias("q75_kwh"),
-    ]).collect(engine="streaming")
-
-    # Monthly breakdown
-    monthly = (
-        lf.group_by("sample_month")
-        .agg([
-            pl.col("account_identifier").n_unique().alias("customers"),
-            pl.len().alias("observations"),
-            pl.col("kwh").mean().alias("mean_kwh"),
-            pl.col("kwh").sum().alias("total_kwh"),
-        ])
-        .sort("sample_month")
-        .collect(engine="streaming")
-    )
-
-    # Weekday vs Weekend
-    weekend_stats = (
-        lf.group_by("is_weekend")
-        .agg([
-            pl.col("kwh").mean().alias("mean_kwh"),
-            pl.col("kwh").median().alias("median_kwh"),
-            pl.col("kwh").std().alias("std_kwh"),
-        ])
-        .collect(engine="streaming")
-    )
-
-    # Hourly statistics
-    hourly_stats = (
-        lf.group_by("hour")
-        .agg([
-            pl.col("kwh").mean().alias("mean_kwh"),
-            pl.col("kwh").std().alias("std_kwh"),
-        ])
-        .sort("hour")
-        .collect(engine="streaming")
-    )
-
-    # Print comprehensive report
-    print("\n📋 OVERALL STATISTICS")
-    print("-" * 80)
-    print(f"  Total Unique Customers:        {overall['unique_customers'][0]:>12,}")
-    print(f"  Total Observations:            {overall['total_observations'][0]:>12,}")
-    print(f"  Date Range:                    {overall['start_date'][0]} to {overall['end_date'][0]}")
-    print(f"  Average kWh per 30-min:        {overall['mean_kwh'][0]:>12.4f}")
-    print(f"  Median kWh per 30-min:         {overall['median_kwh'][0]:>12.4f}")
-    print(f"  Std Dev kWh:                   {overall['std_kwh'][0]:>12.4f}")
-    print(f"  Min kWh:                       {overall['min_kwh'][0]:>12.4f}")
-    print(f"  Max kWh:                       {overall['max_kwh'][0]:>12.4f}")
-    print(f"  25th Percentile:               {overall['q25_kwh'][0]:>12.4f}")
-    print(f"  75th Percentile:               {overall['q75_kwh'][0]:>12.4f}")
-
-    # Estimated daily/annual usage
-    daily_kwh = overall["mean_kwh"][0] * 48  # 48 half-hour intervals per day
-    annual_kwh = daily_kwh * 365
-    print(f"\n  Est. Daily Usage per Home:     {daily_kwh:>12.1f} kWh/day")
-    print(f"  Est. Annual Usage per Home:    {annual_kwh:>12,.0f} kWh/year")
-
-    print("\n📅 MONTHLY BREAKDOWN")
-    print("-" * 80)
-    print(f"{'Month':<12} {'Customers':>10} {'Observations':>15} {'Avg kWh':>12} {'Total kWh':>15}")
-    print("-" * 80)
-    for row in monthly.iter_rows():
-        month_label = f"{row[0][:4]}-{row[0][4:]}"
-        print(f"{month_label:<12} {row[1]:>10,} {row[2]:>15,} {row[3]:>12.4f} {row[4]:>15,.0f}")
-
-    print("\n📊 WEEKDAY vs WEEKEND COMPARISON")
-    print("-" * 80)
-    print(f"{'Period':<15} {'Mean kWh':>12} {'Median kWh':>12} {'Std Dev':>12}")
-    print("-" * 80)
-    for row in weekend_stats.iter_rows():
-        period = "Weekend" if row[0] else "Weekday"
-        print(f"{period:<15} {row[1]:>12.4f} {row[2]:>12.4f} {row[3]:>12.4f}")
-
-    print("\n⏰ HOURLY STATISTICS (Peak Hours)")
-    print("-" * 80)
-    # Show top 5 and bottom 5 hours
-    top5 = hourly_stats.sort("mean_kwh", descending=True).head(5)
-    bottom5 = hourly_stats.sort("mean_kwh").head(5)
-
-    print("  Highest Usage Hours:")
-    for row in top5.iter_rows():
-        print(f"    Hour {row[0]:2d}:00  -  {row[1]:.4f} kWh (±{row[2]:.4f})")
-
-    print("\n  Lowest Usage Hours:")
-    for row in bottom5.iter_rows():
-        print(f"    Hour {row[0]:2d}:00  -  {row[1]:.4f} kWh (±{row[2]:.4f})")
-
-    print("\n" + "=" * 80)
-
-    # Save to file
-    with open("summary_statistics.txt", "w") as f:
-        f.write("=" * 80 + "\n")
-        f.write("COMPREHENSIVE SUMMARY STATISTICS\n")
-        f.write("=" * 80 + "\n\n")
-        f.write(f"Total Customers: {overall['unique_customers'][0]:,}\n")
-        f.write(f"Date Range: {overall['start_date'][0]} to {overall['end_date'][0]}\n")
-        f.write(f"Mean kWh per 30-min: {overall['mean_kwh'][0]:.4f}\n")
-        f.write(f"Estimated Daily Usage: {daily_kwh:.1f} kWh/day\n")
-        f.write(f"Estimated Annual Usage: {annual_kwh:,.0f} kWh/year\n")
-        f.write("\nMonthly Breakdown:\n")
-        f.write(monthly.write_csv())
-
-    print("\n✅ Summary saved to: summary_statistics.txt")
-
-    return overall, monthly, weekend_stats, hourly_stats
-
-
-def create_monthly_heatmap(data_path: str, output_path: str = "load_heatmap_seaborn.png"):
-    """Create professional seaborn heatmap."""
-    print("\n📊 Creating professional seaborn heatmap...")
-
-    lf = pl.scan_parquet(data_path)
-
-    stats = lf.select([
-        pl.col("account_identifier").n_unique().alias("n_customers"),
-        pl.min("date").alias("min_date"),
-        pl.max("date").alias("max_date"),
-    ]).collect(engine="streaming")
-
-    n_customers = stats["n_customers"][0]
-    date_range = f"{stats['min_date'][0]} to {stats['max_date'][0]}"
-
-    monthly_hourly = (
-        lf.group_by(["sample_month", "hour"])
-        .agg(pl.col("kwh").sum().alias("total_kwh"))
-        .sort(["sample_month", "hour"])
-        .collect(engine="streaming")
-    )
-
-    matrix = monthly_hourly.pivot(index="hour", columns="sample_month", values="total_kwh").fill_null(0)
-
-    hour_labels = matrix.select("hour").to_series().to_list()
-    month_cols = sorted([c for c in matrix.columns if c != "hour"])
-    data_matrix = matrix.select(month_cols).to_numpy()
-
-    # Create figure with seaborn
-    fig, ax = plt.subplots(figsize=(16, 9))
-
-    sns.heatmap(
-        data_matrix,
-        cmap="RdYlBu_r",
-        cbar_kws={
-            "label": f"Total Energy Consumption (kWh)\n~{n_customers:,} Households",
-            "shrink": 0.75,
-            "pad": 0.02,
-            "aspect": 30,
-        },
-        xticklabels=[f"{m[:4]}-{m[4:]}" for m in month_cols],
-        yticklabels=hour_labels,
-        linewidths=0.8,
-        linecolor="white",
-        ax=ax,
-        robust=True,
-        square=False,
-        annot=False,
-    )
-
-    ax.set_xlabel("Month", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_ylabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_title(
-        f"Residential Electricity Load Patterns: Temporal Heat Map\n"
-        f"Chicago ZIP 60622 • {date_range} • {n_customers:,} Random Households",
-        fontsize=18,
-        fontweight="bold",
-        pad=25,
-    )
-
-    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=11)
-    ax.set_yticklabels(ax.get_yticklabels(), fontsize=10)
-    ax.invert_yaxis()
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-def create_hourly_profile_seaborn(data_path: str, output_path: str = "hourly_profile_seaborn.png"):
-    """Create professional seaborn hourly profile."""
-    print("\n📊 Creating professional seaborn hourly profile...")
-
-    lf = pl.scan_parquet(data_path)
-    n_customers = lf.select(pl.col("account_identifier").n_unique()).collect(engine="streaming")[0, 0]
-
-    hourly = (
-        lf.group_by("hour")
-        .agg([
-            pl.col("kwh").mean().alias("mean_kwh"),
-            pl.col("kwh").quantile(0.25).alias("p25_kwh"),
-            pl.col("kwh").quantile(0.75).alias("p75_kwh"),
-        ])
-        .sort("hour")
-        .collect(engine="streaming")
-    )
-
-    fig, ax = plt.subplots(figsize=(15, 8))
-
-    hours = hourly["hour"].to_list()
-    mean = hourly["mean_kwh"].to_list()
-    p25 = hourly["p25_kwh"].to_list()
-    p75 = hourly["p75_kwh"].to_list()
-
-    ax.fill_between(hours, p25, p75, alpha=0.3, color="#e74c3c", label="Interquartile Range (25th-75th percentile)")
-
-    sns.lineplot(x=hours, y=mean, linewidth=3.5, color="#c0392b", marker="o", markersize=8, ax=ax, label="Mean Usage")
-
-    ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_ylabel("Energy Consumption (kWh per 30-min interval)", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_title(
-        f"Average Hourly Electricity Usage Profile\n{n_customers:,} Illinois Households • June 2024 - June 2025",
-        fontsize=18,
-        fontweight="bold",
-        pad=25,
-    )
-
-    ax.set_xticks(range(0, 24, 2))
-    ax.set_xlim(-0.5, 23.5)
-    ax.set_ylim(bottom=0)
-    ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8)
-
-    ax.axvspan(0, 6, alpha=0.1, color="#3498db", label="Overnight (12-6 AM)", zorder=0)
-    ax.axvspan(17, 21, alpha=0.1, color="#e74c3c", label="Evening Peak (5-9 PM)", zorder=0)
-
-    peak_hour = mean.index(max(mean))
-    peak_value = max(mean)
-    ax.annotate(
-        f"Peak Demand\n{peak_value:.3f} kWh\nat {peak_hour}:00",
-        xy=(peak_hour, peak_value),
-        xytext=(peak_hour - 4, peak_value + 0.04),
-        fontsize=12,
-        fontweight="bold",
-        bbox=dict(boxstyle="round,pad=0.7", facecolor="#f39c12", alpha=0.9, edgecolor="#000", linewidth=2),
-        arrowprops=dict(arrowstyle="->", lw=2.5, color="#000", connectionstyle="arc3,rad=0.3"),
-    )
-
-    min_value = min(mean)
-    min_hour = mean.index(min_value)
-    ax.annotate(
-        f"Baseload\n{min_value:.3f} kWh\nat {min_hour}:00",
-        xy=(min_hour, min_value),
-        xytext=(min_hour + 3, min_value + 0.06),
-        fontsize=12,
-        fontweight="bold",
-        bbox=dict(boxstyle="round,pad=0.7", facecolor="#3498db", alpha=0.9, edgecolor="#000", linewidth=2),
-        arrowprops=dict(arrowstyle="->", lw=2.5, color="#000", connectionstyle="arc3,rad=-0.3"),
-    )
-
-    ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=12)
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-def create_weekend_comparison_seaborn(data_path: str, output_path: str = "weekend_comparison_seaborn.png"):
-    """Compare weekday vs weekend with peak/baseload annotations."""
-    print("\n📊 Creating professional weekend comparison...")
-
-    lf = pl.scan_parquet(data_path)
-
-    comparison = (
-        lf.group_by(["hour", "is_weekend"])
-        .agg(pl.col("kwh").mean().alias("mean_kwh"))
-        .sort(["is_weekend", "hour"])
-        .collect(engine="streaming")
-    )
-
-    weekday = comparison.filter(pl.col("is_weekend") == False)
-    weekend = comparison.filter(pl.col("is_weekend") == True)
-
-    fig, ax = plt.subplots(figsize=(15, 8))
-
-    weekday_hours = weekday["hour"].to_list()
-    weekday_kwh = weekday["mean_kwh"].to_list()
-    weekend_hours = weekend["hour"].to_list()
-    weekend_kwh = weekend["mean_kwh"].to_list()
-
-    sns.lineplot(
-        x=weekday_hours, y=weekday_kwh, linewidth=3.5, color="#2980b9", marker="o", markersize=8, ax=ax, label="Weekday"
-    )
-    sns.lineplot(
-        x=weekend_hours, y=weekend_kwh, linewidth=3.5, color="#e67e22", marker="s", markersize=8, ax=ax, label="Weekend"
-    )
-
-    ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_ylabel("Average Energy Consumption (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_title(
-        "Weekday vs Weekend Load Profiles\nTemporal Consumption Patterns", fontsize=18, fontweight="bold", pad=25
-    )
-    ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8)
-    ax.set_xticks(range(0, 24, 2))
-    ax.set_xlim(-0.5, 23.5)
-    ax.set_ylim(bottom=0)
-
-    # Weekday peak
-    weekday_peak_hour = weekday_kwh.index(max(weekday_kwh))
-    weekday_peak = max(weekday_kwh)
-    ax.annotate(
-        f"Weekday Peak\n{weekday_peak:.3f} kWh\nat {weekday_peak_hour}:00",
-        xy=(weekday_peak_hour, weekday_peak),
-        xytext=(weekday_peak_hour - 3, weekday_peak + 0.04),
-        fontsize=11,
-        fontweight="bold",
-        bbox=dict(boxstyle="round,pad=0.6", facecolor="#3498db", alpha=0.9, edgecolor="#000", linewidth=2),
-        arrowprops=dict(arrowstyle="->", lw=2, color="#000"),
-    )
-
-    # Weekend peak
-    weekend_peak_hour = weekend_kwh.index(max(weekend_kwh))
-    weekend_peak = max(weekend_kwh)
-    ax.annotate(
-        f"Weekend Peak\n{weekend_peak:.3f} kWh\nat {weekend_peak_hour}:00",
-        xy=(weekend_peak_hour, weekend_peak),
-        xytext=(weekend_peak_hour + 2, weekend_peak + 0.04),
-        fontsize=11,
-        fontweight="bold",
-        bbox=dict(boxstyle="round,pad=0.6", facecolor="#e67e22", alpha=0.9, edgecolor="#000", linewidth=2),
-        arrowprops=dict(arrowstyle="->", lw=2, color="#000"),
-    )
-
-    # Weekday baseload
-    weekday_min = min(weekday_kwh)
-    weekday_min_hour = weekday_kwh.index(weekday_min)
-    ax.annotate(
-        f"Weekday Baseload\n{weekday_min:.3f} kWh",
-        xy=(weekday_min_hour, weekday_min),
-        xytext=(weekday_min_hour + 2, weekday_min + 0.05),
-        fontsize=10,
-        bbox=dict(boxstyle="round,pad=0.5", facecolor="#bdc3c7", alpha=0.9, edgecolor="#000", linewidth=1.5),
-        arrowprops=dict(arrowstyle="->", lw=1.5, color="#000"),
-    )
-
-    # Weekend baseload
-    weekend_min = min(weekend_kwh)
-    weekend_min_hour = weekend_kwh.index(weekend_min)
-    ax.annotate(
-        f"Weekend Baseload\n{weekend_min:.3f} kWh",
-        xy=(weekend_min_hour, weekend_min),
-        xytext=(weekend_min_hour - 4, weekend_min + 0.05),
-        fontsize=10,
-        bbox=dict(boxstyle="round,pad=0.5", facecolor="#ecf0f1", alpha=0.9, edgecolor="#000", linewidth=1.5),
-        arrowprops=dict(arrowstyle="->", lw=1.5, color="#000"),
-    )
-
-    ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=13)
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-if __name__ == "__main__":
-    print("=" * 80)
-    print("PROFESSIONAL RESIDENTIAL LOAD ANALYSIS & VISUALIZATION")
-    print("With Integrated Data Quality Validation")
-    print("=" * 80)
-
-    data_file = "analysis/zip60622_2024/final/sample_60622_202406_202506_CLIPPED_CM90.parquet"
-
-    if not Path(data_file).exists():
-        print(f"❌ File not found: {data_file}")
-        exit(1)
-
-    # 1. DATA QUALITY VALIDATION (using your QC functions)
-    quality_grade = validate_data_quality(data_file)
-
-    # 2. Generate comprehensive summary statistics
-    generate_summary_statistics(data_file)
-
-    # 3. Create professional visualizations
-    create_monthly_heatmap(data_file, "residential_load_heatmap_professional.png")
-    create_hourly_profile_seaborn(data_file, "residential_hourly_profile_professional.png")
-    create_weekend_comparison_seaborn(data_file, "residential_weekend_comparison_professional.png")
-
-    print("\n" + "=" * 80)
-    print("✅ ANALYSIS COMPLETE!")
-    print("=" * 80)
-    print(f"\nData Quality Grade: {quality_grade}")
-    print("\nGenerated files:")
-    print("  📊 summary_statistics.txt")
-    print("  📈 residential_load_heatmap_professional.png")
-    print("  📈 residential_hourly_profile_professional.png")
-    print("  📈 residential_weekend_comparison_professional.png")
-    print("=" * 80)
diff --git a/archive/residential_load_viz_2024/generate_summary_table.py b/archive/residential_load_viz_2024/generate_summary_table.py
deleted file mode 100644
index a498311..0000000
--- a/archive/residential_load_viz_2024/generate_summary_table.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/usr/bin/env python
-"""Generate formatted summary table for paper."""
-
-from pathlib import Path
-
-import polars as pl
-
-DATA_FILE = "analysis/chicago_2024_full_year/combined/chicago_2024_with_april_boost_CM90.parquet"
-OUTPUT_DIR = Path("analysis/chicago_2024_full_year/final_visualizations")
-OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-
-lf = pl.scan_parquet(DATA_FILE)
-
-monthly = (
-    lf.group_by("sample_month")
-    .agg([
-        pl.col("account_identifier").n_unique().alias("customers"),
-        pl.col("zipcode").n_unique().alias("zip_codes"),
-        pl.col("kwh").mean().alias("mean_kwh"),
-        pl.col("kwh").std().alias("std_kwh"),
-        pl.len().alias("observations"),
-    ])
-    .sort("sample_month")
-    .collect(engine="streaming")
-)
-
-print("=" * 80)
-print("TABLE 1: Monthly Sample Summary")
-print("=" * 80)
-print()
-print(f"{'Month':<12} {'Customers':<12} {'ZIP Codes':<12} {'Mean kWh':<12} {'Std Dev':<12} {'Observations':<15}")
-print("-" * 80)
-
-for row in monthly.iter_rows():
-    month_label = f"{row[0][:4]}-{row[0][4:]}"
-    asterisk = "*" if row[0] == "202404" else " "
-    print(f"{month_label:<11}{asterisk} {row[1]:<12,} {row[2]:<12} {row[3]:<12.4f} {row[4]:<12.4f} {row[5]:<15,}")
-
-total_customers = lf.select(pl.col("account_identifier").n_unique()).collect(engine="streaming")[0, 0]
-total_obs = lf.select(pl.len()).collect(engine="streaming")[0, 0]
-
-print("-" * 80)
-print(f"{'TOTAL':<12} {total_customers:<12,} {'-':<12} {'-':<12} {'-':<12} {total_obs:<15,}")
-print()
-print("*April 2024: Limited data availability due to utility data quality issues.")
-print("  Sample size reduced to 229 customers (24% of typical) despite city-wide")
-print("  sampling effort across 47 ZIP codes.")
-print("=" * 80)
-
-# Save to file
-output_file = OUTPUT_DIR / "summary_table.txt"
-with open(output_file, "w") as f:
-    f.write("=" * 80 + "\n")
-    f.write("TABLE 1: Monthly Sample Summary\n")
-    f.write("=" * 80 + "\n\n")
-    f.write(
-        f"{'Month':<12} {'Customers':<12} {'ZIP Codes':<12} {'Mean kWh':<12} {'Std Dev':<12} {'Observations':<15}\n"
-    )
-    f.write("-" * 80 + "\n")
-
-    for row in monthly.iter_rows():
-        month_label = f"{row[0][:4]}-{row[0][4:]}"
-        asterisk = "*" if row[0] == "202404" else " "
-        f.write(
-            f"{month_label:<11}{asterisk} {row[1]:<12,} {row[2]:<12} {row[3]:<12.4f} {row[4]:<12.4f} {row[5]:<15,}\n"
-        )
-
-    f.write("-" * 80 + "\n")
-    f.write(f"{'TOTAL':<12} {total_customers:<12,} {'-':<12} {'-':<12} {'-':<12} {total_obs:<15,}\n")
-    f.write("\n*April 2024: Limited data availability due to utility data quality issues.\n")
-    f.write("  Sample size reduced to 229 customers (24% of typical) despite city-wide\n")
-    f.write("  sampling effort across 47 ZIP codes.\n")
-
-print(f"\n✅ Table saved to: {output_file}")
-
-# Also save CSV
-csv_file = OUTPUT_DIR / "summary_table.csv"
-monthly.write_csv(csv_file)
-print(f"✅ CSV saved to: {csv_file}")
diff --git a/archive/residential_load_viz_2024/sample_april_citywide.py b/archive/residential_load_viz_2024/sample_april_citywide.py
deleted file mode 100755
index 5807864..0000000
--- a/archive/residential_load_viz_2024/sample_april_citywide.py
+++ /dev/null
@@ -1,140 +0,0 @@
-#!/usr/bin/env python
-"""
-Sample ALL 55 Chicago ZIPs specifically for April 2024.
-Target: Get enough customers so that even with 90% CM90 dropout, we still have ~1000.
-"""
-
-import os
-import subprocess
-
-# All 55 ZIPs with substantial April data (from investigation)
-APRIL_ZIPS = [
-    "60601",
-    "60602",
-    "60603",
-    "60605",
-    "60606",
-    "60607",
-    "60608",
-    "60609",
-    "60610",
-    "60611",
-    "60612",
-    "60613",
-    "60614",
-    "60615",
-    "60616",
-    "60617",
-    "60618",
-    "60619",
-    "60620",
-    "60621",
-    "60622",
-    "60623",
-    "60624",
-    "60625",
-    "60626",
-    "60628",
-    "60629",
-    "60630",
-    "60631",
-    "60632",
-    "60633",
-    "60634",
-    "60636",
-    "60637",
-    "60638",
-    "60639",
-    "60640",
-    "60641",
-    "60642",
-    "60643",
-    "60644",
-    "60645",
-    "60646",
-    "60647",
-    "60649",
-    "60651",
-    "60652",
-    "60653",
-    "60654",
-    "60655",
-    "60656",
-    "60657",
-    "60659",
-    "60660",
-    "60661",
-]
-
-# Already sampled in pilot
-PILOT_ZIPS = ["60622", "60614", "60615", "60625", "60629", "60639", "60647", "60653", "60657", "60660"]
-
-# Only sample new ZIPs
-NEW_ZIPS = [z for z in APRIL_ZIPS if z not in PILOT_ZIPS]
-
-# Sample 200 per ZIP to account for ~90% CM90 dropout
-TARGET_PER_ZIP = 200
-
-print("=" * 80)
-print("APRIL 2024 CITY-WIDE BOOST")
-print("=" * 80)
-print(f"Sampling {len(NEW_ZIPS)} additional ZIPs for April only")
-print(f"Target: {TARGET_PER_ZIP} customers per ZIP")
-print(f"Expected raw: {len(NEW_ZIPS) * TARGET_PER_ZIP:,} samples")
-print(f"Expected after CM90 (~10% pass): {len(NEW_ZIPS) * TARGET_PER_ZIP * 0.1:,.0f} customers")
-print(f"Combined with pilot April: ~{len(NEW_ZIPS) * TARGET_PER_ZIP * 0.1 + 12:.0f} total April customers")
-print("=" * 80)
-
-input("\nPress ENTER to start or Ctrl+C to cancel...")
-
-successful = []
-failed = []
-
-for i, zipcode in enumerate(NEW_ZIPS, 1):
-    print(f"\n{'=' * 80}")
-    print(f"[{i}/{len(NEW_ZIPS)}] ZIP {zipcode} - April 2024 only")
-    print(f"{'=' * 80}")
-
-    output_dir = f"analysis/chicago_april_citywide/zip{zipcode}"
-
-    env = os.environ.copy()
-    env.update({
-        "START_YYYYMM": "202404",  # April only
-        "END_YYYYMM": "202404",  # April only
-        "OUTPUT_DIR": output_dir,
-        "TARGET_CUSTOMERS": str(TARGET_PER_ZIP),
-        "ZIP5": zipcode,
-    })
-
-    try:
-        result = subprocess.run(
-            ["python", "sample_customers_60622.py"],
-            env=env,
-        )
-
-        if result.returncode == 0:
-            print(f"\n✅ ZIP {zipcode} completed")
-            successful.append(zipcode)
-        else:
-            print(f"\n⚠️  ZIP {zipcode} failed")
-            failed.append(zipcode)
-
-    except KeyboardInterrupt:
-        print("\n\n⚠️  Interrupted!")
-        print(f"Completed: {len(successful)}")
-        print(f"Remaining: {len(NEW_ZIPS) - i}")
-        exit(1)
-    except Exception as e:
-        print(f"\n❌ ZIP {zipcode} error: {e}")
-        failed.append(zipcode)
-
-print("\n" + "=" * 80)
-print("APRIL BOOST COMPLETE")
-print("=" * 80)
-print(f"✅ Successful: {len(successful)}/{len(NEW_ZIPS)}")
-if failed:
-    print(f"❌ Failed: {len(failed)} ZIPs")
-    print(f"   {', '.join(failed)}")
-
-print("\nNext: python combine_with_april_boost.py")
-print("=" * 80)
diff --git a/archive/residential_load_viz_2024/sample_chicago_all_2024.py b/archive/residential_load_viz_2024/sample_chicago_all_2024.py
deleted file mode 100755
index d09d3a4..0000000
--- a/archive/residential_load_viz_2024/sample_chicago_all_2024.py
+++ /dev/null
@@ -1,144 +0,0 @@
-#!/usr/bin/env python
-"""
-Sample ALL 12 months of 2024 from multiple Chicago ZIP codes.
-"""
-
-import os
-import subprocess
-
-# All 53 Chicago ZIPs with substantial data
-ALL_CHICAGO_ZIPS = [
-    "60601",
-    "60605",
-    "60606",
-    "60607",
-    "60608",
-    "60609",
-    "60610",
-    "60611",
-    "60612",
-    "60613",
-    "60614",
-    "60615",
-    "60616",
-    "60617",
-    "60618",
-    "60619",
-    "60620",
-    "60621",
-    "60622",
-    "60623",
-    "60624",
-    "60625",
-    "60626",
-    "60628",
-    "60629",
-    "60630",
-    "60631",
-    "60632",
-    "60633",
-    "60634",
-    "60636",
-    "60637",
-    "60638",
-    "60639",
-    "60640",
-    "60641",
-    "60642",
-    "60643",
-    "60644",
-    "60645",
-    "60646",
-    "60647",
-    "60649",
-    "60651",
-    "60652",
-    "60653",
-    "60654",
-    "60655",
-    "60656",
-    "60657",
-    "60659",
-    "60660",
-    "60661",
-]
-
-# Pilot: 10 diverse neighborhoods
-PILOT_ZIPS = ["60622", "60614", "60615", "60625", "60629", "60639", "60647", "60653", "60657", "60660"]
-
-# Configuration
-USE_PILOT = True  # Set to False for all 53 ZIPs
-TARGET_PER_ZIP = 100  # 100 customers per ZIP per month
-START_MONTH = "202412"  # December (descending order, so this is "start")
-END_MONTH = "202401"  # January - ALL 12 MONTHS OF 2024
-BASE_OUTPUT = "analysis/chicago_2024_full_year"
-
-ZIPS_TO_SAMPLE = PILOT_ZIPS if USE_PILOT else ALL_CHICAGO_ZIPS
-
-print("=" * 80)
-print("CHICAGO CITY-WIDE: ALL 12 MONTHS OF 2024")
-print("=" * 80)
-print(f"Mode: {'PILOT (10 ZIPs)' if USE_PILOT else 'FULL CITY (53 ZIPs)'}")
-print(f"Customers per ZIP: {TARGET_PER_ZIP}")
-print(f"Total target: {len(ZIPS_TO_SAMPLE) * TARGET_PER_ZIP:,} customers per month")
-print("Months: January 2024 through December 2024 (ALL 12 MONTHS)")
-print(f"Expected total rows: ~{len(ZIPS_TO_SAMPLE) * TARGET_PER_ZIP * 12 * 48 * 30:,}")
-print("=" * 80)
-
-input(f"\nThis will sample {len(ZIPS_TO_SAMPLE)} ZIPs. Press ENTER to start or Ctrl+C to cancel...")
-
-successful = []
-failed = []
-
-for i, zipcode in enumerate(ZIPS_TO_SAMPLE, 1):
-    print(f"\n{'=' * 80}")
-    print(f"[{i}/{len(ZIPS_TO_SAMPLE)}] Processing ZIP {zipcode}...")
-    print(f"{'=' * 80}")
-
-    output_dir = f"{BASE_OUTPUT}/zip{zipcode}"
-
-    env = os.environ.copy()
-    env.update({
-        "START_YYYYMM": START_MONTH,
-        "END_YYYYMM": END_MONTH,
-        "OUTPUT_DIR": output_dir,
-        "TARGET_CUSTOMERS": str(TARGET_PER_ZIP),
-        "ZIP5": zipcode,
-    })
-
-    try:
-        result = subprocess.run(
-            ["python", "sample_customers_60622.py"],
-            env=env,
-        )
-
-        if result.returncode == 0:
-            print(f"\n✅ ZIP {zipcode} completed successfully")
-            successful.append(zipcode)
-        else:
-            print(f"\n⚠️  ZIP {zipcode} failed (code: {result.returncode})")
-            failed.append(zipcode)
-
-    except KeyboardInterrupt:
-        print("\n\n⚠️  Interrupted by user!")
-        print(f"Completed: {len(successful)} ZIPs")
-        print(f"Failed: {len(failed)} ZIPs")
-        print(f"Remaining: {len(ZIPS_TO_SAMPLE) - i} ZIPs")
-        exit(1)
-    except Exception as e:
-        print(f"\n❌ ZIP {zipcode} exception: {e}")
-        failed.append(zipcode)
-
-print("\n" + "=" * 80)
-print("SAMPLING COMPLETE")
-print("=" * 80)
-print(f"✅ Successful: {len(successful)}/{len(ZIPS_TO_SAMPLE)} ZIPs")
-if successful:
-    print(f"   {', '.join(successful)}")
-
-if failed:
-    print(f"\n❌ Failed: {len(failed)} ZIPs")
-    print(f"   {', '.join(failed)}")
-
-print("\nNext step: python combine_chicago_zips.py")
-print("=" * 80)
diff --git a/archive/residential_load_viz_2024/sample_chicago_multizip.py b/archive/residential_load_viz_2024/sample_chicago_multizip.py
deleted file mode 100755
index 4c4e92e..0000000
--- a/archive/residential_load_viz_2024/sample_chicago_multizip.py
+++ /dev/null
@@ -1,134 +0,0 @@
-#!/usr/bin/env python
-"""
-Sample from multiple Chicago ZIP codes for city-wide analysis.
-"""
-
-import os
-import subprocess
-
-# All 53 Chicago ZIPs with substantial data
-ALL_CHICAGO_ZIPS = [
-    "60601",
-    "60605",
-    "60606",
-    "60607",
-    "60608",
-    "60609",
-    "60610",
-    "60611",
-    "60612",
-    "60613",
-    "60614",
-    "60615",
-    "60616",
-    "60617",
-    "60618",
-    "60619",
-    "60620",
-    "60621",
-    "60622",
-    "60623",
-    "60624",
-    "60625",
-    "60626",
-    "60628",
-    "60629",
-    "60630",
-    "60631",
-    "60632",
-    "60633",
-    "60634",
-    "60636",
-    "60637",
-    "60638",
-    "60639",
-    "60640",
-    "60641",
-    "60642",
-    "60643",
-    "60644",
-    "60645",
-    "60646",
-    "60647",
-    "60649",
-    "60651",
-    "60652",
-    "60653",
-    "60654",
-    "60655",
-    "60656",
-    "60657",
-    "60659",
-    "60660",
-    "60661",
-]
-
-# Pilot: representative sample of neighborhoods
-PILOT_ZIPS = ["60622", "60614", "60615", "60625", "60629", "60639", "60647", "60653", "60657", "60660"]
-
-# Configuration
-USE_PILOT = True  # Set to False for full city
-TARGET_PER_ZIP = 500 if USE_PILOT else 100
-START_MONTH = "202412"  # Dec 2024 (descending, so this is start)
-END_MONTH = "202406"  # Jun 2024
-BASE_OUTPUT = "analysis/chicago_citywide"
-
-ZIPS_TO_SAMPLE = PILOT_ZIPS if USE_PILOT else ALL_CHICAGO_ZIPS
-
-print("=" * 80)
-print("CHICAGO CITY-WIDE ELECTRICITY LOAD SAMPLING")
-print("=" * 80)
-print(f"Mode: {'PILOT (10 ZIPs)' if USE_PILOT else 'FULL CITY (53 ZIPs)'}")
-print(f"Customers per ZIP: {TARGET_PER_ZIP}")
-print(f"Total target: {len(ZIPS_TO_SAMPLE) * TARGET_PER_ZIP:,} customers")
-print(f"Months: {END_MONTH} through {START_MONTH}")
-print("=" * 80)
-
-successful = []
-failed = []
-
-for i, zipcode in enumerate(ZIPS_TO_SAMPLE, 1):
-    print(f"\n[{i}/{len(ZIPS_TO_SAMPLE)}] Sampling ZIP {zipcode}...")
-
-    output_dir = f"{BASE_OUTPUT}/zip{zipcode}"
-
-    env = os.environ.copy()
-    env.update({
-        "START_YYYYMM": START_MONTH,
-        "END_YYYYMM": END_MONTH,
-        "OUTPUT_DIR": output_dir,
-        "TARGET_CUSTOMERS": str(TARGET_PER_ZIP),
-        "ZIP5": zipcode,
-    })
-
-    try:
-        result = subprocess.run(
-            ["python", "sample_customers_60622.py"],
-            env=env,
-            capture_output=True,
-            text=True,
-        )
-
-        if result.returncode == 0:
-            print(f"   ✅ ZIP {zipcode} completed")
-            successful.append(zipcode)
-        else:
-            print(f"   ⚠️  ZIP {zipcode} failed (code: {result.returncode})")
-            print(f"   Error: {result.stderr[-200:]}")
-            failed.append(zipcode)
-
-    except Exception as e:
-        print(f"   ❌ ZIP {zipcode} exception: {e}")
-        failed.append(zipcode)
-
-print("\n" + "=" * 80)
-print("SAMPLING COMPLETE")
-print("=" * 80)
-print(f"✅ Successful: {len(successful)}/{len(ZIPS_TO_SAMPLE)} ZIPs")
-print(f"❌ Failed: {len(failed)} ZIPs")
-
-if failed:
-    print(f"\nFailed ZIPs: {', '.join(failed)}")
-
-print("\nNext step: Run combine script to merge all ZIPs into one dataset")
-print("=" * 80)
diff --git a/archive/residential_load_viz_2024/sample_customers_60622.py b/archive/residential_load_viz_2024/sample_customers_60622.py
deleted file mode 100644
index d283de1..0000000
--- a/archive/residential_load_viz_2024/sample_customers_60622.py
+++ /dev/null
@@ -1,378 +0,0 @@
-#!/usr/bin/env python
-"""
-Sample up to 1,000 customers in ZIP (default 60622) for each month from START_YYYYMM
-down to END_YYYYMM (descending), combine to one Parquet, then:
-
-1) CLIP: keep only rows whose calendar month matches sample_month (safety net for uploads that
-   include the next month’s first day).
-2) CM90 (optional): keep customer-months with >=90% "OK days" (rows/day in {46,48,50}).
-
-Env (with defaults):
-  BUCKET=smart-meter-data-sb
-  PREFIX_BASE=sharepoint-files/Zip4
-  ZIP5=60622
-  TARGET_CUSTOMERS=1000
-  START_YYYYMM=202509
-  END_YYYYMM=202409
-  OUTPUT_DIR=analysis/sample_zip60622_recent
-  RANDOM_SEED=42
-  MAX_RETRIES=5
-  RETRY_DELAY=1.5
-  MAX_WORKERS=6
-"""
-
-from __future__ import annotations
-
-import csv
-import logging
-import os
-import random
-import sys
-import tempfile
-import time
-from datetime import datetime
-from pathlib import Path
-from typing import Dict, List, Optional, Set
-
-import boto3
-import polars as pl
-from botocore.config import Config as BotoConfig
-from botocore.exceptions import ClientError
-
-# ---------------- Config ----------------
-BUCKET = os.getenv("BUCKET", "smart-meter-data-sb")
-PREFIX_BASE = os.getenv("PREFIX_BASE", "sharepoint-files/Zip4")
-ZIP5 = os.getenv("ZIP5", "60622")
-TARGET_CUSTOMERS = int(os.getenv("TARGET_CUSTOMERS", "1000"))
-
-START_YYYYMM = os.getenv("START_YYYYMM", "202509")
-END_YYYYMM = os.getenv("END_YYYYMM", "202409")
-
-OUTPUT_DIR = Path(os.getenv("OUTPUT_DIR", f"analysis/sample_zip{ZIP5}_recent"))
-OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-CHECKPOINT = OUTPUT_DIR / "checkpoints"
-SHARDS_DIR = CHECKPOINT / "shards"
-FINAL_DIR = OUTPUT_DIR / "final"
-FINAL_DIR.mkdir(parents=True, exist_ok=True)
-
-RANDOM_SEED = int(os.getenv("RANDOM_SEED", "42"))
-MAX_RETRIES = int(os.getenv("MAX_RETRIES", "5"))
-RETRY_DELAY = float(os.getenv("RETRY_DELAY", "1.5"))
-MAX_WORKERS = int(os.getenv("MAX_WORKERS", "6"))
-BATCH_LOG_EVERY = int(os.getenv("BATCH_LOG_EVERY", "400"))
-
-random.seed(RANDOM_SEED)
-
-# output file names
-RANGE_TAG = f"{END_YYYYMM}_{START_YYYYMM}"
-FINAL_RAW = FINAL_DIR / f"sample_{ZIP5}_{RANGE_TAG}.parquet"
-FINAL_CLIPPED = FINAL_DIR / f"sample_{ZIP5}_{RANGE_TAG}_CLIPPED.parquet"
-FINAL_CLIPPED_CM90 = FINAL_DIR / f"sample_{ZIP5}_{RANGE_TAG}_CLIPPED_CM90.parquet"
-
-# -------------- Logging ---------------
-LOG_FILE = OUTPUT_DIR / f"run_{datetime.now():%Y%m%d_%H%M%S}.log"
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(levelname)s - %(message)s",
-    handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()],
-)
-logger = logging.getLogger("zip-monthly-sampler")
-
-logger.info("=" * 86)
-logger.info(f"ZIP {ZIP5} – sample up to {TARGET_CUSTOMERS:,}/month from {START_YYYYMM} → {END_YYYYMM} (descending)")
-logger.info("=" * 86)
-logger.info(f"S3 bucket/base:   s3://{BUCKET}/{PREFIX_BASE}/")
-logger.info(f"Output dir:       {OUTPUT_DIR}")
-logger.info(f"Final RAW:        {FINAL_RAW}")
-logger.info(f"Final CLIPPED:    {FINAL_CLIPPED}")
-logger.info(f"Final CM90:       {FINAL_CLIPPED_CM90}")
-logger.info("=" * 86)
-
-# -------------- Project transforms --------------
-# Make repo root importable if this script lives in analysis/...
-sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
-from smart_meter_analysis.transformation import add_time_columns, transform_wide_to_long  # noqa: E402
-
-# -------------- S3 client --------------
-s3 = boto3.client(
-    "s3",
-    config=BotoConfig(
-        retries={"max_attempts": MAX_RETRIES, "mode": "standard"},
-        max_pool_connections=max(8, MAX_WORKERS),
-    ),
-)
-
-
-# -------------- Helpers --------------
-def months_desc(start_yyyymm: str, end_yyyymm: str) -> List[str]:
-    """Inclusive descending list from start → end (e.g., 202509..202409)."""
-    sy, sm = int(start_yyyymm[:4]), int(start_yyyymm[4:])
-    ey, em = int(end_yyyymm[:4]), int(end_yyyymm[4:])
-    seq = []
-    y, m = sy, sm
-    while (y > ey) or (y == ey and m >= em):
-        seq.append(f"{y:04d}{m:02d}")
-        # step back one month
-        m -= 1
-        if m == 0:
-            y -= 1
-            m = 12
-    return seq
-
-
-def _norm(name: str) -> str:
-    return "".join(ch.lower() for ch in name if ch.isalnum())
-
-
-def detect_id_col(columns: List[str]) -> Optional[str]:
-    candidates = {"accountidentifier", "accountid", "acctid", "acctidentifier"}
-    m = {_norm(c): c for c in columns}
-    for key in candidates:
-        if key in m:
-            return m[key]
-    if "ACCOUNT_IDENTIFIER" in columns:
-        return "ACCOUNT_IDENTIFIER"
-    return None
-
-
-def month_keys(yyyymm: str, zip5: str) -> List[str]:
-    prefix = f"{PREFIX_BASE}/{yyyymm}/"
-    out: List[str] = []
-    for page in s3.get_paginator("list_objects_v2").paginate(Bucket=BUCKET, Prefix=prefix):
-        for obj in page.get("Contents", []):
-            k = obj["Key"]
-            # FIXED: Validate filename month matches folder month
-            if k.endswith(".csv") and f"_{zip5}-" in k and f"_{yyyymm}_" in k:
-                out.append(k)
-    return out
-
-
-def download_tmp(key: str) -> Path:
-    fd, path = tempfile.mkstemp(prefix="s3_", suffix=".csv")
-    os.close(fd)
-    p = Path(path)
-    attempt = 0
-    while True:
-        attempt += 1
-        try:
-            resp = s3.get_object(Bucket=BUCKET, Key=key)
-            with open(p, "wb") as f:
-                for chunk in resp["Body"].iter_chunks(chunk_size=8 * 1024 * 1024):
-                    if chunk:
-                        f.write(chunk)
-            return p
-        except ClientError as e:
-            code = e.response.get("Error", {}).get("Code", "")
-            msg = e.response.get("Error", {}).get("Message", str(e))
-            if code == "RequestTimeTooSkewed":
-                logger.error("Clock skew detected; sync system time and re-run. AWS: %s", msg)
-                raise
-            if attempt >= MAX_RETRIES:
-                logger.warning("get_object failed for %s: %s %s — giving up.", key, code, msg)
-                raise
-            time.sleep(RETRY_DELAY * attempt)
-
-
-def ids_in_file(key: str) -> List[str]:
-    """Read ONLY the ID column from one CSV."""
-    tmp = download_tmp(key)
-    try:
-        with open(tmp, newline="") as f:
-            header = next(csv.reader(f))
-        id_col = detect_id_col(header)
-        if not id_col:
-            logger.warning("Skipping %s: no account id column. header=%s", key, header)
-            return []
-        ids = (
-            pl.read_csv(tmp, columns=[id_col])
-            .select(pl.col(id_col).cast(pl.Utf8).str.strip_chars().alias("account_identifier"))
-            .drop_nulls()
-            .unique()
-            .to_series()
-            .to_list()
-        )
-        return ids
-    except Exception as e:
-        logger.warning("Skipping %s: %s", key, e)
-        return []
-    finally:
-        try:
-            tmp.unlink(missing_ok=True)
-        except Exception:
-            pass
-
-
-def sample_ids_for_month(yyyymm: str, zip5: str, k: int, rng: random.Random) -> Dict[str, Set[str]]:
-    """Scan files in random order and collect up to k unique IDs. Returns {key -> set(ids)}."""
-    keys = month_keys(yyyymm, zip5)
-    rng.shuffle(keys)
-    selected: Set[str] = set()
-    picked_by_key: Dict[str, Set[str]] = {}
-    logger.info("%s: %d candidate CSVs in ZIP%s", yyyymm, len(keys), zip5)
-
-    for i, key in enumerate(keys, 1):
-        if len(selected) >= k:
-            break
-        cand = ids_in_file(key)
-        if not cand:
-            continue
-        new = [cid for cid in cand if cid not in selected]
-        if not new:
-            continue
-        need = k - len(selected)
-        take = new if len(new) <= need else rng.sample(new, need)
-        selected.update(take)
-        picked_by_key.setdefault(key, set()).update(take)
-
-        if i % BATCH_LOG_EVERY == 0:
-            logger.info("%s: scanned %d/%d files — have %d/%d IDs", yyyymm, i, len(keys), len(selected), k)
-
-    logger.info("%s: sampled %d unique IDs (target %d)", yyyymm, len(selected), k)
-    return picked_by_key
-
-
-def transform_and_filter(local_csv: Path, keep_ids: Set[str]) -> Optional[pl.DataFrame]:
-    """Read CSV → early ID filter (wide) → project transforms → normalize ID → final filter."""
-    try:
-        df_wide = pl.read_csv(local_csv)
-    except Exception as e:
-        logger.warning("Failed to read %s: %s", local_csv, e)
-        return None
-
-    id_col = detect_id_col(df_wide.columns)
-    if id_col:
-        df_wide = df_wide.filter(pl.col(id_col).cast(pl.Utf8).str.strip_chars().is_in(keep_ids))
-        if df_wide.height == 0:
-            return None
-
-    df_long = transform_wide_to_long(df_wide)
-    df_long = add_time_columns(df_long)
-
-    out_id = detect_id_col(df_long.columns) or "account_identifier"
-    if out_id != "account_identifier":
-        df_long = df_long.rename({out_id: "account_identifier"})
-
-    df_long = df_long.with_columns(pl.col("account_identifier").cast(pl.Utf8).str.strip_chars()).filter(
-        pl.col("account_identifier").is_in(keep_ids)
-    )
-    return df_long if df_long.height > 0 else None
-
-
-# -------------- Pipeline --------------
-def run() -> int:
-    rng = random.Random(RANDOM_SEED)
-    months = months_desc(START_YYYYMM, END_YYYYMM)
-    logger.info("Months to process (desc): %s", ", ".join(months))
-
-    FINAL_RAW.unlink(missing_ok=True)
-    total_rows = 0
-
-    for yyyymm in months:
-        picked_by_key = sample_ids_for_month(yyyymm, ZIP5, TARGET_CUSTOMERS, rng)
-        if not picked_by_key:
-            logger.warning("%s: no IDs sampled; skipping month.", yyyymm)
-            continue
-
-        shard_dir = SHARDS_DIR / yyyymm
-        shard_dir.mkdir(parents=True, exist_ok=True)
-        wrote = 0
-
-        for idx, (key, ids) in enumerate(picked_by_key.items(), 1):
-            shard_path = shard_dir / (Path(key).name.replace(".csv", ".parquet"))
-            if shard_path.exists():
-                wrote += 1
-                continue
-            try:
-                tmp = download_tmp(key)
-                try:
-                    df = transform_and_filter(tmp, ids)
-                    if df is None:
-                        continue
-                    df = df.with_columns(
-                        pl.lit(yyyymm).alias("sample_month"),
-                        pl.lit(key).alias("source_key"),
-                    )
-                    df.write_parquet(shard_path)
-                    total_rows += df.height
-                    wrote += 1
-                finally:
-                    try:
-                        tmp.unlink(missing_ok=True)
-                    except Exception:
-                        pass
-            except Exception as e:
-                logger.warning("%s: failed %s: %s", yyyymm, key, e)
-
-            if idx % BATCH_LOG_EVERY == 0:
-                logger.info("%s: processed %d files; shards: %d", yyyymm, idx, wrote)
-
-        logger.info("%s: wrote %d shard(s) for sampled IDs", yyyymm, wrote)
-
-    # Final sink (RAW)
-    logger.info("Sinking all shards to RAW Parquet...")
-    pl.scan_parquet(str(SHARDS_DIR / "*" / "*.parquet")).sink_parquet(FINAL_RAW)
-
-    # CLIP to matching calendar month
-    logger.info("Clipping to matching calendar month per sample_month...")
-    lf = (
-        pl.scan_parquet(FINAL_RAW)
-        .with_columns(pl.col("date").dt.strftime("%Y%m").alias("calendar_month"))
-        .filter(pl.col("calendar_month") == pl.col("sample_month"))
-        .drop("calendar_month")
-    )
-    lf.sink_parquet(FINAL_CLIPPED)
-
-    # CM90 (>=90% OK days, OK = 46/48/50 intervals)
-    logger.info("Building CM90 (>=90%% OK days; OK rows/day in {46,48,50})...")
-    lf_c = pl.scan_parquet(FINAL_CLIPPED)
-    daily = lf_c.group_by(["sample_month", "account_identifier", "date"]).agg(pl.len().alias("rows"))
-    ok_daily = (
-        daily.filter(pl.col("rows").is_in([46, 48, 50]))
-        .group_by(["sample_month", "account_identifier"])
-        .agg(pl.len().alias("ok_days"))
-    )
-    days_in_month = (
-        lf_c.select(["sample_month", "date"]).unique().group_by("sample_month").agg(pl.len().alias("days_in_month"))
-    )
-    cm_keep = (
-        ok_daily.join(days_in_month, on="sample_month", how="left")
-        .with_columns((pl.col("ok_days") / pl.col("days_in_month")).alias("pct_ok"))
-        .filter(pl.col("pct_ok") >= 0.90)
-        .select(["sample_month", "account_identifier"])
-    )
-    lf_c.join(cm_keep, on=["sample_month", "account_identifier"], how="inner").sink_parquet(FINAL_CLIPPED_CM90)
-
-    # Quick summaries
-    def summary(path: Path, tag: str):
-        tbl = (
-            pl.scan_parquet(path)
-            .group_by("sample_month")
-            .agg([
-                pl.col("account_identifier").n_unique().alias("unique_ids"),
-                pl.len().alias("rows"),
-                pl.min("date").alias("min_date"),
-                pl.max("date").alias("max_date"),
-            ])
-            .collect()
-            .sort("sample_month")
-        )
-        logger.info("== %s ==", tag)
-        logger.info("\n%s", tbl)
-
-    summary(FINAL_RAW, "RAW")
-    summary(FINAL_CLIPPED, "CLIPPED")
-    summary(FINAL_CLIPPED_CM90, "CLIPPED_CM90")
-
-    logger.info("✅ Done.")
-    logger.info("RAW:        %s", FINAL_RAW)
-    logger.info("CLIPPED:    %s", FINAL_CLIPPED)
-    logger.info("CLIPPED_CM90:%s", FINAL_CLIPPED_CM90)
-    return 0
-
-
-if __name__ == "__main__":
-    try:
-        raise SystemExit(run())
-    except KeyboardInterrupt:
-        logger.warning("Interrupted by user. Exiting.")
-        raise
diff --git a/archive/residential_load_viz_2024/visualize_chicago_12months.py b/archive/residential_load_viz_2024/visualize_chicago_12months.py
deleted file mode 100644
index ba52b4a..0000000
--- a/archive/residential_load_viz_2024/visualize_chicago_12months.py
+++ /dev/null
@@ -1,346 +0,0 @@
-#!/usr/bin/env python
-"""
-Final visualizations: All 12 months of 2024 Chicago-wide data.
-April clearly marked with caveat about limited data availability.
-"""
-
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import polars as pl
-import seaborn as sns
-
-# Seaborn style
-sns.set_style("whitegrid")
-plt.rcParams["font.family"] = "sans-serif"
-plt.rcParams["font.size"] = 11
-plt.rcParams["axes.labelsize"] = 13
-plt.rcParams["axes.titlesize"] = 16
-plt.rcParams["legend.fontsize"] = 11
-plt.rcParams["figure.titlesize"] = 18
-plt.rcParams["axes.grid"] = True
-plt.rcParams["grid.alpha"] = 0.3
-
-DATA_FILE = "analysis/chicago_2024_full_year/combined/chicago_2024_with_april_boost_CM90.parquet"
-OUTPUT_DIR = Path("analysis/chicago_2024_full_year/final_visualizations")
-OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-
-
-def create_heatmap(data_path: str, output_path: str):
-    """Monthly-hourly heatmap with April footnote."""
-    print("\n📊 Creating heatmap...")
-
-    lf = pl.scan_parquet(data_path)
-
-    stats = lf.select([
-        pl.col("account_identifier").n_unique().alias("n_customers"),
-        pl.min("date").alias("min_date"),
-        pl.max("date").alias("max_date"),
-    ]).collect(engine="streaming")
-
-    n_customers = stats["n_customers"][0]
-    date_range = f"{stats['min_date'][0]} to {stats['max_date'][0]}"
-
-    monthly_hourly = (
-        lf.group_by(["sample_month", "hour"])
-        .agg(pl.col("kwh").sum().alias("total_kwh"))
-        .sort(["sample_month", "hour"])
-        .collect(engine="streaming")
-    )
-
-    matrix = monthly_hourly.pivot(index="hour", columns="sample_month", values="total_kwh").fill_null(0)
-
-    hour_labels = matrix.select("hour").to_series().to_list()
-    month_cols = sorted([c for c in matrix.columns if c != "hour"])
-    data_matrix = matrix.select(month_cols).to_numpy()
-
-    fig, ax = plt.subplots(figsize=(16, 9))
-
-    sns.heatmap(
-        data_matrix,
-        cmap="RdYlBu_r",
-        cbar_kws={
-            "label": f"Total Energy Consumption (kWh)\n{n_customers:,} Households",
-            "shrink": 0.75,
-            "pad": 0.02,
-            "aspect": 30,
-        },
-        xticklabels=[f"{m[:4]}-{m[4:]}" for m in month_cols],
-        yticklabels=hour_labels,
-        linewidths=0.8,
-        linecolor="white",
-        ax=ax,
-        robust=True,
-        square=False,
-    )
-
-    ax.set_xlabel("Month", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_ylabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_title(
-        f"Residential Electricity Load Patterns: Temporal Heat Map\n"
-        f"Chicago • {date_range} • {n_customers:,} Households",
-        fontsize=18,
-        fontweight="bold",
-        pad=25,
-    )
-
-    # Mark April with asterisk
-    xtick_labels = ax.get_xticklabels()
-    for label in xtick_labels:
-        if "2024-04" in label.get_text():
-            label.set_text(label.get_text() + "*")
-            label.set_fontweight("bold")
-
-    ax.set_xticklabels(xtick_labels, rotation=45, ha="right", fontsize=11)
-    ax.invert_yaxis()
-
-    # Add footnote about April
-    fig.text(
-        0.5,
-        0.02,
-        "*April 2024: Limited data availability (n=229 vs. ~950 typical) due to utility data quality issues",
-        ha="center",
-        fontsize=10,
-        style="italic",
-        color="#666666",
-    )
-
-    plt.tight_layout(rect=[0, 0.03, 1, 1])
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-def create_hourly_profile(data_path: str, output_path: str):
-    """Average hourly profile."""
-    print("\n📊 Creating hourly profile...")
-
-    lf = pl.scan_parquet(data_path)
-    n_customers = lf.select(pl.col("account_identifier").n_unique()).collect(engine="streaming")[0, 0]
-
-    hourly = (
-        lf.group_by("hour")
-        .agg([
-            pl.col("kwh").mean().alias("mean_kwh"),
-            pl.col("kwh").quantile(0.25).alias("p25_kwh"),
-            pl.col("kwh").quantile(0.75).alias("p75_kwh"),
-        ])
-        .sort("hour")
-        .collect(engine="streaming")
-    )
-
-    fig, ax = plt.subplots(figsize=(15, 8))
-
-    hours = hourly["hour"].to_list()
-    mean = hourly["mean_kwh"].to_list()
-    p25 = hourly["p25_kwh"].to_list()
-    p75 = hourly["p75_kwh"].to_list()
-
-    ax.fill_between(hours, p25, p75, alpha=0.3, color="#e74c3c", label="Interquartile Range")
-
-    sns.lineplot(x=hours, y=mean, linewidth=3.5, color="#c0392b", marker="o", markersize=8, ax=ax, label="Mean Usage")
-
-    ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_ylabel("Energy Consumption (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_title(
-        f"Average Hourly Electricity Usage Profile\n{n_customers:,} Chicago Households • 2024",
-        fontsize=18,
-        fontweight="bold",
-        pad=25,
-    )
-
-    ax.set_xticks(range(0, 24, 2))
-    ax.set_xlim(-0.5, 23.5)
-    ax.set_ylim(bottom=0)
-    ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8)
-
-    # Peak annotation
-    peak_hour = mean.index(max(mean))
-    peak_value = max(mean)
-    ax.annotate(
-        f"Peak\n{peak_value:.3f} kWh\nat {peak_hour}:00",
-        xy=(peak_hour, peak_value),
-        xytext=(peak_hour - 4, peak_value + 0.04),
-        fontsize=12,
-        fontweight="bold",
-        bbox=dict(boxstyle="round,pad=0.7", facecolor="#f39c12", alpha=0.9, edgecolor="#000", linewidth=2),
-        arrowprops=dict(arrowstyle="->", lw=2.5, color="#000"),
-    )
-
-    # Baseload annotation
-    min_value = min(mean)
-    min_hour = mean.index(min_value)
-    ax.annotate(
-        f"Baseload\n{min_value:.3f} kWh\nat {min_hour}:00",
-        xy=(min_hour, min_value),
-        xytext=(min_hour + 3, min_value + 0.06),
-        fontsize=12,
-        fontweight="bold",
-        bbox=dict(boxstyle="round,pad=0.7", facecolor="#3498db", alpha=0.9, edgecolor="#000", linewidth=2),
-        arrowprops=dict(arrowstyle="->", lw=2.5, color="#000"),
-    )
-
-    ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=12)
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-def create_monthly_profile(data_path: str, output_path: str):
-    """Monthly average with April clearly marked."""
-    print("\n📊 Creating monthly profile...")
-
-    lf = pl.scan_parquet(data_path)
-
-    monthly = (
-        lf.group_by("sample_month")
-        .agg([
-            pl.col("account_identifier").n_unique().alias("customers"),
-            pl.col("kwh").mean().alias("mean_kwh"),
-            pl.col("kwh").std().alias("std_kwh"),
-        ])
-        .sort("sample_month")
-        .collect(engine="streaming")
-    )
-
-    fig, ax = plt.subplots(figsize=(15, 8))
-
-    months = monthly["sample_month"].to_list()
-    mean_kwh = monthly["mean_kwh"].to_list()
-    customers = monthly["customers"].to_list()
-
-    # Month labels
-    month_labels = [f"{m[4:]}/{m[2:4]}" for m in months]  # MM/YY format
-
-    # Color bars - April in different color
-    colors = ["#e74c3c" if m == "202404" else "#3498db" for m in months]
-
-    bars = ax.bar(range(len(months)), mean_kwh, color=colors, edgecolor="black", linewidth=1.5)
-
-    # Add customer count on top of each bar
-    for i, (bar, count) in enumerate(zip(bars, customers)):
-        height = bar.get_height()
-        label = f"n={count}"
-        if months[i] == "202404":
-            label += "*"
-        ax.text(
-            bar.get_x() + bar.get_width() / 2.0,
-            height + 0.01,
-            label,
-            ha="center",
-            va="bottom",
-            fontsize=9,
-            fontweight="bold",
-        )
-
-    ax.set_xlabel("Month", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_ylabel("Average Energy (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_title("Monthly Average Electricity Consumption\nChicago 2024", fontsize=18, fontweight="bold", pad=25)
-
-    ax.set_xticks(range(len(months)))
-    ax.set_xticklabels(month_labels, fontsize=12)
-    ax.set_ylim(0, max(mean_kwh) * 1.15)
-    ax.grid(True, alpha=0.3, axis="y")
-
-    # Legend
-    from matplotlib.patches import Patch
-
-    legend_elements = [
-        Patch(facecolor="#3498db", edgecolor="black", label="Normal months"),
-        Patch(facecolor="#e74c3c", edgecolor="black", label="April (limited data)"),
-    ]
-    ax.legend(handles=legend_elements, loc="upper left", fontsize=11)
-
-    # Footnote
-    fig.text(
-        0.5,
-        0.02,
-        "*April 2024: n=229 (24% of typical) due to utility data quality issues",
-        ha="center",
-        fontsize=10,
-        style="italic",
-        color="#666666",
-    )
-
-    plt.tight_layout(rect=[0, 0.03, 1, 1])
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-def create_weekend_comparison(data_path: str, output_path: str):
-    """Weekday vs weekend."""
-    print("\n📊 Creating weekend comparison...")
-
-    lf = pl.scan_parquet(data_path)
-
-    comparison = (
-        lf.group_by(["hour", "is_weekend"])
-        .agg(pl.col("kwh").mean().alias("mean_kwh"))
-        .sort(["is_weekend", "hour"])
-        .collect(engine="streaming")
-    )
-
-    weekday = comparison.filter(pl.col("is_weekend") == False)
-    weekend = comparison.filter(pl.col("is_weekend") == True)
-
-    fig, ax = plt.subplots(figsize=(15, 9))
-
-    weekday_hours = weekday["hour"].to_list()
-    weekday_kwh = weekday["mean_kwh"].to_list()
-    weekend_hours = weekend["hour"].to_list()
-    weekend_kwh = weekend["mean_kwh"].to_list()
-
-    sns.lineplot(
-        x=weekday_hours, y=weekday_kwh, linewidth=3.5, color="#2980b9", marker="o", markersize=8, ax=ax, label="Weekday"
-    )
-    sns.lineplot(
-        x=weekend_hours, y=weekend_kwh, linewidth=3.5, color="#e67e22", marker="s", markersize=8, ax=ax, label="Weekend"
-    )
-
-    ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_ylabel("Average Energy (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_title("Weekday vs Weekend Load Profiles\nChicago 2024", fontsize=18, fontweight="bold", pad=30)
-    ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8)
-    ax.set_xticks(range(0, 24, 2))
-    ax.set_xlim(-0.5, 23.5)
-
-    max_val = max(max(weekday_kwh), max(weekend_kwh))
-    ax.set_ylim(0, max_val * 1.25)
-
-    ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=13)
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-if __name__ == "__main__":
-    print("=" * 80)
-    print("CHICAGO 2024 FINAL VISUALIZATIONS")
-    print("12 Months • 10,053 Unique Customers • 55 ZIP Codes")
-    print("=" * 80)
-
-    if not Path(DATA_FILE).exists():
-        print(f"❌ File not found: {DATA_FILE}")
-        exit(1)
-
-    create_heatmap(DATA_FILE, OUTPUT_DIR / "chicago_2024_heatmap.png")
-    create_hourly_profile(DATA_FILE, OUTPUT_DIR / "chicago_2024_hourly_profile.png")
-    create_monthly_profile(DATA_FILE, OUTPUT_DIR / "chicago_2024_monthly_profile.png")
-    create_weekend_comparison(DATA_FILE, OUTPUT_DIR / "chicago_2024_weekend_comparison.png")
-
-    print("\n" + "=" * 80)
-    print("✅ ALL VISUALIZATIONS COMPLETE!")
-    print("=" * 80)
-    print(f"\nOutput directory: {OUTPUT_DIR}")
-    print("\nGenerated files:")
-    print("  📈 chicago_2024_heatmap.png")
-    print("  📈 chicago_2024_hourly_profile.png")
-    print("  📈 chicago_2024_monthly_profile.png")
-    print("  📈 chicago_2024_weekend_comparison.png")
-    print("\nAll figures include April 2024 data with appropriate caveats.")
-    print("=" * 80)
diff --git a/archive/residential_load_viz_2024/visualize_chicago_12months_fixed.py b/archive/residential_load_viz_2024/visualize_chicago_12months_fixed.py
deleted file mode 100644
index a88d889..0000000
--- a/archive/residential_load_viz_2024/visualize_chicago_12months_fixed.py
+++ /dev/null
@@ -1,308 +0,0 @@
-#!/usr/bin/env python
-"""
-Final visualizations for Chicago 2024 (CM90 dataset):
-- Heatmap shows MEAN kWh per 30-min per customer.
-- Monthly bar chart annotates each bar with the month’s mean kWh (not sample size).
-- No April-specific asterisks or footnotes anywhere.
-
-Inputs/outputs:
-  DATA_FILE   -> path to combined CM90 parquet
-  OUTPUT_DIR  -> folder for PNGs
-"""
-
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import polars as pl
-import seaborn as sns
-
-# Style
-sns.set_style("whitegrid")
-plt.rcParams.update({
-    "font.family": "sans-serif",
-    "font.size": 11,
-    "axes.labelsize": 13,
-    "axes.titlesize": 16,
-    "legend.fontsize": 11,
-    "figure.titlesize": 18,
-    "axes.grid": True,
-    "grid.alpha": 0.3,
-})
-
-DATA_FILE = "analysis/chicago_2024_full_year/combined/chicago_2024_with_april_boost_CM90.parquet"
-OUTPUT_DIR = Path("analysis/chicago_2024_full_year/final_visualizations")
-OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-
-
-def create_heatmap(data_path: str, output_path: str):
-    """Monthly-hourly heatmap (MEAN kWh per customer)."""
-    print("\n📊 Creating heatmap (MEAN kWh per customer)...")
-
-    lf = pl.scan_parquet(data_path)
-
-    stats = lf.select([
-        pl.col("account_identifier").n_unique().alias("n_customers"),
-        pl.min("date").alias("min_date"),
-        pl.max("date").alias("max_date"),
-    ]).collect(engine="streaming")
-    n_customers = stats["n_customers"][0]
-    date_range = f"{stats['min_date'][0]} to {stats['max_date'][0]}"
-
-    monthly_hourly = (
-        lf.group_by(["sample_month", "hour"])
-        .agg(pl.col("kwh").mean().alias("mean_kwh"))
-        .sort(["sample_month", "hour"])
-        .collect(engine="streaming")
-    )
-
-    matrix = monthly_hourly.pivot(index="hour", columns="sample_month", values="mean_kwh").fill_null(0)
-
-    hour_labels = matrix.select("hour").to_series().to_list()
-    month_cols = sorted([c for c in matrix.columns if c != "hour"])
-    data_matrix = matrix.select(month_cols).to_numpy()
-
-    fig, ax = plt.subplots(figsize=(16, 9))
-    sns.heatmap(
-        data_matrix,
-        cmap="RdYlBu_r",
-        cbar_kws={
-            "label": f"Mean Energy Consumption per Customer (kWh)\n{n_customers:,} Households",
-            "shrink": 0.75,
-            "pad": 0.02,
-            "aspect": 30,
-        },
-        xticklabels=[f"{m[:4]}-{m[4:]}" for m in month_cols],
-        yticklabels=hour_labels,
-        linewidths=0.8,
-        linecolor="white",
-        ax=ax,
-        robust=True,
-        square=False,
-    )
-
-    ax.set_xlabel("Month", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_ylabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_title(
-        "Residential Electricity Load Patterns: Temporal Heat Map\n"
-        f"Chicago • {date_range} • {n_customers:,} Households",
-        fontsize=18,
-        fontweight="bold",
-        pad=25,
-    )
-    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=11)
-    ax.invert_yaxis()
-
-    plt.tight_layout(rect=[0, 0.03, 1, 1])
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-def create_hourly_profile(data_path: str, output_path: str):
-    """Average hourly profile across the year (mean and IQR)."""
-    print("\n📊 Creating hourly profile...")
-
-    lf = pl.scan_parquet(data_path)
-    n_customers = lf.select(pl.col("account_identifier").n_unique()).collect(engine="streaming")[0, 0]
-
-    hourly = (
-        lf.group_by("hour")
-        .agg([
-            pl.col("kwh").mean().alias("mean_kwh"),
-            pl.col("kwh").quantile(0.25).alias("p25_kwh"),
-            pl.col("kwh").quantile(0.75).alias("p75_kwh"),
-        ])
-        .sort("hour")
-        .collect(engine="streaming")
-    )
-
-    fig, ax = plt.subplots(figsize=(15, 8))
-    hours = hourly["hour"].to_list()
-    mean = hourly["mean_kwh"].to_list()
-    p25 = hourly["p25_kwh"].to_list()
-    p75 = hourly["p75_kwh"].to_list()
-
-    ax.fill_between(hours, p25, p75, alpha=0.3, color="#e74c3c", label="Interquartile Range")
-    sns.lineplot(x=hours, y=mean, linewidth=3.5, color="#c0392b", marker="o", markersize=8, ax=ax, label="Mean Usage")
-
-    ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_ylabel("Energy Consumption (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_title(
-        f"Average Hourly Electricity Usage Profile\n{n_customers:,} Chicago Households • 2024",
-        fontsize=18,
-        fontweight="bold",
-        pad=25,
-    )
-
-    ax.set_xticks(range(0, 24, 2))
-    ax.set_xlim(-0.5, 23.5)
-    ax.set_ylim(bottom=0)
-    ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8)
-
-    # Peak annotation
-    peak_idx = mean.index(max(mean))
-    peak_val = mean[peak_idx]
-    ax.annotate(
-        f"Peak\n{peak_val:.3f} kWh\nat {peak_idx}:00",
-        xy=(peak_idx, peak_val),
-        xytext=(peak_idx - 4, peak_val + 0.04),
-        fontsize=12,
-        fontweight="bold",
-        bbox=dict(boxstyle="round,pad=0.7", facecolor="#f39c12", alpha=0.9, edgecolor="#000", linewidth=2),
-        arrowprops=dict(arrowstyle="->", lw=2.5, color="#000"),
-    )
-
-    # Baseload annotation
-    min_val = min(mean)
-    min_idx = mean.index(min_val)
-    ax.annotate(
-        f"Baseload\n{min_val:.3f} kWh\nat {min_idx}:00",
-        xy=(min_idx, min_val),
-        xytext=(min_idx + 3, min_val + 0.06),
-        fontsize=12,
-        fontweight="bold",
-        bbox=dict(boxstyle="round,pad=0.7", facecolor="#3498db", alpha=0.9, edgecolor="#000", linewidth=2),
-        arrowprops=dict(arrowstyle="->", lw=2.5, color="#000"),
-    )
-
-    ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=12)
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-def create_monthly_profile(data_path: str, output_path: str):
-    """
-    Monthly average bar chart.
-    - Bars represent mean kWh per 30-min (as before).
-    - Annotation above each bar now shows the same average (kWh), NOT the sample count.
-    - No April asterisk/footnote.
-    """
-    print("\n📊 Creating monthly profile (annotate with mean kWh)...")
-
-    lf = pl.scan_parquet(data_path)
-
-    monthly = (
-        lf.group_by("sample_month")
-        .agg([
-            pl.col("account_identifier").n_unique().alias("customers"),
-            pl.col("kwh").mean().alias("mean_kwh"),
-            pl.col("kwh").std().alias("std_kwh"),
-        ])
-        .sort("sample_month")
-        .collect(engine="streaming")
-    )
-
-    fig, ax = plt.subplots(figsize=(15, 8))
-    months = monthly["sample_month"].to_list()
-    mean_kwh = monthly["mean_kwh"].to_list()
-
-    month_labels = [f"{m[4:]}/{m[2:4]}" for m in months]
-    colors = ["#3498db"] * len(months)
-
-    bars = ax.bar(range(len(months)), mean_kwh, color=colors, edgecolor="black", linewidth=1.5)
-
-    # Annotate each bar with its average energy
-    for i, bar in enumerate(bars):
-        h = bar.get_height()
-        ax.text(
-            bar.get_x() + bar.get_width() / 2.0,
-            h + 0.01,
-            f"{mean_kwh[i]:.3f} kWh",
-            ha="center",
-            va="bottom",
-            fontsize=9,
-            fontweight="bold",
-        )
-
-    ax.set_xlabel("Month", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_ylabel("Average Energy (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_title("Monthly Average Electricity Consumption\nChicago 2024", fontsize=18, fontweight="bold", pad=25)
-
-    ax.set_xticks(range(len(months)))
-    ax.set_xticklabels(month_labels, fontsize=12)
-    ax.set_ylim(0, max(mean_kwh) * 1.15)
-    ax.grid(True, alpha=0.3, axis="y")
-
-    plt.tight_layout(rect=[0, 0.03, 1, 1])
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-def create_weekend_comparison(data_path: str, output_path: str):
-    """Weekday vs weekend mean kWh per 30-min."""
-    print("\n📊 Creating weekend comparison...")
-
-    lf = pl.scan_parquet(data_path)
-    comparison = (
-        lf.group_by(["hour", "is_weekend"])
-        .agg(pl.col("kwh").mean().alias("mean_kwh"))
-        .sort(["is_weekend", "hour"])
-        .collect(engine="streaming")
-    )
-
-    weekday = comparison.filter(pl.col("is_weekend") == False)
-    weekend = comparison.filter(pl.col("is_weekend") == True)
-
-    fig, ax = plt.subplots(figsize=(15, 9))
-
-    sns.lineplot(
-        x=weekday["hour"].to_list(),
-        y=weekday["mean_kwh"].to_list(),
-        linewidth=3.5,
-        color="#2980b9",
-        marker="o",
-        markersize=8,
-        ax=ax,
-        label="Weekday",
-    )
-    sns.lineplot(
-        x=weekend["hour"].to_list(),
-        y=weekend["mean_kwh"].to_list(),
-        linewidth=3.5,
-        color="#e67e22",
-        marker="s",
-        markersize=8,
-        ax=ax,
-        label="Weekend",
-    )
-
-    ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_ylabel("Average Energy (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12)
-    ax.set_title("Weekday vs Weekend Load Profiles\nChicago 2024", fontsize=18, fontweight="bold", pad=30)
-    ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8)
-    ax.set_xticks(range(0, 24, 2))
-    ax.set_xlim(-0.5, 23.5)
-
-    max_val = max(weekday["mean_kwh"].max(), weekend["mean_kwh"].max())
-    ax.set_ylim(0, max_val * 1.25)
-
-    ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=13)
-
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white")
-    print(f"✅ Saved: {output_path}")
-    plt.close()
-
-
-if __name__ == "__main__":
-    print("=" * 80)
-    print("CHICAGO 2024 FINAL VISUALIZATIONS")
-    print("=" * 80)
-
-    if not Path(DATA_FILE).exists():
-        print(f"❌ File not found: {DATA_FILE}")
-        raise SystemExit(1)
-
-    create_heatmap(DATA_FILE, OUTPUT_DIR / "chicago_2024_heatmap.png")
-    create_hourly_profile(DATA_FILE, OUTPUT_DIR / "chicago_2024_hourly_profile.png")
-    create_monthly_profile(DATA_FILE, OUTPUT_DIR / "chicago_2024_monthly_profile.png")
-    create_weekend_comparison(DATA_FILE, OUTPUT_DIR / "chicago_2024_weekend_comparison.png")
-
-    print("\n" + "=" * 80)
-    print("✅ ALL VISUALIZATIONS COMPLETE!")
-    print("=" * 80)
-    print(f"\nOutput directory: {OUTPUT_DIR}")
-    print("=" * 80)
diff --git a/archive/test_files/fix_final.py b/archive/test_files/fix_final.py
deleted file mode 100644
index b12ee10..0000000
--- a/archive/test_files/fix_final.py
+++ /dev/null
@@ -1,11 +0,0 @@
-with open("smart_meter_analysis/census.py") as f:
-    lines = f.readlines()
-
-# Find and fix the _import_cenpy line
-for i, line in enumerate(lines):
-    if "def _import_cenpy()" in line:
-        lines[i] = "def _import_cenpy():  # type: ignore[no-untyped-def]\n"
-        break
-
-with open("smart_meter_analysis/census.py", "w") as f:
-    f.writelines(lines)
diff --git a/docs/index.html b/docs/index.html
index 84fecc5..b829477 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -155,29 +155,91 @@ <h3 class="anchored" data-anchor-id="clusters">Clusters</h3>
 </section>
 <section id="demographic-predictors-of-load-profile-mix" class="level2">
 <h2 class="anchored" data-anchor-id="demographic-predictors-of-load-profile-mix">Demographic predictors of load profile mix</h2>
-<p>We begin by aggregating load profile clusters at the block group level. If <span class="math inline">\(j\)</span> is the block group, and <span class="math inline">\(q\)</span> is the cluster (<span class="math inline">\(q\in\{1,\ldots,k\}\)</span>), then</p>
+<p>We begin by aggregating load profile clusters at the block group level. If <span class="math inline">\(j\)</span> is the block group, and <span class="math inline">\(q\)</span> is the cluster (<span class="math inline">\(q\in\{0,1,2,3\}\)</span>), then</p>
 <p><span class="math display">\[
 C_{jq} = \sum_{i\in \mathrm{bg}(j)} \mathrm{I}(c_i = q)
 \]</span></p>
 <p>is the count of household-day observations assigned to cluster <span class="math inline">\(q\)</span> in block group <span class="math inline">\(j\)</span>, where <span class="math inline">\(c_i\)</span> denotes the cluster assignment for household-day observation <span class="math inline">\(i\)</span>. Under this aggregation, a single household with multiple sampled days contributes multiple observations to the block group totals.</p>
 <p>We further normalize these to <span class="math inline">\(\pi_{jq} = \frac{C_{jq}}{\sum_q C_{jq}}\)</span>, the proportion of cluster assignments in each block group.</p>
-<p>Our aim is to understand how block group level demographics predict the load cluster mix. Because <span class="math inline">\((\pi_{j1},\ldots,\pi_{jk})\)</span> are compositional proportions that must sum to 1, we model log-ratios of cluster proportions. To prevent numerical instability from zero proportions, we apply Laplace smoothing (<span class="math inline">\(\alpha = 0.5\)</span> pseudocount) before computing log-ratios. Specifically, we define smoothed proportions</p>
+<p>Our aim is to understand how block group level demographics predict the load cluster mix. Because <span class="math inline">\((\pi_{j0},\pi_{j1},\pi_{j2},\pi_{j3})\)</span> are compositional proportions that must sum to 1, we use <strong>multinomial logistic regression</strong> to model the distribution of cluster assignments. This approach properly accounts for the compositional constraint, handles zero counts naturally through the multinomial likelihood, and provides statistically correct standard errors and hypothesis tests.</p>
+<section id="model-specification" class="level3">
+<h3 class="anchored" data-anchor-id="model-specification">Model specification</h3>
+<p>We model the counts <span class="math inline">\((C_{j0}, C_{j1}, C_{j2}, C_{j3})\)</span> as following a multinomial distribution:</p>
 <p><span class="math display">\[
-\tilde{\pi}_{jq} = \frac{C_{jq} + \alpha}{\sum_{q=1}^{k} C_{jq} + k\alpha},
-\qquad \alpha = 0.5,
+(C_{j0}, C_{j1}, C_{j2}, C_{j3}) \sim \mathrm{Multinomial}\left(\sum_q C_{jq}; \pi_{j0}, \pi_{j1}, \pi_{j2}, \pi_{j3}\right)
 \]</span></p>
-<p>which ensures <span class="math inline">\(\tilde{\pi}_{jq} &gt; 0\)</span> for all <span class="math inline">\(q\)</span> and prevents <span class="math inline">\(\log(0) = -\infty\)</span>.</p>
-<p>We fit one model per cluster outcome (except the baseline), modeling the log-ratio of each cluster’s share relative to baseline cluster 1. Specifically, for each <span class="math inline">\(q\in\{2,\ldots,k\}\)</span>, we fit:</p>
+<p>We fit a single unified multinomial logit model with <strong>Cluster 2 as the baseline (reference) category</strong>. This cluster contains the largest share of household-day observations and exhibits the most stable and representative load-shape pattern among the clusters, providing a stable comparison point for the log-ratio regressions. The model estimates coefficients for Clusters 0, 1, and 3 relative to this baseline, parameterizing the log-odds as:</p>
 <p><span class="math display">\[
-\log\left(\frac{\tilde{\pi}_{jq}}{\tilde{\pi}_{j1}}\right)
-= \beta_{0q}
-+ \beta_{1q} X_{1j}
-+ \beta_{2q} X_{2j}
-+ \cdots
-+ \varepsilon_{jq}.
+\log\left(\frac{\pi_{jq}}{\pi_{j2}}\right) = \beta_{0q} + \beta_{1q} X_{1j} + \beta_{2q} X_{2j} + \cdots + \beta_{pq} X_{pj}
 \]</span></p>
-<p>We estimate these as separate weighted least squares (WLS) regressions, weighting each block group <span class="math inline">\(j\)</span> by <span class="math inline">\(\mathrm{total\_obs}_j\)</span>, the total number of household-day observations available for that block group. This weighting gives greater influence to block groups with more observed data. As a robustness check, we also estimate unweighted OLS versions of the same log-ratio regressions; large differences between WLS and OLS estimates would indicate results are disproportionately driven by high-observation block groups.</p>
-<p>Coefficients are interpreted on the log-ratio scale: <span class="math inline">\(\beta_{pq}\)</span> is the expected change in <span class="math inline">\(\log(\tilde{\pi}_{jq}/\tilde{\pi}_{j1})\)</span> for a one-unit increase in predictor <span class="math inline">\(X_{pj}\)</span>, holding other predictors constant. Equivalently, <span class="math inline">\(\exp(\beta_{pq})\)</span> is the multiplicative effect on the proportion ratio <span class="math inline">\(\tilde{\pi}_{jq}/\tilde{\pi}_{j1}\)</span> for a one-unit increase in <span class="math inline">\(X_{pj}\)</span>.</p>
+<p>for <span class="math inline">\(q \in \{0, 1, 3\}\)</span> (i.e., all non-baseline clusters).</p>
+<p>This parameterization ensures that:</p>
+<ul>
+<li>All predicted probabilities are positive: <span class="math inline">\(\pi_{jq} &gt; 0\)</span></li>
+<li>Probabilities sum to 1: <span class="math inline">\(\sum_q \pi_{jq} = 1\)</span></li>
+<li>The baseline cluster probability is determined by the constraint: <span class="math inline">\(\pi_{j2} = \frac{1}{1 + \sum_{q \in \{0,1,3\}} \exp(\beta_{0q} + \beta_{1q} X_{1j} + \cdots)}\)</span></li>
+</ul>
+<p>We estimate this model via <strong>maximum likelihood</strong> using the VGAM package in R, which implements iteratively reweighted least squares (IRLS) optimization. The multinomial likelihood naturally weights observations by their sample size—block groups with more household-day observations contribute more to the likelihood and thus have greater influence on parameter estimates.</p>
+</section>
+<section id="why-multinomial-logit" class="level3">
+<h3 class="anchored" data-anchor-id="why-multinomial-logit">Why multinomial logit?</h3>
+<p>An alternative approach would be to compute empirical log-ratios <span class="math inline">\(\log(C_{jq}/C_{j2})\)</span> and model them using separate weighted ordinary least squares (OLS) regressions. However, this approach has several statistical problems:</p>
+<ol type="1">
+<li><strong>Zero counts:</strong> Log-ratios are undefined when counts are zero, requiring arbitrary smoothing constants</li>
+<li><strong>Dependent errors:</strong> Log-ratios for different clusters share a common denominator and are thus correlated, violating OLS independence assumptions</li>
+<li><strong>Heteroskedasticity:</strong> The variance of log-ratios depends on unknown proportions, creating heteroskedasticity that weighted OLS only partially addresses</li>
+<li><strong>Invalid predictions:</strong> Predicted proportions may not sum to 1</li>
+</ol>
+<p>Multinomial logit avoids all these issues by using the correct likelihood for multinomial count data.</p>
+</section>
+<section id="predictor-variables-and-rank-deficiency" class="level3">
+<h3 class="anchored" data-anchor-id="predictor-variables-and-rank-deficiency">Predictor variables and rank deficiency</h3>
+<p>Our analysis includes 47 demographic predictor variables across five categories (spatial, economic, housing, household, and demographic characteristics). However, several of these variables form <strong>compositional groups</strong> where predictors sum to a constant (e.g., income brackets sum to 100% of households). This creates linear dependencies (rank deficiency) in the design matrix.</p>
+<p>To handle this, we:</p>
+<ol type="1">
+<li><strong>Detect rank deficiency</strong> via QR decomposition of the predictor matrix</li>
+<li><strong>Drop redundant predictors</strong> that are exact linear combinations of others</li>
+<li>In practice, this reduces our predictor set from 47 to approximately <strong>42 active predictors</strong> (plus an intercept term)</li>
+</ol>
+<p>The final model includes <strong>43 terms per cluster equation</strong> (42 predictors + 1 intercept), yielding <strong>129 total coefficients</strong> across the 3 non-baseline cluster equations.</p>
+<div class="callout callout-style-default callout-note callout-empty-content callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+<span class="screen-reader-only">Note</span>Variable Selection is still in progress.
+</div>
+</div>
+<div class="callout-body-container callout-body">
+
+</div>
+</div>
+</section>
+<section id="interpretation" class="level3">
+<h3 class="anchored" data-anchor-id="interpretation">Interpretation</h3>
+<p>Coefficients are interpreted on the <strong>log-odds scale</strong>: <span class="math inline">\(\beta_{pq}\)</span> is the expected change in <span class="math inline">\(\log(\pi_{jq}/\pi_{j2})\)</span> for a one-unit increase in predictor <span class="math inline">\(X_{pj}\)</span>, holding other predictors constant.</p>
+<p>Equivalently, <span class="math inline">\(\exp(\beta_{pq})\)</span> is the <strong>multiplicative effect</strong> on the odds ratio <span class="math inline">\(\pi_{jq}/\pi_{j2}\)</span> for a one-unit increase in <span class="math inline">\(X_{pj}\)</span>. For example:</p>
+<ul>
+<li>If <span class="math inline">\(\beta_{pq} = 0.02\)</span> for median household income (in thousands of dollars), then a $10k increase in income multiplies the odds of cluster <span class="math inline">\(q\)</span> vs.&nbsp;baseline by <span class="math inline">\(\exp(0.02 \times 10) = e^{0.2} \approx 1.22\)</span> (a 22% increase in odds).</li>
+</ul>
+<p>To obtain <strong>predicted probabilities</strong> for a block group with covariates <span class="math inline">\(X_j\)</span>, we use the softmax transformation:</p>
+<p><span class="math display">\[
+\hat{\pi}_{jq} = \begin{cases}
+\frac{\exp(\hat{\beta}_{0q} + \hat{\beta}_{1q} X_{1j} + \cdots)}
+{1 + \sum_{m \in \{0,1,3\}} \exp(\hat{\beta}_{0m} + \hat{\beta}_{1m} X_{1j} + \cdots)}
+&amp; \text{if } q \in \{0,1,3\} \\[1em]
+\frac{1}
+{1 + \sum_{m \in \{0,1,3\}} \exp(\hat{\beta}_{0m} + \hat{\beta}_{1m} X_{1j} + \cdots)}
+&amp; \text{if } q = 2 \text{ (baseline)}
+\end{cases}
+\]</span></p>
+<p>These predicted probabilities automatically sum to 1 and are guaranteed to be positive.</p>
+</section>
+<section id="model-performance" class="level3">
+<h3 class="anchored" data-anchor-id="model-performance">Model performance</h3>
+<p>Our final model achieves a <strong>pseudo-R² of 0.567</strong> (McFadden’s R²), indicating that demographic predictors explain approximately 57% of the variation in cluster distributions across block groups. The model successfully converged using VGAM’s IRLS optimizer, and all coefficients have valid standard errors for hypothesis testing.</p>
+</section>
 </section>
 </section>
 
diff --git a/docs/index.qmd b/docs/index.qmd
index 2c3b497..b47a8ea 100644
--- a/docs/index.qmd
+++ b/docs/index.qmd
@@ -60,7 +60,8 @@ We selected cluster 1 as the baseline (reference category) because it contains t
 
 ## Demographic predictors of load profile mix
 
-We begin by aggregating load profile clusters at the block group level. If $j$ is the block group, and $q$ is the cluster ($q\in\{1,\ldots,k\}$), then
+
+We begin by aggregating load profile clusters at the block group level. If $j$ is the block group, and $q$ is the cluster ($q\in\{0,1,2,3\}$), then
 
 $$
 C_{jq} = \sum_{i\in \mathrm{bg}(j)} \mathrm{I}(c_i = q)
@@ -70,26 +71,82 @@ is the count of household-day observations assigned to cluster $q$ in block grou
 
 We further normalize these to $\pi_{jq} = \frac{C_{jq}}{\sum_q C_{jq}}$, the proportion of cluster assignments in each block group.
 
-Our aim is to understand how block group level demographics predict the load cluster mix. Because $(\pi_{j1},\ldots,\pi_{jk})$ are compositional proportions that must sum to 1, we model log-ratios of cluster proportions. To prevent numerical instability from zero proportions, we apply Laplace smoothing ($\alpha = 0.5$ pseudocount) before computing log-ratios. Specifically, we define smoothed proportions
+Our aim is to understand how block group level demographics predict the load cluster mix. Because $(\pi_{j0},\pi_{j1},\pi_{j2},\pi_{j3})$ are compositional proportions that must sum to 1, we use **multinomial logistic regression** to model the distribution of cluster assignments. This approach properly accounts for the compositional constraint, handles zero counts naturally through the multinomial likelihood, and provides statistically correct standard errors and hypothesis tests.
+
+### Model specification
+
+We model the counts $(C_{j0}, C_{j1}, C_{j2}, C_{j3})$ as following a multinomial distribution:
+
+$$
+(C_{j0}, C_{j1}, C_{j2}, C_{j3}) \sim \mathrm{Multinomial}\left(\sum_q C_{jq}; \pi_{j0}, \pi_{j1}, \pi_{j2}, \pi_{j3}\right)
+$$
+
+We fit a single unified multinomial logit model with **Cluster 2 as the baseline (reference) category**. This cluster contains the largest share of household-day observations and exhibits the most stable and representative load-shape pattern among the clusters, providing a stable comparison point for the log-ratio regressions. The model estimates coefficients for Clusters 0, 1, and 3 relative to this baseline, parameterizing the log-odds as:
 
 $$
-\tilde{\pi}_{jq} = \frac{C_{jq} + \alpha}{\sum_{q=1}^{k} C_{jq} + k\alpha},
-\qquad \alpha = 0.5,
+\log\left(\frac{\pi_{jq}}{\pi_{j2}}\right) = \beta_{0q} + \beta_{1q} X_{1j} + \beta_{2q} X_{2j} + \cdots + \beta_{pq} X_{pj}
 $$
 
-which ensures $\tilde{\pi}_{jq} > 0$ for all $q$ and prevents $\log(0) = -\infty$.
+for $q \in \{0, 1, 3\}$ (i.e., all non-baseline clusters).
+
+This parameterization ensures that:
+
+- All predicted probabilities are positive: $\pi_{jq} > 0$
+- Probabilities sum to 1: $\sum_q \pi_{jq} = 1$
+- The baseline cluster probability is determined by the constraint: $\pi_{j2} = \frac{1}{1 + \sum_{q \in \{0,1,3\}} \exp(\beta_{0q} + \beta_{1q} X_{1j} + \cdots)}$
+
+We estimate this model via **maximum likelihood** using the VGAM package in R, which implements iteratively reweighted least squares (IRLS) optimization. The multinomial likelihood naturally weights observations by their sample size—block groups with more household-day observations contribute more to the likelihood and thus have greater influence on parameter estimates.
+
+### Why multinomial logit?
+
+An alternative approach would be to compute empirical log-ratios $\log(C_{jq}/C_{j2})$ and model them using separate weighted ordinary least squares (OLS) regressions. However, this approach has several statistical problems:
+
+1. **Zero counts:** Log-ratios are undefined when counts are zero, requiring arbitrary smoothing constants
+2. **Dependent errors:** Log-ratios for different clusters share a common denominator and are thus correlated, violating OLS independence assumptions
+3. **Heteroskedasticity:** The variance of log-ratios depends on unknown proportions, creating heteroskedasticity that weighted OLS only partially addresses
+4. **Invalid predictions:** Predicted proportions may not sum to 1
+
+Multinomial logit avoids all these issues by using the correct likelihood for multinomial count data.
 
-We fit one model per cluster outcome (except the baseline), modeling the log-ratio of each cluster’s share relative to baseline cluster 1. Specifically, for each $q\in\{2,\ldots,k\}$, we fit:
+### Predictor variables and rank deficiency
+
+Our analysis includes 47 demographic predictor variables across five categories (spatial, economic, housing, household, and demographic characteristics). However, several of these variables form **compositional groups** where predictors sum to a constant (e.g., income brackets sum to 100% of households). This creates linear dependencies (rank deficiency) in the design matrix.
+
+To handle this, we:
+
+1. **Detect rank deficiency** via QR decomposition of the predictor matrix
+2. **Drop redundant predictors** that are exact linear combinations of others
+3. In practice, this reduces our predictor set from 47 to approximately **42 active predictors** (plus an intercept term)
+
+The final model includes **43 terms per cluster equation** (42 predictors + 1 intercept), yielding **129 total coefficients** across the 3 non-baseline cluster equations.
+
+::: {.callout-note}
+## Variable Selection is still in progress.
+:::
+
+### Interpretation
+
+Coefficients are interpreted on the **log-odds scale**: $\beta_{pq}$ is the expected change in $\log(\pi_{jq}/\pi_{j2})$ for a one-unit increase in predictor $X_{pj}$, holding other predictors constant.
+
+Equivalently, $\exp(\beta_{pq})$ is the **multiplicative effect** on the odds ratio $\pi_{jq}/\pi_{j2}$ for a one-unit increase in $X_{pj}$. For example:
+
+- If $\beta_{pq} = 0.02$ for median household income (in thousands of dollars), then a \$10k increase in income multiplies the odds of cluster $q$ vs. baseline by $\exp(0.02 \times 10) = e^{0.2} \approx 1.22$ (a 22% increase in odds).
+
+To obtain **predicted probabilities** for a block group with covariates $X_j$, we use the softmax transformation:
 
 $$
-\log\left(\frac{\tilde{\pi}_{jq}}{\tilde{\pi}_{j1}}\right)
-= \beta_{0q}
-+ \beta_{1q} X_{1j}
-+ \beta_{2q} X_{2j}
-+ \cdots
-+ \varepsilon_{jq}.
+\hat{\pi}_{jq} = \begin{cases}
+\frac{\exp(\hat{\beta}_{0q} + \hat{\beta}_{1q} X_{1j} + \cdots)}
+{1 + \sum_{m \in \{0,1,3\}} \exp(\hat{\beta}_{0m} + \hat{\beta}_{1m} X_{1j} + \cdots)}
+& \text{if } q \in \{0,1,3\} \\[1em]
+\frac{1}
+{1 + \sum_{m \in \{0,1,3\}} \exp(\hat{\beta}_{0m} + \hat{\beta}_{1m} X_{1j} + \cdots)}
+& \text{if } q = 2 \text{ (baseline)}
+\end{cases}
 $$
 
-We estimate these as separate weighted least squares (WLS) regressions, weighting each block group $j$ by $\mathrm{total\_obs}_j$, the total number of household-day observations available for that block group. This weighting gives greater influence to block groups with more observed data. As a robustness check, we also estimate unweighted OLS versions of the same log-ratio regressions; large differences between WLS and OLS estimates would indicate results are disproportionately driven by high-observation block groups.
+These predicted probabilities automatically sum to 1 and are guaranteed to be positive.
+
+### Model performance
 
-Coefficients are interpreted on the log-ratio scale: $\beta_{pq}$ is the expected change in $\log(\tilde{\pi}_{jq}/\tilde{\pi}_{j1})$ for a one-unit increase in predictor $X_{pj}$, holding other predictors constant. Equivalently, $\exp(\beta_{pq})$ is the multiplicative effect on the proportion ratio $\tilde{\pi}_{jq}/\tilde{\pi}_{j1}$ for a one-unit increase in $X_{pj}$.
+Our final model achieves a **pseudo-R² of 0.567** (McFadden's R²), indicating that demographic predictors explain approximately 57% of the variation in cluster distributions across block groups. The model successfully converged using VGAM's IRLS optimizer, and all coefficients have valid standard errors for hypothesis testing.
diff --git a/pyproject.toml b/pyproject.toml
index c7297bb..746b468 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,6 +20,7 @@ dependencies = [
     "boto3>=1.40.46",
     "botocore>=1.40.47",
     "cenpy>=1.0.1",
+    "git-filter-repo>=2.47.0",
     "ipykernel>=6.30.1",
     "matplotlib>=3.9.4",
     "memory-profiler>=0.61.0",
@@ -59,7 +60,7 @@ DEP002 = [
   "snakeviz",
   "tslearn",
 ]
-DEP003 = ["botocore", "analysis", "smart_meter_analysis", "pandas", "scipy"]
+DEP003 = ["botocore", "analysis", "smart_meter_analysis", "pandas", "scipy", "psutil"]
 DEP004 = ["botocore"]
 
 [dependency-groups]
diff --git a/scripts/run_comed_pipeline.py b/scripts/run_comed_pipeline.py
index 9501ef7..7d7cc25 100644
--- a/scripts/run_comed_pipeline.py
+++ b/scripts/run_comed_pipeline.py
@@ -1,16 +1,36 @@
 #!/usr/bin/env python3
-"""ComEd Smart Meter Analysis Pipeline Orchestrator (Phase 3)
-
-Directory structure (required):
-data/runs/{run_name}/
-├── raw/                         # Downloaded CSVs
-├── processed/                   # Canonical interval parquet
-├── clustering/                  # Stage 1 outputs + clustering outputs
-├── stage2/                      # Stage 2 outputs
-├── logs/                        # All logs
-├── run_manifest.json            # Pipeline metadata
-├── download_manifest.jsonl      # S3 download record
-└── processing_manifest.jsonl    # CSV processing record
+"""
+ComEd Smart Meter Analysis Pipeline Orchestrator (Production-Grade)
+
+This orchestrator integrates the existing repo components to run:
+1) S3 download (optional; supports hybrid loop)
+2) CSV -> canonical parquet ingest (hybrid-safe parts promotion)
+3) Prepare clustering sample/profiles
+4) OPTIONAL: K-search (must run BEFORE single-k clustering)
+5) Single-k clustering
+6) OPTIONAL: Stage 2 regression (R multinomial blockgroup model)
+
+Non-negotiables implemented here
+--------------------------------
+- All imports are at module top (no imports inside functions).
+- Deterministic ordering (sorted inputs; fixed seeds passed through).
+- Resumable + safe against partial failures (hybrid promotion, recursive raw detection, skip flags, idempotent checks).
+- Logs under data/runs/{run_name}/logs/.
+- Never assumes "no files" from a single glob; always checks recursively.
+
+Stage 2 (Updated)
+-----------------
+Stage 2 regression is executed via:
+analysis/stage2/stage2_multinom_blockgroup_weighted.R
+
+Inputs:
+- clusters parquet: data/runs/{run_name}/clustering/cluster_assignments.parquet
+- crosswalk: data/reference/2023_comed_zip4_census_crosswalk.txt (default, overrideable)
+- census predictors parquet: data/runs/{run_name}/stage2/census_predictors_blockgroups.parquet
+  (generated by this orchestrator from census_cache + census_specs.py VARIABLE_SPECS)
+
+New capability:
+- --stage2-build-census-only: regenerate census cache + predictors parquet without running Stage 2 regression
 """
 
 from __future__ import annotations
@@ -19,15 +39,29 @@
 import json
 import logging
 import os
+import re
+import shutil
 import subprocess
 import sys
+from dataclasses import dataclass
 from datetime import datetime, timezone
+from inspect import signature
 from pathlib import Path
 from typing import Any
 
 import polars as pl
+import pyarrow.parquet as pq
+
+try:
+    import psutil  # type: ignore[import-untyped]
+except Exception:
+    psutil = None  # type: ignore[assignment]
+
+import contextlib
 
 from smart_meter_analysis.aws_loader import download_s3_batch, list_s3_files
+from smart_meter_analysis.census import fetch_census_data
+from smart_meter_analysis.census_specs import VARIABLE_SPECS
 
 logger = logging.getLogger(__name__)
 
@@ -36,25 +70,67 @@
 DEFAULT_STATE_FIPS = "17"
 DEFAULT_ACS_YEAR = 2023
 
+DEFAULT_PARTS_DIRNAME = "parts"
+DEFAULT_MIN_FREE_GB = 25.0
+DEFAULT_DOWNLOAD_CHUNK_SIZE = 50_000
+
+DEFAULT_SAMPLE_DAYS = 31
+DEFAULT_SEED = 42
+
+DEFAULT_K_MIN = 3
+DEFAULT_K_MAX = 8
+DEFAULT_K_SEARCH_DIRNAME = "k_search"
+DEFAULT_K_SEARCH_MAX_PROFILES = 500_000
+
+BATCH_BASENAME_RE = re.compile(r"^batch_\d+\.parquet$")
+
+DEFAULT_STAGE2_R_OUTDIRNAME = "stage2_R_multi"
+DEFAULT_STAGE2_CENSUS_PREDICTORS_BASENAME = "census_predictors_blockgroups.parquet"
+
+
+@dataclass(frozen=True)
+class StepPaths:
+    run_dir: Path
+    raw_dir: Path
+    processed_dir: Path
+    clustering_dir: Path
+    stage2_dir: Path
+    stage2_r_dir: Path
+    logs_dir: Path
+    run_manifest: Path
+    run_state: Path
+    download_manifest: Path
+    processing_manifest: Path
+
 
 def _utc_now_iso() -> str:
     return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
 
 
-def _configure_logging(log_path: Path) -> None:
-    log_path.parent.mkdir(parents=True, exist_ok=True)
+def _atomic_write_text(path: Path, text: str) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    tmp = path.with_suffix(path.suffix + ".tmp")
+    tmp.write_text(text, encoding="utf-8")
+    tmp.replace(path)
+
+
+def _configure_logging(logs_dir: Path) -> Path:
+    logs_dir.mkdir(parents=True, exist_ok=True)
+    stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    log_path = logs_dir / f"pipeline_{stamp}.log"
 
     handlers: list[logging.Handler] = [
         logging.StreamHandler(sys.stdout),
         logging.FileHandler(str(log_path), encoding="utf-8"),
     ]
-
     logging.basicConfig(
         level=logging.INFO,
-        format="%(asctime)s - %(levelname)s - %(message)s",
+        format="%(asctime)s %(levelname)s %(name)s %(message)s",
         handlers=handlers,
         force=True,
     )
+    logger.info("Logging to %s", log_path)
+    return log_path
 
 
 def _get_git_sha(repo_root: Path) -> str | None:
@@ -74,102 +150,349 @@ def _get_git_sha(repo_root: Path) -> str | None:
         return None
 
 
-def create_directories(run_dir: Path) -> dict[str, Path]:
-    paths = {
-        "run_dir": run_dir,
-        "raw": run_dir / "raw",
-        "processed": run_dir / "processed",
-        "clustering": run_dir / "clustering",
-        "stage2": run_dir / "stage2",
-        "logs": run_dir / "logs",
-    }
-    for p in paths.values():
+def _disk_free_gb(path: Path) -> float:
+    usage = shutil.disk_usage(str(path))
+    return float(usage.free) / (1024.0**3)
+
+
+def _log_system_diagnostics(*, base_path: Path) -> None:
+    free_gb = _disk_free_gb(base_path)
+    msg: dict[str, Any] = {"event": "system_diagnostics", "disk_free_gb": round(free_gb, 2)}
+    if psutil is not None:
+        try:
+            vm = psutil.virtual_memory()
+            msg["mem_total_gb"] = round(float(vm.total) / (1024.0**3), 2)
+            msg["mem_available_gb"] = round(float(vm.available) / (1024.0**3), 2)
+            msg["mem_percent"] = float(vm.percent)
+        except Exception:  # noqa: S110
+            pass
+    logger.info("DIAG %s", json.dumps(msg, sort_keys=True))
+
+
+def _ensure_paths(run_dir: Path) -> StepPaths:
+    raw_dir = run_dir / "raw"
+    processed_dir = run_dir / "processed"
+    clustering_dir = run_dir / "clustering"
+    stage2_dir = run_dir / "stage2"
+    stage2_r_dir = run_dir / DEFAULT_STAGE2_R_OUTDIRNAME
+    logs_dir = run_dir / "logs"
+
+    for p in [run_dir, raw_dir, processed_dir, clustering_dir, stage2_dir, stage2_r_dir, logs_dir]:
         p.mkdir(parents=True, exist_ok=True)
-    return paths
+
+    return StepPaths(
+        run_dir=run_dir,
+        raw_dir=raw_dir,
+        processed_dir=processed_dir,
+        clustering_dir=clustering_dir,
+        stage2_dir=stage2_dir,
+        stage2_r_dir=stage2_r_dir,
+        logs_dir=logs_dir,
+        run_manifest=run_dir / "run_manifest.json",
+        run_state=run_dir / "run_state.json",
+        download_manifest=run_dir / "download_manifest.jsonl",
+        processing_manifest=run_dir / "processing_manifest.jsonl",
+    )
+
+
+def _validate_crosswalk(path: Path) -> Path:
+    if not path.exists():
+        raise FileNotFoundError(
+            f"ZIP+4 crosswalk not found at {path}. Provide --stage2-crosswalk or ensure the file exists."
+        )
+    return path
 
 
-def _run_subprocess(cmd: list[str], *, cwd: Path | None = None, env: dict[str, str] | None = None) -> None:
+def _run_subprocess(cmd: list[str], *, cwd: Path | None = None) -> None:
     logger.info("Command: %s", " ".join(cmd))
-    r = subprocess.run(cmd, cwd=str(cwd) if cwd else None, env=env, check=False)
+    r = subprocess.run(cmd, cwd=str(cwd) if cwd else None, check=False)
     if r.returncode != 0:
-        raise RuntimeError(f"Command failed with return code {r.returncode}: {' '.join(cmd)}")
+        raise RuntimeError(f"Command failed (rc={r.returncode}): {' '.join(cmd)}")
 
 
-def download_from_s3(*, year_month: str, num_files: int, run_dir: Path) -> dict[str, Any]:
-    raw_dir = run_dir / "raw"
-    raw_dir.mkdir(parents=True, exist_ok=True)
+def _raw_csvs_recursive(raw_dir: Path) -> list[Path]:
+    if not raw_dir.exists():
+        return []
+    csvs = [p for p in raw_dir.rglob("*.csv") if p.is_file()]
+    csvs.extend([p for p in raw_dir.rglob("*.CSV") if p.is_file()])
+    return sorted(set(csvs))
 
-    manifest_path = run_dir / "download_manifest.jsonl"
 
-    # list_s3_files MUST return keys (not URIs).
-    s3_keys = list_s3_files(year_month=year_month, max_files=num_files)
+def _raw_processing_dirs(raw_dir: Path) -> list[Path]:
+    """
+    Return directories that contain top-level CSVs (non-recursive within each returned dir),
+    in deterministic (sorted) order.
+    """
+    if not raw_dir.exists():
+        return []
 
-    logger.info("Downloading %d files for %s into %s", len(s3_keys), year_month, raw_dir)
-    result = download_s3_batch(
-        s3_keys=s3_keys,
-        output_dir=raw_dir,
-        manifest_path=manifest_path,
-        max_files=num_files,
-        fail_fast=True,
-        max_errors=10,
-        retries=3,
-        backoff_factor=2.0,
-    )
-    logger.info(
-        "Download complete: downloaded=%s failed=%s manifest=%s",
-        result.get("downloaded"),
-        result.get("failed"),
-        result.get("manifest_path"),
-    )
-    return result
+    dirs: set[Path] = set()
 
+    top = list(raw_dir.glob("*.csv")) + list(raw_dir.glob("*.CSV"))
+    if any(p.is_file() for p in top):
+        dirs.add(raw_dir)
 
-def process_csvs(*, run_dir: Path, year_month: str, day_mode: str = "calendar") -> Path:
-    raw_dir = run_dir / "raw"
-    processed_dir = run_dir / "processed"
-    processed_dir.mkdir(parents=True, exist_ok=True)
+    for d in [p for p in raw_dir.rglob("*") if p.is_dir()]:
+        files = list(d.glob("*.csv")) + list(d.glob("*.CSV"))
+        if any(p.is_file() for p in files):
+            dirs.add(d)
+
+    return sorted(dirs)
+
+
+def _delete_csvs_under(dir_path: Path) -> None:
+    if not dir_path.exists():
+        return
+    for p in list(dir_path.glob("*.csv")) + list(dir_path.glob("*.CSV")):
+        if p.is_file():
+            try:
+                p.unlink(missing_ok=True)
+            except Exception as exc:
+                logger.warning("Failed to delete raw CSV %s: %s", p, exc)
+
+
+def _prune_empty_dirs(root: Path) -> None:
+    if not root.exists():
+        return
+    for d in sorted([p for p in root.rglob("*") if p.is_dir()], reverse=True):
+        with contextlib.suppress(OSError):
+            d.rmdir()
+
+
+def _parts_root(processed_dir: Path, parts_dirname: str) -> Path:
+    return processed_dir / parts_dirname
+
+
+def _existing_cycle_ids(parts_root: Path) -> list[int]:
+    if not parts_root.exists():
+        return []
+    ids: list[int] = []
+    for d in parts_root.glob("cycle_*"):
+        if not d.is_dir():
+            continue
+        m = re.match(r"^cycle_(\d{4})$", d.name)
+        if not m:
+            continue
+        ids.append(int(m.group(1)))
+    return sorted(ids)
+
+
+def _next_cycle_id(parts_root: Path) -> int:
+    ids = _existing_cycle_ids(parts_root)
+    return (max(ids) + 1) if ids else 0
+
+
+def _collect_part_parquets(parts_root: Path) -> list[Path]:
+    if not parts_root.exists():
+        return []
+    return sorted([p for p in parts_root.rglob("*.parquet") if p.is_file()])
+
+
+def _persist_temp_batches_as_parts(
+    *,
+    processed_dir: Path,
+    parts_dirname: str,
+    cycle_id: int,
+) -> list[Path]:
+    """
+    Promote processed/temp_batches/batch_####.parquet -> processed/{parts_dirname}/cycle_####/batch_####.parquet
+    """
+    temp_dir = processed_dir / "temp_batches"
+    if not temp_dir.exists():
+        return []
+
+    candidates = sorted([p for p in temp_dir.glob("*.parquet") if p.is_file()])
+    batch_files = [p for p in candidates if BATCH_BASENAME_RE.match(p.name)]
+    sub_files = [p for p in candidates if p.name.startswith("batch_") and ("_sub_" in p.name)]
+
+    if not batch_files and not sub_files:
+        return []
+
+    parts_root = _parts_root(processed_dir, parts_dirname)
+    parts_root.mkdir(parents=True, exist_ok=True)
+    cycle_dir = parts_root / f"cycle_{cycle_id:04d}"
+    cycle_dir.mkdir(parents=True, exist_ok=True)
+
+    moved: list[Path] = []
+    for src in batch_files:
+        dst = cycle_dir / src.name
+        if dst.exists():
+            raise FileExistsError(f"Refusing to overwrite existing parts file: {dst}")
+        src.replace(dst)
+        moved.append(dst)
+
+    for s in sub_files:
+        with contextlib.suppress(Exception):
+            s.unlink(missing_ok=True)
+
+    with contextlib.suppress(Exception):
+        shutil.rmtree(temp_dir, ignore_errors=True)
+
+    logger.info("Promoted %d durable batch parquet(s) into %s", len(moved), cycle_dir)
+    return moved
+
+
+def _download_manifest_supported_kwargs() -> dict[str, bool]:
+    try:
+        params = set(signature(download_s3_batch).parameters.keys())
+    except Exception:
+        params = set()
+    return {
+        "overwrite_manifest": "overwrite_manifest" in params,
+        "skip_existing": "skip_existing" in params,
+        "log_every": "log_every" in params,
+    }
+
+
+def _s3_key_basename(s3_key: str) -> str:
+    return os.path.basename(s3_key)
+
+
+def _append_file(src: Path, dst: Path) -> None:
+    dst.parent.mkdir(parents=True, exist_ok=True)
+    if not src.exists():
+        return
+    with src.open("r", encoding="utf-8") as r, dst.open("a", encoding="utf-8") as w:
+        for line in r:
+            w.write(line)
+
+
+def _download_s3_chunk(
+    *,
+    s3_keys: list[str],
+    output_dir: Path,
+    manifest_path: Path,
+    fail_fast: bool,
+    max_errors: int,
+    retries: int,
+    backoff_factor: float,
+    log_every: int,
+    chunk_id: int,
+) -> dict[str, Any]:
+    support = _download_manifest_supported_kwargs()
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    manifest_path.parent.mkdir(parents=True, exist_ok=True)
+
+    kwargs: dict[str, Any] = {
+        "s3_keys": s3_keys,
+        "output_dir": output_dir,
+        "manifest_path": manifest_path,
+        "max_files": None,
+        "fail_fast": bool(fail_fast),
+        "max_errors": int(max_errors),
+        "retries": int(retries),
+        "backoff_factor": float(backoff_factor),
+    }
+
+    if support["log_every"]:
+        kwargs["log_every"] = int(log_every)
+
+    if support["overwrite_manifest"] and support["skip_existing"]:
+        kwargs["overwrite_manifest"] = False
+        kwargs["skip_existing"] = True
+        result = download_s3_batch(**kwargs)
+        return result if isinstance(result, dict) else {"result": result}
+
+    existing = {p.name for p in output_dir.glob("*") if p.is_file()}
+    filtered = [k for k in s3_keys if _s3_key_basename(k) not in existing]
+
+    if not filtered:
+        logger.info("Download chunk %d: all files already exist locally (fallback skip).", chunk_id)
+        return {"downloaded": 0, "skipped": len(s3_keys), "failed": 0, "manifest_path": str(manifest_path)}
+
+    tmp_manifest = manifest_path.parent / f"download_manifest_chunk_{chunk_id:06d}.jsonl"
+    kwargs["s3_keys"] = filtered
+    kwargs["manifest_path"] = tmp_manifest
+
+    result = download_s3_batch(**kwargs)
+    _append_file(tmp_manifest, manifest_path)
+    with contextlib.suppress(Exception):
+        tmp_manifest.unlink(missing_ok=True)
+
+    return result if isinstance(result, dict) else {"result": result}
+
+
+def _safe_list_s3_keys(*, year_month: str, max_files: int | None) -> list[str]:
+    try:
+        keys = list_s3_files(year_month=year_month, max_files=max_files)
+    except TypeError:
+        keys = list_s3_files(year_month=year_month, max_files=max_files if max_files is not None else 0)
+    if keys is None:
+        return []
+    return list(keys)
+
+
+def _load_run_state(path: Path) -> dict[str, Any]:
+    if not path.exists():
+        return {}
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except Exception:
+        return {}
 
-    output_file = processed_dir / f"comed_{year_month}.parquet"
-    manifest_file = run_dir / "processing_manifest.jsonl"
+
+def _update_run_state(path: Path, patch: dict[str, Any]) -> None:
+    state = _load_run_state(path)
+    state.setdefault("created_utc", _utc_now_iso())
+    state["updated_utc"] = _utc_now_iso()
+    for k, v in patch.items():
+        state[k] = v
+    _atomic_write_text(path, json.dumps(state, indent=2, sort_keys=True) + "\n")
+
+
+def process_csvs(
+    *,
+    input_dir: Path,
+    processed_dir: Path,
+    run_dir: Path,
+    year_month: str,
+    day_mode: str,
+    keep_temp: bool,
+    output_path: Path,
+    processing_manifest: Path,
+) -> None:
+    input_dir.mkdir(parents=True, exist_ok=True)
+    processed_dir.mkdir(parents=True, exist_ok=True)
+    processing_manifest.parent.mkdir(parents=True, exist_ok=True)
 
     cmd = [
         sys.executable,
         "scripts/process_csvs_batched_optimized.py",
         "--input-dir",
-        str(raw_dir),
+        str(input_dir),
         "--output",
-        str(output_file),
+        str(output_path),
         "--processing-manifest",
-        str(manifest_file),
+        str(processing_manifest),
         "--day-mode",
         str(day_mode),
     ]
+    if keep_temp:
+        cmd.append("--keep-temp")
 
-    logger.info("Ingesting CSVs into canonical interval parquet: %s", output_file)
+    logger.info("Ingesting CSVs from %s -> %s", input_dir, output_path)
     _run_subprocess(cmd)
 
-    if not output_file.exists():
-        raise FileNotFoundError(f"Expected processed parquet missing: {output_file}")
-
-    return output_file
-
 
 def prepare_clustering(
     *,
-    run_dir: Path,
-    processed_parquet: Path,
+    clustering_dir: Path,
+    processed_inputs: list[Path],
     sample_days: int,
     sample_households: int | None,
     seed: int,
 ) -> Path:
-    clustering_dir = run_dir / "clustering"
     clustering_dir.mkdir(parents=True, exist_ok=True)
 
+    if not processed_inputs:
+        raise ValueError("No processed parquet inputs provided for clustering preparation.")
+
     cmd = [
         sys.executable,
         "analysis/clustering/prepare_clustering_data_households.py",
         "--input",
-        str(processed_parquet),
+        *[str(p) for p in processed_inputs],
         "--output-dir",
         str(clustering_dir),
         "--sample-days",
@@ -178,11 +501,10 @@ def prepare_clustering(
         str(int(seed)),
         "--streaming",
     ]
-
     if sample_households is not None:
         cmd.extend(["--sample-households", str(int(sample_households))])
 
-    logger.info("Preparing Stage 1 clustering data in %s", clustering_dir)
+    logger.info("Preparing clustering data (inputs=%d) -> %s", len(processed_inputs), clustering_dir)
     _run_subprocess(cmd)
 
     profiles = clustering_dir / "sampled_profiles.parquet"
@@ -191,16 +513,163 @@ def prepare_clustering(
     return profiles
 
 
+def _script_help_text(script: Path) -> str:
+    r = subprocess.run([sys.executable, str(script), "--help"], capture_output=True, text=True, check=False)
+    return (r.stdout or "") + "\n" + (r.stderr or "")
+
+
+def _help_supports(help_text: str, flag: str) -> bool:
+    return flag in help_text
+
+
+def run_k_search(
+    *,
+    profiles_path: Path,
+    output_dir: Path,
+    seed: int,
+    k_values: list[int],
+    sample_households: int | None,
+    max_profiles: int | None,
+) -> int:
+    script = Path("analysis/clustering/euclidean_clustering_k_search.py")
+    if not script.exists():
+        raise FileNotFoundError(f"Missing k-search script: {script}")
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    help_text = _script_help_text(script)
+
+    cmd: list[str] = [
+        sys.executable,
+        str(script),
+        "--input",
+        str(profiles_path),
+        "--output-dir",
+        str(output_dir),
+    ]
+
+    if _help_supports(help_text, "--random-state"):
+        cmd.extend(["--random-state", str(int(seed))])
+    elif _help_supports(help_text, "--seed"):
+        cmd.extend(["--seed", str(int(seed))])
+
+    if _help_supports(help_text, "--k-values"):
+        cmd.extend(["--k-values", ",".join(str(k) for k in k_values)])
+    elif _help_supports(help_text, "--k-min") and _help_supports(help_text, "--k-max"):
+        cmd.extend(["--k-min", str(min(k_values)), "--k-max", str(max(k_values))])
+    else:
+        if _help_supports(help_text, "--k"):
+            for k in k_values:
+                cmd.extend(["--k", str(int(k))])
+        else:
+            raise RuntimeError(
+                "k-search script does not expose --k-values, (--k-min/--k-max), or --k; cannot drive it safely."
+            )
+
+    if sample_households is not None:
+        if _help_supports(help_text, "--sample-households"):
+            cmd.extend(["--sample-households", str(int(sample_households))])
+        elif _help_supports(help_text, "--k-search-sample-households"):
+            cmd.extend(["--k-search-sample-households", str(int(sample_households))])
+
+    if max_profiles is not None:
+        if _help_supports(help_text, "--max-profiles"):
+            cmd.extend(["--max-profiles", str(int(max_profiles))])
+        elif _help_supports(help_text, "--k-search-max-profiles"):
+            cmd.extend(["--k-search-max-profiles", str(int(max_profiles))])
+
+    logger.info("Running k-search -> %s", output_dir)
+    _run_subprocess(cmd)
+
+    return read_recommended_k(output_dir=output_dir, k_values=k_values)
+
+
+def read_recommended_k(*, output_dir: Path, k_values: list[int]) -> int:
+    explicit_candidates = [
+        output_dir / "recommended_k.json",
+        output_dir / "recommended_k.txt",
+        output_dir / "best_k.json",
+        output_dir / "best_k.txt",
+        output_dir / "k_recommended.json",
+        output_dir / "k_recommended.txt",
+    ]
+    for p in explicit_candidates:
+        if not p.exists():
+            continue
+        try:
+            if p.suffix.lower() == ".txt":
+                k = int(p.read_text(encoding="utf-8").strip())
+                logger.info("k-search recommended k from %s: %d", p.name, k)
+                return k
+            obj = json.loads(p.read_text(encoding="utf-8"))
+            for key in ["recommended_k", "best_k", "k", "recommended"]:
+                if key in obj:
+                    k = int(obj[key])
+                    logger.info("k-search recommended k from %s[%s]: %d", p.name, key, k)
+                    return k
+        except Exception:  # noqa: S110
+            pass
+
+    for p in sorted(output_dir.glob("*.json")):
+        try:
+            obj = json.loads(p.read_text(encoding="utf-8"))
+            for key in ["recommended_k", "best_k"]:
+                if key in obj:
+                    k = int(obj[key])
+                    logger.info("k-search recommended k from %s[%s]: %d", p.name, key, k)
+                    return k
+        except Exception:  # noqa: S112
+            continue
+
+    table_candidates = (
+        list(output_dir.glob("*.parquet")) + list(output_dir.glob("*.csv")) + list(output_dir.glob("*.tsv"))
+    )
+    for p in sorted(table_candidates):
+        try:
+            if p.suffix.lower() == ".parquet":
+                df = pl.read_parquet(p)
+            elif p.suffix.lower() == ".csv":
+                df = pl.read_csv(p)
+            elif p.suffix.lower() == ".tsv":
+                df = pl.read_csv(p, separator="\t")
+            else:
+                continue
+        except Exception:  # noqa: S112
+            continue
+
+        cols = set(df.columns)
+        if "k" not in cols:
+            continue
+
+        silhouette_col = None
+        for c in ["silhouette", "silhouette_score", "mean_silhouette"]:
+            if c in cols:
+                silhouette_col = c
+                break
+
+        if silhouette_col is not None:
+            df2 = df.select(["k", silhouette_col]).drop_nulls()
+            if df2.height == 0:
+                continue
+            best = df2.sort([silhouette_col, "k"], descending=[True, False]).row(0)
+            k = int(best[0])
+            logger.info("k-search recommended k from %s by max %s: %d", p.name, silhouette_col, k)
+            return k
+
+    k = int(sorted(k_values)[0])
+    logger.warning("k-search outputs did not expose a recommended k; falling back deterministically to k=%d", k)
+    return k
+
+
 def run_clustering(
     *,
-    run_dir: Path,
+    clustering_dir: Path,
     profiles_path: Path,
     k: int,
     clustering_seed: int,
 ) -> None:
-    clustering_dir = run_dir / "clustering"
     clustering_dir.mkdir(parents=True, exist_ok=True)
 
+    # NOTE: this is where clustering standardization/normalization is applied.
     cmd = [
         sys.executable,
         "analysis/clustering/euclidean_clustering_minibatch.py",
@@ -210,14 +679,14 @@ def run_clustering(
         str(clustering_dir),
         "--k",
         str(int(k)),
-        "--random-state",
+        "--seed",
         str(int(clustering_seed)),
         "--normalize",
         "--normalize-method",
         "minmax",
     ]
 
-    logger.info("Running clustering (MiniBatchKMeans) k=%d output=%s", k, clustering_dir)
+    logger.info("Running clustering (MiniBatchKMeans) k=%d -> %s", k, clustering_dir)
     _run_subprocess(cmd)
 
     expected = clustering_dir / "cluster_assignments.parquet"
@@ -226,86 +695,343 @@ def run_clustering(
 
 
 def _default_stage2_census_cache(*, stage2_dir: Path, state_fips: str, acs_year: int) -> Path:
-    # Option 1: cache lives under the run directory
     return stage2_dir / f"census_cache_{state_fips}_{acs_year}.parquet"
 
 
-def run_stage2_logratio(
+def _default_stage2_census_predictors(*, stage2_dir: Path) -> Path:
+    return stage2_dir / DEFAULT_STAGE2_CENSUS_PREDICTORS_BASENAME
+
+
+def _predictor_names_from_specs() -> list[str]:
+    names: list[str] = []
+    for spec in VARIABLE_SPECS:
+        nm = spec.get("name")
+        if nm and isinstance(nm, str):
+            names.append(nm)
+    seen: set[str] = set()
+    out: list[str] = []
+    for n in names:
+        if n in seen:
+            continue
+        seen.add(n)
+        out.append(n)
+    return out
+
+
+def build_stage2_census_predictors(
     *,
-    run_dir: Path,
-    crosswalk_path: Path,
+    stage2_dir: Path,
     state_fips: str,
     acs_year: int,
-    min_obs_per_bg: int,
-    alpha: float,
-    standardize: bool,
+    decennial_year: int,
     fetch_census: bool,
-    no_ols: bool,
-    baseline_cluster: str | None,
-    predictors_from: Path | None,
     census_cache_path: Path | None,
-) -> None:
-    stage2_dir = run_dir / "stage2"
+    census_predictors_path: Path | None,
+) -> Path:
+    """
+    Build (or reuse) the block-group census cache and derive a predictors parquet for the R stage2 script.
+
+    Source of truth for predictors:
+      smart_meter_analysis.census_specs.VARIABLE_SPECS[*]["name"]
+
+    Output:
+      stage2/census_predictors_blockgroups.parquet with:
+        - GEOID (12-digit BG GEOID)
+        - all predictor columns listed in VARIABLE_SPECS names (must exist; strict by default)
+
+    Write format:
+      Parquet with SNAPPY compression (broadly supported by R/arrow builds).
+    """
     stage2_dir.mkdir(parents=True, exist_ok=True)
 
+    cache = census_cache_path or _default_stage2_census_cache(
+        stage2_dir=stage2_dir, state_fips=state_fips, acs_year=acs_year
+    )
+    preds_out = census_predictors_path or _default_stage2_census_predictors(stage2_dir=stage2_dir)
+
+    if fetch_census or (not cache.exists()):
+        logger.info("Building census cache via census.py -> %s", cache)
+        fetch_census_data(
+            state_fips=str(state_fips),
+            acs_year=int(acs_year),
+            decennial_year=int(decennial_year),
+            county_fips=None,
+            output_path=cache,
+            keep_raw_debug_cols=None,
+        )
+    else:
+        logger.info("Using existing census cache: %s", cache)
+
+    if not cache.exists():
+        raise FileNotFoundError(f"Census cache not found after build attempt: {cache}")
+
+    df = pl.read_parquet(str(cache))
+    cols = set(df.columns)
+
+    if "GEOID" not in cols:
+        raise ValueError(f"Census cache missing GEOID column. Columns={df.columns}")
+
+    predictor_names = _predictor_names_from_specs()
+    missing = [c for c in predictor_names if c not in cols]
+    if missing:
+        raise ValueError(
+            "Census cache is missing predictor column(s) required by census_specs.py VARIABLE_SPECS. "
+            f"Missing {len(missing)}: {missing[:50]}{' ...' if len(missing) > 50 else ''}"
+        )
+
+    out_df = df.select(["GEOID", *predictor_names]).unique(subset=["GEOID"], keep="first")
+    preds_out.parent.mkdir(parents=True, exist_ok=True)
+
+    table = out_df.to_arrow()
+    pq.write_table(table, preds_out, compression="snappy")
+    logger.info("Wrote census predictors parquet: %s (rows=%d cols=%d)", preds_out, out_df.height, out_df.width)
+    return preds_out
+
+
+def run_stage2_multinom_r(
+    *,
+    run_dir: Path,
+    crosswalk_path: Path,
+    census_predictors_path: Path,
+    out_dir: Path,
+    baseline_cluster: str | None,
+    alpha: float | None,
+    min_obs_per_bg: int,
+    allow_missing_predictors: bool,
+    standardize: int,
+    use_vgam: int,
+    verbose: bool,
+    no_emoji: bool,
+) -> None:
+    """
+    Execute the R Stage 2 multinomial BG regression script with the required inputs.
+
+    Pass-through flags:
+      --standardize (SD-only scaling; no mean centering)
+      --use-vgam (VGAM vglm vs nnet multinom)
+    """
     clusters_path = run_dir / "clustering" / "cluster_assignments.parquet"
     if not clusters_path.exists():
         raise FileNotFoundError(f"Missing cluster assignments for Stage 2: {clusters_path}")
 
-    if not crosswalk_path.exists():
-        raise FileNotFoundError(f"Missing crosswalk for Stage 2: {crosswalk_path}")
+    crosswalk_path = _validate_crosswalk(crosswalk_path)
 
-    script_path = Path("analysis/clustering/stage2_logratio_regression.py")
+    if not census_predictors_path.exists():
+        raise FileNotFoundError(f"Missing census predictors parquet for Stage 2: {census_predictors_path}")
+
+    script_path = Path("analysis/stage2/stage2_multinom_blockgroup_weighted.R")
     if not script_path.exists():
-        raise FileNotFoundError(f"Missing Stage 2 script: {script_path}")
+        raise FileNotFoundError(f"Missing Stage 2 R script: {script_path}")
 
-    resolved_cache = census_cache_path or _default_stage2_census_cache(
-        stage2_dir=stage2_dir,
-        state_fips=state_fips,
-        acs_year=acs_year,
-    )
+    out_dir.mkdir(parents=True, exist_ok=True)
 
-    cmd = [
-        sys.executable,
+    cmd: list[str] = [
+        "Rscript",
         str(script_path),
         "--clusters",
         str(clusters_path),
         "--crosswalk",
         str(crosswalk_path),
-        "--output-dir",
-        str(stage2_dir),
-        "--census-cache",
-        str(resolved_cache),
-        "--state-fips",
-        str(state_fips),
-        "--acs-year",
-        str(int(acs_year)),
+        "--census",
+        str(census_predictors_path),
+        "--out-dir",
+        str(out_dir),
         "--min-obs-per-bg",
         str(int(min_obs_per_bg)),
-        "--alpha",
-        str(float(alpha)),
+        "--allow-missing-predictors",
+        "1" if allow_missing_predictors else "0",
+        "--standardize",
+        str(int(standardize)),
+        "--use-vgam",
+        str(int(use_vgam)),
+        "--verbose",
+        "1" if verbose else "0",
+        "--no-emoji",
+        "1" if no_emoji else "0",
     ]
 
-    if fetch_census:
-        cmd.append("--fetch-census")
-
-    if standardize:
-        cmd.append("--standardize")
-
-    if no_ols:
-        cmd.append("--no-ols")
-
     if baseline_cluster is not None:
         cmd.extend(["--baseline-cluster", str(baseline_cluster)])
 
-    if predictors_from is not None:
-        cmd.extend(["--predictors-from", str(predictors_from)])
+    if alpha is not None:
+        cmd.extend(["--alpha", str(float(alpha))])
 
-    logger.info("Running Stage 2 log-ratio regression output=%s", stage2_dir)
+    logger.info("Running Stage 2 (R multinomial) -> %s", out_dir)
     _run_subprocess(cmd)
 
 
-def write_run_manifest(*, run_dir: Path, args: argparse.Namespace) -> None:
+def _hybrid_download_and_process(
+    *,
+    paths: StepPaths,
+    year_month: str,
+    day_mode: str,
+    download_chunk_size: int,
+    min_free_gb: float,
+    parts_dirname: str,
+    delete_raw_after_process: bool,
+    download_fail_fast: bool,
+    download_max_errors: int,
+    download_retries: int,
+    download_backoff_factor: float,
+    download_log_every: int,
+    skip_download: bool,
+    skip_ingest: bool,
+    hybrid_write_monolith: bool,
+) -> list[Path]:
+    raw_dir = paths.raw_dir
+    processed_dir = paths.processed_dir
+    parts_root = _parts_root(processed_dir, parts_dirname)
+    parts_root.mkdir(parents=True, exist_ok=True)
+
+    ym_raw_dir = raw_dir / year_month
+    ym_raw_dir.mkdir(parents=True, exist_ok=True)
+
+    cycle_id = _next_cycle_id(parts_root)
+    logger.info("Hybrid: next cycle_id=%d (existing cycles=%s)", cycle_id, _existing_cycle_ids(parts_root))
+
+    promoted = _persist_temp_batches_as_parts(
+        processed_dir=processed_dir, parts_dirname=parts_dirname, cycle_id=cycle_id
+    )
+    if promoted:
+        cycle_id += 1
+
+    def process_any_raw_dirs() -> None:
+        nonlocal cycle_id
+        dirs = _raw_processing_dirs(raw_dir)
+        if not dirs:
+            rec = _raw_csvs_recursive(raw_dir)
+            logger.info("Hybrid: raw recursive CSV count=%d; no processable top-level dirs found.", len(rec))
+            return
+
+        for d in dirs:
+            csv_count = len([p for p in list(d.glob("*.csv")) + list(d.glob("*.CSV")) if p.is_file()])
+            rec_total = len(_raw_csvs_recursive(d))
+            logger.info("Hybrid: process dir=%s top_level_csvs=%d recursive_csvs=%d", d, csv_count, rec_total)
+
+            if skip_ingest:
+                logger.info("Hybrid: --skip-ingest set; will not process raw dir %s", d)
+                continue
+
+            free_gb = _disk_free_gb(paths.run_dir)
+            if free_gb < float(min_free_gb):
+                raise RuntimeError(
+                    f"Insufficient disk space before processing: free_gb={free_gb:.1f} < min_free_gb={min_free_gb:.1f}"
+                )
+
+            if hybrid_write_monolith:
+                out_path = processed_dir / f"comed_{year_month}_cycle_{cycle_id:04d}.parquet"
+            else:
+                out_path = processed_dir / f"_scratch_comed_{year_month}.parquet"
+
+            process_csvs(
+                input_dir=d,
+                processed_dir=processed_dir,
+                run_dir=paths.run_dir,
+                year_month=year_month,
+                day_mode=day_mode,
+                keep_temp=True,
+                output_path=out_path,
+                processing_manifest=paths.processing_manifest,
+            )
+
+            _persist_temp_batches_as_parts(
+                processed_dir=processed_dir,
+                parts_dirname=parts_dirname,
+                cycle_id=cycle_id,
+            )
+
+            if delete_raw_after_process:
+                logger.info("Hybrid: deleting raw CSVs in %s after successful processing (cycle=%d)", d, cycle_id)
+                _delete_csvs_under(d)
+
+            if not hybrid_write_monolith and out_path.exists():
+                with contextlib.suppress(Exception):
+                    out_path.unlink(missing_ok=True)
+
+            cycle_id += 1
+
+        _prune_empty_dirs(raw_dir)
+
+    process_any_raw_dirs()
+
+    if skip_download:
+        logger.info("Hybrid: --skip-download set; no further S3 downloads will be performed.")
+        parts = _collect_part_parquets(parts_root)
+        if not parts:
+            raise RuntimeError(f"Hybrid mode (skip-download) found no parquet parts under: {parts_root}")
+        return parts
+
+    s3_keys_all = _safe_list_s3_keys(year_month=year_month, max_files=None)
+    logger.info("Hybrid: total S3 keys for %s: %d", year_month, len(s3_keys_all))
+
+    for i, start in enumerate(range(0, len(s3_keys_all), int(download_chunk_size))):
+        _log_system_diagnostics(base_path=paths.run_dir)
+
+        free_gb = _disk_free_gb(paths.run_dir)
+        if free_gb < float(min_free_gb):
+            logger.info(
+                "Hybrid: free space %.1f GB below threshold %.1f GB; processing existing raw before downloading more.",
+                free_gb,
+                float(min_free_gb),
+            )
+            process_any_raw_dirs()
+            free_gb = _disk_free_gb(paths.run_dir)
+            if free_gb < float(min_free_gb):
+                raise RuntimeError(
+                    f"Insufficient disk after processing/cleanup: free_gb={free_gb:.1f} < min_free_gb={min_free_gb:.1f}"
+                )
+
+        end = min(len(s3_keys_all), start + int(download_chunk_size))
+        chunk = s3_keys_all[start:end]
+        logger.info("Hybrid: downloading chunk %d keys [%d, %d) size=%d -> %s", i, start, end, len(chunk), ym_raw_dir)
+
+        _download_s3_chunk(
+            s3_keys=chunk,
+            output_dir=ym_raw_dir,
+            manifest_path=paths.download_manifest,
+            fail_fast=bool(download_fail_fast),
+            max_errors=int(download_max_errors),
+            retries=int(download_retries),
+            backoff_factor=float(download_backoff_factor),
+            log_every=int(download_log_every),
+            chunk_id=i,
+        )
+
+        process_any_raw_dirs()
+
+    process_any_raw_dirs()
+
+    parts = _collect_part_parquets(parts_root)
+    if not parts:
+        raise RuntimeError(f"Hybrid mode produced no parquet parts under: {parts_root}")
+
+    logger.info("Hybrid complete: durable parts=%d under %s", len(parts), parts_root)
+    return parts
+
+
+def _parse_k_values(args: argparse.Namespace) -> list[int]:
+    if args.k_values:
+        parts = [p.strip() for p in str(args.k_values).split(",") if p.strip()]
+        ks = sorted({int(p) for p in parts})
+        if any(k < 2 for k in ks):
+            raise ValueError("--k-values must all be >= 2")
+        return ks
+    kmin = int(args.k_min)
+    kmax = int(args.k_max)
+    if kmin < 2 or kmax < 2 or kmax < kmin:
+        raise ValueError("--k-min/--k-max must be >=2 and k-max >= k-min")
+    return list(range(kmin, kmax + 1))
+
+
+def write_run_manifest(
+    *,
+    run_dir: Path,
+    args: argparse.Namespace,
+    selected_k: int | None,
+    k_search_enabled: bool,
+    k_search_output_dir: Path | None,
+    log_path: Path,
+) -> None:
     repo_root = Path().resolve()
 
     manifest: dict[str, Any] = {
@@ -315,153 +1041,527 @@ def write_run_manifest(*, run_dir: Path, args: argparse.Namespace) -> None:
         "git_sha": _get_git_sha(repo_root),
         "python_version": sys.version,
         "polars_version": pl.__version__,
-        "seeds": {
-            "sampling": args.seed,
-            "clustering": args.clustering_seed,
-        },
-        "directory_structure": "data/runs/{run_name}/",
-        "pipeline_version": "v1.0-restored-baseline",
+        "selected_k": selected_k,
+        "k_search_enabled": bool(k_search_enabled),
+        "k_search_output_dir": str(k_search_output_dir) if k_search_output_dir else None,
+        "logs": {"pipeline_log": str(log_path)},
         "cwd": str(Path.cwd()),
-        "platform": {
-            "os_name": os.name,
-            "sys_platform": sys.platform,
-        },
+        "platform": {"os_name": os.name, "sys_platform": sys.platform},
     }
 
     out = run_dir / "run_manifest.json"
-    out.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+    _atomic_write_text(out, json.dumps(manifest, indent=2, sort_keys=True) + "\n")
     logger.info("Wrote run manifest: %s", out)
 
 
 def run_pipeline(args: argparse.Namespace) -> None:
     run_dir = DEFAULT_RUNS_DIR / args.run_name
-    paths = create_directories(run_dir)
-
-    _configure_logging(paths["logs"] / "pipeline.log")
+    paths = _ensure_paths(run_dir)
+    log_path = _configure_logging(paths.logs_dir)
 
+    _update_run_state(paths.run_state, {"run_name": args.run_name, "year_month": args.year_month, "status": "running"})
     logger.info("Run directory: %s", run_dir)
     logger.info("Year-month: %s", args.year_month)
 
-    if args.from_s3:
-        download_from_s3(year_month=args.year_month, num_files=int(args.num_files), run_dir=run_dir)
-    else:
-        logger.info("Skipping S3 download (--from-s3 not set). Expecting CSVs in: %s", paths["raw"])
+    raw_rec = _raw_csvs_recursive(paths.raw_dir)
+    logger.info("Raw recursive CSV count under %s: %d", paths.raw_dir, len(raw_rec))
 
-    processed_parquet = process_csvs(run_dir=run_dir, year_month=args.year_month, day_mode=args.day_mode)
+    _log_system_diagnostics(base_path=paths.run_dir)
 
-    profiles_path = prepare_clustering(
-        run_dir=run_dir,
-        processed_parquet=processed_parquet,
-        sample_days=int(args.sample_days),
-        sample_households=(int(args.sample_households) if args.sample_households is not None else None),
-        seed=int(args.seed),
+    processed_inputs: list[Path] = []
+    selected_k: int | None = None
+    k_search_out_dir: Path | None = None
+
+    # ----------------------------
+    # Early exit: Stage 2 census build only
+    # ----------------------------
+    if bool(args.stage2_build_census_only):
+        preds_path = build_stage2_census_predictors(
+            stage2_dir=paths.stage2_dir,
+            state_fips=str(args.stage2_state_fips),
+            acs_year=int(args.stage2_acs_year),
+            decennial_year=int(args.stage2_decennial_year),
+            fetch_census=bool(args.stage2_fetch_census),
+            census_cache_path=(Path(args.stage2_census_cache) if args.stage2_census_cache is not None else None),
+            census_predictors_path=(
+                Path(args.stage2_census_predictors) if args.stage2_census_predictors is not None else None
+            ),
+        )
+        _update_run_state(
+            paths.run_state,
+            {"status": "running", "stage2_census_build": {"complete": True, "census_predictors": str(preds_path)}},
+        )
+        write_run_manifest(
+            run_dir=paths.run_dir,
+            args=args,
+            selected_k=None,
+            k_search_enabled=False,
+            k_search_output_dir=None,
+            log_path=log_path,
+        )
+        _update_run_state(paths.run_state, {"status": "complete"})
+        logger.info("Pipeline completed (stage2 census build only).")
+        return
+
+    # ----------------------------
+    # Early exit: Stage 2 only (no ingest/prepare/clustering/k-search)
+    # ----------------------------
+    do_stage2 = bool(args.run_stage2) and not bool(args.skip_stage2)
+    stage2_only = (
+        do_stage2
+        and bool(args.skip_download)
+        and bool(args.skip_ingest)
+        and bool(args.skip_prepare)
+        and bool(args.skip_k_search)
+        and bool(args.skip_cluster)
+        and (not bool(args.k_search))
     )
+    if stage2_only:
+        preds_path = build_stage2_census_predictors(
+            stage2_dir=paths.stage2_dir,
+            state_fips=str(args.stage2_state_fips),
+            acs_year=int(args.stage2_acs_year),
+            decennial_year=int(args.stage2_decennial_year),
+            fetch_census=bool(args.stage2_fetch_census),
+            census_cache_path=(Path(args.stage2_census_cache) if args.stage2_census_cache is not None else None),
+            census_predictors_path=(
+                Path(args.stage2_census_predictors) if args.stage2_census_predictors is not None else None
+            ),
+        )
+        run_stage2_multinom_r(
+            run_dir=paths.run_dir,
+            crosswalk_path=Path(args.stage2_crosswalk),
+            census_predictors_path=preds_path,
+            out_dir=paths.stage2_r_dir,
+            baseline_cluster=(str(args.stage2_baseline_cluster) if args.stage2_baseline_cluster is not None else None),
+            alpha=(float(args.stage2_alpha) if args.stage2_alpha is not None else None),
+            min_obs_per_bg=int(args.stage2_min_obs_per_bg),
+            allow_missing_predictors=bool(args.stage2_allow_missing_predictors),
+            standardize=int(args.stage2_standardize),
+            use_vgam=int(args.stage2_use_vgam),
+            verbose=bool(args.stage2_verbose),
+            no_emoji=bool(args.stage2_no_emoji),
+        )
+        _update_run_state(
+            paths.run_state,
+            {"status": "running", "stage2": {"complete": True, "engine": "R", "out_dir": str(paths.stage2_r_dir)}},
+        )
+        write_run_manifest(
+            run_dir=paths.run_dir,
+            args=args,
+            selected_k=None,
+            k_search_enabled=False,
+            k_search_output_dir=None,
+            log_path=log_path,
+        )
+        _update_run_state(paths.run_state, {"status": "complete"})
+        logger.info("Pipeline completed (stage2-only).")
+        return
 
-    run_clustering(
-        run_dir=run_dir,
-        profiles_path=profiles_path,
-        k=int(args.k),
-        clustering_seed=int(args.clustering_seed),
+    # ----------------------------
+    # Stage 0/1: Download + Ingest
+    # ----------------------------
+    if args.hybrid:
+        if not args.from_s3 and not args.skip_download:
+            raise ValueError("--hybrid requires --from-s3 unless --skip-download is used.")
+
+        if args.skip_ingest:
+            parts_root = _parts_root(paths.processed_dir, args.parts_dirname)
+            processed_inputs = _collect_part_parquets(parts_root)
+            if not processed_inputs:
+                raise RuntimeError(
+                    f"--skip-ingest specified but no parts found under {parts_root}. "
+                    "Either run ingest or point parts-dirname correctly."
+                )
+        else:
+            processed_inputs = _hybrid_download_and_process(
+                paths=paths,
+                year_month=args.year_month,
+                day_mode=args.day_mode,
+                download_chunk_size=int(args.download_chunk_size),
+                min_free_gb=float(args.min_free_gb),
+                parts_dirname=str(args.parts_dirname),
+                delete_raw_after_process=bool(args.delete_raw_after_process),
+                download_fail_fast=not bool(args.no_fail_fast),
+                download_max_errors=int(args.max_errors),
+                download_retries=int(args.retries),
+                download_backoff_factor=float(args.backoff_factor),
+                download_log_every=int(args.log_every),
+                skip_download=bool(args.skip_download),
+                skip_ingest=bool(args.skip_ingest),
+                hybrid_write_monolith=bool(args.hybrid_write_monolith),
+            )
+    else:
+        monolith = paths.processed_dir / f"comed_{args.year_month}.parquet"
+        if args.skip_ingest:
+            if not monolith.exists():
+                raise FileNotFoundError(f"--skip-ingest specified but monolith missing: {monolith}")
+            processed_inputs = [monolith]
+        else:
+            if args.from_s3 and not args.skip_download:
+                ym_raw_dir = paths.raw_dir / args.year_month
+                ym_raw_dir.mkdir(parents=True, exist_ok=True)
+
+                s3_keys = _safe_list_s3_keys(year_month=args.year_month, max_files=int(args.num_files))
+                logger.info("Non-hybrid: downloading %d file(s) -> %s", len(s3_keys), ym_raw_dir)
+                _download_s3_chunk(
+                    s3_keys=s3_keys,
+                    output_dir=ym_raw_dir,
+                    manifest_path=paths.download_manifest,
+                    fail_fast=not bool(args.no_fail_fast),
+                    max_errors=int(args.max_errors),
+                    retries=int(args.retries),
+                    backoff_factor=float(args.backoff_factor),
+                    log_every=int(args.log_every),
+                    chunk_id=0,
+                )
+            else:
+                logger.info("Non-hybrid: skipping S3 download (expecting CSVs already under %s).", paths.raw_dir)
+
+            dirs = _raw_processing_dirs(paths.raw_dir)
+            if not dirs:
+                raise RuntimeError(
+                    "Non-hybrid ingest found no processable CSV directories. "
+                    "Check raw recursive CSV count and ensure CSVs exist under raw/ or raw/{YYYYMM}/."
+                )
+            if len(dirs) > 1:
+                logger.warning("Non-hybrid: multiple raw CSV dirs detected; using first deterministically: %s", dirs[0])
+
+            process_csvs(
+                input_dir=dirs[0],
+                processed_dir=paths.processed_dir,
+                run_dir=paths.run_dir,
+                year_month=args.year_month,
+                day_mode=args.day_mode,
+                keep_temp=False,
+                output_path=monolith,
+                processing_manifest=paths.processing_manifest,
+            )
+            if not monolith.exists():
+                raise FileNotFoundError(f"Non-hybrid expected monolith missing: {monolith}")
+            processed_inputs = [monolith]
+
+    _update_run_state(
+        paths.run_state,
+        {"status": "running", "ingest": {"complete": True, "inputs": [str(p) for p in processed_inputs]}},
     )
 
-    if bool(args.run_stage2):
-        run_stage2_logratio(
-            run_dir=run_dir,
-            crosswalk_path=Path(args.stage2_crosswalk),
+    # ----------------------------
+    # Stage 1: Prepare clustering
+    # ----------------------------
+    profiles_path = paths.clustering_dir / "sampled_profiles.parquet"
+    if args.skip_prepare:
+        if not profiles_path.exists():
+            raise FileNotFoundError(f"--skip-prepare specified but missing: {profiles_path}")
+        logger.info("Skipping clustering preparation (using existing %s).", profiles_path)
+    else:
+        profiles_path = prepare_clustering(
+            clustering_dir=paths.clustering_dir,
+            processed_inputs=processed_inputs,
+            sample_days=int(args.sample_days),
+            sample_households=(int(args.sample_households) if args.sample_households is not None else None),
+            seed=int(args.seed),
+        )
+
+    _update_run_state(
+        paths.run_state, {"status": "running", "prepare": {"complete": True, "profiles": str(profiles_path)}}
+    )
+
+    # ----------------------------
+    # Stage 1.5: K-search (optional)
+    # ----------------------------
+    if args.k_search:
+        k_search_out_dir = paths.clustering_dir / str(args.k_search_output_dirname)
+        k_values = _parse_k_values(args)
+
+        if args.skip_k_search:
+            logger.info("Skipping k-search by flag.")
+            selected_k = None
+            if k_search_out_dir.exists():
+                selected_k = read_recommended_k(output_dir=k_search_out_dir, k_values=k_values)
+                logger.info("Using existing k-search recommendation: k=%d", selected_k)
+        else:
+            selected_k = run_k_search(
+                profiles_path=profiles_path,
+                output_dir=k_search_out_dir,
+                seed=int(args.seed),
+                k_values=k_values,
+                sample_households=(
+                    int(args.k_search_sample_households) if args.k_search_sample_households is not None else None
+                ),
+                max_profiles=(int(args.k_search_max_profiles) if args.k_search_max_profiles is not None else None),
+            )
+
+        _update_run_state(
+            paths.run_state,
+            {
+                "status": "running",
+                "k_search": {"enabled": True, "output_dir": str(k_search_out_dir), "selected_k": selected_k},
+            },
+        )
+    else:
+        _update_run_state(paths.run_state, {"status": "running", "k_search": {"enabled": False}})
+
+    # ----------------------------
+    # Stage 1.6: Single-k clustering
+    # ----------------------------
+    assignments = paths.clustering_dir / "cluster_assignments.parquet"
+    if args.skip_cluster:
+        if not assignments.exists():
+            raise FileNotFoundError(f"--skip-cluster specified but missing: {assignments}")
+        logger.info("Skipping clustering (using existing %s).", assignments)
+    else:
+        k_final = selected_k if args.k is None or str(args.k).lower() == "auto" else int(args.k)
+
+        if k_final is None:
+            raise ValueError(
+                "No k available for clustering. Provide --k, or enable --k-search and use --k auto (or omit --k)."
+            )
+        if k_final < 2:
+            raise ValueError("--k must be >= 2")
+
+        run_clustering(
+            clustering_dir=paths.clustering_dir,
+            profiles_path=profiles_path,
+            k=int(k_final),
+            clustering_seed=int(args.clustering_seed),
+        )
+        selected_k = int(k_final)
+
+    _update_run_state(paths.run_state, {"status": "running", "cluster": {"complete": True, "selected_k": selected_k}})
+
+    # ----------------------------
+    # Stage 2: Regression (optional)
+    # ----------------------------
+    do_stage2 = bool(args.run_stage2) and not bool(args.skip_stage2)
+    if do_stage2:
+        preds_path = build_stage2_census_predictors(
+            stage2_dir=paths.stage2_dir,
             state_fips=str(args.stage2_state_fips),
             acs_year=int(args.stage2_acs_year),
-            min_obs_per_bg=int(args.stage2_min_obs_per_bg),
-            alpha=float(args.stage2_alpha),
-            standardize=bool(args.stage2_standardize),
-            no_ols=bool(args.stage2_no_ols),
+            decennial_year=int(args.stage2_decennial_year),
             fetch_census=bool(args.stage2_fetch_census),
-            baseline_cluster=(str(args.stage2_baseline_cluster) if args.stage2_baseline_cluster is not None else None),
-            predictors_from=(Path(args.stage2_predictors_from) if args.stage2_predictors_from is not None else None),
             census_cache_path=(Path(args.stage2_census_cache) if args.stage2_census_cache is not None else None),
+            census_predictors_path=(
+                Path(args.stage2_census_predictors) if args.stage2_census_predictors is not None else None
+            ),
         )
 
-    write_run_manifest(run_dir=run_dir, args=args)
+        run_stage2_multinom_r(
+            run_dir=paths.run_dir,
+            crosswalk_path=Path(args.stage2_crosswalk),
+            census_predictors_path=preds_path,
+            out_dir=paths.stage2_r_dir,
+            baseline_cluster=(str(args.stage2_baseline_cluster) if args.stage2_baseline_cluster is not None else None),
+            alpha=(float(args.stage2_alpha) if args.stage2_alpha is not None else None),
+            min_obs_per_bg=int(args.stage2_min_obs_per_bg),
+            allow_missing_predictors=bool(args.stage2_allow_missing_predictors),
+            standardize=int(args.stage2_standardize),
+            use_vgam=int(args.stage2_use_vgam),
+            verbose=bool(args.stage2_verbose),
+            no_emoji=bool(args.stage2_no_emoji),
+        )
+        _update_run_state(
+            paths.run_state,
+            {"status": "running", "stage2": {"complete": True, "engine": "R", "out_dir": str(paths.stage2_r_dir)}},
+        )
+    else:
+        _update_run_state(paths.run_state, {"status": "running", "stage2": {"complete": False, "skipped": True}})
 
+    # ----------------------------
+    # Finalize
+    # ----------------------------
+    write_run_manifest(
+        run_dir=paths.run_dir,
+        args=args,
+        selected_k=selected_k,
+        k_search_enabled=bool(args.k_search),
+        k_search_output_dir=k_search_out_dir,
+        log_path=log_path,
+    )
+    _update_run_state(paths.run_state, {"status": "complete"})
     logger.info("Pipeline completed successfully.")
 
 
 def build_parser() -> argparse.ArgumentParser:
-    p = argparse.ArgumentParser(description="ComEd Smart Meter Analysis Pipeline Orchestrator (Phase 3)")
+    p = argparse.ArgumentParser(description="ComEd Smart Meter Analysis Pipeline Orchestrator (Production-Grade)")
 
+    # Core
     p.add_argument("--run-name", required=True, help="Run name (directory under data/runs/)")
     p.add_argument("--year-month", required=True, help="Target month in YYYYMM format (e.g., 202307)")
     p.add_argument("--from-s3", action="store_true", help="Download CSVs from S3 into run_dir/raw/")
 
-    p.add_argument("--num-files", type=int, default=10, help="Number of S3 files to download (default: 10)")
+    # Skip flags
+    p.add_argument("--skip-download", action="store_true", help="Skip S3 download step.")
+    p.add_argument("--skip-ingest", action="store_true", help="Skip CSV->parquet ingest step (use existing outputs).")
+    p.add_argument(
+        "--skip-prepare", action="store_true", help="Skip clustering preparation (use existing sampled_profiles)."
+    )
+    p.add_argument("--skip-k-search", action="store_true", help="Skip k-search run (use existing outputs if present).")
+    p.add_argument("--skip-cluster", action="store_true", help="Skip clustering (use existing cluster_assignments).")
+    p.add_argument("--skip-stage2", action="store_true", help="Skip Stage 2 even if --run-stage2 is set.")
+
+    # Download knobs
+    p.add_argument("--num-files", type=int, default=10, help="Non-hybrid: number of S3 files to download (default: 10)")
+    p.add_argument("--no-fail-fast", action="store_true", help="Download: allow errors up to --max-errors")
+    p.add_argument(
+        "--max-errors", type=int, default=10, help="Download: max errors before aborting (when --no-fail-fast)"
+    )
+    p.add_argument("--retries", type=int, default=3, help="Download: retries per file (in addition to initial attempt)")
+    p.add_argument("--backoff-factor", type=float, default=2.0, help="Download: exponential backoff factor")
+    p.add_argument("--log-every", type=int, default=100, help="Download: progress logging interval")
 
-    p.add_argument("--sample-days", type=int, default=31, help="Days to sample for clustering (default: 31)")
+    # Hybrid
+    p.add_argument(
+        "--hybrid",
+        action="store_true",
+        help="Enable disk-safe hybrid loop (download chunk -> process -> delete raw -> repeat).",
+    )
+    p.add_argument(
+        "--download-chunk-size",
+        type=int,
+        default=DEFAULT_DOWNLOAD_CHUNK_SIZE,
+        help=f"Hybrid: keys per download chunk (default: {DEFAULT_DOWNLOAD_CHUNK_SIZE})",
+    )
+    p.add_argument(
+        "--min-free-gb",
+        type=float,
+        default=DEFAULT_MIN_FREE_GB,
+        help=f"Hybrid: minimum free disk GB (default: {DEFAULT_MIN_FREE_GB})",
+    )
+    p.add_argument(
+        "--parts-dirname",
+        type=str,
+        default=DEFAULT_PARTS_DIRNAME,
+        help=f"Hybrid: processed/ parts dir name (default: {DEFAULT_PARTS_DIRNAME})",
+    )
+    p.add_argument(
+        "--delete-raw-after-process",
+        action="store_true",
+        help="Hybrid: delete raw CSVs after each successful processing cycle (recommended).",
+    )
+    p.add_argument(
+        "--hybrid-write-monolith", action="store_true", help="Hybrid: keep per-cycle monolith parquet outputs."
+    )
+
+    # Ingest
+    p.add_argument("--day-mode", choices=["calendar", "billing"], default="calendar", help="Day attribution mode")
+
+    # Prepare
+    p.add_argument(
+        "--sample-days", type=int, default=DEFAULT_SAMPLE_DAYS, help=f"Days to sample (default: {DEFAULT_SAMPLE_DAYS})"
+    )
     p.add_argument(
         "--sample-households",
         type=int,
         default=None,
         help="Households to sample (default: all). Provide an integer to limit.",
     )
+    p.add_argument("--seed", type=int, default=DEFAULT_SEED, help=f"Random seed for sampling (default: {DEFAULT_SEED})")
+    p.add_argument(
+        "--clustering-seed",
+        type=int,
+        default=DEFAULT_SEED,
+        help=f"Random seed for clustering (default: {DEFAULT_SEED})",
+    )
 
-    p.add_argument("--seed", type=int, default=42, help="Random seed for sampling (default: 42)")
-    p.add_argument("--clustering-seed", type=int, default=42, help="Random seed for clustering (default: 42)")
-
-    p.add_argument("--k", type=int, required=True, help="Number of clusters (single k per run)")
-
-    p.add_argument("--day-mode", choices=["calendar", "billing"], default="calendar", help="Day attribution mode")
+    # K-search
+    p.add_argument("--k-search", action="store_true", help="Enable k-selection search (runs before clustering).")
+    p.add_argument("--k-min", type=int, default=DEFAULT_K_MIN, help=f"k-search: minimum k (default: {DEFAULT_K_MIN})")
+    p.add_argument("--k-max", type=int, default=DEFAULT_K_MAX, help=f"k-search: maximum k (default: {DEFAULT_K_MAX})")
+    p.add_argument(
+        "--k-values", default=None, help="k-search: explicit comma-separated k values (overrides --k-min/--k-max)."
+    )
+    p.add_argument(
+        "--k-search-sample-households", type=int, default=None, help="k-search: limit households used by k-search."
+    )
+    p.add_argument(
+        "--k-search-max-profiles",
+        type=int,
+        default=DEFAULT_K_SEARCH_MAX_PROFILES,
+        help=f"k-search: cap profiles for k-search (default: {DEFAULT_K_SEARCH_MAX_PROFILES})",
+    )
+    p.add_argument(
+        "--k-search-output-dirname",
+        type=str,
+        default=DEFAULT_K_SEARCH_DIRNAME,
+        help=f"k-search: output dir under clustering/ (default: {DEFAULT_K_SEARCH_DIRNAME})",
+    )
 
-    p.add_argument("--run-stage2", action="store_true", help="Run Stage 2 log-ratio regression (optional)")
+    # Clustering k
+    p.add_argument(
+        "--k",
+        default=None,
+        help="Number of clusters for single-k clustering. If --k-search is enabled, use --k auto or omit --k.",
+    )
 
-    # ----------------------------
-    # Stage 2 options (user-facing)
-    # ----------------------------
+    # Stage 2 (R multinomial)
+    p.add_argument("--run-stage2", action="store_true", help="Run Stage 2 regression (R multinomial blockgroup model)")
+    p.add_argument(
+        "--stage2-build-census-only",
+        action="store_true",
+        help="Stage 2: build census cache + predictors parquet only, then exit.",
+    )
     p.add_argument(
         "--stage2-crosswalk",
         default=str(DEFAULT_CROSSWALK_PATH),
-        help=f"ZIP+4 → block-group crosswalk path (default: {DEFAULT_CROSSWALK_PATH})",
+        help=f"ZIP+4 → BG crosswalk path (default: {DEFAULT_CROSSWALK_PATH})",
     )
     p.add_argument(
-        "--stage2-state-fips",
-        default=DEFAULT_STATE_FIPS,
-        help=f"State FIPS (default: {DEFAULT_STATE_FIPS})",
+        "--stage2-state-fips", default=DEFAULT_STATE_FIPS, help=f"State FIPS (default: {DEFAULT_STATE_FIPS})"
     )
     p.add_argument(
-        "--stage2-acs-year",
-        type=int,
-        default=DEFAULT_ACS_YEAR,
-        help=f"ACS year (default: {DEFAULT_ACS_YEAR})",
+        "--stage2-acs-year", type=int, default=DEFAULT_ACS_YEAR, help=f"ACS year (default: {DEFAULT_ACS_YEAR})"
     )
     p.add_argument(
-        "--stage2-fetch-census",
-        action="store_true",
-        help="Stage 2: force re-fetch Census data (ignore cache)",
+        "--stage2-decennial-year", type=int, default=2020, help="Decennial year for urban_percent (default: 2020)"
     )
     p.add_argument(
-        "--stage2-census-cache",
+        "--stage2-fetch-census", action="store_true", help="Stage 2: force re-fetch Census data (ignore cache)"
+    )
+    p.add_argument("--stage2-census-cache", default=None, help="Optional census cache parquet path.")
+    p.add_argument(
+        "--stage2-census-predictors",
         default=None,
-        help=(
-            "Optional census cache parquet path. If omitted, defaults to "
-            "data/runs/{run_name}/stage2/census_cache_{state_fips}_{acs_year}.parquet (Option 1)."
-        ),
+        help="Optional census predictors parquet path (default: stage2/census_predictors_blockgroups.parquet).",
+    )
+    p.add_argument(
+        "--stage2-min-obs-per-bg", type=int, default=50, help="Stage 2: min household-days per BG (default: 50)"
+    )
+    p.add_argument(
+        "--stage2-alpha",
+        type=float,
+        default=None,
+        help="Optional: pass alpha to Stage2 R script. If omitted, no --alpha flag is passed.",
     )
-
     p.add_argument(
-        "--stage2-min-obs-per-bg",
+        "--stage2-allow-missing-predictors",
         type=int,
-        default=50,
-        help="Stage 2 minimum household-day observations per block group (default: 50)",
+        choices=[0, 1],
+        default=0,
+        help="If 1, drop block groups with any predictor NA (complete-case); if 0, abort if any BG would be dropped (default: 0).",
     )
-    p.add_argument("--stage2-alpha", type=float, default=0.5, help="Stage 2 Laplace smoothing alpha (default: 0.5)")
-    p.add_argument("--stage2-standardize", action="store_true", help="Stage 2: standardize predictors")
-    p.add_argument("--stage2-no-ols", action="store_true", help="Stage 2: skip OLS robustness check")
     p.add_argument(
         "--stage2-baseline-cluster",
         default=None,
         help="Stage 2: optional baseline cluster label (default: most frequent cluster)",
     )
+    p.add_argument("--stage2-verbose", action="store_true", help="Stage 2 (R): verbose logs.")
+    p.add_argument("--stage2-no-emoji", action="store_true", help="Stage 2 (R): disable unicode icons.")
+
     p.add_argument(
-        "--stage2-predictors-from",
-        default=None,
-        help="Stage 2: optional path to predictors list (one per line) to force exact predictors",
+        "--stage2-standardize",
+        type=int,
+        choices=[0, 1],
+        default=0,
+        help="Stage 2 (R): divide predictors by SD only (NO mean-centering). 0=off, 1=on (default: 0).",
+    )
+    p.add_argument(
+        "--stage2-use-vgam",
+        type=int,
+        choices=[0, 1],
+        default=1,
+        help="Stage 2 (R): 1=use VGAM::vglm (IRLS), 0=use nnet::multinom (default: 1).",
     )
 
     return p
@@ -470,11 +1570,32 @@ def build_parser() -> argparse.ArgumentParser:
 def main() -> None:
     args = build_parser().parse_args()
 
-    if int(args.num_files) <= 0 and args.from_s3:
-        raise ValueError("--num-files must be > 0 when using --from-s3")
-
-    if int(args.k) <= 1:
-        raise ValueError("--k must be >= 2")
+    # Validation
+    if args.hybrid:
+        if int(args.download_chunk_size) <= 0:
+            raise ValueError("--download-chunk-size must be > 0")
+        if float(args.min_free_gb) <= 0:
+            raise ValueError("--min-free-gb must be > 0")
+    else:
+        if args.from_s3 and not args.skip_download and int(args.num_files) <= 0:
+            raise ValueError("--num-files must be > 0 when using --from-s3 (non-hybrid mode)")
+
+    if args.k_search_max_profiles is not None and int(args.k_search_max_profiles) <= 0:
+        raise ValueError("--k-search-max-profiles must be > 0")
+
+    # k validation: only required if we will actually run clustering
+    if not bool(args.skip_cluster):
+        if not bool(args.k_search):
+            if args.k is None or str(args.k).lower() == "auto":
+                raise ValueError("--k is required when --k-search is not enabled (unless --skip-cluster is set)")
+            if int(args.k) < 2:
+                raise ValueError("--k must be >= 2")
+        else:
+            if args.k is not None and str(args.k).lower() != "auto" and int(args.k) < 2:
+                raise ValueError("--k must be >= 2")
+    else:
+        if args.k is not None and str(args.k).lower() != "auto" and int(args.k) < 2:
+            raise ValueError("--k must be >= 2")
 
     run_pipeline(args)
 
diff --git a/smart_meter_analysis/census.py b/smart_meter_analysis/census.py
index 47f1ff2..2849e90 100644
--- a/smart_meter_analysis/census.py
+++ b/smart_meter_analysis/census.py
@@ -296,7 +296,7 @@ def fetch_acs_data(
     logger.info(f"Retrieved {df.height} block groups from ACS raw pull")
 
     # Debug: Check if required variables are present
-    required_vars = ["B17001_001E", "B17001_002E", "B22001_001E", "B22001_002E", "B03001_001E", "B03001_003E"]
+    required_vars: list[str] = []
     missing_vars = [v for v in required_vars if v not in df.columns]
     if missing_vars:
         logger.warning(f"Required variables not found in API response: {missing_vars}")
diff --git a/smart_meter_analysis/census_specs.py b/smart_meter_analysis/census_specs.py
index c617df2..3f81a31 100644
--- a/smart_meter_analysis/census_specs.py
+++ b/smart_meter_analysis/census_specs.py
@@ -3,7 +3,9 @@
 
 - VARIABLE_SPECS is the spec-driven registry used by census.py to decide which ACS/Decennial
   variables to request and how to engineer features.
-- STAGE2_PREDICTORS_47 is the *stable* final predictor list used by Stage 2 regression.
+- STAGE2_PREDICTORS_47 is the stable, documented predictor inventory.
+- STAGE2_PREDICTORS_MODEL_42 is the deterministic model input list after omitting one
+  reference variable per compositional share group (to ensure identifiability).
 """
 
 from __future__ import annotations
@@ -63,6 +65,80 @@
     "urban_percent",
 ]
 
+EXPECTED_STAGE2_PREDICTOR_COUNT = len(STAGE2_PREDICTORS_47)
+
+# -----------------------------------------------------------------------------
+# Deterministic compositional reference specification
+# -----------------------------------------------------------------------------
+# These groups contain "shares" that sum to (approximately) 100 within a block group.
+# To avoid a non-unique regression (perfect multicollinearity), we omit one reference
+# variable per group from the model input list. The omitted share is documented and
+# can be derived from the included shares:
+#   reference_share = 100 - sum(other_shares_in_group)
+STAGE2_COMPOSITION_GROUPS: list[dict[str, Any]] = [
+    {
+        "group_id": "income_3bin",
+        "group_name": "Household income distribution (3 bins)",
+        "variables": ["pct_income_under_25k", "pct_income_25k_to_75k", "pct_income_75k_plus"],
+        "reference": "pct_income_75k_plus",
+        "rationale": (
+            "Shares sum to 100; omit one bin to ensure identifiability. "
+            "Highest-income share provides a stable reference for interpreting lower/middle shares."
+        ),
+    },
+    {
+        "group_id": "tenure_2bin",
+        "group_name": "Housing tenure (occupied units)",
+        "variables": ["pct_owner_occupied", "pct_renter_occupied"],
+        "reference": "pct_owner_occupied",
+        "rationale": "Owner + renter shares sum to 100; omit owners so renter share is interpreted relative to owners.",
+    },
+    {
+        "group_id": "vintage_3bin",
+        "group_name": "Housing vintage distribution (3 bins)",
+        "variables": ["pct_housing_built_2000_plus", "pct_housing_built_1980_1999", "old_building_pct"],
+        "reference": "pct_housing_built_2000_plus",
+        "rationale": "Shares sum to 100; omit newest stock so older stock shares are interpreted relative to newer construction.",
+    },
+    {
+        "group_id": "home_value_3bin",
+        "group_name": "Owner-occupied home value distribution (3 bins)",
+        "variables": ["pct_home_value_under_150k", "pct_home_value_150k_to_299k", "pct_home_value_300k_plus"],
+        "reference": "pct_home_value_300k_plus",
+        "rationale": "Shares sum to 100; omit highest-value share so lower/middle shares are interpreted relative to highest-value homes.",
+    },
+    {
+        "group_id": "age_6bin",
+        "group_name": "Population age distribution (6 bins)",
+        "variables": [
+            "pct_population_under_5",
+            "pct_population_5_to_17",
+            "pct_population_18_to_24",
+            "pct_population_25_to_44",
+            "pct_population_45_to_64",
+            "pct_population_65_plus",
+        ],
+        "reference": "pct_population_25_to_44",
+        "rationale": "Shares sum to 100; omit 25-44 as a central working-age reference group.",
+    },
+]
+
+STAGE2_REFERENCE_PREDICTORS: set[str] = {g["reference"] for g in STAGE2_COMPOSITION_GROUPS}
+
+# Validate references exist in the 47-list (fail fast if someone edits lists incorrectly)
+_missing_refs = sorted([r for r in STAGE2_REFERENCE_PREDICTORS if r not in STAGE2_PREDICTORS_47])
+if _missing_refs:
+    raise ValueError(f"Reference predictors not found in STAGE2_PREDICTORS_47: {_missing_refs}")
+
+# Model input list: keep original order but omit references deterministically
+STAGE2_PREDICTORS_MODEL_42: list[str] = [p for p in STAGE2_PREDICTORS_47 if p not in STAGE2_REFERENCE_PREDICTORS]
+
+EXPECTED_STAGE2_MODEL_PREDICTOR_COUNT = len(STAGE2_PREDICTORS_MODEL_42)
+if EXPECTED_STAGE2_PREDICTOR_COUNT != 47:
+    raise ValueError(f"Expected 47 predictors, found {EXPECTED_STAGE2_PREDICTOR_COUNT}")
+if EXPECTED_STAGE2_MODEL_PREDICTOR_COUNT != 42:
+    raise ValueError(f"Expected 42 model predictors, found {EXPECTED_STAGE2_MODEL_PREDICTOR_COUNT}")
+
 # -----------------------------------------------------------------------------
 # Complete spec registry with ACS variable codes
 # -----------------------------------------------------------------------------
@@ -172,7 +248,7 @@
         "name": "pct_housing_built_2000_plus",
         "category": "housing",
         "source": "acs",
-        "numerator": "B25034_002E + B25034_003E + B25034_004E + B25034_005E",  # 2000-2009, 2010-2013, 2014-2017, 2018-2020+
+        "numerator": "B25034_002E + B25034_003E + B25034_004E + B25034_005E",
         "denominator": "B25034_001E",
         "transformation": "none",
     },
@@ -180,7 +256,7 @@
         "name": "pct_housing_built_1980_1999",
         "category": "housing",
         "source": "acs",
-        "numerator": "B25034_006E + B25034_007E",  # 1990-1999, 1980-1989
+        "numerator": "B25034_006E + B25034_007E",
         "denominator": "B25034_001E",
         "transformation": "none",
     },
@@ -188,7 +264,7 @@
         "name": "old_building_pct",
         "category": "housing",
         "source": "acs",
-        "numerator": "B25034_008E + B25034_009E + B25034_010E + B25034_011E",  # Pre-1980
+        "numerator": "B25034_008E + B25034_009E + B25034_010E + B25034_011E",
         "denominator": "B25034_001E",
         "transformation": "none",
     },
@@ -196,7 +272,7 @@
         "name": "pct_structure_single_family_detached",
         "category": "housing",
         "source": "acs",
-        "numerator": "B25024_002E",  # 1-unit detached
+        "numerator": "B25024_002E",
         "denominator": "B25024_001E",
         "transformation": "none",
     },
@@ -204,7 +280,7 @@
         "name": "pct_structure_single_family_attached",
         "category": "housing",
         "source": "acs",
-        "numerator": "B25024_003E",  # 1-unit attached
+        "numerator": "B25024_003E",
         "denominator": "B25024_001E",
         "transformation": "none",
     },
@@ -212,7 +288,7 @@
         "name": "pct_structure_multifamily_2_to_4",
         "category": "housing",
         "source": "acs",
-        "numerator": "B25024_004E + B25024_005E",  # 2 units + 3-4 units
+        "numerator": "B25024_004E + B25024_005E",
         "denominator": "B25024_001E",
         "transformation": "none",
     },
@@ -220,7 +296,7 @@
         "name": "pct_structure_multifamily_5_to_19",
         "category": "housing",
         "source": "acs",
-        "numerator": "B25024_006E + B25024_007E",  # 5-9 units + 10-19 units
+        "numerator": "B25024_006E + B25024_007E",
         "denominator": "B25024_001E",
         "transformation": "none",
     },
@@ -228,7 +304,7 @@
         "name": "pct_structure_multifamily_20_plus",
         "category": "housing",
         "source": "acs",
-        "numerator": "B25024_008E + B25024_009E",  # 20-49 units + 50+ units
+        "numerator": "B25024_008E + B25024_009E",
         "denominator": "B25024_001E",
         "transformation": "none",
     },
@@ -236,7 +312,7 @@
         "name": "pct_structure_multifamily_10_plus",
         "category": "housing",
         "source": "acs",
-        "numerator": "B25024_007E + B25024_008E + B25024_009E",  # 10-19, 20-49, 50+
+        "numerator": "B25024_007E + B25024_008E + B25024_009E",
         "denominator": "B25024_001E",
         "transformation": "none",
     },
@@ -244,7 +320,7 @@
         "name": "pct_structure_mobile_home",
         "category": "housing",
         "source": "acs",
-        "numerator": "B25024_010E",  # Mobile home
+        "numerator": "B25024_010E",
         "denominator": "B25024_001E",
         "transformation": "none",
     },
@@ -276,7 +352,7 @@
         "name": "pct_rent_burden_30_plus",
         "category": "housing",
         "source": "acs",
-        "numerator": "B25070_007E + B25070_008E + B25070_009E + B25070_010E",  # 30-34.9%, 35-39.9%, 40-49.9%, 50%+
+        "numerator": "B25070_007E + B25070_008E + B25070_009E + B25070_010E",
         "denominator": "B25070_001E",
         "transformation": "none",
     },
@@ -284,7 +360,7 @@
         "name": "pct_rent_burden_50_plus",
         "category": "housing",
         "source": "acs",
-        "numerator": "B25070_010E",  # 50%+
+        "numerator": "B25070_010E",
         "denominator": "B25070_001E",
         "transformation": "none",
     },
@@ -292,15 +368,15 @@
         "name": "pct_owner_cost_burden_30_plus_mortgage",
         "category": "housing",
         "source": "acs",
-        "numerator": "B25091_008E + B25091_009E + B25091_010E + B25091_011E",  # With mortgage: 30-34.9%, 35-39.9%, 40-49.9%, 50%+
-        "denominator": "B25091_002E",  # With mortgage total
+        "numerator": "B25091_008E + B25091_009E + B25091_010E + B25091_011E",
+        "denominator": "B25091_002E",
         "transformation": "none",
     },
     {
         "name": "pct_owner_cost_burden_50_plus_mortgage",
         "category": "housing",
         "source": "acs",
-        "numerator": "B25091_011E",  # With mortgage: 50%+
+        "numerator": "B25091_011E",
         "denominator": "B25091_002E",
         "transformation": "none",
     },
@@ -308,16 +384,16 @@
         "name": "pct_owner_overcrowded_2plus_per_room",
         "category": "housing",
         "source": "acs",
-        "numerator": "B25014_007E",  # Owner: 2+ persons per room
-        "denominator": "B25014_002E",  # Owner total
+        "numerator": "B25014_007E",
+        "denominator": "B25014_002E",
         "transformation": "none",
     },
     {
         "name": "pct_renter_overcrowded_2plus_per_room",
         "category": "housing",
         "source": "acs",
-        "numerator": "B25014_013E",  # Renter: 2+ persons per room
-        "denominator": "B25014_008E",  # Renter total
+        "numerator": "B25014_013E",
+        "denominator": "B25014_008E",
         "transformation": "none",
     },
     # -------------------------------------------------------------------------
@@ -341,8 +417,8 @@
         "name": "pct_single_parent_households",
         "category": "household",
         "source": "acs",
-        "numerator": "B11001_006E + B11001_007E",  # Male householder + Female householder, no spouse
-        "denominator": "B11001_001E",  # Total households
+        "numerator": "B11001_006E + B11001_007E",
+        "denominator": "B11001_001E",
         "transformation": "none",
     },
     # -------------------------------------------------------------------------
@@ -359,15 +435,15 @@
         "name": "pct_female",
         "category": "demographic",
         "source": "acs",
-        "numerator": "B01001_026E",  # Female total
-        "denominator": "B01001_001E",  # Total population
+        "numerator": "B01001_026E",
+        "denominator": "B01001_001E",
         "transformation": "none",
     },
     {
         "name": "pct_white_alone",
         "category": "demographic",
         "source": "acs",
-        "numerator": "B03002_003E",  # White alone, not Hispanic/Latino
+        "numerator": "B03002_003E",
         "denominator": "B03002_001E",
         "transformation": "none",
     },
@@ -375,7 +451,7 @@
         "name": "pct_black_alone",
         "category": "demographic",
         "source": "acs",
-        "numerator": "B03002_004E",  # Black alone, not Hispanic/Latino
+        "numerator": "B03002_004E",
         "denominator": "B03002_001E",
         "transformation": "none",
     },
@@ -383,7 +459,7 @@
         "name": "pct_asian_alone",
         "category": "demographic",
         "source": "acs",
-        "numerator": "B03002_006E",  # Asian alone, not Hispanic/Latino
+        "numerator": "B03002_006E",
         "denominator": "B03002_001E",
         "transformation": "none",
     },
@@ -391,7 +467,7 @@
         "name": "pct_two_or_more_races",
         "category": "demographic",
         "source": "acs",
-        "numerator": "B03002_009E",  # Two or more races, not Hispanic/Latino
+        "numerator": "B03002_009E",
         "denominator": "B03002_001E",
         "transformation": "none",
     },
@@ -399,7 +475,7 @@
         "name": "pct_population_under_5",
         "category": "demographic",
         "source": "acs",
-        "numerator": "B01001_003E + B01001_027E",  # Male + Female under 5
+        "numerator": "B01001_003E + B01001_027E",
         "denominator": "B01001_001E",
         "transformation": "none",
     },
@@ -450,11 +526,8 @@
         "name": "urban_percent",
         "category": "spatial",
         "source": "decennial",
-        "numerator": "H2_002N",  # Urban housing units
-        "denominator": "H2_002N + H2_003N",  # Total (urban + rural)
+        "numerator": "H2_002N",
+        "denominator": "H2_002N + H2_003N",
         "transformation": "none",
     },
 ]
-
-# Optional convenience: expose the expected count
-EXPECTED_STAGE2_PREDICTOR_COUNT = len(STAGE2_PREDICTORS_47)
diff --git a/test b/test
new file mode 100644
index 0000000..e69de29
diff --git a/uv.lock b/uv.lock
index f299c53..67dd1a3 100644
--- a/uv.lock
+++ b/uv.lock
@@ -756,6 +756,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f7/ec/67fbef5d497f86283db54c22eec6f6140243aae73265799baaaa19cd17fb/ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619", size = 11034, upload-time = "2022-05-02T15:47:14.552Z" },
 ]
 
+[[package]]
+name = "git-filter-repo"
+version = "2.47.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b9/bb/7a283f568af6b0528ade65e8ace84bd6ba46003e429101bcd62c232d01a5/git_filter_repo-2.47.0.tar.gz", hash = "sha256:411b27e68a080c07a69c233cb526dbc2d848b09a72f10477f4444dd0822cf290", size = 275743, upload-time = "2024-12-04T03:10:48.2Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/60/60/d3943f0880ebcb7e0bdf79254d10dddd39c7b656eeecae32b8806ff66dec/git_filter_repo-2.47.0-py3-none-any.whl", hash = "sha256:2cd04929b9024e83e65db571cbe36aec65ead0cb5f9ec5abe42158654af5ad83", size = 76282, upload-time = "2024-12-04T03:10:46.064Z" },
+]
+
 [[package]]
 name = "griffe"
 version = "1.15.0"
@@ -2898,6 +2907,7 @@ dependencies = [
     { name = "boto3" },
     { name = "botocore" },
     { name = "cenpy" },
+    { name = "git-filter-repo" },
     { name = "ipykernel" },
     { name = "matplotlib" },
     { name = "memory-profiler" },
@@ -2938,6 +2948,7 @@ requires-dist = [
     { name = "boto3", specifier = ">=1.40.46" },
     { name = "botocore", specifier = ">=1.40.47" },
     { name = "cenpy", specifier = ">=1.0.1" },
+    { name = "git-filter-repo", specifier = ">=2.47.0" },
     { name = "ipykernel", specifier = ">=6.30.1" },
     { name = "matplotlib", specifier = ">=3.9.4" },
     { name = "memory-profiler", specifier = ">=0.61.0" },