diff --git a/.gitignore b/.gitignore index 15eb7d1..fc6880b 100644 --- a/.gitignore +++ b/.gitignore @@ -172,3 +172,15 @@ results/ profiles/ docs/*.html docs/index_files/ + +# --- Local junk / scratch artifacts (root) --- +sample*.csv +*.log +Untitled +tl_2023_17_bg.zip + +# --- Quarantine area (local) --- +archive_quarantine/ + +# Legacy artifacts (do not track) +archive/ diff --git a/analysis/clustering/euclidean_clustering_k_search.py b/analysis/clustering/euclidean_clustering_k_search.py index b655065..5ab66f2 100644 --- a/analysis/clustering/euclidean_clustering_k_search.py +++ b/analysis/clustering/euclidean_clustering_k_search.py @@ -228,7 +228,7 @@ def predict_and_write_assignments_streaming( pf = pq.ParquetFile(input_path) out_schema = pf.schema_arrow.append(pa.field("cluster", pa.int32())) - writer = pq.ParquetWriter(output_path, out_schema, compression="zstd") + writer = pq.ParquetWriter(output_path, out_schema, compression="snappy") k = int(model.n_clusters) counts = np.zeros(k, dtype=np.int64) @@ -273,7 +273,7 @@ def plot_centroids(centroids: np.ndarray, output_path: Path) -> None: x = np.arange(n_timepoints) xlabel = "Time Interval" - fig, ax = plt.subplots(figsize=(12, 6)) + _fig, ax = plt.subplots(figsize=(12, 6)) for i in range(k): ax.plot(x, centroids[i], label=f"Cluster {i}", linewidth=2) diff --git a/analysis/clustering/euclidean_clustering_minibatch.py b/analysis/clustering/euclidean_clustering_minibatch.py index 49a690d..8ddd1b3 100644 --- a/analysis/clustering/euclidean_clustering_minibatch.py +++ b/analysis/clustering/euclidean_clustering_minibatch.py @@ -16,10 +16,14 @@ from __future__ import annotations import argparse +import gc import json import logging +import os +import time from collections.abc import Iterable from pathlib import Path +from typing import Any import matplotlib.pyplot as plt import numpy as np @@ -29,15 +33,118 @@ from sklearn.cluster import MiniBatchKMeans from sklearn.metrics import silhouette_score +try: + import psutil # optional +except Exception: + psutil = None + logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) +CGROUP_ROOT = Path("/sys/fs/cgroup") +_BASELINE_CGROUP_EVENTS: dict[str, int] = {} + # ============================================================================= -# IO + batch utilities +# MEMORY TELEMETRY (unprivileged container-safe) # ============================================================================= +def _read_text_file(path: Path) -> str | None: + try: + return path.read_text(encoding="utf-8").strip() + except Exception: + return None + + +def _read_int_file(path: Path) -> int | None: + s = _read_text_file(path) + if s is None: + return None + try: + return int(s) + except Exception: + return None + +def _get_rss_bytes() -> int | None: + try: + if psutil is not None: + return int(psutil.Process(os.getpid()).memory_info().rss) + except Exception: # noqa: S110 + pass + try: + with open("/proc/self/status", encoding="utf-8") as f: + for line in f: + if line.startswith("VmRSS:"): + parts = line.split() + return int(parts[1]) * 1024 + except Exception: + return None + return None + +def _read_cgroup_memory_bytes() -> dict[str, int | None]: + cur = _read_int_file(CGROUP_ROOT / "memory.current") + peak = _read_int_file(CGROUP_ROOT / "memory.peak") + maxv = _read_text_file(CGROUP_ROOT / "memory.max") + limit = None + if maxv is not None and maxv != "max": + try: + limit = int(maxv) + except Exception: + limit = None + return {"current": cur, "peak": peak, "limit": limit} + + +def _read_cgroup_memory_events() -> dict[str, int]: + out: dict[str, int] = {} + try: + txt = (CGROUP_ROOT / "memory.events").read_text(encoding="utf-8") + for line in txt.splitlines(): + parts = line.split() + if len(parts) == 2: + out[parts[0]] = int(parts[1]) + except Exception: # noqa: S110 + pass + return out + + +def _mb(x: int | None) -> float | None: + if x is None: + return None + return round(x / (1024.0 * 1024.0), 3) + + +def log_memory(stage: str, extra: dict[str, Any] | None = None) -> None: + global _BASELINE_CGROUP_EVENTS + if not _BASELINE_CGROUP_EVENTS: + _BASELINE_CGROUP_EVENTS = _read_cgroup_memory_events() + + rss_b = _get_rss_bytes() + cg = _read_cgroup_memory_bytes() + ev = _read_cgroup_memory_events() + keys = set(_BASELINE_CGROUP_EVENTS) | set(ev) + delta = {k: ev.get(k, 0) - _BASELINE_CGROUP_EVENTS.get(k, 0) for k in keys} + + payload: dict[str, Any] = { + "ts": round(time.time(), 3), + "event": "mem", + "stage": stage, + "rss_mb": _mb(rss_b), + "cgroup_current_mb": _mb(cg.get("current")), + "cgroup_peak_mb": _mb(cg.get("peak")), + "cgroup_limit_mb": _mb(cg.get("limit")), + "cgroup_oom_kill_delta": delta.get("oom_kill", 0), + "cgroup_events_delta": delta, + } + if extra: + payload.update(extra) + + logger.info("[MEMORY] %s", json.dumps(payload, separators=(",", ":"), sort_keys=True)) + + +# ============================================================================= +# IO + batch utilities +# ============================================================================= def parquet_num_rows(path: Path) -> int: """Return Parquet row count from file metadata (no full scan).""" pf = pq.ParquetFile(path) @@ -57,15 +164,82 @@ def iter_profile_batches(path: Path, batch_size: int, columns: list[str] | None) yield from pf.iter_batches(batch_size=batch_size, columns=columns) +def _list_array_to_numpy_matrix(arr: pa.Array) -> np.ndarray: + """ + Convert Arrow list-like array (List/LargeList/FixedSizeList) to a 2D NumPy array + WITHOUT materializing Python lists. + + Assumes lists are uniform length within the batch (true for 48-pt profiles). + """ + if isinstance(arr, pa.FixedSizeListArray): + n = len(arr) + list_size = int(arr.type.list_size) + values = arr.values + v = values.to_numpy(zero_copy_only=False) + if v.size != n * list_size: + raise ValueError(f"Unexpected FixedSizeList flatten size={v.size} for n={n}, list_size={list_size}") + X = v.reshape((n, list_size)) + return X + + if not (pa.types.is_list(arr.type) or pa.types.is_large_list(arr.type)): + raise TypeError(f"Expected List/LargeList/FixedSizeList; got {arr.type}") + + n = len(arr) + if n == 0: + return np.empty((0, 0), dtype=np.float64) + + # ListArray / LargeListArray: use offsets + values + # Offsets length = n+1; differences must be constant (48) for this dataset. + offsets = arr.offsets.to_numpy(zero_copy_only=False) + diffs = np.diff(offsets) + if diffs.size == 0: + return np.empty((0, 0), dtype=np.float64) + + first = int(diffs[0]) + if not np.all(diffs == first): + raise ValueError("Non-uniform list lengths detected in profile column within a batch; cannot reshape safely.") + if first <= 0: + raise ValueError(f"Invalid list length inferred from offsets: {first}") + + values = arr.values + v = values.to_numpy(zero_copy_only=False) + expected = int(n * first) + if v.size < expected: + raise ValueError(f"Flattened values too short: {v.size} < expected {expected}") + if v.size != expected: + # This should not happen for uniform lists; fail loud to avoid silent corruption. + raise ValueError(f"Flattened values size mismatch: {v.size} != expected {expected}") + + X = v.reshape((n, first)) + return X + + def recordbatch_profiles_to_numpy(rb: pa.RecordBatch, profile_col: str = "profile") -> np.ndarray: - """Convert RecordBatch `profile` list column into a 2D float64 NumPy array.""" + """ + Convert RecordBatch `profile` list column into a 2D float64 NumPy array, + avoiding Python object materialization. + """ idx = rb.schema.get_field_index(profile_col) if idx < 0: raise ValueError(f"RecordBatch missing required column '{profile_col}'") - profiles = rb.column(idx).to_pylist() - X = np.asarray(profiles, dtype=np.float64) + col = rb.column(idx) + + # RecordBatch columns are typically Array, not ChunkedArray; handle defensively. + if isinstance(col, pa.ChunkedArray): + if col.num_chunks != 1: + # Concatenate chunks cheaply in Arrow-space (still bounded by batch size). + col = pa.chunked_array(col.chunks).combine_chunks() + col_arr = col.chunk(0) + else: + col_arr = col + + X = _list_array_to_numpy_matrix(col_arr).astype(np.float64, copy=False) if X.ndim != 2: raise ValueError(f"Expected 2D profile array; got shape={X.shape}") + + if not np.isfinite(X).all(): + X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0) + return X @@ -96,8 +270,6 @@ def parse_output_columns(spec: str) -> list[str] | None: # ============================================================================= # Normalization # ============================================================================= - - def normalize_batch(X: np.ndarray, method: str) -> np.ndarray: """Row-wise normalization: minmax, zscore, or none. Constant rows -> zeros.""" if method in ("none", "", None): @@ -126,8 +298,6 @@ def normalize_batch(X: np.ndarray, method: str) -> np.ndarray: # ============================================================================= # Clustering # ============================================================================= - - def fit_minibatch_kmeans( input_path: Path, k: int, @@ -139,6 +309,7 @@ def fit_minibatch_kmeans( ) -> MiniBatchKMeans: """Fit MiniBatchKMeans by streaming over `profile` batches and calling partial_fit().""" logger.info("Fitting MiniBatchKMeans (k=%d, batch_size=%s, n_init=%d)...", k, f"{batch_size:,}", n_init) + log_memory("fit_start", {"k": int(k), "batch_size": int(batch_size), "n_init": int(n_init)}) model = MiniBatchKMeans( n_clusters=k, @@ -159,10 +330,15 @@ def fit_minibatch_kmeans( X = normalize_batch(X, normalize_method) model.partial_fit(X) seen += int(X.shape[0]) - if bi % 10 == 0 or bi == n_batches: - logger.info(" Trained batch %d/%d (seen=%s)", bi, n_batches, f"{seen:,}") + + if bi == 1 or bi == n_batches or bi % 10 == 0: + log_memory("fit_progress", {"batch_i": int(bi), "n_batches": int(n_batches), "seen": int(seen)}) + + del X, rb + gc.collect() logger.info(" Training complete. Inertia: %s", f"{float(model.inertia_):,.2f}") + log_memory("fit_done", {"seen": int(seen), "inertia": float(model.inertia_)}) return model @@ -200,6 +376,7 @@ def predict_write_and_optional_silhouette( # noqa: C901 """ logger.info("Predicting labels + writing assignments streaming: %s", output_path) output_path.parent.mkdir(parents=True, exist_ok=True) + log_memory("predict_start", {"batch_size": int(batch_size)}) k = int(model.n_clusters) counts = np.zeros(k, dtype=np.int64) @@ -211,15 +388,15 @@ def predict_write_and_optional_silhouette( # noqa: C901 global_row = 0 writer: pq.ParquetWriter | None = None - out_schema: pa.Schema | None = None try: - for rb in iter_profile_batches(input_path, batch_size=batch_size, columns=read_columns): + for bi, rb in enumerate(iter_profile_batches(input_path, batch_size=batch_size, columns=read_columns), start=1): # Compute labels (requires profile) X = recordbatch_profiles_to_numpy(rb, profile_col="profile") if normalize and normalize_method != "none": X = normalize_batch(X, normalize_method) - labels = model.predict(X).astype(np.int32) + + labels = model.predict(X).astype(np.int32, copy=False) counts += np.bincount(labels, minlength=k) # Silhouette sampling (uses labels on this pass; avoids a second scan) @@ -236,9 +413,10 @@ def predict_write_and_optional_silhouette( # noqa: C901 if sample_pos > start: idx_in_batch = silhouette_sample_idx[start:sample_pos] - global_row + # Copy rows into a compact array later; these are small (<=5000). for j in idx_in_batch: jj = int(j) - sample_X.append(X[jj]) + sample_X.append(X[jj].copy()) sample_y.append(int(labels[jj])) # Build output batch: select write columns (or all columns), then append cluster @@ -255,14 +433,18 @@ def predict_write_and_optional_silhouette( # noqa: C901 out_rb = out_rb.append_column("cluster", pa.array(labels, type=pa.int32())) - # Initialize writer lazily from first out_rb schema if writer is None: - out_schema = out_rb.schema - writer = pq.ParquetWriter(output_path, out_schema, compression="zstd") - writer.write_batch(out_rb) + writer = pq.ParquetWriter(output_path, out_rb.schema, compression="snappy") + writer.write_batch(out_rb) global_row += int(labels.shape[0]) + if bi == 1 or (bi % 10 == 0): + log_memory("predict_progress", {"batch_i": int(bi), "rows_written": int(global_row)}) + + del out_rb, labels, X, rb + gc.collect() + finally: if writer is not None: writer.close() @@ -279,21 +461,22 @@ def predict_write_and_optional_silhouette( # noqa: C901 if np.unique(ys).size >= 2: Xs = np.asarray(sample_X, dtype=np.float64) logger.info("Computing silhouette on sample_size=%s ...", f"{len(ys):,}") + log_memory("silhouette_start", {"sample_size": len(ys)}) sil = float(silhouette_score(Xs, ys, metric="euclidean")) logger.info(" Silhouette score (sample): %.3f", sil) + log_memory("silhouette_done", {"silhouette": float(sil)}) else: logger.info("Silhouette: skipped (sample fell into a single cluster)") elif use_sil: logger.info("Silhouette: skipped (insufficient sample)") + log_memory("predict_done", {"rows_written": int(total)}) return counts, sil # ============================================================================= # Plotting + outputs # ============================================================================= - - def plot_centroids(centroids: np.ndarray, output_path: Path) -> None: """Save a line plot of cluster centroids.""" k = int(centroids.shape[0]) @@ -306,7 +489,7 @@ def plot_centroids(centroids: np.ndarray, output_path: Path) -> None: x = np.arange(n_timepoints) xlabel = "Time Interval" - fig, ax = plt.subplots(figsize=(12, 6)) + _fig, ax = plt.subplots(figsize=(12, 6)) for i in range(k): ax.plot(x, centroids[i], label=f"Cluster {i}", linewidth=2) @@ -348,8 +531,6 @@ def save_metadata(metadata: dict[str, object], output_path: Path) -> None: # ============================================================================= # Main # ============================================================================= - - def build_parser() -> argparse.ArgumentParser: p = argparse.ArgumentParser( description="Memory-Efficient K-Means Clustering (MiniBatch, PyArrow-batched)", @@ -412,18 +593,16 @@ def main() -> int: n_profiles = parquet_num_rows(args.input) logger.info("Profiles (from parquet metadata): %s", f"{n_profiles:,}") + log_memory("start", {"n_profiles": int(n_profiles)}) - # Output column handling requested_out_cols = parse_output_columns(args.output_columns) - # Read columns must include profile for prediction; if requested_out_cols is None -> read all columns. - # If a subset was requested, read only: requested_out_cols + profile (deduped). if requested_out_cols is None: read_columns = None - write_columns = None # write everything we read (which is everything) + write_columns = None logger.info("Assignments output columns: ALL (default)") else: - read_columns = [] + read_columns: list[str] = [] seen = set() for c in [*requested_out_cols, "profile"]: if c not in seen: @@ -432,7 +611,6 @@ def main() -> int: write_columns = requested_out_cols logger.info("Assignments output columns: %s (+cluster)", ",".join(write_columns)) - # Fit model = fit_minibatch_kmeans( input_path=args.input, k=int(args.k), @@ -444,14 +622,12 @@ def main() -> int: ) centroids = model.cluster_centers_ - # Deterministic silhouette sample indices (optional) sample_idx = make_silhouette_sample_idx(int(n_profiles), int(args.silhouette_sample_size), int(args.seed)) if sample_idx.size > 0: logger.info("Silhouette sample size: %s", f"{sample_idx.size:,}") else: logger.info("Silhouette: disabled") - # Predict + write assignments (and optional silhouette) assignments_path = args.output_dir / "cluster_assignments.parquet" counts, sil_score = predict_write_and_optional_silhouette( model=model, @@ -465,7 +641,6 @@ def main() -> int: silhouette_sample_idx=sample_idx, ) - # Save artifacts centroids_plot_path = args.output_dir / "cluster_centroids.png" centroids_parquet_path = args.output_dir / "cluster_centroids.parquet" metadata_path = args.output_dir / "clustering_metadata.json" @@ -503,6 +678,7 @@ def main() -> int: if sil_score is not None: logger.info("Silhouette (sample): %.3f", float(sil_score)) logger.info("Output: %s", args.output_dir) + log_memory("end") return 0 diff --git a/analysis/clustering/prepare_clustering_data_households.py b/analysis/clustering/prepare_clustering_data_households.py index d285d6b..451ae3b 100644 --- a/analysis/clustering/prepare_clustering_data_households.py +++ b/analysis/clustering/prepare_clustering_data_households.py @@ -6,115 +6,311 @@ Transforms interval-level energy data into daily load profiles at the HOUSEHOLD (account) level for k-means clustering. -MODIFIED: Now accepts multiple input parquet files (e.g., split by time period). +MODIFIED: Accepts multiple input parquet files (e.g., split by time period). +ROBUST: Also accepts *_accounts.parquet and *_dates.parquet as inputs (as produced +by the pipeline), and will not attempt to treat them as interval data. What this does: - 1. Validate schema (no full data scan) - 2. Build/load manifests for accounts and dates (streaming, memory-safe) + 1. Validate schema (schema-only checks; no full scan) + 2. Build/load manifests for accounts and dates (memory-safe) 3. Sample households + dates from manifests 4. Create daily 48-point load profiles per household 5. Output profiles ready for clustering Design notes: - Uses MANIFESTS to avoid OOM on unique().collect() for large files - - Manifests are built once via sink_parquet (streaming) and cached - - Standard mode: single filtered collect() for selected households/dates - * good for up to ~5k-10k households sampled (depending on memory) - - Chunked streaming mode: fully streaming pipeline with per-chunk sinks - * required for 10k+ households on constrained hardware - - Profiles are 48 half-hourly kWh values in chronological order (00:30-24:00) - - Incomplete days (not 48 intervals) are dropped as "missing/irregular data" + - Chunked streaming mode writes per-chunk parquet files and then performs a + bounded-memory merge via PyArrow iter_batches + ParquetWriter. + - In streaming mode we DO NOT read the full merged sampled_profiles.parquet + into memory; we derive the household map and stats via lazy scans. Output files: - sampled_profiles.parquet: - One row per (account_identifier, date) with 'profile' = list[float] of length 48 + One row per (account_identifier, date) with 'profile' = list[float] length 48 - household_zip4_map.parquet: Unique account_identifier → zip_code mapping for later joins - -Manifest files (auto-generated alongside input, via smart_meter_analysis.manifests): - - {input_stem}_accounts.parquet: unique (account_identifier, zip_code) pairs - - {input_stem}_dates.parquet: unique (date, is_weekend, weekday) tuples """ from __future__ import annotations import argparse import gc +import json import logging +import os +import time +from calendar import monthrange +from collections.abc import Sequence from pathlib import Path from typing import Any, Literal import polars as pl +import pyarrow.parquet as pq from smart_meter_analysis.manifests import ensure_account_manifest, ensure_date_manifest +try: + import psutil # optional +except Exception: + psutil = None + logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) -REQUIRED_ENERGY_COLS = ["zip_code", "account_identifier", "datetime", "kwh"] -REQUIRED_TIME_COLS = ["date", "hour", "is_weekend", "weekday"] +REQUIRED_INTERVAL_COLS = ["zip_code", "account_identifier", "datetime", "kwh", "date", "hour", "is_weekend", "weekday"] +REQUIRED_ACCOUNT_MANIFEST_COLS = ["account_identifier", "zip_code"] +REQUIRED_DATE_MANIFEST_COLS = ["date", "is_weekend", "weekday"] + +CGROUP_ROOT = Path("/sys/fs/cgroup") + +# No CLI flags per requirement: keep bounded-memory merge settings internal. +MERGE_BATCH_SIZE_ROWS = 65_536 +FINAL_PARQUET_COMPRESSION = "snappy" # ============================================================================= -# MEMORY INSTRUMENTATION +# MEMORY TELEMETRY (unprivileged container-safe) # ============================================================================= -def log_memory(label: str) -> None: - """Log current RSS memory usage (Linux only).""" +_BASELINE_CGROUP_EVENTS: dict[str, int] = {} + + +def _read_text_file(path: Path) -> str | None: try: - with open("/proc/self/status") as f: + return path.read_text(encoding="utf-8").strip() + except Exception: + return None + + +def _read_int_file(path: Path) -> int | None: + s = _read_text_file(path) + if s is None: + return None + try: + return int(s) + except Exception: + return None + + +def _get_rss_bytes() -> int | None: + try: + if psutil is not None: + return int(psutil.Process(os.getpid()).memory_info().rss) + except Exception: # noqa: S110 + pass + + try: + with open("/proc/self/status", encoding="utf-8") as f: for line in f: if line.startswith("VmRSS:"): - mem_mb = int(line.split()[1]) / 1024 - logger.info("[MEMORY] %s: %.0f MB", label, mem_mb) - break - except Exception as exc: - logger.debug("Skipping memory log for %s: %s", label, exc) + parts = line.split() + return int(parts[1]) * 1024 + except Exception: + return None + return None + + +def _read_cgroup_memory_bytes() -> dict[str, int | None]: + cur = _read_int_file(CGROUP_ROOT / "memory.current") + peak = _read_int_file(CGROUP_ROOT / "memory.peak") + maxv = _read_text_file(CGROUP_ROOT / "memory.max") + limit = None + if maxv is not None and maxv != "max": + try: + limit = int(maxv) + except Exception: + limit = None + return {"current": cur, "peak": peak, "limit": limit} + + +def _read_cgroup_memory_events() -> dict[str, int]: + out: dict[str, int] = {} + try: + txt = (CGROUP_ROOT / "memory.events").read_text(encoding="utf-8") + for line in txt.splitlines(): + parts = line.split() + if len(parts) == 2: + out[parts[0]] = int(parts[1]) + except Exception: # noqa: S110 + pass + return out + + +def _mb(x: int | None) -> float | None: + if x is None: + return None + return round(x / (1024.0 * 1024.0), 3) + + +def log_memory(label: str, extra: dict[str, Any] | None = None) -> None: + global _BASELINE_CGROUP_EVENTS + if not _BASELINE_CGROUP_EVENTS: + _BASELINE_CGROUP_EVENTS = _read_cgroup_memory_events() + + rss_b = _get_rss_bytes() + cg = _read_cgroup_memory_bytes() + ev = _read_cgroup_memory_events() + keys = set(_BASELINE_CGROUP_EVENTS) | set(ev) + delta = {k: ev.get(k, 0) - _BASELINE_CGROUP_EVENTS.get(k, 0) for k in keys} + + payload: dict[str, Any] = { + "ts": round(time.time(), 3), + "event": "mem", + "stage": label, + "rss_mb": _mb(rss_b), + "cgroup_current_mb": _mb(cg.get("current")), + "cgroup_peak_mb": _mb(cg.get("peak")), + "cgroup_limit_mb": _mb(cg.get("limit")), + "cgroup_oom_kill_delta": delta.get("oom_kill", 0), + "cgroup_events_delta": delta, + } + if extra: + payload.update(extra) + + logger.info("[MEMORY] %s", json.dumps(payload, separators=(",", ":"), sort_keys=True)) # ============================================================================= -# MULTI-FILE SUPPORT +# INPUT CLASSIFICATION # ============================================================================= +def _schema_names_for_path(p: Path) -> list[str]: + try: + return pl.scan_parquet(p).collect_schema().names() + except Exception as exc: + raise RuntimeError(f"Failed to read parquet schema for {p}: {exc}") from exc + + +def classify_inputs(paths: list[Path]) -> tuple[list[Path], list[Path], list[Path]]: + """ + Classify inputs into: + - interval_paths: full interval dataset parts + - account_manifest_paths: *_accounts.parquet (or schema matches) + - date_manifest_paths: *_dates.parquet (or schema matches) + + This makes the script robust to the orchestrator passing mixed inputs. + """ + interval_paths: list[Path] = [] + account_manifest_paths: list[Path] = [] + date_manifest_paths: list[Path] = [] + + for p in paths: + name = p.name.lower() + + # Fast filename heuristics first + if name.endswith("_accounts.parquet"): + account_manifest_paths.append(p) + continue + if name.endswith("_dates.parquet"): + date_manifest_paths.append(p) + continue + + cols = set(_schema_names_for_path(p)) + + if set(REQUIRED_INTERVAL_COLS).issubset(cols): + interval_paths.append(p) + continue + + # Schema-based fallback for manifests + if set(REQUIRED_ACCOUNT_MANIFEST_COLS).issubset(cols) and "datetime" not in cols and "kwh" not in cols: + account_manifest_paths.append(p) + continue + if set(REQUIRED_DATE_MANIFEST_COLS).issubset(cols) and "datetime" not in cols and "kwh" not in cols: + date_manifest_paths.append(p) + continue + + raise ValueError( + "Unrecognized parquet input (neither interval data nor expected manifest schema): " + f"{p} with columns={sorted(cols)}" + ) + + if not interval_paths: + raise ValueError("No interval parquet inputs detected. Expected at least one file with interval schema.") + + logger.info( + "Classified inputs: interval=%d, accounts_manifest=%d, dates_manifest=%d", + len(interval_paths), + len(account_manifest_paths), + len(date_manifest_paths), + ) + return interval_paths, account_manifest_paths, date_manifest_paths + + def scan_multiple_parquet(paths: list[Path]) -> pl.LazyFrame: - """Create a unified LazyFrame from multiple parquet files.""" + """Create a unified LazyFrame from multiple interval parquet files.""" if len(paths) == 1: return pl.scan_parquet(paths[0]) - logger.info("Combining %d input files...", len(paths)) + logger.info("Combining %d interval input files...", len(paths)) return pl.concat([pl.scan_parquet(p) for p in paths]) # ============================================================================= -# DATA INSPECTION & SAMPLING +# VALIDATION # ============================================================================= -def validate_schema(paths: list[Path]) -> dict[str, Any]: - """Validate input data has required columns (schema-only check).""" - lf = scan_multiple_parquet(paths) - schema = lf.collect_schema() - +def validate_interval_schema(interval_paths: list[Path]) -> dict[str, Any]: + """ + Validate required columns exist and all interval parts share the same schema. + Schema-only checks; does not scan full data. + """ errors: list[str] = [] - missing_energy = [c for c in REQUIRED_ENERGY_COLS if c not in schema.names()] - missing_time = [c for c in REQUIRED_TIME_COLS if c not in schema.names()] - if missing_energy: - errors.append(f"Missing energy columns: {missing_energy}") - if missing_time: - errors.append(f"Missing time columns: {missing_time}") + first_schema = pl.scan_parquet(interval_paths[0]).collect_schema() + first_names = first_schema.names() + missing = [c for c in REQUIRED_INTERVAL_COLS if c not in first_names] + if missing: + errors.append(f"Missing interval columns in first file {interval_paths[0]}: {missing}") - return {"valid": len(errors) == 0, "errors": errors, "columns": schema.names()} + # Ensure all interval parts match schema (fail-loud with useful diagnostics) + for p in interval_paths[1:]: + s = pl.scan_parquet(p).collect_schema() + if s.names() != first_names: + errors.append( + "Interval schema mismatch:\n" + f" first={interval_paths[0]} cols={first_names}\n" + f" this ={p} cols={s.names()}" + ) + return {"valid": len(errors) == 0, "errors": errors, "columns": first_names} + +# ============================================================================= +# SAMPLING HELPERS +# ============================================================================= def _as_polars_date_list(dates: list[Any]) -> list[Any]: - """ - Ensure Python/Polars date values are normalized to Polars Date dtype - before using in `.is_in()`. - """ if not dates: return dates - # This handles python datetime.date, python datetime.datetime, strings, etc. return pl.Series("dates", dates).cast(pl.Date).to_list() +def _sum_parquet_rows(paths: Sequence[Path]) -> int: + total = 0 + for p in paths: + pf = pq.ParquetFile(str(p)) + md = pf.metadata + if md is None: + raise RuntimeError(f"Missing parquet metadata for input: {p}") + total += int(md.num_rows) + return total + + +def _load_union_account_manifest(paths: list[Path]) -> pl.DataFrame: + return ( + pl.concat([pl.scan_parquet(p) for p in paths]) + .unique(subset=["account_identifier", "zip_code"]) + .collect(engine="streaming") + ) + + +def _load_union_date_manifest(paths: list[Path]) -> pl.DataFrame: + return ( + pl.concat([pl.scan_parquet(p) for p in paths]) + .unique(subset=["date", "is_weekend", "weekday"]) + .collect(engine="streaming") + ) + + def get_metadata_and_samples( # noqa: C901 - input_paths: list[Path], + interval_paths: list[Path], + account_manifest_paths: list[Path], + date_manifest_paths: list[Path], sample_households: int | None, sample_days: int, day_strategy: Literal["stratified", "random"], @@ -123,52 +319,27 @@ def get_metadata_and_samples( # noqa: C901 month: int | None = None, ) -> dict[str, Any]: """ - Get summary statistics and sample households + dates using MANIFESTS. + Get summary statistics and sample households + dates using manifests. + + If manifests are provided via CLI inputs (as the orchestrator does), use them directly. + Otherwise, fall back to ensure_*_manifest(interval_file) for each interval file. """ - logger.info("Scanning input data for metadata and sampling...") - log_memory("start of get_metadata_and_samples") - - lf = scan_multiple_parquet(input_paths) - - logger.info(" Computing summary statistics...") - summary_df = lf.select([ - pl.len().alias("n_rows"), - pl.col("zip_code").n_unique().alias("n_zip_codes"), - pl.col("account_identifier").n_unique().alias("n_accounts"), - pl.col("date").min().alias("min_date"), - pl.col("date").max().alias("max_date"), - ]).collect(engine="streaming") - summary = summary_df.to_dicts()[0] - - if summary["n_rows"] == 0: - raise ValueError("Input files contain no rows.") - - logger.info(" %s rows", f"{summary['n_rows']:,}") - logger.info(" %s households", f"{summary['n_accounts']:,}") - logger.info(" %s ZIP+4 codes", f"{summary['n_zip_codes']:,}") - logger.info(" Date range: %s to %s", summary["min_date"], summary["max_date"]) - - # Load manifests from first input - logger.info(" Loading manifests from first input file...") - account_manifest0 = ensure_account_manifest(input_paths[0]) - date_manifest0 = ensure_date_manifest(input_paths[0]) - - accounts_df = pl.read_parquet(account_manifest0) - dates_df = pl.read_parquet(date_manifest0) - - # If multiple files, combine manifests from all files first - if len(input_paths) > 1: - logger.info(" Loading manifests from additional input files...") - for path in input_paths[1:]: - acc_manifest = ensure_account_manifest(path) - date_manifest_extra = ensure_date_manifest(path) - accounts_df = pl.concat([accounts_df, pl.read_parquet(acc_manifest)]).unique() - dates_df = pl.concat([dates_df, pl.read_parquet(date_manifest_extra)]).unique() - - # Apply month filter if year/month are specified (after all dates are assembled) - if year is not None and month is not None: - from calendar import monthrange + logger.info("Sampling from manifests...") + log_memory("start_of_get_metadata_and_samples") + + if account_manifest_paths: + accounts_df = _load_union_account_manifest(account_manifest_paths) + else: + manifests = [ensure_account_manifest(p) for p in interval_paths] + accounts_df = _load_union_account_manifest([Path(m) for m in manifests]) + + if date_manifest_paths: + dates_df = _load_union_date_manifest(date_manifest_paths) + else: + manifests = [ensure_date_manifest(p) for p in interval_paths] + dates_df = _load_union_date_manifest([Path(m) for m in manifests]) + if year is not None and month is not None: _, last_day = monthrange(year, month) start_date = pl.date(year, month, 1) end_date = pl.date(year, month, last_day) @@ -178,18 +349,31 @@ def get_metadata_and_samples( # noqa: C901 logger.info(" No month filter applied (using all available dates): %d", dates_df.height) if accounts_df.height == 0: - raise ValueError("No account_identifier values found in manifest.") + raise ValueError("No account_identifier values found in account manifest(s).") if dates_df.height == 0: - raise ValueError("No dates found in manifest.") + raise ValueError("No dates found in date manifest(s).") + + summary = { + "n_rows": _sum_parquet_rows(interval_paths), + "n_accounts": int(accounts_df.height), + "n_zip_codes": int(accounts_df.select(pl.col("zip_code").n_unique()).item()), + "min_date": dates_df.select(pl.col("date").min()).item(), + "max_date": dates_df.select(pl.col("date").max()).item(), + } - log_memory("after loading manifests") + logger.info(" %s rows (from interval parquet metadata)", f"{summary['n_rows']:,}") + logger.info(" %s households (from manifests)", f"{summary['n_accounts']:,}") + logger.info(" %s ZIP+4 codes (from manifests)", f"{summary['n_zip_codes']:,}") + logger.info(" Date range: %s to %s (from manifests)", summary["min_date"], summary["max_date"]) + + log_memory("after_loading_manifests", {"accounts_rows": accounts_df.height, "dates_rows": dates_df.height}) # Sample households - if sample_households is not None and sample_households < len(accounts_df): + if sample_households is not None and sample_households < accounts_df.height: accounts_df = accounts_df.sample(n=sample_households, shuffle=True, seed=seed) - logger.info(" Sampled %s households", f"{len(accounts_df):,}") + logger.info(" Sampled %s households", f"{accounts_df.height:,}") else: - logger.info(" Using all %s households", f"{len(accounts_df):,}") + logger.info(" Using all %s households", f"{accounts_df.height:,}") accounts = accounts_df["account_identifier"].to_list() @@ -202,11 +386,14 @@ def get_metadata_and_samples( # noqa: C901 day_strategy = "random" if day_strategy == "stratified": + weekday_df = dates_df.filter(~pl.col("is_weekend")) + weekend_df = dates_df.filter(pl.col("is_weekend")) + n_weekdays = int(sample_days * 0.7) n_weekends = sample_days - n_weekdays - n_weekdays = min(n_weekdays, len(weekday_df)) - n_weekends = min(n_weekends, len(weekend_df)) + n_weekdays = min(n_weekdays, weekday_df.height) + n_weekends = min(n_weekends, weekend_df.height) sampled_weekdays = ( weekday_df.sample(n=n_weekdays, shuffle=True, seed=seed)["date"].to_list() if n_weekdays > 0 else [] @@ -217,107 +404,44 @@ def get_metadata_and_samples( # noqa: C901 dates = sorted(sampled_weekdays + sampled_weekends) logger.info( - " Sampled %d weekdays + %d weekend days (stratified)", - len(sampled_weekdays), - len(sampled_weekends), + " Sampled %d weekdays + %d weekend days (stratified)", len(sampled_weekdays), len(sampled_weekends) ) else: - n_sample = min(sample_days, len(dates_df)) + n_sample = min(sample_days, dates_df.height) dates = dates_df.sample(n=n_sample, shuffle=True, seed=seed)["date"].to_list() logger.info(" Sampled %d days (random)", len(dates)) if not dates: raise ValueError("No dates were sampled; check input data and sampling settings.") - # Normalize date types now (so downstream `.is_in()` is reliable) dates = _as_polars_date_list(dates) - log_memory("end of get_metadata_and_samples") - logger.info(" Sampled days requested: %d; sampled days returned: %d", sample_days, len(dates)) - logger.info(" Sampled dates: %s", dates) + del accounts_df, dates_df + gc.collect() + log_memory("end_of_get_metadata_and_samples", {"sampled_accounts": len(accounts), "sampled_dates": len(dates)}) return {"summary": summary, "accounts": accounts, "dates": dates} # ============================================================================= -# PROFILE CREATION +# PROFILE CREATION (STREAMING) # ============================================================================= -def create_household_profiles(input_paths: list[Path], accounts: list[str], dates: list[Any]) -> pl.DataFrame: - """ - Create daily load profiles for selected households and dates (in-memory). - """ - logger.info("Creating profiles for %s households x %d days...", f"{len(accounts):,}", len(dates)) - log_memory("start of create_household_profiles") - - if not accounts: - raise ValueError("No accounts provided for profile creation.") - if not dates: - raise ValueError("No dates provided for profile creation.") - - # ensure dates are Polars Date-compatible - dates = _as_polars_date_list(dates) - - lf = scan_multiple_parquet(input_paths).filter( - pl.col("account_identifier").is_in(accounts) & pl.col("date").is_in(dates) - ) - - # Diagnostic: how many of the sampled dates actually appear after filtering? - present_dates = lf.select(pl.col("date").unique().sort()).collect(engine="streaming")["date"].to_list() - logger.info(" Sampled dates present in filtered interval data: %d / %d", len(present_dates), len(dates)) - if len(present_dates) < len(dates): - missing = sorted(set(dates) - set(present_dates)) - logger.warning(" Missing sampled dates after interval filter: %s", missing) - - df = lf.sort(["account_identifier", "datetime"]).collect() - - if df.is_empty(): - logger.warning(" No data found for selected households/dates") - return pl.DataFrame() - - logger.info(" Loaded %s interval records", f"{len(df):,}") - log_memory("after loading filtered intervals") - - profiles_df = df.group_by(["account_identifier", "zip_code", "date"]).agg([ - pl.col("kwh").alias("profile"), - pl.col("is_weekend").first(), - pl.col("weekday").first(), - pl.len().alias("num_intervals"), - ]) - - n_before = len(profiles_df) - profiles_df = profiles_df.filter(pl.col("num_intervals") == 48) - n_dropped = n_before - len(profiles_df) - - if n_dropped > 0: - logger.info(" Dropped %s incomplete profiles (missing or irregular data)", f"{n_dropped:,}") - - logger.info(" Created %s complete profiles", f"{len(profiles_df):,}") - log_memory("end of create_household_profiles") - - return profiles_df.drop("num_intervals") - - def _create_profiles_for_chunk_streaming( - input_paths: list[Path], + interval_paths: list[Path], accounts_chunk: list[str], dates: list[Any], chunk_idx: int, total_chunks: int, tmp_dir: Path, ) -> Path: - """ - Create profiles for a single chunk of households and write directly to parquet. - """ logger.info(" Chunk %d/%d: %s households...", chunk_idx + 1, total_chunks, f"{len(accounts_chunk):,}") - log_memory(f"chunk {chunk_idx + 1} start") + log_memory(f"chunk_{chunk_idx + 1}_start", {"chunk_accounts": len(accounts_chunk), "n_chunks": total_chunks}) - # ensure dates are Polars Date-compatible dates = _as_polars_date_list(dates) - chunk_path = tmp_dir / f"sampled_profiles_chunk_{chunk_idx:03d}.parquet" ( - scan_multiple_parquet(input_paths) + scan_multiple_parquet(interval_paths) .filter(pl.col("account_identifier").is_in(accounts_chunk) & pl.col("date").is_in(dates)) .group_by(["account_identifier", "zip_code", "date"]) .agg([ @@ -331,29 +455,89 @@ def _create_profiles_for_chunk_streaming( .sink_parquet(chunk_path) ) - log_memory(f"chunk {chunk_idx + 1} done") + log_memory(f"chunk_{chunk_idx + 1}_done", {"chunk_path": str(chunk_path)}) logger.info(" Wrote chunk parquet: %s", chunk_path) return chunk_path +def _merge_parquet_chunks_bounded_memory(chunk_paths: Sequence[Path], output_path: Path) -> int: + if not chunk_paths: + raise ValueError("No chunk files provided for merge.") + + chunk_files = sorted((Path(p) for p in chunk_paths), key=lambda p: p.name) + output_path = Path(output_path) + tmp_out = output_path.with_suffix(output_path.suffix + ".tmp") + + if tmp_out.exists(): + tmp_out.unlink() + + log_memory("merge_start", {"n_chunks": len(chunk_files), "batch_rows": MERGE_BATCH_SIZE_ROWS}) + + expected_rows = 0 + for p in chunk_files: + pf = pq.ParquetFile(str(p)) + md = pf.metadata + if md is None: + raise RuntimeError(f"Missing parquet metadata for chunk: {p}") + expected_rows += int(md.num_rows) + + first_pf = pq.ParquetFile(str(chunk_files[0])) + schema = first_pf.schema_arrow + + writer = pq.ParquetWriter( + where=str(tmp_out), + schema=schema, + compression=FINAL_PARQUET_COMPRESSION, + use_dictionary=True, + write_statistics=True, + ) + + rows_written = 0 + t0 = time.time() + try: + for i, p in enumerate(chunk_files, start=1): + pf = pq.ParquetFile(str(p)) + if not pf.schema_arrow.equals(schema, check_metadata=False): + raise RuntimeError( + "Schema mismatch across chunk files; refusing to merge.\n" + f"First schema: {schema}\n" + f"Mismatch file: {p}\n" + f"File schema: {pf.schema_arrow}" + ) + + for rb in pf.iter_batches(batch_size=MERGE_BATCH_SIZE_ROWS): + writer.write_batch(rb) + rows_written += int(rb.num_rows) + + del pf + gc.collect() + + if i == 1 or i == len(chunk_files) or (i % 10 == 0): + log_memory("merge_progress", {"chunk_i": i, "n_chunks": len(chunk_files), "rows_written": rows_written}) + finally: + writer.close() + + if rows_written != expected_rows: + raise RuntimeError(f"Merged row count mismatch: wrote {rows_written} rows but expected {expected_rows} rows.") + + os.replace(str(tmp_out), str(output_path)) + log_memory("merge_done", {"rows_written": rows_written, "seconds": round(time.time() - t0, 3)}) + return rows_written + + def create_household_profiles_chunked_streaming( - input_paths: list[Path], + interval_paths: list[Path], accounts: list[str], dates: list[Any], output_path: Path, chunk_size: int = 5000, ) -> int: - """ - Create daily load profiles using chunked streaming. - """ if not accounts: raise ValueError("No accounts provided for chunked streaming profile creation.") if not dates: raise ValueError("No dates provided for chunked streaming profile creation.") - # Defensive: ensure dates are Polars Date-compatible dates = _as_polars_date_list(dates) - n_accounts = len(accounts) n_chunks = (n_accounts + chunk_size - 1) // chunk_size @@ -364,7 +548,7 @@ def create_household_profiles_chunked_streaming( f"{n_accounts:,}", len(dates), ) - log_memory("before chunked streaming") + log_memory("before_chunked_streaming", {"n_accounts": n_accounts, "n_chunks": n_chunks, "chunk_size": chunk_size}) output_path.parent.mkdir(parents=True, exist_ok=True) tmp_dir = output_path.parent @@ -376,7 +560,7 @@ def create_household_profiles_chunked_streaming( accounts_chunk = accounts[start_idx:end_idx] chunk_path = _create_profiles_for_chunk_streaming( - input_paths=input_paths, + interval_paths=interval_paths, accounts_chunk=accounts_chunk, dates=dates, chunk_idx=i, @@ -393,14 +577,10 @@ def create_household_profiles_chunked_streaming( logger.warning("No profiles created in chunked streaming mode!") return 0 - logger.info("Combining %d chunk files into %s", len(chunk_paths), output_path) - log_memory("before streaming concat") + logger.info("Combining %d chunk files into %s (bounded-memory merge)", len(chunk_paths), output_path) + gc.collect() + n_profiles = _merge_parquet_chunks_bounded_memory(chunk_paths=chunk_paths, output_path=output_path) - pl.concat([pl.scan_parquet(p) for p in chunk_paths]).sink_parquet(output_path) - - log_memory("after streaming concat") - - n_profiles = pl.scan_parquet(output_path).select(pl.len()).collect()[0, 0] logger.info(" Created %s complete profiles (chunked streaming)", f"{n_profiles:,}") logger.info(" Saved to %s", output_path) @@ -410,6 +590,7 @@ def create_household_profiles_chunked_streaming( except OSError as exc: logger.warning("Failed to delete temp chunk file %s: %s", p, exc) + log_memory("after_chunk_cleanup", {"deleted_chunks": len(chunk_paths)}) return int(n_profiles) @@ -428,7 +609,6 @@ def prepare_clustering_data( year: int | None = None, month: int | None = None, ) -> dict[str, Any]: - """Prepare household-level clustering data from interval parquet.""" logger.info("=" * 70) logger.info("PREPARING HOUSEHOLD-LEVEL CLUSTERING DATA") if len(input_paths) > 1: @@ -436,14 +616,18 @@ def prepare_clustering_data( if streaming: logger.info("(STREAMING MODE ENABLED, chunk_size=%d)", chunk_size) logger.info("=" * 70) - log_memory("start of prepare_clustering_data") + log_memory("start_of_prepare_clustering_data", {"streaming": streaming, "n_inputs": len(input_paths)}) - validation = validate_schema(input_paths) + interval_paths, account_manifest_paths, date_manifest_paths = classify_inputs(input_paths) + + validation = validate_interval_schema(interval_paths) if not validation["valid"]: raise ValueError(f"Input validation failed: {validation['errors']}") metadata = get_metadata_and_samples( - input_paths=input_paths, + interval_paths=interval_paths, + account_manifest_paths=account_manifest_paths, + date_manifest_paths=date_manifest_paths, sample_households=sample_households, sample_days=sample_days, day_strategy=day_strategy, @@ -457,10 +641,11 @@ def prepare_clustering_data( output_dir.mkdir(parents=True, exist_ok=True) profiles_path = output_dir / "sampled_profiles.parquet" + map_path = output_dir / "household_zip4_map.parquet" if streaming: n_profiles = create_household_profiles_chunked_streaming( - input_paths=input_paths, + interval_paths=interval_paths, accounts=accounts, dates=dates, output_path=profiles_path, @@ -468,12 +653,48 @@ def prepare_clustering_data( ) if n_profiles == 0: raise ValueError("No profiles created in chunked streaming mode - check input data and sampling settings.") - profiles_df = pl.read_parquet(profiles_path) + + # Memory-safe: do not read entire profiles parquet + log_memory("build_household_map_start") + (pl.scan_parquet(profiles_path).select(["account_identifier", "zip_code"]).unique().sink_parquet(map_path)) + log_memory("build_household_map_done", {"map_path": str(map_path)}) + + stats_row = ( + pl.scan_parquet(profiles_path) + .select([ + pl.len().alias("n_profiles"), + pl.col("account_identifier").n_unique().alias("n_households"), + pl.col("zip_code").n_unique().alias("n_zip4s"), + pl.col("date").n_unique().alias("n_dates"), + ]) + .collect(engine="streaming") + .to_dicts()[0] + ) + stats = {k: int(stats_row[k]) for k in ["n_profiles", "n_households", "n_zip4s", "n_dates"]} else: - profiles_df = create_household_profiles(input_paths=input_paths, accounts=accounts, dates=dates) + # Non-streaming mode unchanged from original implementation. + profiles_df = ( + scan_multiple_parquet(interval_paths) + .filter(pl.col("account_identifier").is_in(accounts) & pl.col("date").is_in(_as_polars_date_list(dates))) + .sort(["account_identifier", "datetime"]) + .collect() + ) if profiles_df.is_empty(): raise ValueError("No profiles created - check input data and sampling settings.") - profiles_df.select([ + + profiles_out = ( + profiles_df.group_by(["account_identifier", "zip_code", "date"]) + .agg([ + pl.col("kwh").alias("profile"), + pl.col("is_weekend").first(), + pl.col("weekday").first(), + pl.len().alias("num_intervals"), + ]) + .filter(pl.col("num_intervals") == 48) + .drop("num_intervals") + ) + + profiles_out.select([ "account_identifier", "zip_code", "date", @@ -483,17 +704,16 @@ def prepare_clustering_data( ]).write_parquet(profiles_path) logger.info(" Saved profiles: %s", profiles_path) - household_map = profiles_df.select(["account_identifier", "zip_code"]).unique() - map_path = output_dir / "household_zip4_map.parquet" - household_map.write_parquet(map_path) - logger.info(" Saved household-ZIP+4 map: %s (%s households)", map_path, f"{len(household_map):,}") - - stats = { - "n_profiles": len(profiles_df), - "n_households": profiles_df["account_identifier"].n_unique(), - "n_zip4s": profiles_df["zip_code"].n_unique(), - "n_dates": profiles_df["date"].n_unique(), - } + household_map = profiles_out.select(["account_identifier", "zip_code"]).unique() + household_map.write_parquet(map_path) + logger.info(" Saved household-ZIP+4 map: %s (%s households)", map_path, f"{household_map.height:,}") + + stats = { + "n_profiles": int(profiles_out.height), + "n_households": int(profiles_out["account_identifier"].n_unique()), + "n_zip4s": int(profiles_out["zip_code"].n_unique()), + "n_dates": int(profiles_out["date"].n_unique()), + } logger.info("") logger.info("=" * 70) @@ -504,7 +724,7 @@ def prepare_clustering_data( logger.info(" ZIP+4s represented: %s", f"{stats['n_zip4s']:,}") logger.info(" Days: %d", stats["n_dates"]) logger.info(" Output: %s", output_dir) - log_memory("end of prepare_clustering_data") + log_memory("end_of_prepare_clustering_data", stats) return stats diff --git a/analysis/stage2/stage2_multinom_blockgroup_weighted.R b/analysis/stage2/stage2_multinom_blockgroup_weighted.R index 3efb410..5e595bc 100644 --- a/analysis/stage2/stage2_multinom_blockgroup_weighted.R +++ b/analysis/stage2/stage2_multinom_blockgroup_weighted.R @@ -4,36 +4,24 @@ # Stage 2 Multinomial Logit (Block Group): Cluster Composition ~ Census Predictors # ============================================================================= # -# What this script does (high-level) -# ---------------------------------- -# 1) Reads household-day cluster assignments (ZIP+4 resolution) from a Parquet file. -# 2) Aggregates these household-days into ZIP+4 × cluster counts using Arrow compute -# (i.e., without loading the full row-level data into memory). -# 3) Joins ZIP+4 to Census Block Group using a crosswalk, then aggregates to -# Block Group × cluster counts and total household-days per Block Group. -# 4) Joins Block Group-level census predictors (Parquet output from upstream census pipeline). -# 5) Fits a multinomial logit model with COUNT response: -# Y_bg = (n_bg,clusterA, n_bg,clusterB, ..., n_bg,clusterBaseline) -# where cluster probabilities are modeled as a function of BG predictors. -# -# Why this structure -# ------------------ -# - The regression is formulated at the Block Group level to avoid per-row modeling at the -# household-day level while still leveraging the full count information (multinomial likelihood). -# - Arrow Dataset aggregation avoids memory blowups when the input Parquet is very large. -# - Predictors are inferred from the census parquet to reduce coupling / hardcoding. -# - VGAM is used by default because it is typically more numerically stable at large scale than -# nnet::multinom, but VGAM requires a full-rank design matrix; rank-deficient terms are dropped -# deterministically to satisfy that constraint. +# Key principles for regulatory defensibility +# ------------------------------------------- +# - Predictors are inferred from the census parquet to avoid fragile hard-coding. +# - Some predictors are "shares" that sum to 100 within a group (compositional). +# To ensure a unique model, one pre-specified reference share per group is omitted +# from estimation. Omitted shares are documented explicitly in outputs. +# - Predictors are NOT mean-centered. Optional scaling divides by SD only. +# - If VGAM is used, any remaining rank-deficient terms are dropped deterministically. # # Outputs # ------- -# - regression_results.parquet: coefficient table with cluster, predictor, estimate, SE, z, p, q. -# - regression_diagnostics.json: model fit diagnostics (LL, deviance, pseudo-R2, AIC/BIC, etc.) -# - stage2_input_qc.json: data lineage + drop counts + inferred/used predictor lists -# - regression_data_blockgroups_wide.parquet: the modeled BG dataset (counts + predictors) -# - stage2_manifest.json: paths of all outputs -# - stage2_metadata.json: runtime + package versions (provenance) +# - regression_results.parquet +# - regression_diagnostics.json +# - stage2_input_qc.json +# - regression_data_blockgroups_wide.parquet +# - omitted_predictors.parquet <-- NEW: documents reference omissions + rank drops +# - stage2_manifest.json +# - stage2_metadata.json # # ============================================================================= @@ -41,9 +29,6 @@ # ----------------------------- # CLI args (no external deps) # ----------------------------- -# Notes: -# - Avoids argparse-style dependencies in R, keeping the script self-contained. -# - Supports both "--flag value" and "--flag=value". print_help_and_exit <- function(exit_code = 0) { cat( paste0( @@ -59,24 +44,17 @@ print_help_and_exit <- function(exit_code = 0) { " --baseline-cluster K Baseline cluster label (default: choose most frequent)\n", " --min-obs-per-bg N Drop BGs with total household-days < N (default: 50)\n", " --allow-missing-predictors 0|1 If 0, abort if predictor NA would drop any BGs (default: 0)\n", - " --standardize 0|1 Z-score standardize predictors (default: 0)\n", + " --standardize 0|1 If 1, divide predictors by SD (NO mean-centering) (default: 0)\n", " --use-vgam 0|1 Use VGAM::vglm() (IRLS) instead of nnet::multinom (default: 1)\n", " --verbose 0|1 Verbose logging (default: 1)\n", " --no-emoji 0|1 Disable unicode icons (default: 0)\n", " --help Print this help and exit\n\n", "Notes:\n", " - Predictors are inferred from the census parquet columns.\n", + " - Some predictors are compositional shares; one reference share per group is omitted\n", + " deterministically and documented in omitted_predictors.parquet.\n", " - Model uses COUNT response: cbind(count_clusterA, count_clusterB, ...)\n", - " - Zeros are handled naturally by the multinomial likelihood; no smoothing is applied.\n", - " - Standardization (--standardize=1) is STRONGLY RECOMMENDED for numerical stability.\n", - " - VGAM requires a full-rank design matrix; this script drops rank-deficient terms deterministically.\n", - " - Outputs written under out-dir:\n", - " regression_results.parquet\n", - " regression_diagnostics.json\n", - " stage2_input_qc.json\n", - " regression_data_blockgroups_wide.parquet\n", - " stage2_manifest.json\n", - " stage2_metadata.json\n\n" + " - Zeros are handled naturally by the multinomial likelihood; no smoothing is applied.\n\n" ) ) quit(status = exit_code) @@ -134,12 +112,6 @@ if (is.null(CLUSTERS_PATH) || is.null(CROSSWALK_PATH) || is.null(CENSUS_PATH) || # ----------------------------- # Deps # ----------------------------- -# The approach uses: -# - arrow: efficient parquet I/O and dataset aggregation -# - dplyr/tibble: data manipulation -# - jsonlite: writing QC/diagnostics -# - nnet: fallback multinomial logit solver -# - VGAM: default multinomial logit solver for stability (full-rank requirement) require_pkg <- function(pkg) requireNamespace(pkg, quietly = TRUE) if (!require_pkg("arrow")) stopf("Missing R package 'arrow'. Install with: install.packages('arrow')") @@ -176,13 +148,72 @@ logi("%s Config: standardize=%d use_vgam=%d", IC$ok, STANDARDIZE, USE_VGAM) t_total_start <- Sys.time() +# ----------------------------- +# Reference-category specification (deterministic omissions) +# ----------------------------- +REFERENCE_GROUPS <- list( + list( + group_id = "income_3bin", + group_name = "Household income distribution (3 bins)", + variables = c("pct_income_under_25k", "pct_income_25k_to_75k", "pct_income_75k_plus"), + reference = "pct_income_75k_plus", + rationale = "Shares sum to 100; omit one bin to ensure a unique model. Highest-income share is a stable reference for policy interpretation." + ), + list( + group_id = "tenure_2bin", + group_name = "Housing tenure (occupied units)", + variables = c("pct_owner_occupied", "pct_renter_occupied"), + reference = "pct_owner_occupied", + rationale = "Owner + renter shares sum to 100; omit owners so renter share is interpreted relative to owners." + ), + list( + group_id = "vintage_3bin", + group_name = "Housing vintage distribution (3 bins)", + variables = c("pct_housing_built_2000_plus", "pct_housing_built_1980_1999", "old_building_pct"), + reference = "pct_housing_built_2000_plus", + rationale = "Shares sum to 100; omit newest stock so older stock shares are interpreted relative to newer construction." + ), + list( + group_id = "home_value_3bin", + group_name = "Owner-occupied home value distribution (3 bins)", + variables = c("pct_home_value_under_150k", "pct_home_value_150k_to_299k", "pct_home_value_300k_plus"), + reference = "pct_home_value_300k_plus", + rationale = "Shares sum to 100; omit highest-value share so lower/middle value shares are interpreted relative to highest-value homes." + ), + list( + group_id = "age_6bin", + group_name = "Population age distribution (6 bins)", + variables = c( + "pct_population_under_5", "pct_population_5_to_17", "pct_population_18_to_24", + "pct_population_25_to_44", "pct_population_45_to_64", "pct_population_65_plus" + ), + reference = "pct_population_25_to_44", + rationale = "Shares sum to 100; omit 25–44 as a central working-age reference group." + ) +) + +REFERENCE_VARS <- unique(vapply(REFERENCE_GROUPS, function(g) g$reference, character(1))) + +implied_definitions <- lapply(REFERENCE_GROUPS, function(g) { + incl <- setdiff(g$variables, g$reference) + list( + group_id = g$group_id, + group_name = g$group_name, + reference = g$reference, + implied_as = paste0(g$reference, " = 100 - (", paste(incl, collapse = " + "), ")") + ) +}) + +REFERENCE_CAVEAT <- paste( + "Some predictors are shares that sum to 100 within a group (compositional).", + "To ensure a unique multinomial logit model, one pre-specified reference share per group is omitted from estimation.", + "The omitted share is not independently estimated; it is implied by the included shares via the identity in the diagnostics." +) + + # ----------------------------- # Helpers: keys + inference # ----------------------------- -# ZIP+4 normalization: -# - Cluster parquet uses "zip_code" which can appear as: -# - "60601-1234" or "606011234" or other string forms. -# - Crosswalk expects Zip + Zip4 columns; we standardize to "#####-####". normalize_zip4 <- function(x) { s <- as.character(x) s <- trimws(s) @@ -193,7 +224,6 @@ normalize_zip4 <- function(x) { out } -# Ensure Zip4 is exactly 4 digits, leading zeros preserved. zfill4 <- function(x) { s <- as.character(x) s <- trimws(s) @@ -203,8 +233,6 @@ zfill4 <- function(x) { s } -# Census GEOID column inference: -# - Upstream data may name the key GEOID, CensusKey2023, CensusKey2020, etc. infer_geoid_col <- function(df) { nms <- names(df) low <- tolower(nms) @@ -214,10 +242,6 @@ infer_geoid_col <- function(df) { NULL } -# Predictor inference: -# - Uses numeric/integer/logical columns as candidate predictors. -# - Excludes id-like columns (GEOID/NAME + the inferred geoid key). -# - Drops columns that are entirely NA. infer_predictors <- function(census_df) { geoid_col <- infer_geoid_col(census_df) if (is.null(geoid_col)) { @@ -240,16 +264,6 @@ infer_predictors <- function(census_df) { list(geoid_col = geoid_col, predictors = preds, dropped_all_na = all_na) } -# Rank-deficiency handling for VGAM: -# - VGAM::vglm(multinomial()) requires a full-rank model matrix. -# - We build a standard model.matrix with intercept and remove: -# (a) constant columns (excluding intercept) -# (b) columns beyond QR rank using pivot ordering -# -# Note: This is deterministic and reproducible, but may drop terms that are -# substantively meaningful if predictors are highly collinear. Treat as a -# pragmatic requirement for full-rank MLE and revisit for a more principled -# approach if needed. drop_rank_deficient_terms <- function(model_df, predictors) { ftmp <- stats::as.formula(paste0("~ ", paste(predictors, collapse = " + "))) Xmm <- stats::model.matrix(ftmp, data = model_df) @@ -311,9 +325,6 @@ drop_rank_deficient_terms <- function(model_df, predictors) { # ----------------------------- # Read + aggregate clusters (memory-safe) # ----------------------------- -# Key point: -# - We do not read the full household-day dataset into memory. -# - Instead we open it as an Arrow dataset and aggregate "zip_code × cluster" counts. if (!file.exists(CLUSTERS_PATH)) stopf("Clusters parquet not found: %s", CLUSTERS_PATH) if (!file.exists(CROSSWALK_PATH)) stopf("Crosswalk file not found: %s", CROSSWALK_PATH) if (!file.exists(CENSUS_PATH)) stopf("Census predictors parquet not found: %s", CENSUS_PATH) @@ -333,11 +344,6 @@ zip_cluster_counts <- clusters_ds %>% dplyr::summarise(n = dplyr::n(), .groups = "drop") %>% dplyr::collect() -if (!("zip_code" %in% names(zip_cluster_counts))) stopf("Clusters parquet must include column 'zip_code'") -if (!("cluster" %in% names(zip_cluster_counts))) stopf("Clusters parquet must include column 'cluster'") -if (!("n" %in% names(zip_cluster_counts))) stopf("Internal error: missing 'n' after aggregation") - -# Normalize and sanitize types zip_cluster_counts <- zip_cluster_counts %>% dplyr::mutate( zip4 = normalize_zip4(zip_code), @@ -353,14 +359,6 @@ household_day_rows_total <- sum(zip_cluster_counts$n, na.rm = TRUE) # ----------------------------- # Read crosswalk # ----------------------------- -# Crosswalk requirements: -# - Must include Zip, Zip4, and CensusKey2023 (or equivalent). -# - We standardize to "zip4" = "#####-####" and "block_group_geoid" = 12-char BG GEOID. -# -# Design choice: -# - If multiple BGs map to the same ZIP+4, we deterministically pick the first after sorting. -# This enforces a single mapping and avoids row inflation, but it should be reviewed for whether -# a probabilistic or fractional allocation is preferable. logi("%s Reading crosswalk: %s", IC$ok, CROSSWALK_PATH) cw_tbl <- tryCatch( @@ -403,7 +401,6 @@ cw <- cw %>% if (nrow(cw) == 0) stopf("Crosswalk produced 0 usable rows after cleaning/filtering.") -# Deterministic one-to-one ZIP+4 → BG mapping cw <- cw %>% dplyr::arrange(zip4, block_group_geoid) %>% dplyr::group_by(zip4) %>% @@ -417,10 +414,6 @@ if (nrow(dup_zip4) > 0) stopf("Crosswalk still has non-unique zip4 after determi # ----------------------------- # Join + aggregate to BG counts # ----------------------------- -# After joining the crosswalk, we can compute: -# - BG×cluster counts -# - BG total household-days -# and optionally drop BGs with too few observations (min_obs_per_bg). logi("%s Joining ZIP+4×cluster counts to crosswalk...", IC$ok) clusters2 <- zip_cluster_counts %>% @@ -445,7 +438,6 @@ total_by_bg <- clusters2 %>% clusters_observed <- sort(unique(bg_counts$cluster)) if (length(clusters_observed) < 2) stopf("Need at least 2 clusters observed after aggregation; found: %s", paste(clusters_observed, collapse = ",")) -# Wide BG frame: GEOID + total + one column per cluster count bg_wide <- total_by_bg %>% dplyr::rename(GEOID = block_group_geoid) for (k in clusters_observed) { @@ -460,11 +452,9 @@ for (k in clusters_observed) { dplyr::select(-n) } -# Observation floor: avoids fragile inference for tiny BG totals. bg_wide <- bg_wide %>% dplyr::filter(total_household_days >= as.integer(MIN_OBS_PER_BG)) if (nrow(bg_wide) == 0) stopf("No block groups remain after --min-obs-per-bg filtering (N=%d).", MIN_OBS_PER_BG) -# Zero count diagnostic: proportion of BGs with zero count in each cluster column. zero_stats <- list() for (k in clusters_observed) { colname <- paste0("cluster_", k) @@ -479,31 +469,26 @@ for (k in clusters_observed) { # ----------------------------- # Read census predictors + infer predictors # ----------------------------- -# This step establishes the modeling covariate set at the BG level. -# Predictors are inferred (numeric/integer/logical columns) rather than hard-coded. logi("%s Reading census predictors: %s", IC$ok, CENSUS_PATH) census <- arrow::read_parquet(CENSUS_PATH, as_data_frame = TRUE) inf <- infer_predictors(census) CENSUS_GEOID_COL <- inf$geoid_col -PREDICTORS <- inf$predictors +PREDICTORS_RAW <- inf$predictors DROPPED_ALL_NA <- inf$dropped_all_na -if (length(PREDICTORS) == 0) stopf("No usable numeric predictors inferred from census parquet.") +if (length(PREDICTORS_RAW) == 0) stopf("No usable numeric predictors inferred from census parquet.") -# Deduplicate by GEOID defensively; keep only inferred predictors. census <- census %>% dplyr::mutate(GEOID = as.character(.data[[CENSUS_GEOID_COL]])) %>% - dplyr::select(GEOID, dplyr::any_of(PREDICTORS)) %>% + dplyr::select(GEOID, dplyr::any_of(PREDICTORS_RAW)) %>% dplyr::distinct(GEOID, .keep_all = TRUE) logi("%s Joining census predictors to BG counts...", IC$ok) bg_model <- bg_wide %>% dplyr::inner_join(census, by = "GEOID") -# Missing predictor handling: -# - If allow_missing_predictors=0: fail fast if any BG would be dropped by NA. -# - If allow_missing_predictors=1: drop incomplete BGs (complete-case analysis). -pred_mat <- bg_model %>% dplyr::select(dplyr::any_of(PREDICTORS)) +# Missing predictor handling +pred_mat <- bg_model %>% dplyr::select(dplyr::any_of(PREDICTORS_RAW)) any_na <- apply(is.na(pred_mat), 1, any) drop_missing_pred_bg <- sum(any_na) @@ -522,17 +507,26 @@ if (nrow(bg_model) == 0) stopf("No block groups remain after predictor missingne # ----------------------------- -# Optional: standardize predictors (z-score) +# Deterministic reference-variable omission (if present) +# ----------------------------- +omit_refs_present <- intersect(REFERENCE_VARS, PREDICTORS_RAW) +PREDICTORS <- setdiff(PREDICTORS_RAW, omit_refs_present) + +if (length(omit_refs_present) > 0) { + logi("%s Omitted %d reference-share predictor(s): %s", IC$warn, length(omit_refs_present), paste(omit_refs_present, collapse = ", ")) +} + +if (length(PREDICTORS) == 0) stopf("No predictors remain after reference omissions.") + + +# ----------------------------- +# Optional: scale by SD only (NO mean-centering) # ----------------------------- -# Rationale: -# - Improves numerical conditioning for optimization / IRLS. -# - Makes coefficients more comparable (effect per 1 SD change). -# - Retains original scale parameters in scaling_info for provenance. scaling_info <- NULL zero_var_predictors <- character(0) if (STANDARDIZE == 1L) { - logi("%s Standardizing %d predictors (z-score: mean=0, sd=1)...", IC$ok, length(PREDICTORS)) + logi("%s Scaling %d predictors by SD only (no mean-centering)...", IC$ok, length(PREDICTORS)) scaling_info <- list() for (col in PREDICTORS) { @@ -540,16 +534,14 @@ if (STANDARDIZE == 1L) { x <- bg_model[[col]] if (is.logical(x)) x <- as.numeric(x) - mu <- mean(x, na.rm = TRUE) sigma <- stats::sd(x, na.rm = TRUE) - scaling_info[[col]] <- list(mean = as.numeric(mu), sd = as.numeric(sigma)) + scaling_info[[col]] <- list(sd = as.numeric(sigma), centered = FALSE) if (is.finite(sigma) && sigma > 1e-10) { - bg_model[[col]] <- (x - mu) / sigma + bg_model[[col]] <- x / sigma } else { - # Retain original scale if SD ~ 0; keep a list for QC outputs. zero_var_predictors <- c(zero_var_predictors, col) - cat(sprintf("%s Predictor '%s' has ~zero variance; not standardized.\n", IC$warn, col)) + cat(sprintf("%s Predictor '%s' has ~zero variance; not scaled.\n", IC$warn, col)) bg_model[[col]] <- x } } @@ -559,11 +551,6 @@ if (STANDARDIZE == 1L) { # ----------------------------- # Choose baseline + response matrix # ----------------------------- -# The response is a matrix of counts per BG across clusters: -# - Columns are ordered so the baseline cluster is last, matching the refLevel config for VGAM. -# - Baseline selection: -# - If user specifies --baseline-cluster, use it. -# - Otherwise choose the most frequent cluster by total count. resp_cols <- paste0("cluster_", clusters_observed) resp_cols <- resp_cols[resp_cols %in% names(bg_model)] if (length(resp_cols) < 2) stopf("Need >=2 cluster count columns; found: %s", paste(resp_cols, collapse = ",")) @@ -586,7 +573,6 @@ if (!paste0("cluster_", baseline_cluster) %in% resp_cols) { resp_cols_ordered <- c(setdiff(resp_cols, paste0("cluster_", baseline_cluster)), paste0("cluster_", baseline_cluster)) Y <- as.matrix(bg_model[, resp_cols_ordered, drop = FALSE]) -# Basic response validation if (any(is.na(Y))) stopf("NA values detected in response matrix.") if (any(Y < 0, na.rm = TRUE)) stopf("Negative counts detected in response matrix.") rs <- rowSums(Y) @@ -595,13 +581,9 @@ if (any(rs <= 0, na.rm = TRUE)) { stopf("Block groups with non-positive total counts. Example row indices: %s", paste(head(bad_rows, 10), collapse = ", ")) } -# Model frame for fitting model_df <- bg_model[, c("GEOID", "total_household_days", PREDICTORS), drop = FALSE] model_df$Y <- I(Y) -# Map “equations” to cluster labels: -# - Multinomial logit has one linear predictor per non-baseline outcome. -# - Here, eq index i corresponds to nonbase_clusters[i]. nonbase_cols <- resp_cols_ordered[resp_cols_ordered != paste0("cluster_", baseline_cluster)] nonbase_clusters <- as.integer(sub("^cluster_", "", nonbase_cols)) @@ -620,21 +602,16 @@ if (USE_VGAM == 1L) { design_rank <- as.integer(rk$rank) design_ncol <- as.integer(rk$ncol_design) - # Rebuild with kept predictors only model_df <- bg_model[, c("GEOID", "total_household_days", PREDICTORS), drop = FALSE] model_df$Y <- I(Y) } +if (length(PREDICTORS) == 0) stopf("No predictors remain after rank-deficiency handling.") + # ----------------------------- # Fit model # ----------------------------- -# Model form: -# Y ~ X1 + X2 + ... + Xp -# -# For VGAM::vglm: -# - family = multinomial(refLevel = K) where K is the last column (baseline). -# - IRLS typically handles scale better than nnet::multinom in large-count settings. logi( "%s Fitting multinomial logit (counts) with %d BGs, %d predictors, %d clusters (baseline=%d)...", IC$ok, nrow(model_df), length(PREDICTORS), length(resp_cols_ordered), baseline_cluster @@ -658,7 +635,6 @@ if (USE_VGAM == 1L) { error = function(e) stopf("Model fit failed (VGAM::vglm): %s", e$message) ) - # Null model (intercept-only): provides baseline for pseudo-R2 and deviance comparisons fit0 <- tryCatch( withCallingHandlers( VGAM::vglm(Y ~ 1, family = VGAM::multinomial(refLevel = length(resp_cols_ordered)), data = model_df), @@ -692,10 +668,6 @@ logi("%s Model fit completed in %.1f seconds", IC$ok, fit_duration) # ----------------------------- # Convergence heuristics # ----------------------------- -# Approach: -# - Primary: check warnings that suggest iteration/step issues. -# - Secondary: compare deviance_full vs deviance_null; if too close, signal weak fit or instability. -# Note: VGAM is S4; we avoid `$iter` or similar direct slot assumptions here. convergence_ok <- TRUE convergence_message <- "Model converged successfully" @@ -732,43 +704,9 @@ if (is.finite(deviance_ratio) && deviance_ratio > 0.95) { } -# ----------------------------- -# Correlations (diagnostic) -# ----------------------------- -# This is a quick heuristic to identify predictors correlated with observed cluster proportions. -# It is not used for modeling; it is logged for reviewer sanity-checking and interpretation guidance. -if (all(c("cluster_0", "cluster_1", "cluster_3") %in% names(bg_model))) { - bg_tmp <- bg_model %>% - dplyr::mutate( - prop_cluster_0 = cluster_0 / total_household_days, - prop_cluster_1 = cluster_1 / total_household_days, - prop_cluster_3 = cluster_3 / total_household_days - ) - - cor_matrix <- tryCatch( - stats::cor( - bg_tmp[, PREDICTORS, drop = FALSE], - bg_tmp[, c("prop_cluster_0", "prop_cluster_1", "prop_cluster_3"), drop = FALSE], - use = "complete.obs" - ), - error = function(e) NULL - ) - - if (!is.null(cor_matrix)) { - strong_cors <- apply(abs(cor_matrix), 1, max) - cat("\nPredictors with |correlation| > 0.1 to any cluster proportion:\n") - print(sort(strong_cors[strong_cors > 0.1], decreasing = TRUE)) - } -} - - # ----------------------------- # Core stats # ----------------------------- -# McFadden pseudo-R2: -# 1 - (LL_full / LL_null) -# Interpreted as improvement over intercept-only baseline on the log-likelihood scale. -# Not directly comparable to OLS R^2; treat as a relative fit measure. ll_full <- as.numeric(stats::logLik(fit)) ll_null <- as.numeric(stats::logLik(fit0)) pseudo_r2 <- 1.0 - (ll_full / ll_null) @@ -777,15 +715,6 @@ pseudo_r2 <- 1.0 - (ll_full / ll_null) # ----------------------------- # Coefficients table (nnet + VGAM) -> ALWAYS returns 'cluster' # ----------------------------- -# This function normalizes output format across engines: -# - Each row: (cluster, predictor, coefficient, std_err, z_stat, p_value) -# - cluster indicates the non-baseline outcome corresponding to the equation. -# -# For VGAM: -# - We use vcov(fit) for SEs and align by exact coefficient names. -# - We robustly parse equation indices from coefficient naming conventions: -# A) "term:1" style -# B) "log(mu[,1]/mu[,K]):term" style extract_coef_table <- function(fit_obj, nonbase_clusters) { if (inherits(fit_obj, "multinom")) { coefs <- summary(fit_obj)$coefficients @@ -841,7 +770,6 @@ extract_coef_table <- function(fit_obj, nonbase_clusters) { eq <- rep(NA_integer_, length(nm_vec)) term <- rep(NA_character_, length(nm_vec)) - # A) "term:1" m_a <- regexec("^(.*):([0-9]+)$", nm_vec) r_a <- regmatches(nm_vec, m_a) has_a <- lengths(r_a) == 3 @@ -850,7 +778,6 @@ extract_coef_table <- function(fit_obj, nonbase_clusters) { eq[has_a] <- suppressWarnings(as.integer(vapply(r_a[has_a], function(x) x[[3]], character(1)))) } - # B) "log(mu[,k]/mu[,K]):term" need_b <- !has_a if (any(need_b)) { nm_b <- nm_vec[need_b] @@ -898,9 +825,6 @@ extract_coef_table <- function(fit_obj, nonbase_clusters) { res_tbl <- extract_coef_table(fit, nonbase_clusters) -# Multiple testing control: -# - BH q-values computed within each cluster equation (i.e., within each non-baseline outcome), -# which is a sensible default for “per-equation” screening. res_tbl <- res_tbl %>% dplyr::group_by(cluster) %>% dplyr::mutate(q_value = p.adjust(p_value, method = "BH")) %>% @@ -918,12 +842,39 @@ res_tbl <- res_tbl %>% # ----------------------------- # QC + metadata outputs # ----------------------------- -# This section writes: -# - input_qc: what was dropped and why; predictor inference outcomes; configuration -# - diag: fit diagnostics and cluster marginals -# - manifest: file path registry for downstream automation household_day_rows_modeled <- sum(bg_model$total_household_days, na.rm = TRUE) +# New omitted predictors artifact (reference omissions + rank drops) +omitted_tbl <- tibble::tibble( + predictor = c(omit_refs_present, dropped_predictors_rank), + reason = c( + rep("omitted_reference_share", length(omit_refs_present)), + rep("dropped_rank_deficient", length(dropped_predictors_rank)) + ) +) %>% + dplyr::mutate( + caveat = dplyr::case_when( + reason == "omitted_reference_share" ~ REFERENCE_CAVEAT, + TRUE ~ "Dropped because the fitted design matrix was not full rank (VGAM full-rank requirement)." + ) + ) + +# Add group metadata for reference omissions +if (nrow(omitted_tbl) > 0) { + ref_map <- do.call(rbind.data.frame, lapply(REFERENCE_GROUPS, function(g) { + data.frame( + predictor = g$reference, + group_id = g$group_id, + group_name = g$group_name, + implied_definition = implied_definitions[[which(vapply(implied_definitions, function(x) x$group_id, character(1)) == g$group_id)[1]]]$implied_as, + rationale = g$rationale, + stringsAsFactors = FALSE + ) + })) + omitted_tbl <- omitted_tbl %>% + dplyr::left_join(ref_map, by = "predictor") +} + input_qc <- list( inputs = list( clusters = CLUSTERS_PATH, @@ -934,7 +885,15 @@ input_qc <- list( "Predictors are inferred from the census parquet columns (numeric/logical), excluding GEOID/NAME.", "Counts are computed memory-safely using Arrow Dataset aggregation; the full household-day parquet is not read into RAM.", "Zero counts are expected in BG×cluster composition; multinomial likelihood handles zeros naturally (no smoothing/alpha).", - "If VGAM is used, rank-deficient predictors are dropped to satisfy full-rank design requirement." + "Reference-share variables (compositional groups) are omitted deterministically when present to ensure identifiability.", + "If VGAM is used, remaining rank-deficient predictors are dropped deterministically to satisfy full-rank design requirement." + ), + reference_categories = list( + reference_groups = REFERENCE_GROUPS, + reference_variables = REFERENCE_VARS, + reference_variables_present_in_input = omit_refs_present, + implied_definitions = implied_definitions, + caveat = REFERENCE_CAVEAT ), counts = list( household_day_rows_total_after_basic_filter = as.integer(household_day_rows_total), @@ -950,9 +909,10 @@ input_qc <- list( geoid_column = CENSUS_GEOID_COL, predictors_inferred = inf$predictors, predictors_used = PREDICTORS, + predictors_omitted_reference_share = omit_refs_present, predictors_dropped_all_na = DROPPED_ALL_NA, predictors_dropped_rank_deficient = dropped_predictors_rank, - predictors_zero_variance_not_standardized = zero_var_predictors + predictors_zero_variance_not_scaled = zero_var_predictors ), model = list( clusters_observed = as.integer(clusters_observed), @@ -998,6 +958,17 @@ diag <- list( clusters_observed = as.integer(clusters_observed), baseline_cluster = as.integer(baseline_cluster) ), + reference_categories = list( + reference_groups = REFERENCE_GROUPS, + reference_variables = REFERENCE_VARS, + reference_variables_present_in_input = omit_refs_present, + implied_definitions = implied_definitions, + caveat = REFERENCE_CAVEAT + ), + omitted_predictors_summary = list( + n_omitted_reference_share = as.integer(length(omit_refs_present)), + n_dropped_rank_deficient = as.integer(length(dropped_predictors_rank)) + ), cluster_marginal_distributions = lapply(names(cluster_totals), function(cn) { k <- as.integer(sub("^cluster_", "", cn)) list(cluster = k, household_days = as.integer(cluster_totals[[cn]]), proportion = as.numeric(cluster_props[[cn]])) @@ -1011,6 +982,7 @@ manifest <- list( regression_diagnostics = file.path(OUT_DIR, "regression_diagnostics.json"), stage2_input_qc = file.path(OUT_DIR, "stage2_input_qc.json"), regression_data_blockgroups_wide = file.path(OUT_DIR, "regression_data_blockgroups_wide.parquet"), + omitted_predictors = file.path(OUT_DIR, "omitted_predictors.parquet"), stage2_manifest = file.path(OUT_DIR, "stage2_manifest.json"), stage2_metadata = file.path(OUT_DIR, "stage2_metadata.json") ) @@ -1020,6 +992,7 @@ logi("%s Writing outputs to: %s", IC$ok, OUT_DIR) arrow::write_parquet(res_tbl, file.path(OUT_DIR, "regression_results.parquet")) arrow::write_parquet(bg_model, file.path(OUT_DIR, "regression_data_blockgroups_wide.parquet")) +arrow::write_parquet(omitted_tbl, file.path(OUT_DIR, "omitted_predictors.parquet")) safe_write_json(diag, file.path(OUT_DIR, "regression_diagnostics.json")) safe_write_json(input_qc, file.path(OUT_DIR, "stage2_input_qc.json")) safe_write_json(manifest, file.path(OUT_DIR, "stage2_manifest.json")) @@ -1047,6 +1020,8 @@ cat(sprintf("%s Stage 2 multinomial logit complete.\n", IC$ok)) cat(sprintf(" Block Groups (modeled): %d\n", nrow(bg_model))) cat(sprintf(" Household-days (modeled): %s\n", format(household_day_rows_modeled, big.mark = ","))) cat(sprintf(" Predictors (used): %d\n", length(PREDICTORS))) +cat(sprintf(" Omitted reference predictors (present): %d\n", length(omit_refs_present))) +cat(sprintf(" Rank-deficient predictors dropped: %d\n", length(dropped_predictors_rank))) cat(sprintf(" Pseudo R^2 (McFadden): %.4f\n", pseudo_r2)) cat(sprintf(" Converged: %s\n", ifelse(convergence_ok, "true", "false"))) cat("\n") diff --git a/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_heatmap.png b/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_heatmap.png deleted file mode 100644 index 8f606f0..0000000 Binary files a/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_heatmap.png and /dev/null differ diff --git a/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_hourly_profile.png b/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_hourly_profile.png deleted file mode 100644 index 97d6ebc..0000000 Binary files a/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_hourly_profile.png and /dev/null differ diff --git a/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_monthly_profile.png b/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_monthly_profile.png deleted file mode 100644 index 8c5511c..0000000 Binary files a/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_monthly_profile.png and /dev/null differ diff --git a/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_weekend_comparison.png b/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_weekend_comparison.png deleted file mode 100644 index a62da58..0000000 Binary files a/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/chicago_2024_weekend_comparison.png and /dev/null differ diff --git a/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/summary_table.txt b/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/summary_table.txt deleted file mode 100644 index d455a91..0000000 --- a/archive/residential_load_viz_2024/analysis/chicago_2024_full_year/final_visualizations/summary_table.txt +++ /dev/null @@ -1,24 +0,0 @@ -================================================================================ -TABLE 1: Monthly Sample Summary -================================================================================ - -Month Customers ZIP Codes Mean kWh Std Dev Observations --------------------------------------------------------------------------------- -2024-01 995 10 0.4011 0.5584 1,479,469 -2024-02 979 10 0.2643 0.3245 1,361,693 -2024-03 987 10 0.2252 0.3126 1,465,503 -2024-04 * 229 47 0.3035 0.6872 318,930 -2024-05 781 8 0.2611 0.3299 1,156,931 -2024-06 964 10 0.4038 0.5014 1,387,196 -2024-07 962 10 0.4036 0.4819 1,428,964 -2024-08 908 10 0.5073 1.0055 1,349,911 -2024-09 958 10 0.3175 0.3892 1,372,516 -2024-10 954 10 0.2242 0.2682 1,418,168 -2024-11 963 10 0.2669 0.3413 1,387,539 -2024-12 977 10 0.2838 0.3554 1,443,393 --------------------------------------------------------------------------------- -TOTAL 10,270 - - - 15,570,213 - -*April 2024: Limited data availability due to utility data quality issues. - Sample size reduced to 229 customers (24% of typical) despite city-wide - sampling effort across 47 ZIP codes. diff --git a/archive/residential_load_viz_2024/combine_chicago_zips.py b/archive/residential_load_viz_2024/combine_chicago_zips.py deleted file mode 100755 index 54364be..0000000 --- a/archive/residential_load_viz_2024/combine_chicago_zips.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env python -""" -Combine all sampled ZIP codes into one Chicago-wide dataset. -Checks for April 2024 data quality issues. -""" - -from pathlib import Path - -import polars as pl - -BASE_DIR = Path("analysis/chicago_2024_full_year") -OUTPUT_DIR = BASE_DIR / "combined" -OUTPUT_DIR.mkdir(parents=True, exist_ok=True) - -print("=" * 80) -print("COMBINING ALL CHICAGO ZIP CODES - 2024 FULL YEAR") -print("=" * 80) - -# Find all CLIPPED_CM90 files -cm90_files = sorted(BASE_DIR.glob("zip*/final/*_CLIPPED_CM90.parquet")) - -print(f"Found {len(cm90_files)} ZIP datasets") - -if not cm90_files: - print("❌ No files found! Run sampling first.") - exit(1) - -# Combine all ZIPs -print("\nLoading and combining...") -all_data = [] - -for file in cm90_files: - zipcode = file.parent.parent.name.replace("zip", "") - print(f" ZIP {zipcode}...", end=" ") - - df = pl.read_parquet(file) - df = df.with_columns(pl.lit(zipcode).alias("zipcode")) - all_data.append(df) - print(f"{len(df):,} rows") - -combined = pl.concat(all_data) - -print("\n✅ Combined dataset:") -print(f" Total rows: {len(combined):,}") -print(f" Unique customers: {combined['account_identifier'].n_unique():,}") -print(f" ZIP codes: {combined['zipcode'].n_unique()}") -print(f" Months: {combined['sample_month'].n_unique()}") -print(f" Date range: {combined['date'].min()} to {combined['date'].max()}") - -# Summary by month -monthly = ( - combined.group_by("sample_month") - .agg([ - pl.col("account_identifier").n_unique().alias("customers"), - pl.col("zipcode").n_unique().alias("zips"), - pl.col("kwh").mean().alias("mean_kwh"), - pl.len().alias("rows"), - ]) - .sort("sample_month") -) - -print("\n📊 Monthly Summary (ALL 12 months of 2024):") -print(monthly) - -# Check April specifically -april = monthly.filter(pl.col("sample_month") == "202404") -if april.height > 0: - april_customers = april["customers"][0] - other_avg = monthly.filter(pl.col("sample_month") != "202404")["customers"].mean() - print("\n⚠️ April 2024 check:") - print(f" April customers: {april_customers}") - print(f" Other months avg: {other_avg:.0f}") - if april_customers < other_avg * 0.3: - print(" 🚨 WARNING: April has significantly fewer customers!") - print(" Consider excluding April from analysis.") - -# Summary by ZIP -zip_summary = ( - combined.group_by("zipcode") - .agg([ - pl.col("sample_month").n_unique().alias("months"), - pl.col("account_identifier").n_unique().alias("customers"), - pl.col("kwh").mean().alias("mean_kwh"), - ]) - .sort("zipcode") -) - -print("\n📍 ZIP Code Summary:") -print(zip_summary) - -# Save -output_file = OUTPUT_DIR / "chicago_all_zips_2024_CM90.parquet" -combined.write_parquet(output_file) - -print(f"\n✅ Saved: {output_file}") -print(f" File size: {output_file.stat().st_size / 1e9:.2f} GB") -print("=" * 80) diff --git a/archive/residential_load_viz_2024/combine_chicago_zips_streaming.py b/archive/residential_load_viz_2024/combine_chicago_zips_streaming.py deleted file mode 100644 index f2d849c..0000000 --- a/archive/residential_load_viz_2024/combine_chicago_zips_streaming.py +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env python -""" -Combine all sampled ZIP codes using streaming (lazy frames). -""" - -from pathlib import Path - -import polars as pl - -BASE_DIR = Path("analysis/chicago_2024_full_year") -OUTPUT_DIR = BASE_DIR / "combined" -OUTPUT_DIR.mkdir(parents=True, exist_ok=True) - -print("=" * 80) -print("COMBINING ALL CHICAGO ZIP CODES - 2024 FULL YEAR (STREAMING)") -print("=" * 80) - -# Find all CLIPPED_CM90 files -cm90_files = sorted(BASE_DIR.glob("zip*/final/*_CLIPPED_CM90.parquet")) -print(f"Found {len(cm90_files)} ZIP datasets\n") - -if not cm90_files: - print("❌ No files found!") - exit(1) - -# Add zipcode column to each file and save temporarily -temp_dir = OUTPUT_DIR / "temp" -temp_dir.mkdir(exist_ok=True) - -print("Adding zipcode column to each file...") -temp_files = [] - -for file in cm90_files: - zipcode = file.parent.parent.name.replace("zip", "") - temp_file = temp_dir / f"{zipcode}.parquet" - - if not temp_file.exists(): - print(f" Processing ZIP {zipcode}...", end=" ") - df = pl.read_parquet(file).with_columns(pl.lit(zipcode).alias("zipcode")) - df.write_parquet(temp_file) - print(f"{len(df):,} rows") - else: - print(f" ZIP {zipcode} already processed") - - temp_files.append(temp_file) - -# Now use lazy scan to combine WITHOUT loading into memory -print("\nCombining all ZIPs (streaming)...") -output_file = OUTPUT_DIR / "chicago_all_zips_2024_CM90.parquet" - -# Scan all temp files lazily and sink to final file -lf = pl.scan_parquet(temp_files) -lf.sink_parquet(output_file) - -print(f"✅ Saved: {output_file}") -print(f" File size: {output_file.stat().st_size / 1e9:.2f} GB") - -# Now compute summaries using streaming -print("\n📊 Computing monthly summary (streaming)...") -monthly = ( - pl.scan_parquet(output_file) - .group_by("sample_month") - .agg([ - pl.col("account_identifier").n_unique().alias("customers"), - pl.col("zipcode").n_unique().alias("zips"), - pl.col("kwh").mean().alias("mean_kwh"), - pl.len().alias("rows"), - ]) - .sort("sample_month") - .collect(streaming=True) -) - -print("\nMonthly Summary (ALL 12 months of 2024):") -print(monthly) - -# Check April -april = monthly.filter(pl.col("sample_month") == "202404") -if april.height > 0: - april_customers = april["customers"][0] - other_months = monthly.filter(pl.col("sample_month") != "202404") - other_avg = other_months["customers"].mean() - - print("\n⚠️ April 2024 Data Quality Check:") - print(f" April customers: {april_customers:,}") - print(f" Other months avg: {other_avg:.0f}") - - if april_customers < other_avg * 0.5: - print(f" 🚨 WARNING: April has {(1 - april_customers / other_avg) * 100:.0f}% fewer customers!") - print(" Recommendation: Exclude April from analysis") - else: - print(" ✅ April looks okay!") - -# ZIP summary -print("\n📍 Computing ZIP summary (streaming)...") -zip_summary = ( - pl.scan_parquet(output_file) - .group_by("zipcode") - .agg([ - pl.col("sample_month").n_unique().alias("months"), - pl.col("account_identifier").n_unique().alias("customers_total"), - pl.col("kwh").mean().alias("mean_kwh"), - ]) - .sort("zipcode") - .collect(streaming=True) -) - -print("\nZIP Code Summary:") -print(zip_summary) - -# Overall stats -print("\n📈 Overall Statistics:") -overall = ( - pl.scan_parquet(output_file) - .select([ - pl.col("account_identifier").n_unique().alias("unique_customers"), - pl.len().alias("total_rows"), - pl.col("zipcode").n_unique().alias("zip_count"), - pl.col("sample_month").n_unique().alias("month_count"), - pl.col("kwh").mean().alias("mean_kwh"), - pl.min("date").alias("start_date"), - pl.max("date").alias("end_date"), - ]) - .collect(streaming=True) -) - -print(f" Total customers: {overall['unique_customers'][0]:,}") -print(f" Total rows: {overall['total_rows'][0]:,}") -print(f" ZIP codes: {overall['zip_count'][0]}") -print(f" Months: {overall['month_count'][0]}") -print(f" Mean kWh: {overall['mean_kwh'][0]:.4f}") -print(f" Date range: {overall['start_date'][0]} to {overall['end_date'][0]}") - -print("\n" + "=" * 80) -print("✅ COMPLETE! Ready for visualization.") -print("=" * 80) diff --git a/archive/residential_load_viz_2024/combine_with_april_boost.py b/archive/residential_load_viz_2024/combine_with_april_boost.py deleted file mode 100755 index 0338ccd..0000000 --- a/archive/residential_load_viz_2024/combine_with_april_boost.py +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env python -""" -Combine original 10-ZIP dataset with April city-wide boost. -""" - -from pathlib import Path - -import polars as pl - -print("=" * 80) -print("COMBINING: Original + April City-Wide Boost") -print("=" * 80) - -# Original 10 ZIPs (all 12 months) -original = Path("analysis/chicago_2024_full_year/combined/chicago_all_zips_2024_CM90.parquet") - -# April boost from 45 additional ZIPs -april_files = list(Path("analysis/chicago_april_citywide").glob("zip*/final/*_CLIPPED_CM90.parquet")) - -print(f"Original dataset: {original}") -print(f"April boost files: {len(april_files)} ZIPs") - -if not april_files: - print("\n❌ No April boost files! Run sample_april_citywide.py first") - exit(1) - -# Load April boost data -print("\nLoading April boost data...") -april_frames = [] -for file in april_files: - zipcode = file.parent.parent.name.replace("zip", "") - df = pl.read_parquet(file).with_columns(pl.lit(zipcode).alias("zipcode")) - customers = df["account_identifier"].n_unique() - print(f" ZIP {zipcode}: {customers} customers (after CM90)") - april_frames.append(df) - -if april_frames: - april_boost = pl.concat(april_frames) - total_april_boost = april_boost["account_identifier"].n_unique() - print(f"\n✅ April boost total: {total_april_boost:,} customers from {len(april_files)} ZIPs") -else: - print("\n❌ No April data passed CM90!") - exit(1) - -# Combine -print("\nCombining with original dataset...") -combined = pl.concat([pl.scan_parquet(original), pl.LazyFrame(april_boost)], how="vertical_relaxed") - -output = Path("analysis/chicago_2024_full_year/combined/chicago_2024_with_april_boost_CM90.parquet") -combined.sink_parquet(output) - -print(f"\n✅ Saved: {output}") -print(f" Size: {output.stat().st_size / 1e9:.2f} GB") - -# Monthly summary -monthly = ( - pl.scan_parquet(output) - .group_by("sample_month") - .agg([ - pl.col("account_identifier").n_unique().alias("customers"), - pl.col("zipcode").n_unique().alias("zips"), - pl.col("kwh").mean().alias("mean_kwh"), - ]) - .sort("sample_month") - .collect(engine="streaming") -) - -print("\n📊 FINAL Monthly Summary (with April boost):") -print(monthly) - -april_row = monthly.filter(pl.col("sample_month") == "202404") -if april_row.height > 0: - april_customers = april_row["customers"][0] - april_zips = april_row["zips"][0] - other_avg = monthly.filter(pl.col("sample_month") != "202404")["customers"].mean() - - print(f"\n{'=' * 80}") - print("APRIL 2024 FINAL STATUS:") - print(f" Customers: {april_customers:,}") - print(f" ZIPs: {april_zips}") - print(f" Other months avg: {other_avg:.0f}") - print(f" April is {april_customers / other_avg * 100:.0f}% of typical month") - - if april_customers >= other_avg * 0.8: - print(" ✅ EXCELLENT - April is now comparable!") - elif april_customers >= other_avg * 0.5: - print(" ✅ GOOD - April is usable with minor caveat") - else: - print(" ⚠️ FAIR - April still lower but much better") - print(f"{'=' * 80}") - -print("\n✅ Ready for visualization!") diff --git a/archive/residential_load_viz_2024/create_heatmap.py b/archive/residential_load_viz_2024/create_heatmap.py deleted file mode 100644 index 66cc270..0000000 --- a/archive/residential_load_viz_2024/create_heatmap.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python -""" -Create load heatmap: Hour × Day of Year -Aggregated across all customers (200 residential accounts). -""" - -from pathlib import Path - -import matplotlib.pyplot as plt -import polars as pl -import seaborn as sns - - -def create_load_heatmap(usage_df: pl.DataFrame, output_path: str = "load_heatmap.png"): - """Create beautiful seaborn heatmap of usage patterns.""" - print("📊 Creating heatmap...") - - # Add day_of_year and hour - df = usage_df.with_columns([ - pl.col("datetime").dt.ordinal_day().alias("day_of_year"), - pl.col("datetime").dt.hour().alias("hour"), - ]) - - # Aggregate: total kWh for each (day_of_year, hour) combination - heatmap_data = ( - df.group_by(["day_of_year", "hour"]).agg(pl.col("kwh").sum().alias("total_kwh")).sort(["day_of_year", "hour"]) - ) - - print(f" Data points: {heatmap_data.height:,} hour×day combinations") - - # Pivot to matrix: rows=hours (0-23), cols=days (1-366) - matrix = heatmap_data.pivot(index="hour", columns="day_of_year", values="total_kwh").fill_null(0) - - # Convert to numpy - hour_labels = matrix.select("hour").to_series().to_list() - day_cols = [c for c in matrix.columns if c != "hour"] - data_matrix = matrix.select(day_cols).to_numpy() - - print(f" Matrix: {data_matrix.shape[0]} hours × {data_matrix.shape[1]} days") - - # Create figure - fig, ax = plt.subplots(figsize=(24, 10)) - - # Seaborn heatmap with YlOrRd colormap - sns.heatmap( - data_matrix, - cmap="YlOrRd", # Yellow-Orange-Red - cbar_kws={"label": "Total kWh (200 homes)", "shrink": 0.8}, - xticklabels=30, # Show every 30th day - yticklabels=hour_labels, - linewidths=0, - ax=ax, - robust=True, # Use robust quantiles for color limits - ) - - # Labels and title - ax.set_xlabel("Day of Year 2024", fontsize=16, fontweight="bold") - ax.set_ylabel("Hour of Day", fontsize=16, fontweight="bold") - ax.set_title( - "Illinois Residential Electricity Load Patterns - 2024\n200 Random Households", - fontsize=18, - fontweight="bold", - pad=20, - ) - - # Invert y-axis so midnight is at top - ax.invert_yaxis() - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight") - print(f"✅ Saved: {output_path}") - plt.close() - - -def create_hourly_profile(usage_df: pl.DataFrame, output_path: str = "hourly_profile.png"): - """Create average hourly load profile.""" - print("\n📊 Creating hourly profile...") - - # Average by hour across all days - hourly = ( - usage_df.with_columns(pl.col("datetime").dt.hour().alias("hour")) - .group_by("hour") - .agg([pl.col("kwh").mean().alias("avg_kwh"), pl.col("kwh").std().alias("std_kwh")]) - .sort("hour") - ) - - # Plot - fig, ax = plt.subplots(figsize=(12, 6)) - - hours = hourly["hour"].to_list() - avg = hourly["avg_kwh"].to_list() - std = hourly["std_kwh"].to_list() - - ax.plot(hours, avg, linewidth=3, color="#d62728", marker="o") - ax.fill_between( - hours, [a - s for a, s in zip(avg, std)], [a + s for a, s in zip(avg, std)], alpha=0.3, color="#d62728" - ) - - ax.set_xlabel("Hour of Day", fontsize=14, fontweight="bold") - ax.set_ylabel("Average kWh", fontsize=14, fontweight="bold") - ax.set_title("Average Hourly Electricity Usage - 2024\n200 Illinois Homes", fontsize=16, fontweight="bold") - ax.grid(True, alpha=0.3) - ax.set_xticks(range(0, 24, 2)) - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight") - print(f"✅ Saved: {output_path}") - plt.close() - - -if __name__ == "__main__": - print("=" * 60) - print("RESIDENTIAL LOAD VISUALIZATION") - print("=" * 60) - - # Load data - data_file = Path("sampled_customers_2024.parquet") - print(f"\n📂 Loading: {data_file}") - df = pl.read_parquet(data_file) - - print(f" Rows: {df.height:,}") - print(f" Date range: {df['date'].min()} to {df['date'].max()}") - print(f" ZIP+4 codes: {df['zip_code'].n_unique()}") - - # Create visualizations - create_load_heatmap(df, "residential_load_heatmap_2024.png") - create_hourly_profile(df, "residential_hourly_profile_2024.png") - - print("\n" + "=" * 60) - print("✅ VISUALIZATIONS COMPLETE!") - print("=" * 60) diff --git a/archive/residential_load_viz_2024/create_heatmap_professional.py b/archive/residential_load_viz_2024/create_heatmap_professional.py deleted file mode 100644 index 92b6e7e..0000000 --- a/archive/residential_load_viz_2024/create_heatmap_professional.py +++ /dev/null @@ -1,196 +0,0 @@ -#!/usr/bin/env python -""" -Professional-quality load visualizations for presentation. -""" - -from pathlib import Path - -import matplotlib.pyplot as plt -import polars as pl -import seaborn as sns - -# Set Garamond font -plt.rcParams["font.family"] = "serif" -plt.rcParams["font.serif"] = ["Garamond", "Georgia", "Times New Roman"] -plt.rcParams["font.size"] = 11 -plt.rcParams["axes.labelsize"] = 13 -plt.rcParams["axes.titlesize"] = 16 -plt.rcParams["xtick.labelsize"] = 10 -plt.rcParams["ytick.labelsize"] = 10 -plt.rcParams["legend.fontsize"] = 11 -plt.rcParams["figure.titlesize"] = 18 - - -def create_load_heatmap(usage_df: pl.DataFrame, output_path: str = "load_heatmap_pro.png"): - """Create professional heatmap with month labels.""" - print("📊 Creating professional heatmap...") - - # Prepare data - df = usage_df.with_columns([ - pl.col("datetime").dt.ordinal_day().alias("day_of_year"), - pl.col("datetime").dt.hour().alias("hour"), - ]) - - # Aggregate - heatmap_data = ( - df.group_by(["day_of_year", "hour"]).agg(pl.col("kwh").sum().alias("total_kwh")).sort(["day_of_year", "hour"]) - ) - - # Pivot to matrix - matrix = heatmap_data.pivot(index="hour", columns="day_of_year", values="total_kwh").fill_null(0) - - hour_labels = matrix.select("hour").to_series().to_list() - day_cols = [c for c in matrix.columns if c != "hour"] - data_matrix = matrix.select(day_cols).to_numpy() - - print(f" Data range: {data_matrix.min():.0f} to {data_matrix.max():.0f} kWh") - print(f" Matrix shape: {data_matrix.shape[0]} hours × {data_matrix.shape[1]} days") - - # Create figure - fig, ax = plt.subplots(figsize=(20, 8)) - - # Heatmap - sns.heatmap( - data_matrix, - cmap="rocket_r", - cbar_kws={"label": "Total kWh (8,131 homes)", "shrink": 0.8, "pad": 0.02}, - xticklabels=False, # We'll add custom labels - yticklabels=hour_labels, - linewidths=0, - ax=ax, - robust=True, - ) - - # Add month labels on x-axis - # 2024 is a leap year - month_starts = [1, 32, 61, 92, 122, 153, 183, 214, 245, 275, 306, 336] - month_names = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] - month_midpoints = [(month_starts[i] + (month_starts[i + 1] if i < 11 else 367)) / 2 - 1 for i in range(12)] - - # Set month names as x-tick labels - ax.set_xticks(month_midpoints) - ax.set_xticklabels(month_names, fontsize=11) - - # Add subtle month divider lines - for day in month_starts[1:]: - ax.axvline(x=day - 1, color="white", linewidth=0.8, alpha=0.4) - - # Labels - ax.set_xlabel("Month (2024)", fontsize=14, fontweight="bold", labelpad=10) - ax.set_ylabel("Hour of Day", fontsize=14, fontweight="bold", labelpad=10) - ax.set_title( - "Illinois Residential Electricity Load Patterns - 2024\n8,131 Random Households", - fontsize=17, - fontweight="bold", - pad=20, - ) - - # Invert y-axis so midnight is at top - ax.invert_yaxis() - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -def create_hourly_profile(usage_df: pl.DataFrame, output_path: str = "hourly_profile_pro.png"): - """Create hourly profile with IQR.""" - print("\n📊 Creating professional hourly profile...") - - # Calculate stats - hourly = ( - usage_df.with_columns(pl.col("datetime").dt.hour().alias("hour")) - .group_by("hour") - .agg([ - pl.col("kwh").mean().alias("mean_kwh"), - pl.col("kwh").quantile(0.25).alias("p25_kwh"), - pl.col("kwh").quantile(0.75).alias("p75_kwh"), - ]) - .sort("hour") - ) - - # Reality check output - print("\n 📊 REALITY CHECK:") - print(f" Lowest avg: {hourly['mean_kwh'].min():.3f} kWh per 30-min") - print(f" Highest avg: {hourly['mean_kwh'].max():.3f} kWh per 30-min") - print(f" Daily total: ~{hourly['mean_kwh'].sum():.1f} kWh per home per day") - - # Create figure - fig, ax = plt.subplots(figsize=(14, 7)) - - hours = hourly["hour"].to_list() - mean = hourly["mean_kwh"].to_list() - p25 = hourly["p25_kwh"].to_list() - p75 = hourly["p75_kwh"].to_list() - - # Interquartile range (tighter than std dev) - ax.fill_between(hours, p25, p75, alpha=0.25, color="#d62728", label="Interquartile range (25th-75th %ile)") - ax.plot(hours, mean, linewidth=3, color="#d62728", marker="o", markersize=6, label="Average usage", zorder=5) - - # Styling - ax.set_xlabel("Hour of Day", fontsize=14, fontweight="bold", labelpad=10) - ax.set_ylabel("Average kWh per 30-min interval", fontsize=14, fontweight="bold", labelpad=10) - ax.set_title( - "Average Hourly Electricity Usage - 2024\n8,131 Illinois Homes", fontsize=17, fontweight="bold", pad=20 - ) - ax.grid(True, alpha=0.3, linestyle="--", linewidth=0.5) - ax.set_xticks(range(0, 24, 2)) - ax.set_xlim(-0.5, 23.5) - ax.set_ylim(bottom=0) - - # Time-of-day shading - ax.axvspan(0, 6, alpha=0.08, color="blue", label="Overnight (12-6 AM)") - ax.axvspan(17, 21, alpha=0.08, color="red", label="Evening peak (5-9 PM)") - - # Annotations - peak_hour = mean.index(max(mean)) - peak_value = max(mean) - ax.annotate( - f"Peak: {peak_value:.3f} kWh\nat {peak_hour}:00", - xy=(peak_hour, peak_value), - xytext=(peak_hour - 3, peak_value + 0.03), - fontsize=11, - fontweight="bold", - bbox=dict(boxstyle="round", facecolor="yellow", alpha=0.8, edgecolor="black", linewidth=1.5), - arrowprops=dict(arrowstyle="->", lw=2, color="black"), - ) - - min_value = min(mean) - min_hour = mean.index(min_value) - ax.annotate( - f"Baseload: {min_value:.3f} kWh\nat {min_hour}:00", - xy=(min_hour, min_value), - xytext=(min_hour + 2, min_value + 0.05), - fontsize=11, - bbox=dict(boxstyle="round", facecolor="lightblue", alpha=0.8, edgecolor="black", linewidth=1.5), - arrowprops=dict(arrowstyle="->", lw=2, color="black"), - ) - - ax.legend(loc="upper left", framealpha=0.95, edgecolor="black", fancybox=False) - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -if __name__ == "__main__": - print("=" * 60) - print("PROFESSIONAL RESIDENTIAL LOAD VISUALIZATION") - print("=" * 60) - - data_file = Path("sampled_customers_2024.parquet") - print(f"\n📂 Loading: {data_file}") - df = pl.read_parquet(data_file) - - print(f" Rows: {df.height:,}") - print(f" Customers: {df['account_identifier'].n_unique():,}") - print(f" Date range: {df['date'].min()} to {df['date'].max()}") - - create_load_heatmap(df, "residential_load_heatmap_professional.png") - create_hourly_profile(df, "residential_hourly_profile_professional.png") - - print("\n" + "=" * 60) - print("✅ COMPLETE!") - print("=" * 60) diff --git a/archive/residential_load_viz_2024/create_visualization.py b/archive/residential_load_viz_2024/create_visualization.py deleted file mode 100644 index 2ff9f6c..0000000 --- a/archive/residential_load_viz_2024/create_visualization.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python -""" -Professional-quality load visualizations - FIXED for June 2024-June 2025 data. -Memory-safe for Codespaces. -""" - -from pathlib import Path - -import matplotlib.pyplot as plt -import polars as pl -import seaborn as sns - -# Set Garamond font -plt.rcParams["font.family"] = "serif" -plt.rcParams["font.serif"] = ["Garamond", "Georgia", "Times New Roman"] -plt.rcParams["font.size"] = 11 -plt.rcParams["axes.labelsize"] = 13 -plt.rcParams["axes.titlesize"] = 16 -plt.rcParams["xtick.labelsize"] = 10 -plt.rcParams["ytick.labelsize"] = 10 -plt.rcParams["legend.fontsize"] = 11 -plt.rcParams["figure.titlesize"] = 18 - - -def create_monthly_heatmap(data_path: str, output_path: str = "load_heatmap_monthly.png"): - """ - Create professional heatmap using MONTHLY aggregation (memory-safe). - Shows hour-of-day vs month (not day-of-year). - """ - print("📊 Creating monthly heatmap (memory-safe)...") - - # LAZY LOAD - aggregate in-database - lf = pl.scan_parquet(data_path) - - # Get stats - stats = lf.select([ - pl.col("account_identifier").n_unique().alias("n_customers"), - pl.min("date").alias("min_date"), - pl.max("date").alias("max_date"), - ]).collect(engine="streaming") - - n_customers = stats["n_customers"][0] - date_range = f"{stats['min_date'][0]} to {stats['max_date'][0]}" - print(f" Customers: {n_customers:,}") - print(f" Date range: {date_range}") - - # Aggregate by month and hour (streaming) - monthly_hourly = ( - lf.group_by(["sample_month", "hour"]) - .agg(pl.col("kwh").sum().alias("total_kwh")) - .sort(["sample_month", "hour"]) - .collect(engine="streaming") - ) - - print(f" Aggregated to {len(monthly_hourly):,} rows") - - # Pivot to matrix - matrix = monthly_hourly.pivot(index="hour", columns="sample_month", values="total_kwh").fill_null(0) - - hour_labels = matrix.select("hour").to_series().to_list() - month_cols = sorted([c for c in matrix.columns if c != "hour"]) - data_matrix = matrix.select(month_cols).to_numpy() - - print(f" Matrix: {data_matrix.shape[0]} hours × {data_matrix.shape[1]} months") - print(f" Range: {data_matrix.min():,.0f} to {data_matrix.max():,.0f} kWh") - - # Create figure - fig, ax = plt.subplots(figsize=(16, 8)) - - # Heatmap - sns.heatmap( - data_matrix, - cmap="rocket_r", - cbar_kws={"label": f"Total kWh (~{n_customers:,} homes)", "shrink": 0.8, "pad": 0.02}, - xticklabels=[f"{m[:4]}-{m[4:]}" for m in month_cols], # Format as YYYY-MM - yticklabels=hour_labels, - linewidths=0.5, - linecolor="white", - ax=ax, - robust=True, - ) - - # Labels - ax.set_xlabel("Month", fontsize=14, fontweight="bold", labelpad=10) - ax.set_ylabel("Hour of Day", fontsize=14, fontweight="bold", labelpad=10) - ax.set_title( - f"Illinois Residential Electricity Load Patterns\n{date_range} • {n_customers:,} Random Households", - fontsize=17, - fontweight="bold", - pad=20, - ) - - # Rotate x-labels for readability - ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right") - - # Invert y-axis - ax.invert_yaxis() - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -def create_hourly_profile(data_path: str, output_path: str = "hourly_profile_pro.png"): - """Create hourly profile with IQR (memory-safe).""" - print("\n📊 Creating hourly profile (memory-safe)...") - - # LAZY LOAD - lf = pl.scan_parquet(data_path) - - # Get customer count - n_customers = lf.select(pl.col("account_identifier").n_unique()).collect(engine="streaming")[0, 0] - - # Calculate stats (streaming aggregation) - hourly = ( - lf.group_by("hour") - .agg([ - pl.col("kwh").mean().alias("mean_kwh"), - pl.col("kwh").quantile(0.25).alias("p25_kwh"), - pl.col("kwh").quantile(0.75).alias("p75_kwh"), - ]) - .sort("hour") - .collect(engine="streaming") - ) - - print("\n 📊 Statistics:") - print(f" Customers: {n_customers:,}") - print(f" Lowest avg: {hourly['mean_kwh'].min():.3f} kWh per 30-min") - print(f" Highest avg: {hourly['mean_kwh'].max():.3f} kWh per 30-min") - print(f" Daily total: ~{hourly['mean_kwh'].sum():.1f} kWh per home per day") - - # Create figure - fig, ax = plt.subplots(figsize=(14, 7)) - - hours = hourly["hour"].to_list() - mean = hourly["mean_kwh"].to_list() - p25 = hourly["p25_kwh"].to_list() - p75 = hourly["p75_kwh"].to_list() - - # Interquartile range - ax.fill_between(hours, p25, p75, alpha=0.25, color="#d62728", label="Interquartile range (25th-75th %ile)") - ax.plot(hours, mean, linewidth=3, color="#d62728", marker="o", markersize=6, label="Average usage", zorder=5) - - # Styling - ax.set_xlabel("Hour of Day", fontsize=14, fontweight="bold", labelpad=10) - ax.set_ylabel("Average kWh per 30-min interval", fontsize=14, fontweight="bold", labelpad=10) - ax.set_title( - f"Average Hourly Electricity Usage\n{n_customers:,} Illinois Homes • June 2024 - June 2025", - fontsize=17, - fontweight="bold", - pad=20, - ) - ax.grid(True, alpha=0.3, linestyle="--", linewidth=0.5) - ax.set_xticks(range(0, 24, 2)) - ax.set_xlim(-0.5, 23.5) - ax.set_ylim(bottom=0) - - # Time-of-day shading - ax.axvspan(0, 6, alpha=0.08, color="blue", label="Overnight (12-6 AM)") - ax.axvspan(17, 21, alpha=0.08, color="red", label="Evening peak (5-9 PM)") - - # Annotations - peak_hour = mean.index(max(mean)) - peak_value = max(mean) - ax.annotate( - f"Peak: {peak_value:.3f} kWh\nat {peak_hour}:00", - xy=(peak_hour, peak_value), - xytext=(peak_hour - 3, peak_value + 0.03), - fontsize=11, - fontweight="bold", - bbox=dict(boxstyle="round", facecolor="yellow", alpha=0.8, edgecolor="black", linewidth=1.5), - arrowprops=dict(arrowstyle="->", lw=2, color="black"), - ) - - min_value = min(mean) - min_hour = mean.index(min_value) - ax.annotate( - f"Baseload: {min_value:.3f} kWh\nat {min_hour}:00", - xy=(min_hour, min_value), - xytext=(min_hour + 2, min_value + 0.05), - fontsize=11, - bbox=dict(boxstyle="round", facecolor="lightblue", alpha=0.8, edgecolor="black", linewidth=1.5), - arrowprops=dict(arrowstyle="->", lw=2, color="black"), - ) - - ax.legend(loc="upper left", framealpha=0.95, edgecolor="black", fancybox=False) - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -def create_weekend_comparison(data_path: str, output_path: str = "weekend_comparison.png"): - """Compare weekday vs weekend load profiles.""" - print("\n📊 Creating weekend comparison...") - - lf = pl.scan_parquet(data_path) - - # Aggregate by hour and weekend status - comparison = ( - lf.group_by(["hour", "is_weekend"]) - .agg(pl.col("kwh").mean().alias("mean_kwh")) - .sort(["is_weekend", "hour"]) - .collect(engine="streaming") - ) - - weekday = comparison.filter(pl.col("is_weekend") == False) - weekend = comparison.filter(pl.col("is_weekend") == True) - - fig, ax = plt.subplots(figsize=(14, 7)) - - ax.plot( - weekday["hour"], weekday["mean_kwh"], linewidth=3, color="#1f77b4", marker="o", label="Weekday", markersize=6 - ) - ax.plot( - weekend["hour"], weekend["mean_kwh"], linewidth=3, color="#ff7f0e", marker="s", label="Weekend", markersize=6 - ) - - ax.set_xlabel("Hour of Day", fontsize=14, fontweight="bold") - ax.set_ylabel("Average kWh per 30-min", fontsize=14, fontweight="bold") - ax.set_title("Weekday vs Weekend Load Profiles", fontsize=17, fontweight="bold", pad=20) - ax.grid(True, alpha=0.3, linestyle="--") - ax.legend(fontsize=13, framealpha=0.95) - ax.set_xticks(range(0, 24, 2)) - ax.set_xlim(-0.5, 23.5) - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -if __name__ == "__main__": - print("=" * 60) - print("PROFESSIONAL RESIDENTIAL LOAD VISUALIZATION") - print("=" * 60) - - # YOUR ACTUAL FILE - data_file = "analysis/zip60622_2024/final/sample_60622_202406_202506_CLIPPED.parquet" - - print(f"\n📂 Using: {data_file}") - - # Check file exists - if not Path(data_file).exists(): - print(f"❌ File not found: {data_file}") - print("Please update the path!") - exit(1) - - # Create visualizations (all memory-safe) - create_monthly_heatmap(data_file, "residential_load_heatmap_professional.png") - create_hourly_profile(data_file, "residential_hourly_profile_professional.png") - create_weekend_comparison(data_file, "residential_weekend_comparison.png") - - print("\n" + "=" * 60) - print("✅ COMPLETE!") - print("=" * 60) diff --git a/archive/residential_load_viz_2024/create_visualizations_cm90.py b/archive/residential_load_viz_2024/create_visualizations_cm90.py deleted file mode 100644 index b3df259..0000000 --- a/archive/residential_load_viz_2024/create_visualizations_cm90.py +++ /dev/null @@ -1,294 +0,0 @@ -#!/usr/bin/env python -""" -Professional visualizations using CM90 (quality-filtered) data. -""" - -from pathlib import Path - -import matplotlib.pyplot as plt -import polars as pl -import seaborn as sns - -# Set seaborn style -sns.set_style("whitegrid") -plt.rcParams["font.family"] = "sans-serif" -plt.rcParams["font.size"] = 11 -plt.rcParams["axes.labelsize"] = 13 -plt.rcParams["axes.titlesize"] = 16 -plt.rcParams["legend.fontsize"] = 11 -plt.rcParams["figure.titlesize"] = 18 -plt.rcParams["axes.grid"] = True -plt.rcParams["grid.alpha"] = 0.3 - - -def create_monthly_heatmap_cm90(data_path: str, output_path: str = "heatmap_cm90.png"): - """Create professional seaborn heatmap from CM90 data.""" - print("\n📊 Creating CM90 heatmap...") - - lf = pl.scan_parquet(data_path) - - stats = lf.select([ - pl.col("account_identifier").n_unique().alias("n_customers"), - pl.min("date").alias("min_date"), - pl.max("date").alias("max_date"), - ]).collect(engine="streaming") - - n_customers = stats["n_customers"][0] - date_range = f"{stats['min_date'][0]} to {stats['max_date'][0]}" - - monthly_hourly = ( - lf.group_by(["sample_month", "hour"]) - .agg(pl.col("kwh").sum().alias("total_kwh")) - .sort(["sample_month", "hour"]) - .collect(engine="streaming") - ) - - matrix = monthly_hourly.pivot(index="hour", columns="sample_month", values="total_kwh").fill_null(0) - - hour_labels = matrix.select("hour").to_series().to_list() - month_cols = sorted([c for c in matrix.columns if c != "hour"]) - data_matrix = matrix.select(month_cols).to_numpy() - - # Create figure - fig, ax = plt.subplots(figsize=(16, 9)) - - sns.heatmap( - data_matrix, - cmap="RdYlBu_r", - cbar_kws={ - "label": f"Total Energy Consumption (kWh)\n~{n_customers:,} Quality-Filtered Households", - "shrink": 0.75, - "pad": 0.02, - "aspect": 30, - }, - xticklabels=[f"{m[:4]}-{m[4:]}" for m in month_cols], - yticklabels=hour_labels, - linewidths=0.8, - linecolor="white", - ax=ax, - robust=True, - square=False, - ) - - ax.set_xlabel("Month", fontsize=15, fontweight="bold", labelpad=12) - ax.set_ylabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12) - ax.set_title( - f"Residential Electricity Load Patterns (CM90 Quality-Filtered)\n" - f"Chicago ZIP 60622 • {date_range} • {n_customers:,} Households\n" - f"Natural Gas Heating | Electric Air Conditioning", - fontsize=17, - fontweight="bold", - pad=25, - ) - - ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=11) - ax.invert_yaxis() - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -def create_hourly_profile_cm90(data_path: str, output_path: str = "hourly_profile_cm90.png"): - """Create hourly profile from CM90 data.""" - print("\n📊 Creating CM90 hourly profile...") - - lf = pl.scan_parquet(data_path) - n_customers = lf.select(pl.col("account_identifier").n_unique()).collect(engine="streaming")[0, 0] - - hourly = ( - lf.group_by("hour") - .agg([ - pl.col("kwh").mean().alias("mean_kwh"), - pl.col("kwh").quantile(0.25).alias("p25_kwh"), - pl.col("kwh").quantile(0.75).alias("p75_kwh"), - ]) - .sort("hour") - .collect(engine="streaming") - ) - - fig, ax = plt.subplots(figsize=(15, 8)) - - hours = hourly["hour"].to_list() - mean = hourly["mean_kwh"].to_list() - p25 = hourly["p25_kwh"].to_list() - p75 = hourly["p75_kwh"].to_list() - - ax.fill_between(hours, p25, p75, alpha=0.3, color="#e74c3c", label="Interquartile Range") - - sns.lineplot(x=hours, y=mean, linewidth=3.5, color="#c0392b", marker="o", markersize=8, ax=ax, label="Mean Usage") - - ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12) - ax.set_ylabel("Energy Consumption (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12) - ax.set_title( - f"Average Hourly Electricity Usage (CM90 Quality-Filtered)\n" - f"{n_customers:,} Chicago Households • June 2024 - June 2025", - fontsize=17, - fontweight="bold", - pad=25, - ) - - ax.set_xticks(range(0, 24, 2)) - ax.set_xlim(-0.5, 23.5) - ax.set_ylim(bottom=0) - ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8) - - ax.axvspan(0, 6, alpha=0.1, color="#3498db", label="Overnight", zorder=0) - ax.axvspan(17, 21, alpha=0.1, color="#e74c3c", label="Evening Peak", zorder=0) - - # Annotations - peak_hour = mean.index(max(mean)) - peak_value = max(mean) - ax.annotate( - f"Peak\n{peak_value:.3f} kWh\nat {peak_hour}:00", - xy=(peak_hour, peak_value), - xytext=(peak_hour - 4, peak_value + 0.04), - fontsize=12, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.7", facecolor="#f39c12", alpha=0.9, edgecolor="#000", linewidth=2), - arrowprops=dict(arrowstyle="->", lw=2.5, color="#000"), - ) - - min_value = min(mean) - min_hour = mean.index(min_value) - ax.annotate( - f"Baseload\n{min_value:.3f} kWh\nat {min_hour}:00", - xy=(min_hour, min_value), - xytext=(min_hour + 3, min_value + 0.06), - fontsize=12, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.7", facecolor="#3498db", alpha=0.9, edgecolor="#000", linewidth=2), - arrowprops=dict(arrowstyle="->", lw=2.5, color="#000"), - ) - - ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=12) - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -def create_weekend_comparison_cm90(data_path: str, output_path: str = "weekend_comparison_cm90.png"): - """Compare weekday vs weekend from CM90 data.""" - print("\n📊 Creating CM90 weekend comparison...") - - lf = pl.scan_parquet(data_path) - - comparison = ( - lf.group_by(["hour", "is_weekend"]) - .agg(pl.col("kwh").mean().alias("mean_kwh")) - .sort(["is_weekend", "hour"]) - .collect(engine="streaming") - ) - - weekday = comparison.filter(pl.col("is_weekend") == False) - weekend = comparison.filter(pl.col("is_weekend") == True) - - fig, ax = plt.subplots(figsize=(15, 8)) - - weekday_hours = weekday["hour"].to_list() - weekday_kwh = weekday["mean_kwh"].to_list() - weekend_hours = weekend["hour"].to_list() - weekend_kwh = weekend["mean_kwh"].to_list() - - sns.lineplot( - x=weekday_hours, y=weekday_kwh, linewidth=3.5, color="#2980b9", marker="o", markersize=8, ax=ax, label="Weekday" - ) - sns.lineplot( - x=weekend_hours, y=weekend_kwh, linewidth=3.5, color="#e67e22", marker="s", markersize=8, ax=ax, label="Weekend" - ) - - ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12) - ax.set_ylabel("Average Energy (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12) - ax.set_title( - "Weekday vs Weekend Load Profiles (CM90 Quality-Filtered)\nTemporal Consumption Patterns", - fontsize=17, - fontweight="bold", - pad=25, - ) - ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8) - ax.set_xticks(range(0, 24, 2)) - ax.set_xlim(-0.5, 23.5) - ax.set_ylim(bottom=0) - - # Annotations - weekday_peak_hour = weekday_kwh.index(max(weekday_kwh)) - weekday_peak = max(weekday_kwh) - ax.annotate( - f"Weekday Peak\n{weekday_peak:.3f} kWh\nat {weekday_peak_hour}:00", - xy=(weekday_peak_hour, weekday_peak), - xytext=(weekday_peak_hour - 3, weekday_peak + 0.04), - fontsize=11, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.6", facecolor="#3498db", alpha=0.9, edgecolor="#000", linewidth=2), - arrowprops=dict(arrowstyle="->", lw=2, color="#000"), - ) - - weekend_peak_hour = weekend_kwh.index(max(weekend_kwh)) - weekend_peak = max(weekend_kwh) - ax.annotate( - f"Weekend Peak\n{weekend_peak:.3f} kWh\nat {weekend_peak_hour}:00", - xy=(weekend_peak_hour, weekend_peak), - xytext=(weekend_peak_hour + 2, weekend_peak + 0.04), - fontsize=11, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.6", facecolor="#e67e22", alpha=0.9, edgecolor="#000", linewidth=2), - arrowprops=dict(arrowstyle="->", lw=2, color="#000"), - ) - - weekday_min = min(weekday_kwh) - weekday_min_hour = weekday_kwh.index(weekday_min) - ax.annotate( - f"Weekday Low\n{weekday_min:.3f} kWh", - xy=(weekday_min_hour, weekday_min), - xytext=(weekday_min_hour + 2, weekday_min + 0.05), - fontsize=10, - bbox=dict(boxstyle="round,pad=0.5", facecolor="#bdc3c7", alpha=0.9, edgecolor="#000", linewidth=1.5), - arrowprops=dict(arrowstyle="->", lw=1.5, color="#000"), - ) - - weekend_min = min(weekend_kwh) - weekend_min_hour = weekend_kwh.index(weekend_min) - ax.annotate( - f"Weekend Low\n{weekend_min:.3f} kWh", - xy=(weekend_min_hour, weekend_min), - xytext=(weekend_min_hour - 4, weekend_min + 0.05), - fontsize=10, - bbox=dict(boxstyle="round,pad=0.5", facecolor="#ecf0f1", alpha=0.9, edgecolor="#000", linewidth=1.5), - arrowprops=dict(arrowstyle="->", lw=1.5, color="#000"), - ) - - ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=13) - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -if __name__ == "__main__": - print("=" * 80) - print("CM90 QUALITY-FILTERED VISUALIZATIONS") - print("=" * 80) - - data_file = "analysis/zip60622_2024/final/sample_60622_202406_202506_CLIPPED_CM90.parquet" - - if not Path(data_file).exists(): - print(f"❌ File not found: {data_file}") - exit(1) - - create_monthly_heatmap_cm90(data_file, "residential_heatmap_cm90.png") - create_hourly_profile_cm90(data_file, "residential_hourly_cm90.png") - create_weekend_comparison_cm90(data_file, "residential_weekend_cm90.png") - - print("\n" + "=" * 80) - print("✅ CM90 VISUALIZATIONS COMPLETE!") - print("=" * 80) - print("\nGenerated files:") - print(" 📈 residential_heatmap_cm90.png") - print(" 📈 residential_hourly_cm90.png") - print(" 📈 residential_weekend_cm90.png") - print("\nNote: These use quality-filtered data (≥90% valid days)") - print("=" * 80) diff --git a/archive/residential_load_viz_2024/create_visualizations_enhanced.py b/archive/residential_load_viz_2024/create_visualizations_enhanced.py deleted file mode 100644 index 5868fbf..0000000 --- a/archive/residential_load_viz_2024/create_visualizations_enhanced.py +++ /dev/null @@ -1,447 +0,0 @@ -#!/usr/bin/env python -""" -Professional-quality load visualizations with summary statistics. -""" - -from pathlib import Path - -import matplotlib.pyplot as plt -import polars as pl -import seaborn as sns - -# Set seaborn style -sns.set_style("whitegrid") -sns.set_palette("husl") - -# Professional typography -plt.rcParams["font.family"] = "serif" -plt.rcParams["font.serif"] = ["Garamond", "Georgia", "Times New Roman"] -plt.rcParams["font.size"] = 11 -plt.rcParams["axes.labelsize"] = 13 -plt.rcParams["axes.titlesize"] = 16 -plt.rcParams["xtick.labelsize"] = 10 -plt.rcParams["ytick.labelsize"] = 10 -plt.rcParams["legend.fontsize"] = 11 -plt.rcParams["figure.titlesize"] = 18 -plt.rcParams["axes.grid"] = True -plt.rcParams["grid.alpha"] = 0.3 - - -def generate_summary_statistics(data_path: str): - """Generate comprehensive summary statistics table.""" - print("\n📊 Generating Summary Statistics...") - - lf = pl.scan_parquet(data_path) - - # Overall statistics - overall = lf.select([ - pl.col("account_identifier").n_unique().alias("unique_customers"), - pl.len().alias("total_observations"), - pl.min("date").alias("start_date"), - pl.max("date").alias("end_date"), - pl.col("kwh").mean().alias("mean_kwh"), - pl.col("kwh").median().alias("median_kwh"), - pl.col("kwh").std().alias("std_kwh"), - pl.col("kwh").min().alias("min_kwh"), - pl.col("kwh").max().alias("max_kwh"), - pl.col("kwh").quantile(0.25).alias("q25_kwh"), - pl.col("kwh").quantile(0.75).alias("q75_kwh"), - ]).collect(engine="streaming") - - # Monthly breakdown - monthly = ( - lf.group_by("sample_month") - .agg([ - pl.col("account_identifier").n_unique().alias("customers"), - pl.len().alias("observations"), - pl.col("kwh").mean().alias("mean_kwh"), - pl.col("kwh").sum().alias("total_kwh"), - ]) - .sort("sample_month") - .collect(engine="streaming") - ) - - # Weekday vs Weekend - weekend_stats = ( - lf.group_by("is_weekend") - .agg([ - pl.col("kwh").mean().alias("mean_kwh"), - pl.col("kwh").median().alias("median_kwh"), - pl.col("kwh").std().alias("std_kwh"), - ]) - .collect(engine="streaming") - ) - - # Hourly statistics - hourly_stats = ( - lf.group_by("hour") - .agg([ - pl.col("kwh").mean().alias("mean_kwh"), - pl.col("kwh").std().alias("std_kwh"), - ]) - .sort("hour") - .collect(engine="streaming") - ) - - # Print comprehensive report - print("\n" + "=" * 80) - print("COMPREHENSIVE SUMMARY STATISTICS") - print("=" * 80) - - print("\n📋 OVERALL STATISTICS") - print("-" * 80) - print(f" Total Unique Customers: {overall['unique_customers'][0]:>12,}") - print(f" Total Observations: {overall['total_observations'][0]:>12,}") - print(f" Date Range: {overall['start_date'][0]} to {overall['end_date'][0]}") - print(f" Average kWh per 30-min: {overall['mean_kwh'][0]:>12.4f}") - print(f" Median kWh per 30-min: {overall['median_kwh'][0]:>12.4f}") - print(f" Std Dev kWh: {overall['std_kwh'][0]:>12.4f}") - print(f" Min kWh: {overall['min_kwh'][0]:>12.4f}") - print(f" Max kWh: {overall['max_kwh'][0]:>12.4f}") - print(f" 25th Percentile: {overall['q25_kwh'][0]:>12.4f}") - print(f" 75th Percentile: {overall['q75_kwh'][0]:>12.4f}") - - # Estimated daily/annual usage - daily_kwh = overall["mean_kwh"][0] * 48 # 48 half-hour intervals per day - annual_kwh = daily_kwh * 365 - print(f"\n Est. Daily Usage per Home: {daily_kwh:>12.1f} kWh/day") - print(f" Est. Annual Usage per Home: {annual_kwh:>12,.0f} kWh/year") - - print("\n📅 MONTHLY BREAKDOWN") - print("-" * 80) - print(f"{'Month':<12} {'Customers':>10} {'Observations':>15} {'Avg kWh':>12} {'Total kWh':>15}") - print("-" * 80) - for row in monthly.iter_rows(): - month_label = f"{row[0][:4]}-{row[0][4:]}" - print(f"{month_label:<12} {row[1]:>10,} {row[2]:>15,} {row[3]:>12.4f} {row[4]:>15,.0f}") - - print("\n📊 WEEKDAY vs WEEKEND COMPARISON") - print("-" * 80) - print(f"{'Period':<15} {'Mean kWh':>12} {'Median kWh':>12} {'Std Dev':>12}") - print("-" * 80) - for row in weekend_stats.iter_rows(): - period = "Weekend" if row[0] else "Weekday" - print(f"{period:<15} {row[1]:>12.4f} {row[2]:>12.4f} {row[3]:>12.4f}") - - print("\n⏰ HOURLY STATISTICS (Peak Hours)") - print("-" * 80) - # Show top 5 and bottom 5 hours - top5 = hourly_stats.sort("mean_kwh", descending=True).head(5) - bottom5 = hourly_stats.sort("mean_kwh").head(5) - - print(" Highest Usage Hours:") - for row in top5.iter_rows(): - print(f" Hour {row[0]:2d}:00 - {row[1]:.4f} kWh (±{row[2]:.4f})") - - print("\n Lowest Usage Hours:") - for row in bottom5.iter_rows(): - print(f" Hour {row[0]:2d}:00 - {row[1]:.4f} kWh (±{row[2]:.4f})") - - print("\n" + "=" * 80) - - # Save to file - with open("summary_statistics.txt", "w") as f: - f.write("=" * 80 + "\n") - f.write("COMPREHENSIVE SUMMARY STATISTICS\n") - f.write("=" * 80 + "\n\n") - f.write(f"Total Customers: {overall['unique_customers'][0]:,}\n") - f.write(f"Date Range: {overall['start_date'][0]} to {overall['end_date'][0]}\n") - f.write(f"Mean kWh per 30-min: {overall['mean_kwh'][0]:.4f}\n") - f.write(f"Estimated Daily Usage: {daily_kwh:.1f} kWh/day\n") - f.write(f"Estimated Annual Usage: {annual_kwh:,.0f} kWh/year\n") - f.write("\nMonthly Breakdown:\n") - f.write(monthly.write_csv()) - - print("\n✅ Summary saved to: summary_statistics.txt") - - return overall, monthly, weekend_stats, hourly_stats - - -def create_monthly_heatmap(data_path: str, output_path: str = "load_heatmap_seaborn.png"): - """Create professional seaborn heatmap.""" - print("\n📊 Creating professional seaborn heatmap...") - - lf = pl.scan_parquet(data_path) - - stats = lf.select([ - pl.col("account_identifier").n_unique().alias("n_customers"), - pl.min("date").alias("min_date"), - pl.max("date").alias("max_date"), - ]).collect(engine="streaming") - - n_customers = stats["n_customers"][0] - date_range = f"{stats['min_date'][0]} to {stats['max_date'][0]}" - - monthly_hourly = ( - lf.group_by(["sample_month", "hour"]) - .agg(pl.col("kwh").sum().alias("total_kwh")) - .sort(["sample_month", "hour"]) - .collect(engine="streaming") - ) - - matrix = monthly_hourly.pivot(index="hour", columns="sample_month", values="total_kwh").fill_null(0) - - hour_labels = matrix.select("hour").to_series().to_list() - month_cols = sorted([c for c in matrix.columns if c != "hour"]) - data_matrix = matrix.select(month_cols).to_numpy() - - # Create figure with seaborn - fig, ax = plt.subplots(figsize=(16, 9)) - - sns.heatmap( - data_matrix, - cmap="RdYlBu_r", # Professional diverging colormap - cbar_kws={ - "label": f"Total Energy Consumption (kWh)\n~{n_customers:,} Households", - "shrink": 0.75, - "pad": 0.02, - "aspect": 30, - }, - xticklabels=[f"{m[:4]}-{m[4:]}" for m in month_cols], - yticklabels=hour_labels, - linewidths=0.8, - linecolor="white", - ax=ax, - robust=True, - square=False, - annot=False, - ) - - # Professional styling - ax.set_xlabel("Month", fontsize=15, fontweight="bold", labelpad=12) - ax.set_ylabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12) - ax.set_title( - f"Residential Electricity Load Patterns: Temporal Heat Map\n" - f"Chicago ZIP 60622 • {date_range} • {n_customers:,} Random Households", - fontsize=18, - fontweight="bold", - pad=25, - ) - - # Rotate labels - ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=11) - ax.set_yticklabels(ax.get_yticklabels(), fontsize=10) - - # Invert y-axis - ax.invert_yaxis() - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -def create_hourly_profile_seaborn(data_path: str, output_path: str = "hourly_profile_seaborn.png"): - """Create professional seaborn hourly profile.""" - print("\n📊 Creating professional seaborn hourly profile...") - - lf = pl.scan_parquet(data_path) - n_customers = lf.select(pl.col("account_identifier").n_unique()).collect(engine="streaming")[0, 0] - - hourly = ( - lf.group_by("hour") - .agg([ - pl.col("kwh").mean().alias("mean_kwh"), - pl.col("kwh").quantile(0.25).alias("p25_kwh"), - pl.col("kwh").quantile(0.75).alias("p75_kwh"), - ]) - .sort("hour") - .collect(engine="streaming") - ) - - # Create figure - fig, ax = plt.subplots(figsize=(15, 8)) - - hours = hourly["hour"].to_list() - mean = hourly["mean_kwh"].to_list() - p25 = hourly["p25_kwh"].to_list() - p75 = hourly["p75_kwh"].to_list() - - # Use seaborn lineplot style - ax.fill_between(hours, p25, p75, alpha=0.3, color="#e74c3c", label="Interquartile Range (25th-75th percentile)") - - sns.lineplot(x=hours, y=mean, linewidth=3.5, color="#c0392b", marker="o", markersize=8, ax=ax, label="Mean Usage") - - # Styling - ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12) - ax.set_ylabel("Energy Consumption (kWh per 30-min interval)", fontsize=15, fontweight="bold", labelpad=12) - ax.set_title( - f"Average Hourly Electricity Usage Profile\n{n_customers:,} Illinois Households • June 2024 - June 2025", - fontsize=18, - fontweight="bold", - pad=25, - ) - - ax.set_xticks(range(0, 24, 2)) - ax.set_xlim(-0.5, 23.5) - ax.set_ylim(bottom=0) - ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8) - - # Time-of-day shading - ax.axvspan(0, 6, alpha=0.1, color="#3498db", label="Overnight (12-6 AM)", zorder=0) - ax.axvspan(17, 21, alpha=0.1, color="#e74c3c", label="Evening Peak (5-9 PM)", zorder=0) - - # Annotations with improved styling - peak_hour = mean.index(max(mean)) - peak_value = max(mean) - ax.annotate( - f"Peak Demand\n{peak_value:.3f} kWh\nat {peak_hour}:00", - xy=(peak_hour, peak_value), - xytext=(peak_hour - 4, peak_value + 0.04), - fontsize=12, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.7", facecolor="#f39c12", alpha=0.9, edgecolor="#000", linewidth=2), - arrowprops=dict(arrowstyle="->", lw=2.5, color="#000", connectionstyle="arc3,rad=0.3"), - ) - - min_value = min(mean) - min_hour = mean.index(min_value) - ax.annotate( - f"Baseload\n{min_value:.3f} kWh\nat {min_hour}:00", - xy=(min_hour, min_value), - xytext=(min_hour + 3, min_value + 0.06), - fontsize=12, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.7", facecolor="#3498db", alpha=0.9, edgecolor="#000", linewidth=2), - arrowprops=dict(arrowstyle="->", lw=2.5, color="#000", connectionstyle="arc3,rad=-0.3"), - ) - - ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=12) - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -def create_weekend_comparison_seaborn(data_path: str, output_path: str = "weekend_comparison_seaborn.png"): - """Compare weekday vs weekend with peak/baseload annotations.""" - print("\n📊 Creating professional weekend comparison...") - - lf = pl.scan_parquet(data_path) - - comparison = ( - lf.group_by(["hour", "is_weekend"]) - .agg(pl.col("kwh").mean().alias("mean_kwh")) - .sort(["is_weekend", "hour"]) - .collect(engine="streaming") - ) - - weekday = comparison.filter(pl.col("is_weekend") == False) - weekend = comparison.filter(pl.col("is_weekend") == True) - - # Create figure - fig, ax = plt.subplots(figsize=(15, 8)) - - weekday_hours = weekday["hour"].to_list() - weekday_kwh = weekday["mean_kwh"].to_list() - weekend_hours = weekend["hour"].to_list() - weekend_kwh = weekend["mean_kwh"].to_list() - - # Seaborn style lines - sns.lineplot( - x=weekday_hours, y=weekday_kwh, linewidth=3.5, color="#2980b9", marker="o", markersize=8, ax=ax, label="Weekday" - ) - sns.lineplot( - x=weekend_hours, y=weekend_kwh, linewidth=3.5, color="#e67e22", marker="s", markersize=8, ax=ax, label="Weekend" - ) - - # Styling - ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12) - ax.set_ylabel("Average Energy Consumption (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12) - ax.set_title( - "Weekday vs Weekend Load Profiles\nTemporal Consumption Patterns", fontsize=18, fontweight="bold", pad=25 - ) - ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8) - ax.set_xticks(range(0, 24, 2)) - ax.set_xlim(-0.5, 23.5) - ax.set_ylim(bottom=0) - - # Peak annotations for BOTH weekday and weekend - # Weekday peak - weekday_peak_hour = weekday_kwh.index(max(weekday_kwh)) - weekday_peak = max(weekday_kwh) - ax.annotate( - f"Weekday Peak\n{weekday_peak:.3f} kWh\nat {weekday_peak_hour}:00", - xy=(weekday_peak_hour, weekday_peak), - xytext=(weekday_peak_hour - 3, weekday_peak + 0.04), - fontsize=11, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.6", facecolor="#3498db", alpha=0.9, edgecolor="#000", linewidth=2), - arrowprops=dict(arrowstyle="->", lw=2, color="#000"), - ) - - # Weekend peak - weekend_peak_hour = weekend_kwh.index(max(weekend_kwh)) - weekend_peak = max(weekend_kwh) - ax.annotate( - f"Weekend Peak\n{weekend_peak:.3f} kWh\nat {weekend_peak_hour}:00", - xy=(weekend_peak_hour, weekend_peak), - xytext=(weekend_peak_hour + 2, weekend_peak + 0.04), - fontsize=11, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.6", facecolor="#e67e22", alpha=0.9, edgecolor="#000", linewidth=2), - arrowprops=dict(arrowstyle="->", lw=2, color="#000"), - ) - - # Baseload annotations - # Weekday baseload - weekday_min = min(weekday_kwh) - weekday_min_hour = weekday_kwh.index(weekday_min) - ax.annotate( - f"Weekday Baseload\n{weekday_min:.3f} kWh", - xy=(weekday_min_hour, weekday_min), - xytext=(weekday_min_hour + 2, weekday_min + 0.05), - fontsize=10, - bbox=dict(boxstyle="round,pad=0.5", facecolor="#bdc3c7", alpha=0.9, edgecolor="#000", linewidth=1.5), - arrowprops=dict(arrowstyle="->", lw=1.5, color="#000"), - ) - - # Weekend baseload - weekend_min = min(weekend_kwh) - weekend_min_hour = weekend_kwh.index(weekend_min) - ax.annotate( - f"Weekend Baseload\n{weekend_min:.3f} kWh", - xy=(weekend_min_hour, weekend_min), - xytext=(weekend_min_hour - 4, weekend_min + 0.05), - fontsize=10, - bbox=dict(boxstyle="round,pad=0.5", facecolor="#ecf0f1", alpha=0.9, edgecolor="#000", linewidth=1.5), - arrowprops=dict(arrowstyle="->", lw=1.5, color="#000"), - ) - - ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=13) - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -if __name__ == "__main__": - print("=" * 80) - print("PROFESSIONAL RESIDENTIAL LOAD ANALYSIS & VISUALIZATION") - print("=" * 80) - - data_file = "../zip60622_2024/final/sample_60622_202406_202506_CLIPPED.parquet" - - if not Path(data_file).exists(): - print(f"❌ File not found: {data_file}") - exit(1) - - # 1. Generate comprehensive summary statistics - generate_summary_statistics(data_file) - - # 2. Create professional visualizations - create_monthly_heatmap(data_file, "residential_load_heatmap_professional.png") - create_hourly_profile_seaborn(data_file, "residential_hourly_profile_professional.png") - create_weekend_comparison_seaborn(data_file, "residential_weekend_comparison_professional.png") - - print("\n" + "=" * 80) - print("✅ ANALYSIS COMPLETE!") - print("=" * 80) - print("\nGenerated files:") - print(" 📊 summary_statistics.txt") - print(" 📈 residential_load_heatmap_professional.png") - print(" 📈 residential_hourly_profile_professional.png") - print(" 📈 residential_weekend_comparison_professional.png") - print("=" * 80) diff --git a/archive/residential_load_viz_2024/create_visualizations_final.py b/archive/residential_load_viz_2024/create_visualizations_final.py deleted file mode 100644 index 48275df..0000000 --- a/archive/residential_load_viz_2024/create_visualizations_final.py +++ /dev/null @@ -1,321 +0,0 @@ -#!/usr/bin/env python -""" -Final professional visualizations with all corrections. -""" - -from pathlib import Path - -import matplotlib.pyplot as plt -import polars as pl -import seaborn as sns - -# Set seaborn style -sns.set_style("whitegrid") -plt.rcParams["font.family"] = "sans-serif" -plt.rcParams["font.size"] = 11 -plt.rcParams["axes.labelsize"] = 13 -plt.rcParams["axes.titlesize"] = 16 -plt.rcParams["legend.fontsize"] = 11 -plt.rcParams["figure.titlesize"] = 18 -plt.rcParams["axes.grid"] = True -plt.rcParams["grid.alpha"] = 0.3 - - -def create_monthly_heatmap(data_path: str, output_path: str = "heatmap_final.png"): - """Create professional seaborn heatmap.""" - print("\n📊 Creating heatmap...") - - lf = pl.scan_parquet(data_path) - - # Filter to Aug 2024 - Aug 2025 only - desired_months = [f"2024{m:02d}" for m in range(8, 13)] + [f"2025{m:02d}" for m in range(1, 9)] - lf = lf.filter(pl.col("sample_month").is_in(desired_months)) - - stats = lf.select([ - pl.col("account_identifier").n_unique().alias("n_customers"), - pl.min("date").alias("min_date"), - pl.max("date").alias("max_date"), - ]).collect(engine="streaming") - - n_customers = stats["n_customers"][0] - date_range = f"{stats['min_date'][0]} to {stats['max_date'][0]}" - - print(f" Unique households: {n_customers:,}") - - monthly_hourly = ( - lf.group_by(["sample_month", "hour"]) - .agg(pl.col("kwh").sum().alias("total_kwh")) - .sort(["sample_month", "hour"]) - .collect(engine="streaming") - ) - - # Check what months we actually have - available_months = sorted(monthly_hourly["sample_month"].unique().to_list()) - print(f" Available months: {', '.join(available_months)}") - if "202507" not in available_months: - print(" ⚠️ Note: July 2025 (202507) missing from source data") - - matrix = monthly_hourly.pivot(index="hour", columns="sample_month", values="total_kwh").fill_null(0) - - hour_labels = matrix.select("hour").to_series().to_list() - month_cols = sorted([c for c in matrix.columns if c != "hour"]) - data_matrix = matrix.select(month_cols).to_numpy() - - # Create figure - fig, ax = plt.subplots(figsize=(16, 9)) - - sns.heatmap( - data_matrix, - cmap="RdYlBu_r", - cbar_kws={ - "label": f"Total Energy Consumption (kWh)\n{n_customers:,} Households", - "shrink": 0.75, - "pad": 0.02, - "aspect": 30, - }, - xticklabels=[f"{m[:4]}-{m[4:]}" for m in month_cols], - yticklabels=hour_labels, - linewidths=0.8, - linecolor="white", - ax=ax, - robust=True, - square=False, - ) - - ax.set_xlabel("Month", fontsize=15, fontweight="bold", labelpad=12) - ax.set_ylabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12) - ax.set_title( - f"Residential Electricity Load Patterns: Temporal Heat Map\n" - f"Chicago ZIP 60622 • {date_range} • {n_customers:,} Households", - fontsize=18, - fontweight="bold", - pad=25, - ) - - ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=11) - ax.invert_yaxis() - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -def create_hourly_profile(data_path: str, output_path: str = "hourly_profile_final.png"): - """Create hourly profile with seaborn.""" - print("\n📊 Creating hourly profile...") - - lf = pl.scan_parquet(data_path) - - # Filter to Aug 2024 - Aug 2025 - desired_months = [f"2024{m:02d}" for m in range(8, 13)] + [f"2025{m:02d}" for m in range(1, 9)] - lf = lf.filter(pl.col("sample_month").is_in(desired_months)) - - n_customers = lf.select(pl.col("account_identifier").n_unique()).collect(engine="streaming")[0, 0] - - hourly = ( - lf.group_by("hour") - .agg([ - pl.col("kwh").mean().alias("mean_kwh"), - pl.col("kwh").quantile(0.25).alias("p25_kwh"), - pl.col("kwh").quantile(0.75).alias("p75_kwh"), - ]) - .sort("hour") - .collect(engine="streaming") - ) - - fig, ax = plt.subplots(figsize=(15, 8)) - - hours = hourly["hour"].to_list() - mean = hourly["mean_kwh"].to_list() - p25 = hourly["p25_kwh"].to_list() - p75 = hourly["p75_kwh"].to_list() - - # Seaborn fill and line - ax.fill_between(hours, p25, p75, alpha=0.3, color="#e74c3c", label="Interquartile Range") - - sns.lineplot(x=hours, y=mean, linewidth=3.5, color="#c0392b", marker="o", markersize=8, ax=ax, label="Mean Usage") - - ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12) - ax.set_ylabel("Energy Consumption (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12) - ax.set_title( - f"Average Hourly Electricity Usage\n{n_customers:,} Chicago Households • August 2024 - August 2025", - fontsize=18, - fontweight="bold", - pad=25, - ) - - ax.set_xticks(range(0, 24, 2)) - ax.set_xlim(-0.5, 23.5) - ax.set_ylim(bottom=0) - ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8) - - ax.axvspan(0, 6, alpha=0.1, color="#3498db", label="Overnight", zorder=0) - ax.axvspan(17, 21, alpha=0.1, color="#e74c3c", label="Evening Peak", zorder=0) - - # Annotations - peak_hour = mean.index(max(mean)) - peak_value = max(mean) - ax.annotate( - f"Peak\n{peak_value:.3f} kWh\nat {peak_hour}:00", - xy=(peak_hour, peak_value), - xytext=(peak_hour - 4, peak_value + 0.04), - fontsize=12, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.7", facecolor="#f39c12", alpha=0.9, edgecolor="#000", linewidth=2), - arrowprops=dict(arrowstyle="->", lw=2.5, color="#000"), - ) - - min_value = min(mean) - min_hour = mean.index(min_value) - ax.annotate( - f"Baseload\n{min_value:.3f} kWh\nat {min_hour}:00", - xy=(min_hour, min_value), - xytext=(min_hour + 3, min_value + 0.06), - fontsize=12, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.7", facecolor="#3498db", alpha=0.9, edgecolor="#000", linewidth=2), - arrowprops=dict(arrowstyle="->", lw=2.5, color="#000"), - ) - - ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=12) - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -def create_weekend_comparison(data_path: str, output_path: str = "weekend_comparison_final.png"): - """Compare weekday vs weekend with seaborn.""" - print("\n📊 Creating weekend comparison...") - - lf = pl.scan_parquet(data_path) - - # Filter to Aug 2024 - Aug 2025 - desired_months = [f"2024{m:02d}" for m in range(8, 13)] + [f"2025{m:02d}" for m in range(1, 9)] - lf = lf.filter(pl.col("sample_month").is_in(desired_months)) - - comparison = ( - lf.group_by(["hour", "is_weekend"]) - .agg(pl.col("kwh").mean().alias("mean_kwh")) - .sort(["is_weekend", "hour"]) - .collect(engine="streaming") - ) - - weekday = comparison.filter(pl.col("is_weekend") == False) - weekend = comparison.filter(pl.col("is_weekend") == True) - - fig, ax = plt.subplots(figsize=(15, 9)) # Taller to avoid overlap - - weekday_hours = weekday["hour"].to_list() - weekday_kwh = weekday["mean_kwh"].to_list() - weekend_hours = weekend["hour"].to_list() - weekend_kwh = weekend["mean_kwh"].to_list() - - # Seaborn lines - sns.lineplot( - x=weekday_hours, y=weekday_kwh, linewidth=3.5, color="#2980b9", marker="o", markersize=8, ax=ax, label="Weekday" - ) - sns.lineplot( - x=weekend_hours, y=weekend_kwh, linewidth=3.5, color="#e67e22", marker="s", markersize=8, ax=ax, label="Weekend" - ) - - ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12) - ax.set_ylabel("Average Energy (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12) - ax.set_title( - "Weekday vs Weekend Load Profiles\nTemporal Consumption Patterns", - fontsize=18, - fontweight="bold", - pad=30, # More padding - ) - ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8) - ax.set_xticks(range(0, 24, 2)) - ax.set_xlim(-0.5, 23.5) - - # Set y-limit to give more headroom for annotations - max_val = max(max(weekday_kwh), max(weekend_kwh)) - ax.set_ylim(0, max_val * 1.25) # 25% headroom - - # Annotations - adjusted positions to avoid overlap - weekday_peak_hour = weekday_kwh.index(max(weekday_kwh)) - weekday_peak = max(weekday_kwh) - ax.annotate( - f"Weekday Peak\n{weekday_peak:.3f} kWh\nat {weekday_peak_hour}:00", - xy=(weekday_peak_hour, weekday_peak), - xytext=(weekday_peak_hour - 4, weekday_peak + 0.03), # Closer to point - fontsize=11, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.6", facecolor="#3498db", alpha=0.9, edgecolor="#000", linewidth=2), - arrowprops=dict(arrowstyle="->", lw=2, color="#000"), - ) - - weekend_peak_hour = weekend_kwh.index(max(weekend_kwh)) - weekend_peak = max(weekend_kwh) - ax.annotate( - f"Weekend Peak\n{weekend_peak:.3f} kWh\nat {weekend_peak_hour}:00", - xy=(weekend_peak_hour, weekend_peak), - xytext=(weekend_peak_hour + 3, weekend_peak + 0.03), # Closer to point - fontsize=11, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.6", facecolor="#e67e22", alpha=0.9, edgecolor="#000", linewidth=2), - arrowprops=dict(arrowstyle="->", lw=2, color="#000"), - ) - - # Baseload annotations - weekday_min = min(weekday_kwh) - weekday_min_hour = weekday_kwh.index(weekday_min) - ax.annotate( - f"Weekday Low\n{weekday_min:.3f} kWh", - xy=(weekday_min_hour, weekday_min), - xytext=(weekday_min_hour + 2, weekday_min + 0.04), - fontsize=10, - bbox=dict(boxstyle="round,pad=0.5", facecolor="#bdc3c7", alpha=0.9, edgecolor="#000", linewidth=1.5), - arrowprops=dict(arrowstyle="->", lw=1.5, color="#000"), - ) - - weekend_min = min(weekend_kwh) - weekend_min_hour = weekend_kwh.index(weekend_min) - ax.annotate( - f"Weekend Low\n{weekend_min:.3f} kWh", - xy=(weekend_min_hour, weekend_min), - xytext=(weekend_min_hour - 4, weekend_min + 0.04), - fontsize=10, - bbox=dict(boxstyle="round,pad=0.5", facecolor="#ecf0f1", alpha=0.9, edgecolor="#000", linewidth=1.5), - arrowprops=dict(arrowstyle="->", lw=1.5, color="#000"), - ) - - ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=13) - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -if __name__ == "__main__": - print("=" * 80) - print("FINAL PROFESSIONAL VISUALIZATIONS") - print("=" * 80) - - data_file = "analysis/zip60622_2024/final/sample_60622_202406_202506_CLIPPED_CM90.parquet" - - if not Path(data_file).exists(): - print(f"❌ File not found: {data_file}") - exit(1) - - print("\n✓ Using seaborn for all visualizations") - print("✓ Filtering to August 2024 - August 2025 only") - - create_monthly_heatmap(data_file, "residential_heatmap_final.png") - create_hourly_profile(data_file, "residential_hourly_final.png") - create_weekend_comparison(data_file, "residential_weekend_final.png") - - print("\n" + "=" * 80) - print("✅ VISUALIZATIONS COMPLETE!") - print("=" * 80) - print("\nGenerated files:") - print(" 📈 residential_heatmap_final.png") - print(" 📈 residential_hourly_final.png") - print(" 📈 residential_weekend_final.png") - print("=" * 80) diff --git a/archive/residential_load_viz_2024/create_visualizations_with_qc.py b/archive/residential_load_viz_2024/create_visualizations_with_qc.py deleted file mode 100644 index d1c52e9..0000000 --- a/archive/residential_load_viz_2024/create_visualizations_with_qc.py +++ /dev/null @@ -1,593 +0,0 @@ -#!/usr/bin/env python -""" -Professional-quality load visualizations with QC validation. -""" - -import sys -from pathlib import Path - -import matplotlib.pyplot as plt -import polars as pl -import seaborn as sns - -# Import your transformation QC functions -sys.path.insert(0, str(Path(__file__).resolve().parents[2])) -from smart_meter_analysis.transformation import daily_interval_qc, dst_transition_dates - -# Set seaborn style -sns.set_style("whitegrid") -sns.set_palette("husl") - -# Professional typography -plt.rcParams["font.family"] = "serif" -plt.rcParams["font.serif"] = ["Garamond", "Georgia", "Times New Roman"] -plt.rcParams["font.size"] = 11 -plt.rcParams["axes.labelsize"] = 13 -plt.rcParams["axes.titlesize"] = 16 -plt.rcParams["xtick.labelsize"] = 10 -plt.rcParams["ytick.labelsize"] = 10 -plt.rcParams["legend.fontsize"] = 11 -plt.rcParams["figure.titlesize"] = 18 -plt.rcParams["axes.grid"] = True -plt.rcParams["grid.alpha"] = 0.3 - - -def validate_data_quality(data_path: str): - """Run comprehensive data quality checks using transformation QC functions.""" - print("\n" + "=" * 80) - print("DATA QUALITY VALIDATION") - print("=" * 80) - - lf = pl.scan_parquet(data_path) - - # Load a representative sample (faster QC) - print("\n🔍 Loading sample for quality checks...") - sample_size = 500000 # Half million rows should be representative - df_sample = lf.limit(sample_size).collect() - print(f" Analyzing {len(df_sample):,} rows (sample)") - - # Run interval count QC - print("\n📊 Running Interval Count Analysis...") - qc = daily_interval_qc(df_sample) - - # Day type distribution - day_types = ( - qc.group_by("day_type") - .agg([pl.len().alias("count"), pl.col("n_intervals").mean().alias("avg_intervals")]) - .sort("day_type") - ) - - print("\n Day Type Distribution:") - print(" " + "-" * 60) - print(f" {'Type':<20} {'Count':>10} {'Avg Intervals':>15}") - print(" " + "-" * 60) - for row in day_types.iter_rows(): - print(f" {row[0]:<20} {row[1]:>10,} {row[2]:>15.1f}") - - # Check for odd days - odd_days = qc.filter(pl.col("is_odd_count")) - total_days = qc.height - pct_valid = ((total_days - odd_days.height) / total_days) * 100 - - print("\n Quality Metrics:") - print(f" {'Total customer-days analyzed:':<40} {total_days:>10,}") - print(f" {'Days with valid counts (46/48/50):':<40} {total_days - odd_days.height:>10,}") - print(f" {'Days with odd counts:':<40} {odd_days.height:>10,}") - print(f" {'Data quality score:':<40} {pct_valid:>9.2f}%") - - if odd_days.height > 0: - print(f"\n ⚠️ WARNING: {odd_days.height} days have unusual interval counts") - print(" Sample of odd days:") - print(odd_days.select(["account_identifier", "date", "n_intervals"]).head(5)) - else: - print("\n ✅ EXCELLENT: All days have valid interval counts!") - - # DST transition validation - print("\n📅 DST Transition Analysis...") - try: - # Get full dataset for DST check (need complete date range) - dst_dates = dst_transition_dates(df_sample) - - if dst_dates.height > 0: - print("\n Detected DST Transition Dates:") - print(" " + "-" * 60) - for row in dst_dates.iter_rows(): - print(f" {row[0]} - {row[1].upper()}") - print("\n ✅ DST transitions properly detected and handled") - else: - print(" ℹ️ No DST transitions in sampled data") - except Exception as e: - print(f" ⚠️ Could not validate DST: {e}") - - # Interval distribution across hours - print("\n⏰ Hourly Interval Distribution...") - hourly_counts = df_sample.group_by("hour").agg(pl.len().alias("count")).sort("hour") - - avg_per_hour = hourly_counts["count"].mean() - std_per_hour = hourly_counts["count"].std() - print(f" Average intervals per hour: {avg_per_hour:,.0f} (±{std_per_hour:,.0f})") - - # Check for missing hours - expected_hours = set(range(24)) - actual_hours = set(hourly_counts["hour"].to_list()) - missing_hours = expected_hours - actual_hours - - if missing_hours: - print(f" ⚠️ Missing hours: {sorted(missing_hours)}") - else: - print(" ✅ All 24 hours represented") - - # kWh value validation - print("\n⚡ Energy Value (kWh) Validation...") - kwh_stats = df_sample.select([ - pl.col("kwh").min().alias("min"), - pl.col("kwh").quantile(0.01).alias("p01"), - pl.col("kwh").median().alias("median"), - pl.col("kwh").quantile(0.99).alias("p99"), - pl.col("kwh").max().alias("max"), - (pl.col("kwh") < 0).sum().alias("negative_count"), - pl.col("kwh").is_null().sum().alias("null_count"), - ]) - - print(f" Min kWh: {kwh_stats['min'][0]:>10.4f}") - print(f" 1st percentile: {kwh_stats['p01'][0]:>10.4f}") - print(f" Median kWh: {kwh_stats['median'][0]:>10.4f}") - print(f" 99th percentile: {kwh_stats['p99'][0]:>10.4f}") - print(f" Max kWh: {kwh_stats['max'][0]:>10.4f}") - print(f" Negative values: {kwh_stats['negative_count'][0]:>10,}") - print(f" Null values: {kwh_stats['null_count'][0]:>10,}") - - if kwh_stats["negative_count"][0] > 0: - print(" ⚠️ WARNING: Negative kWh values detected!") - if kwh_stats["null_count"][0] > 0: - print(" ⚠️ WARNING: Null kWh values detected!") - if kwh_stats["negative_count"][0] == 0 and kwh_stats["null_count"][0] == 0: - print(" ✅ All kWh values are valid (no negatives or nulls)") - - # Overall quality assessment - print("\n" + "=" * 80) - print("OVERALL DATA QUALITY ASSESSMENT") - print("=" * 80) - - issues = [] - if odd_days.height > 0: - issues.append(f"{odd_days.height} days with odd interval counts") - if kwh_stats["negative_count"][0] > 0: - issues.append("Negative kWh values present") - if kwh_stats["null_count"][0] > 0: - issues.append("Null kWh values present") - if missing_hours: - issues.append("Some hours missing from data") - - if not issues: - print("✅ EXCELLENT: Data quality is very high!") - print(" - All interval counts valid (46/48/50)") - print(" - No negative or null kWh values") - print(" - All hours represented") - print(" - Ready for analysis") - quality_grade = "A+" - elif len(issues) == 1 and "odd interval" in issues[0] and odd_days.height < total_days * 0.01: - print("✅ GOOD: Data quality is acceptable") - print(f" - Minor issues: {issues[0]}") - print(" - Should not significantly impact analysis") - quality_grade = "B+" - else: - print("⚠️ FAIR: Some data quality issues detected:") - for issue in issues: - print(f" - {issue}") - print(" - Review issues before drawing conclusions") - quality_grade = "C" - - print(f"\n Quality Grade: {quality_grade}") - print("=" * 80 + "\n") - - return quality_grade - - -def generate_summary_statistics(data_path: str): - """Generate comprehensive summary statistics table.""" - print("\n" + "=" * 80) - print("COMPREHENSIVE SUMMARY STATISTICS") - print("=" * 80) - - lf = pl.scan_parquet(data_path) - - # Overall statistics - overall = lf.select([ - pl.col("account_identifier").n_unique().alias("unique_customers"), - pl.len().alias("total_observations"), - pl.min("date").alias("start_date"), - pl.max("date").alias("end_date"), - pl.col("kwh").mean().alias("mean_kwh"), - pl.col("kwh").median().alias("median_kwh"), - pl.col("kwh").std().alias("std_kwh"), - pl.col("kwh").min().alias("min_kwh"), - pl.col("kwh").max().alias("max_kwh"), - pl.col("kwh").quantile(0.25).alias("q25_kwh"), - pl.col("kwh").quantile(0.75).alias("q75_kwh"), - ]).collect(engine="streaming") - - # Monthly breakdown - monthly = ( - lf.group_by("sample_month") - .agg([ - pl.col("account_identifier").n_unique().alias("customers"), - pl.len().alias("observations"), - pl.col("kwh").mean().alias("mean_kwh"), - pl.col("kwh").sum().alias("total_kwh"), - ]) - .sort("sample_month") - .collect(engine="streaming") - ) - - # Weekday vs Weekend - weekend_stats = ( - lf.group_by("is_weekend") - .agg([ - pl.col("kwh").mean().alias("mean_kwh"), - pl.col("kwh").median().alias("median_kwh"), - pl.col("kwh").std().alias("std_kwh"), - ]) - .collect(engine="streaming") - ) - - # Hourly statistics - hourly_stats = ( - lf.group_by("hour") - .agg([ - pl.col("kwh").mean().alias("mean_kwh"), - pl.col("kwh").std().alias("std_kwh"), - ]) - .sort("hour") - .collect(engine="streaming") - ) - - # Print comprehensive report - print("\n📋 OVERALL STATISTICS") - print("-" * 80) - print(f" Total Unique Customers: {overall['unique_customers'][0]:>12,}") - print(f" Total Observations: {overall['total_observations'][0]:>12,}") - print(f" Date Range: {overall['start_date'][0]} to {overall['end_date'][0]}") - print(f" Average kWh per 30-min: {overall['mean_kwh'][0]:>12.4f}") - print(f" Median kWh per 30-min: {overall['median_kwh'][0]:>12.4f}") - print(f" Std Dev kWh: {overall['std_kwh'][0]:>12.4f}") - print(f" Min kWh: {overall['min_kwh'][0]:>12.4f}") - print(f" Max kWh: {overall['max_kwh'][0]:>12.4f}") - print(f" 25th Percentile: {overall['q25_kwh'][0]:>12.4f}") - print(f" 75th Percentile: {overall['q75_kwh'][0]:>12.4f}") - - # Estimated daily/annual usage - daily_kwh = overall["mean_kwh"][0] * 48 # 48 half-hour intervals per day - annual_kwh = daily_kwh * 365 - print(f"\n Est. Daily Usage per Home: {daily_kwh:>12.1f} kWh/day") - print(f" Est. Annual Usage per Home: {annual_kwh:>12,.0f} kWh/year") - - print("\n📅 MONTHLY BREAKDOWN") - print("-" * 80) - print(f"{'Month':<12} {'Customers':>10} {'Observations':>15} {'Avg kWh':>12} {'Total kWh':>15}") - print("-" * 80) - for row in monthly.iter_rows(): - month_label = f"{row[0][:4]}-{row[0][4:]}" - print(f"{month_label:<12} {row[1]:>10,} {row[2]:>15,} {row[3]:>12.4f} {row[4]:>15,.0f}") - - print("\n📊 WEEKDAY vs WEEKEND COMPARISON") - print("-" * 80) - print(f"{'Period':<15} {'Mean kWh':>12} {'Median kWh':>12} {'Std Dev':>12}") - print("-" * 80) - for row in weekend_stats.iter_rows(): - period = "Weekend" if row[0] else "Weekday" - print(f"{period:<15} {row[1]:>12.4f} {row[2]:>12.4f} {row[3]:>12.4f}") - - print("\n⏰ HOURLY STATISTICS (Peak Hours)") - print("-" * 80) - # Show top 5 and bottom 5 hours - top5 = hourly_stats.sort("mean_kwh", descending=True).head(5) - bottom5 = hourly_stats.sort("mean_kwh").head(5) - - print(" Highest Usage Hours:") - for row in top5.iter_rows(): - print(f" Hour {row[0]:2d}:00 - {row[1]:.4f} kWh (±{row[2]:.4f})") - - print("\n Lowest Usage Hours:") - for row in bottom5.iter_rows(): - print(f" Hour {row[0]:2d}:00 - {row[1]:.4f} kWh (±{row[2]:.4f})") - - print("\n" + "=" * 80) - - # Save to file - with open("summary_statistics.txt", "w") as f: - f.write("=" * 80 + "\n") - f.write("COMPREHENSIVE SUMMARY STATISTICS\n") - f.write("=" * 80 + "\n\n") - f.write(f"Total Customers: {overall['unique_customers'][0]:,}\n") - f.write(f"Date Range: {overall['start_date'][0]} to {overall['end_date'][0]}\n") - f.write(f"Mean kWh per 30-min: {overall['mean_kwh'][0]:.4f}\n") - f.write(f"Estimated Daily Usage: {daily_kwh:.1f} kWh/day\n") - f.write(f"Estimated Annual Usage: {annual_kwh:,.0f} kWh/year\n") - f.write("\nMonthly Breakdown:\n") - f.write(monthly.write_csv()) - - print("\n✅ Summary saved to: summary_statistics.txt") - - return overall, monthly, weekend_stats, hourly_stats - - -def create_monthly_heatmap(data_path: str, output_path: str = "load_heatmap_seaborn.png"): - """Create professional seaborn heatmap.""" - print("\n📊 Creating professional seaborn heatmap...") - - lf = pl.scan_parquet(data_path) - - stats = lf.select([ - pl.col("account_identifier").n_unique().alias("n_customers"), - pl.min("date").alias("min_date"), - pl.max("date").alias("max_date"), - ]).collect(engine="streaming") - - n_customers = stats["n_customers"][0] - date_range = f"{stats['min_date'][0]} to {stats['max_date'][0]}" - - monthly_hourly = ( - lf.group_by(["sample_month", "hour"]) - .agg(pl.col("kwh").sum().alias("total_kwh")) - .sort(["sample_month", "hour"]) - .collect(engine="streaming") - ) - - matrix = monthly_hourly.pivot(index="hour", columns="sample_month", values="total_kwh").fill_null(0) - - hour_labels = matrix.select("hour").to_series().to_list() - month_cols = sorted([c for c in matrix.columns if c != "hour"]) - data_matrix = matrix.select(month_cols).to_numpy() - - # Create figure with seaborn - fig, ax = plt.subplots(figsize=(16, 9)) - - sns.heatmap( - data_matrix, - cmap="RdYlBu_r", - cbar_kws={ - "label": f"Total Energy Consumption (kWh)\n~{n_customers:,} Households", - "shrink": 0.75, - "pad": 0.02, - "aspect": 30, - }, - xticklabels=[f"{m[:4]}-{m[4:]}" for m in month_cols], - yticklabels=hour_labels, - linewidths=0.8, - linecolor="white", - ax=ax, - robust=True, - square=False, - annot=False, - ) - - ax.set_xlabel("Month", fontsize=15, fontweight="bold", labelpad=12) - ax.set_ylabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12) - ax.set_title( - f"Residential Electricity Load Patterns: Temporal Heat Map\n" - f"Chicago ZIP 60622 • {date_range} • {n_customers:,} Random Households", - fontsize=18, - fontweight="bold", - pad=25, - ) - - ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=11) - ax.set_yticklabels(ax.get_yticklabels(), fontsize=10) - ax.invert_yaxis() - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -def create_hourly_profile_seaborn(data_path: str, output_path: str = "hourly_profile_seaborn.png"): - """Create professional seaborn hourly profile.""" - print("\n📊 Creating professional seaborn hourly profile...") - - lf = pl.scan_parquet(data_path) - n_customers = lf.select(pl.col("account_identifier").n_unique()).collect(engine="streaming")[0, 0] - - hourly = ( - lf.group_by("hour") - .agg([ - pl.col("kwh").mean().alias("mean_kwh"), - pl.col("kwh").quantile(0.25).alias("p25_kwh"), - pl.col("kwh").quantile(0.75).alias("p75_kwh"), - ]) - .sort("hour") - .collect(engine="streaming") - ) - - fig, ax = plt.subplots(figsize=(15, 8)) - - hours = hourly["hour"].to_list() - mean = hourly["mean_kwh"].to_list() - p25 = hourly["p25_kwh"].to_list() - p75 = hourly["p75_kwh"].to_list() - - ax.fill_between(hours, p25, p75, alpha=0.3, color="#e74c3c", label="Interquartile Range (25th-75th percentile)") - - sns.lineplot(x=hours, y=mean, linewidth=3.5, color="#c0392b", marker="o", markersize=8, ax=ax, label="Mean Usage") - - ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12) - ax.set_ylabel("Energy Consumption (kWh per 30-min interval)", fontsize=15, fontweight="bold", labelpad=12) - ax.set_title( - f"Average Hourly Electricity Usage Profile\n{n_customers:,} Illinois Households • June 2024 - June 2025", - fontsize=18, - fontweight="bold", - pad=25, - ) - - ax.set_xticks(range(0, 24, 2)) - ax.set_xlim(-0.5, 23.5) - ax.set_ylim(bottom=0) - ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8) - - ax.axvspan(0, 6, alpha=0.1, color="#3498db", label="Overnight (12-6 AM)", zorder=0) - ax.axvspan(17, 21, alpha=0.1, color="#e74c3c", label="Evening Peak (5-9 PM)", zorder=0) - - peak_hour = mean.index(max(mean)) - peak_value = max(mean) - ax.annotate( - f"Peak Demand\n{peak_value:.3f} kWh\nat {peak_hour}:00", - xy=(peak_hour, peak_value), - xytext=(peak_hour - 4, peak_value + 0.04), - fontsize=12, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.7", facecolor="#f39c12", alpha=0.9, edgecolor="#000", linewidth=2), - arrowprops=dict(arrowstyle="->", lw=2.5, color="#000", connectionstyle="arc3,rad=0.3"), - ) - - min_value = min(mean) - min_hour = mean.index(min_value) - ax.annotate( - f"Baseload\n{min_value:.3f} kWh\nat {min_hour}:00", - xy=(min_hour, min_value), - xytext=(min_hour + 3, min_value + 0.06), - fontsize=12, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.7", facecolor="#3498db", alpha=0.9, edgecolor="#000", linewidth=2), - arrowprops=dict(arrowstyle="->", lw=2.5, color="#000", connectionstyle="arc3,rad=-0.3"), - ) - - ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=12) - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -def create_weekend_comparison_seaborn(data_path: str, output_path: str = "weekend_comparison_seaborn.png"): - """Compare weekday vs weekend with peak/baseload annotations.""" - print("\n📊 Creating professional weekend comparison...") - - lf = pl.scan_parquet(data_path) - - comparison = ( - lf.group_by(["hour", "is_weekend"]) - .agg(pl.col("kwh").mean().alias("mean_kwh")) - .sort(["is_weekend", "hour"]) - .collect(engine="streaming") - ) - - weekday = comparison.filter(pl.col("is_weekend") == False) - weekend = comparison.filter(pl.col("is_weekend") == True) - - fig, ax = plt.subplots(figsize=(15, 8)) - - weekday_hours = weekday["hour"].to_list() - weekday_kwh = weekday["mean_kwh"].to_list() - weekend_hours = weekend["hour"].to_list() - weekend_kwh = weekend["mean_kwh"].to_list() - - sns.lineplot( - x=weekday_hours, y=weekday_kwh, linewidth=3.5, color="#2980b9", marker="o", markersize=8, ax=ax, label="Weekday" - ) - sns.lineplot( - x=weekend_hours, y=weekend_kwh, linewidth=3.5, color="#e67e22", marker="s", markersize=8, ax=ax, label="Weekend" - ) - - ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12) - ax.set_ylabel("Average Energy Consumption (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12) - ax.set_title( - "Weekday vs Weekend Load Profiles\nTemporal Consumption Patterns", fontsize=18, fontweight="bold", pad=25 - ) - ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8) - ax.set_xticks(range(0, 24, 2)) - ax.set_xlim(-0.5, 23.5) - ax.set_ylim(bottom=0) - - # Weekday peak - weekday_peak_hour = weekday_kwh.index(max(weekday_kwh)) - weekday_peak = max(weekday_kwh) - ax.annotate( - f"Weekday Peak\n{weekday_peak:.3f} kWh\nat {weekday_peak_hour}:00", - xy=(weekday_peak_hour, weekday_peak), - xytext=(weekday_peak_hour - 3, weekday_peak + 0.04), - fontsize=11, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.6", facecolor="#3498db", alpha=0.9, edgecolor="#000", linewidth=2), - arrowprops=dict(arrowstyle="->", lw=2, color="#000"), - ) - - # Weekend peak - weekend_peak_hour = weekend_kwh.index(max(weekend_kwh)) - weekend_peak = max(weekend_kwh) - ax.annotate( - f"Weekend Peak\n{weekend_peak:.3f} kWh\nat {weekend_peak_hour}:00", - xy=(weekend_peak_hour, weekend_peak), - xytext=(weekend_peak_hour + 2, weekend_peak + 0.04), - fontsize=11, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.6", facecolor="#e67e22", alpha=0.9, edgecolor="#000", linewidth=2), - arrowprops=dict(arrowstyle="->", lw=2, color="#000"), - ) - - # Weekday baseload - weekday_min = min(weekday_kwh) - weekday_min_hour = weekday_kwh.index(weekday_min) - ax.annotate( - f"Weekday Baseload\n{weekday_min:.3f} kWh", - xy=(weekday_min_hour, weekday_min), - xytext=(weekday_min_hour + 2, weekday_min + 0.05), - fontsize=10, - bbox=dict(boxstyle="round,pad=0.5", facecolor="#bdc3c7", alpha=0.9, edgecolor="#000", linewidth=1.5), - arrowprops=dict(arrowstyle="->", lw=1.5, color="#000"), - ) - - # Weekend baseload - weekend_min = min(weekend_kwh) - weekend_min_hour = weekend_kwh.index(weekend_min) - ax.annotate( - f"Weekend Baseload\n{weekend_min:.3f} kWh", - xy=(weekend_min_hour, weekend_min), - xytext=(weekend_min_hour - 4, weekend_min + 0.05), - fontsize=10, - bbox=dict(boxstyle="round,pad=0.5", facecolor="#ecf0f1", alpha=0.9, edgecolor="#000", linewidth=1.5), - arrowprops=dict(arrowstyle="->", lw=1.5, color="#000"), - ) - - ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=13) - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -if __name__ == "__main__": - print("=" * 80) - print("PROFESSIONAL RESIDENTIAL LOAD ANALYSIS & VISUALIZATION") - print("With Integrated Data Quality Validation") - print("=" * 80) - - data_file = "analysis/zip60622_2024/final/sample_60622_202406_202506_CLIPPED_CM90.parquet" - - if not Path(data_file).exists(): - print(f"❌ File not found: {data_file}") - exit(1) - - # 1. DATA QUALITY VALIDATION (using your QC functions) - quality_grade = validate_data_quality(data_file) - - # 2. Generate comprehensive summary statistics - generate_summary_statistics(data_file) - - # 3. Create professional visualizations - create_monthly_heatmap(data_file, "residential_load_heatmap_professional.png") - create_hourly_profile_seaborn(data_file, "residential_hourly_profile_professional.png") - create_weekend_comparison_seaborn(data_file, "residential_weekend_comparison_professional.png") - - print("\n" + "=" * 80) - print("✅ ANALYSIS COMPLETE!") - print("=" * 80) - print(f"\nData Quality Grade: {quality_grade}") - print("\nGenerated files:") - print(" 📊 summary_statistics.txt") - print(" 📈 residential_load_heatmap_professional.png") - print(" 📈 residential_hourly_profile_professional.png") - print(" 📈 residential_weekend_comparison_professional.png") - print("=" * 80) diff --git a/archive/residential_load_viz_2024/generate_summary_table.py b/archive/residential_load_viz_2024/generate_summary_table.py deleted file mode 100644 index a498311..0000000 --- a/archive/residential_load_viz_2024/generate_summary_table.py +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python -"""Generate formatted summary table for paper.""" - -from pathlib import Path - -import polars as pl - -DATA_FILE = "analysis/chicago_2024_full_year/combined/chicago_2024_with_april_boost_CM90.parquet" -OUTPUT_DIR = Path("analysis/chicago_2024_full_year/final_visualizations") -OUTPUT_DIR.mkdir(parents=True, exist_ok=True) - -lf = pl.scan_parquet(DATA_FILE) - -monthly = ( - lf.group_by("sample_month") - .agg([ - pl.col("account_identifier").n_unique().alias("customers"), - pl.col("zipcode").n_unique().alias("zip_codes"), - pl.col("kwh").mean().alias("mean_kwh"), - pl.col("kwh").std().alias("std_kwh"), - pl.len().alias("observations"), - ]) - .sort("sample_month") - .collect(engine="streaming") -) - -print("=" * 80) -print("TABLE 1: Monthly Sample Summary") -print("=" * 80) -print() -print(f"{'Month':<12} {'Customers':<12} {'ZIP Codes':<12} {'Mean kWh':<12} {'Std Dev':<12} {'Observations':<15}") -print("-" * 80) - -for row in monthly.iter_rows(): - month_label = f"{row[0][:4]}-{row[0][4:]}" - asterisk = "*" if row[0] == "202404" else " " - print(f"{month_label:<11}{asterisk} {row[1]:<12,} {row[2]:<12} {row[3]:<12.4f} {row[4]:<12.4f} {row[5]:<15,}") - -total_customers = lf.select(pl.col("account_identifier").n_unique()).collect(engine="streaming")[0, 0] -total_obs = lf.select(pl.len()).collect(engine="streaming")[0, 0] - -print("-" * 80) -print(f"{'TOTAL':<12} {total_customers:<12,} {'-':<12} {'-':<12} {'-':<12} {total_obs:<15,}") -print() -print("*April 2024: Limited data availability due to utility data quality issues.") -print(" Sample size reduced to 229 customers (24% of typical) despite city-wide") -print(" sampling effort across 47 ZIP codes.") -print("=" * 80) - -# Save to file -output_file = OUTPUT_DIR / "summary_table.txt" -with open(output_file, "w") as f: - f.write("=" * 80 + "\n") - f.write("TABLE 1: Monthly Sample Summary\n") - f.write("=" * 80 + "\n\n") - f.write( - f"{'Month':<12} {'Customers':<12} {'ZIP Codes':<12} {'Mean kWh':<12} {'Std Dev':<12} {'Observations':<15}\n" - ) - f.write("-" * 80 + "\n") - - for row in monthly.iter_rows(): - month_label = f"{row[0][:4]}-{row[0][4:]}" - asterisk = "*" if row[0] == "202404" else " " - f.write( - f"{month_label:<11}{asterisk} {row[1]:<12,} {row[2]:<12} {row[3]:<12.4f} {row[4]:<12.4f} {row[5]:<15,}\n" - ) - - f.write("-" * 80 + "\n") - f.write(f"{'TOTAL':<12} {total_customers:<12,} {'-':<12} {'-':<12} {'-':<12} {total_obs:<15,}\n") - f.write("\n*April 2024: Limited data availability due to utility data quality issues.\n") - f.write(" Sample size reduced to 229 customers (24% of typical) despite city-wide\n") - f.write(" sampling effort across 47 ZIP codes.\n") - -print(f"\n✅ Table saved to: {output_file}") - -# Also save CSV -csv_file = OUTPUT_DIR / "summary_table.csv" -monthly.write_csv(csv_file) -print(f"✅ CSV saved to: {csv_file}") diff --git a/archive/residential_load_viz_2024/sample_april_citywide.py b/archive/residential_load_viz_2024/sample_april_citywide.py deleted file mode 100755 index 5807864..0000000 --- a/archive/residential_load_viz_2024/sample_april_citywide.py +++ /dev/null @@ -1,140 +0,0 @@ -#!/usr/bin/env python -""" -Sample ALL 55 Chicago ZIPs specifically for April 2024. -Target: Get enough customers so that even with 90% CM90 dropout, we still have ~1000. -""" - -import os -import subprocess - -# All 55 ZIPs with substantial April data (from investigation) -APRIL_ZIPS = [ - "60601", - "60602", - "60603", - "60605", - "60606", - "60607", - "60608", - "60609", - "60610", - "60611", - "60612", - "60613", - "60614", - "60615", - "60616", - "60617", - "60618", - "60619", - "60620", - "60621", - "60622", - "60623", - "60624", - "60625", - "60626", - "60628", - "60629", - "60630", - "60631", - "60632", - "60633", - "60634", - "60636", - "60637", - "60638", - "60639", - "60640", - "60641", - "60642", - "60643", - "60644", - "60645", - "60646", - "60647", - "60649", - "60651", - "60652", - "60653", - "60654", - "60655", - "60656", - "60657", - "60659", - "60660", - "60661", -] - -# Already sampled in pilot -PILOT_ZIPS = ["60622", "60614", "60615", "60625", "60629", "60639", "60647", "60653", "60657", "60660"] - -# Only sample new ZIPs -NEW_ZIPS = [z for z in APRIL_ZIPS if z not in PILOT_ZIPS] - -# Sample 200 per ZIP to account for ~90% CM90 dropout -TARGET_PER_ZIP = 200 - -print("=" * 80) -print("APRIL 2024 CITY-WIDE BOOST") -print("=" * 80) -print(f"Sampling {len(NEW_ZIPS)} additional ZIPs for April only") -print(f"Target: {TARGET_PER_ZIP} customers per ZIP") -print(f"Expected raw: {len(NEW_ZIPS) * TARGET_PER_ZIP:,} samples") -print(f"Expected after CM90 (~10% pass): {len(NEW_ZIPS) * TARGET_PER_ZIP * 0.1:,.0f} customers") -print(f"Combined with pilot April: ~{len(NEW_ZIPS) * TARGET_PER_ZIP * 0.1 + 12:.0f} total April customers") -print("=" * 80) - -input("\nPress ENTER to start or Ctrl+C to cancel...") - -successful = [] -failed = [] - -for i, zipcode in enumerate(NEW_ZIPS, 1): - print(f"\n{'=' * 80}") - print(f"[{i}/{len(NEW_ZIPS)}] ZIP {zipcode} - April 2024 only") - print(f"{'=' * 80}") - - output_dir = f"analysis/chicago_april_citywide/zip{zipcode}" - - env = os.environ.copy() - env.update({ - "START_YYYYMM": "202404", # April only - "END_YYYYMM": "202404", # April only - "OUTPUT_DIR": output_dir, - "TARGET_CUSTOMERS": str(TARGET_PER_ZIP), - "ZIP5": zipcode, - }) - - try: - result = subprocess.run( - ["python", "sample_customers_60622.py"], - env=env, - ) - - if result.returncode == 0: - print(f"\n✅ ZIP {zipcode} completed") - successful.append(zipcode) - else: - print(f"\n⚠️ ZIP {zipcode} failed") - failed.append(zipcode) - - except KeyboardInterrupt: - print("\n\n⚠️ Interrupted!") - print(f"Completed: {len(successful)}") - print(f"Remaining: {len(NEW_ZIPS) - i}") - exit(1) - except Exception as e: - print(f"\n❌ ZIP {zipcode} error: {e}") - failed.append(zipcode) - -print("\n" + "=" * 80) -print("APRIL BOOST COMPLETE") -print("=" * 80) -print(f"✅ Successful: {len(successful)}/{len(NEW_ZIPS)}") -if failed: - print(f"❌ Failed: {len(failed)} ZIPs") - print(f" {', '.join(failed)}") - -print("\nNext: python combine_with_april_boost.py") -print("=" * 80) diff --git a/archive/residential_load_viz_2024/sample_chicago_all_2024.py b/archive/residential_load_viz_2024/sample_chicago_all_2024.py deleted file mode 100755 index d09d3a4..0000000 --- a/archive/residential_load_viz_2024/sample_chicago_all_2024.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python -""" -Sample ALL 12 months of 2024 from multiple Chicago ZIP codes. -""" - -import os -import subprocess - -# All 53 Chicago ZIPs with substantial data -ALL_CHICAGO_ZIPS = [ - "60601", - "60605", - "60606", - "60607", - "60608", - "60609", - "60610", - "60611", - "60612", - "60613", - "60614", - "60615", - "60616", - "60617", - "60618", - "60619", - "60620", - "60621", - "60622", - "60623", - "60624", - "60625", - "60626", - "60628", - "60629", - "60630", - "60631", - "60632", - "60633", - "60634", - "60636", - "60637", - "60638", - "60639", - "60640", - "60641", - "60642", - "60643", - "60644", - "60645", - "60646", - "60647", - "60649", - "60651", - "60652", - "60653", - "60654", - "60655", - "60656", - "60657", - "60659", - "60660", - "60661", -] - -# Pilot: 10 diverse neighborhoods -PILOT_ZIPS = ["60622", "60614", "60615", "60625", "60629", "60639", "60647", "60653", "60657", "60660"] - -# Configuration -USE_PILOT = True # Set to False for all 53 ZIPs -TARGET_PER_ZIP = 100 # 100 customers per ZIP per month -START_MONTH = "202412" # December (descending order, so this is "start") -END_MONTH = "202401" # January - ALL 12 MONTHS OF 2024 -BASE_OUTPUT = "analysis/chicago_2024_full_year" - -ZIPS_TO_SAMPLE = PILOT_ZIPS if USE_PILOT else ALL_CHICAGO_ZIPS - -print("=" * 80) -print("CHICAGO CITY-WIDE: ALL 12 MONTHS OF 2024") -print("=" * 80) -print(f"Mode: {'PILOT (10 ZIPs)' if USE_PILOT else 'FULL CITY (53 ZIPs)'}") -print(f"Customers per ZIP: {TARGET_PER_ZIP}") -print(f"Total target: {len(ZIPS_TO_SAMPLE) * TARGET_PER_ZIP:,} customers per month") -print("Months: January 2024 through December 2024 (ALL 12 MONTHS)") -print(f"Expected total rows: ~{len(ZIPS_TO_SAMPLE) * TARGET_PER_ZIP * 12 * 48 * 30:,}") -print("=" * 80) - -input(f"\nThis will sample {len(ZIPS_TO_SAMPLE)} ZIPs. Press ENTER to start or Ctrl+C to cancel...") - -successful = [] -failed = [] - -for i, zipcode in enumerate(ZIPS_TO_SAMPLE, 1): - print(f"\n{'=' * 80}") - print(f"[{i}/{len(ZIPS_TO_SAMPLE)}] Processing ZIP {zipcode}...") - print(f"{'=' * 80}") - - output_dir = f"{BASE_OUTPUT}/zip{zipcode}" - - env = os.environ.copy() - env.update({ - "START_YYYYMM": START_MONTH, - "END_YYYYMM": END_MONTH, - "OUTPUT_DIR": output_dir, - "TARGET_CUSTOMERS": str(TARGET_PER_ZIP), - "ZIP5": zipcode, - }) - - try: - result = subprocess.run( - ["python", "sample_customers_60622.py"], - env=env, - ) - - if result.returncode == 0: - print(f"\n✅ ZIP {zipcode} completed successfully") - successful.append(zipcode) - else: - print(f"\n⚠️ ZIP {zipcode} failed (code: {result.returncode})") - failed.append(zipcode) - - except KeyboardInterrupt: - print("\n\n⚠️ Interrupted by user!") - print(f"Completed: {len(successful)} ZIPs") - print(f"Failed: {len(failed)} ZIPs") - print(f"Remaining: {len(ZIPS_TO_SAMPLE) - i} ZIPs") - exit(1) - except Exception as e: - print(f"\n❌ ZIP {zipcode} exception: {e}") - failed.append(zipcode) - -print("\n" + "=" * 80) -print("SAMPLING COMPLETE") -print("=" * 80) -print(f"✅ Successful: {len(successful)}/{len(ZIPS_TO_SAMPLE)} ZIPs") -if successful: - print(f" {', '.join(successful)}") - -if failed: - print(f"\n❌ Failed: {len(failed)} ZIPs") - print(f" {', '.join(failed)}") - -print("\nNext step: python combine_chicago_zips.py") -print("=" * 80) diff --git a/archive/residential_load_viz_2024/sample_chicago_multizip.py b/archive/residential_load_viz_2024/sample_chicago_multizip.py deleted file mode 100755 index 4c4e92e..0000000 --- a/archive/residential_load_viz_2024/sample_chicago_multizip.py +++ /dev/null @@ -1,134 +0,0 @@ -#!/usr/bin/env python -""" -Sample from multiple Chicago ZIP codes for city-wide analysis. -""" - -import os -import subprocess - -# All 53 Chicago ZIPs with substantial data -ALL_CHICAGO_ZIPS = [ - "60601", - "60605", - "60606", - "60607", - "60608", - "60609", - "60610", - "60611", - "60612", - "60613", - "60614", - "60615", - "60616", - "60617", - "60618", - "60619", - "60620", - "60621", - "60622", - "60623", - "60624", - "60625", - "60626", - "60628", - "60629", - "60630", - "60631", - "60632", - "60633", - "60634", - "60636", - "60637", - "60638", - "60639", - "60640", - "60641", - "60642", - "60643", - "60644", - "60645", - "60646", - "60647", - "60649", - "60651", - "60652", - "60653", - "60654", - "60655", - "60656", - "60657", - "60659", - "60660", - "60661", -] - -# Pilot: representative sample of neighborhoods -PILOT_ZIPS = ["60622", "60614", "60615", "60625", "60629", "60639", "60647", "60653", "60657", "60660"] - -# Configuration -USE_PILOT = True # Set to False for full city -TARGET_PER_ZIP = 500 if USE_PILOT else 100 -START_MONTH = "202412" # Dec 2024 (descending, so this is start) -END_MONTH = "202406" # Jun 2024 -BASE_OUTPUT = "analysis/chicago_citywide" - -ZIPS_TO_SAMPLE = PILOT_ZIPS if USE_PILOT else ALL_CHICAGO_ZIPS - -print("=" * 80) -print("CHICAGO CITY-WIDE ELECTRICITY LOAD SAMPLING") -print("=" * 80) -print(f"Mode: {'PILOT (10 ZIPs)' if USE_PILOT else 'FULL CITY (53 ZIPs)'}") -print(f"Customers per ZIP: {TARGET_PER_ZIP}") -print(f"Total target: {len(ZIPS_TO_SAMPLE) * TARGET_PER_ZIP:,} customers") -print(f"Months: {END_MONTH} through {START_MONTH}") -print("=" * 80) - -successful = [] -failed = [] - -for i, zipcode in enumerate(ZIPS_TO_SAMPLE, 1): - print(f"\n[{i}/{len(ZIPS_TO_SAMPLE)}] Sampling ZIP {zipcode}...") - - output_dir = f"{BASE_OUTPUT}/zip{zipcode}" - - env = os.environ.copy() - env.update({ - "START_YYYYMM": START_MONTH, - "END_YYYYMM": END_MONTH, - "OUTPUT_DIR": output_dir, - "TARGET_CUSTOMERS": str(TARGET_PER_ZIP), - "ZIP5": zipcode, - }) - - try: - result = subprocess.run( - ["python", "sample_customers_60622.py"], - env=env, - capture_output=True, - text=True, - ) - - if result.returncode == 0: - print(f" ✅ ZIP {zipcode} completed") - successful.append(zipcode) - else: - print(f" ⚠️ ZIP {zipcode} failed (code: {result.returncode})") - print(f" Error: {result.stderr[-200:]}") - failed.append(zipcode) - - except Exception as e: - print(f" ❌ ZIP {zipcode} exception: {e}") - failed.append(zipcode) - -print("\n" + "=" * 80) -print("SAMPLING COMPLETE") -print("=" * 80) -print(f"✅ Successful: {len(successful)}/{len(ZIPS_TO_SAMPLE)} ZIPs") -print(f"❌ Failed: {len(failed)} ZIPs") - -if failed: - print(f"\nFailed ZIPs: {', '.join(failed)}") - -print("\nNext step: Run combine script to merge all ZIPs into one dataset") -print("=" * 80) diff --git a/archive/residential_load_viz_2024/sample_customers_60622.py b/archive/residential_load_viz_2024/sample_customers_60622.py deleted file mode 100644 index d283de1..0000000 --- a/archive/residential_load_viz_2024/sample_customers_60622.py +++ /dev/null @@ -1,378 +0,0 @@ -#!/usr/bin/env python -""" -Sample up to 1,000 customers in ZIP (default 60622) for each month from START_YYYYMM -down to END_YYYYMM (descending), combine to one Parquet, then: - -1) CLIP: keep only rows whose calendar month matches sample_month (safety net for uploads that - include the next month’s first day). -2) CM90 (optional): keep customer-months with >=90% "OK days" (rows/day in {46,48,50}). - -Env (with defaults): - BUCKET=smart-meter-data-sb - PREFIX_BASE=sharepoint-files/Zip4 - ZIP5=60622 - TARGET_CUSTOMERS=1000 - START_YYYYMM=202509 - END_YYYYMM=202409 - OUTPUT_DIR=analysis/sample_zip60622_recent - RANDOM_SEED=42 - MAX_RETRIES=5 - RETRY_DELAY=1.5 - MAX_WORKERS=6 -""" - -from __future__ import annotations - -import csv -import logging -import os -import random -import sys -import tempfile -import time -from datetime import datetime -from pathlib import Path -from typing import Dict, List, Optional, Set - -import boto3 -import polars as pl -from botocore.config import Config as BotoConfig -from botocore.exceptions import ClientError - -# ---------------- Config ---------------- -BUCKET = os.getenv("BUCKET", "smart-meter-data-sb") -PREFIX_BASE = os.getenv("PREFIX_BASE", "sharepoint-files/Zip4") -ZIP5 = os.getenv("ZIP5", "60622") -TARGET_CUSTOMERS = int(os.getenv("TARGET_CUSTOMERS", "1000")) - -START_YYYYMM = os.getenv("START_YYYYMM", "202509") -END_YYYYMM = os.getenv("END_YYYYMM", "202409") - -OUTPUT_DIR = Path(os.getenv("OUTPUT_DIR", f"analysis/sample_zip{ZIP5}_recent")) -OUTPUT_DIR.mkdir(parents=True, exist_ok=True) -CHECKPOINT = OUTPUT_DIR / "checkpoints" -SHARDS_DIR = CHECKPOINT / "shards" -FINAL_DIR = OUTPUT_DIR / "final" -FINAL_DIR.mkdir(parents=True, exist_ok=True) - -RANDOM_SEED = int(os.getenv("RANDOM_SEED", "42")) -MAX_RETRIES = int(os.getenv("MAX_RETRIES", "5")) -RETRY_DELAY = float(os.getenv("RETRY_DELAY", "1.5")) -MAX_WORKERS = int(os.getenv("MAX_WORKERS", "6")) -BATCH_LOG_EVERY = int(os.getenv("BATCH_LOG_EVERY", "400")) - -random.seed(RANDOM_SEED) - -# output file names -RANGE_TAG = f"{END_YYYYMM}_{START_YYYYMM}" -FINAL_RAW = FINAL_DIR / f"sample_{ZIP5}_{RANGE_TAG}.parquet" -FINAL_CLIPPED = FINAL_DIR / f"sample_{ZIP5}_{RANGE_TAG}_CLIPPED.parquet" -FINAL_CLIPPED_CM90 = FINAL_DIR / f"sample_{ZIP5}_{RANGE_TAG}_CLIPPED_CM90.parquet" - -# -------------- Logging --------------- -LOG_FILE = OUTPUT_DIR / f"run_{datetime.now():%Y%m%d_%H%M%S}.log" -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", - handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()], -) -logger = logging.getLogger("zip-monthly-sampler") - -logger.info("=" * 86) -logger.info(f"ZIP {ZIP5} – sample up to {TARGET_CUSTOMERS:,}/month from {START_YYYYMM} → {END_YYYYMM} (descending)") -logger.info("=" * 86) -logger.info(f"S3 bucket/base: s3://{BUCKET}/{PREFIX_BASE}/") -logger.info(f"Output dir: {OUTPUT_DIR}") -logger.info(f"Final RAW: {FINAL_RAW}") -logger.info(f"Final CLIPPED: {FINAL_CLIPPED}") -logger.info(f"Final CM90: {FINAL_CLIPPED_CM90}") -logger.info("=" * 86) - -# -------------- Project transforms -------------- -# Make repo root importable if this script lives in analysis/... -sys.path.insert(0, str(Path(__file__).resolve().parents[2])) -from smart_meter_analysis.transformation import add_time_columns, transform_wide_to_long # noqa: E402 - -# -------------- S3 client -------------- -s3 = boto3.client( - "s3", - config=BotoConfig( - retries={"max_attempts": MAX_RETRIES, "mode": "standard"}, - max_pool_connections=max(8, MAX_WORKERS), - ), -) - - -# -------------- Helpers -------------- -def months_desc(start_yyyymm: str, end_yyyymm: str) -> List[str]: - """Inclusive descending list from start → end (e.g., 202509..202409).""" - sy, sm = int(start_yyyymm[:4]), int(start_yyyymm[4:]) - ey, em = int(end_yyyymm[:4]), int(end_yyyymm[4:]) - seq = [] - y, m = sy, sm - while (y > ey) or (y == ey and m >= em): - seq.append(f"{y:04d}{m:02d}") - # step back one month - m -= 1 - if m == 0: - y -= 1 - m = 12 - return seq - - -def _norm(name: str) -> str: - return "".join(ch.lower() for ch in name if ch.isalnum()) - - -def detect_id_col(columns: List[str]) -> Optional[str]: - candidates = {"accountidentifier", "accountid", "acctid", "acctidentifier"} - m = {_norm(c): c for c in columns} - for key in candidates: - if key in m: - return m[key] - if "ACCOUNT_IDENTIFIER" in columns: - return "ACCOUNT_IDENTIFIER" - return None - - -def month_keys(yyyymm: str, zip5: str) -> List[str]: - prefix = f"{PREFIX_BASE}/{yyyymm}/" - out: List[str] = [] - for page in s3.get_paginator("list_objects_v2").paginate(Bucket=BUCKET, Prefix=prefix): - for obj in page.get("Contents", []): - k = obj["Key"] - # FIXED: Validate filename month matches folder month - if k.endswith(".csv") and f"_{zip5}-" in k and f"_{yyyymm}_" in k: - out.append(k) - return out - - -def download_tmp(key: str) -> Path: - fd, path = tempfile.mkstemp(prefix="s3_", suffix=".csv") - os.close(fd) - p = Path(path) - attempt = 0 - while True: - attempt += 1 - try: - resp = s3.get_object(Bucket=BUCKET, Key=key) - with open(p, "wb") as f: - for chunk in resp["Body"].iter_chunks(chunk_size=8 * 1024 * 1024): - if chunk: - f.write(chunk) - return p - except ClientError as e: - code = e.response.get("Error", {}).get("Code", "") - msg = e.response.get("Error", {}).get("Message", str(e)) - if code == "RequestTimeTooSkewed": - logger.error("Clock skew detected; sync system time and re-run. AWS: %s", msg) - raise - if attempt >= MAX_RETRIES: - logger.warning("get_object failed for %s: %s %s — giving up.", key, code, msg) - raise - time.sleep(RETRY_DELAY * attempt) - - -def ids_in_file(key: str) -> List[str]: - """Read ONLY the ID column from one CSV.""" - tmp = download_tmp(key) - try: - with open(tmp, newline="") as f: - header = next(csv.reader(f)) - id_col = detect_id_col(header) - if not id_col: - logger.warning("Skipping %s: no account id column. header=%s", key, header) - return [] - ids = ( - pl.read_csv(tmp, columns=[id_col]) - .select(pl.col(id_col).cast(pl.Utf8).str.strip_chars().alias("account_identifier")) - .drop_nulls() - .unique() - .to_series() - .to_list() - ) - return ids - except Exception as e: - logger.warning("Skipping %s: %s", key, e) - return [] - finally: - try: - tmp.unlink(missing_ok=True) - except Exception: - pass - - -def sample_ids_for_month(yyyymm: str, zip5: str, k: int, rng: random.Random) -> Dict[str, Set[str]]: - """Scan files in random order and collect up to k unique IDs. Returns {key -> set(ids)}.""" - keys = month_keys(yyyymm, zip5) - rng.shuffle(keys) - selected: Set[str] = set() - picked_by_key: Dict[str, Set[str]] = {} - logger.info("%s: %d candidate CSVs in ZIP%s", yyyymm, len(keys), zip5) - - for i, key in enumerate(keys, 1): - if len(selected) >= k: - break - cand = ids_in_file(key) - if not cand: - continue - new = [cid for cid in cand if cid not in selected] - if not new: - continue - need = k - len(selected) - take = new if len(new) <= need else rng.sample(new, need) - selected.update(take) - picked_by_key.setdefault(key, set()).update(take) - - if i % BATCH_LOG_EVERY == 0: - logger.info("%s: scanned %d/%d files — have %d/%d IDs", yyyymm, i, len(keys), len(selected), k) - - logger.info("%s: sampled %d unique IDs (target %d)", yyyymm, len(selected), k) - return picked_by_key - - -def transform_and_filter(local_csv: Path, keep_ids: Set[str]) -> Optional[pl.DataFrame]: - """Read CSV → early ID filter (wide) → project transforms → normalize ID → final filter.""" - try: - df_wide = pl.read_csv(local_csv) - except Exception as e: - logger.warning("Failed to read %s: %s", local_csv, e) - return None - - id_col = detect_id_col(df_wide.columns) - if id_col: - df_wide = df_wide.filter(pl.col(id_col).cast(pl.Utf8).str.strip_chars().is_in(keep_ids)) - if df_wide.height == 0: - return None - - df_long = transform_wide_to_long(df_wide) - df_long = add_time_columns(df_long) - - out_id = detect_id_col(df_long.columns) or "account_identifier" - if out_id != "account_identifier": - df_long = df_long.rename({out_id: "account_identifier"}) - - df_long = df_long.with_columns(pl.col("account_identifier").cast(pl.Utf8).str.strip_chars()).filter( - pl.col("account_identifier").is_in(keep_ids) - ) - return df_long if df_long.height > 0 else None - - -# -------------- Pipeline -------------- -def run() -> int: - rng = random.Random(RANDOM_SEED) - months = months_desc(START_YYYYMM, END_YYYYMM) - logger.info("Months to process (desc): %s", ", ".join(months)) - - FINAL_RAW.unlink(missing_ok=True) - total_rows = 0 - - for yyyymm in months: - picked_by_key = sample_ids_for_month(yyyymm, ZIP5, TARGET_CUSTOMERS, rng) - if not picked_by_key: - logger.warning("%s: no IDs sampled; skipping month.", yyyymm) - continue - - shard_dir = SHARDS_DIR / yyyymm - shard_dir.mkdir(parents=True, exist_ok=True) - wrote = 0 - - for idx, (key, ids) in enumerate(picked_by_key.items(), 1): - shard_path = shard_dir / (Path(key).name.replace(".csv", ".parquet")) - if shard_path.exists(): - wrote += 1 - continue - try: - tmp = download_tmp(key) - try: - df = transform_and_filter(tmp, ids) - if df is None: - continue - df = df.with_columns( - pl.lit(yyyymm).alias("sample_month"), - pl.lit(key).alias("source_key"), - ) - df.write_parquet(shard_path) - total_rows += df.height - wrote += 1 - finally: - try: - tmp.unlink(missing_ok=True) - except Exception: - pass - except Exception as e: - logger.warning("%s: failed %s: %s", yyyymm, key, e) - - if idx % BATCH_LOG_EVERY == 0: - logger.info("%s: processed %d files; shards: %d", yyyymm, idx, wrote) - - logger.info("%s: wrote %d shard(s) for sampled IDs", yyyymm, wrote) - - # Final sink (RAW) - logger.info("Sinking all shards to RAW Parquet...") - pl.scan_parquet(str(SHARDS_DIR / "*" / "*.parquet")).sink_parquet(FINAL_RAW) - - # CLIP to matching calendar month - logger.info("Clipping to matching calendar month per sample_month...") - lf = ( - pl.scan_parquet(FINAL_RAW) - .with_columns(pl.col("date").dt.strftime("%Y%m").alias("calendar_month")) - .filter(pl.col("calendar_month") == pl.col("sample_month")) - .drop("calendar_month") - ) - lf.sink_parquet(FINAL_CLIPPED) - - # CM90 (>=90% OK days, OK = 46/48/50 intervals) - logger.info("Building CM90 (>=90%% OK days; OK rows/day in {46,48,50})...") - lf_c = pl.scan_parquet(FINAL_CLIPPED) - daily = lf_c.group_by(["sample_month", "account_identifier", "date"]).agg(pl.len().alias("rows")) - ok_daily = ( - daily.filter(pl.col("rows").is_in([46, 48, 50])) - .group_by(["sample_month", "account_identifier"]) - .agg(pl.len().alias("ok_days")) - ) - days_in_month = ( - lf_c.select(["sample_month", "date"]).unique().group_by("sample_month").agg(pl.len().alias("days_in_month")) - ) - cm_keep = ( - ok_daily.join(days_in_month, on="sample_month", how="left") - .with_columns((pl.col("ok_days") / pl.col("days_in_month")).alias("pct_ok")) - .filter(pl.col("pct_ok") >= 0.90) - .select(["sample_month", "account_identifier"]) - ) - lf_c.join(cm_keep, on=["sample_month", "account_identifier"], how="inner").sink_parquet(FINAL_CLIPPED_CM90) - - # Quick summaries - def summary(path: Path, tag: str): - tbl = ( - pl.scan_parquet(path) - .group_by("sample_month") - .agg([ - pl.col("account_identifier").n_unique().alias("unique_ids"), - pl.len().alias("rows"), - pl.min("date").alias("min_date"), - pl.max("date").alias("max_date"), - ]) - .collect() - .sort("sample_month") - ) - logger.info("== %s ==", tag) - logger.info("\n%s", tbl) - - summary(FINAL_RAW, "RAW") - summary(FINAL_CLIPPED, "CLIPPED") - summary(FINAL_CLIPPED_CM90, "CLIPPED_CM90") - - logger.info("✅ Done.") - logger.info("RAW: %s", FINAL_RAW) - logger.info("CLIPPED: %s", FINAL_CLIPPED) - logger.info("CLIPPED_CM90:%s", FINAL_CLIPPED_CM90) - return 0 - - -if __name__ == "__main__": - try: - raise SystemExit(run()) - except KeyboardInterrupt: - logger.warning("Interrupted by user. Exiting.") - raise diff --git a/archive/residential_load_viz_2024/visualize_chicago_12months.py b/archive/residential_load_viz_2024/visualize_chicago_12months.py deleted file mode 100644 index ba52b4a..0000000 --- a/archive/residential_load_viz_2024/visualize_chicago_12months.py +++ /dev/null @@ -1,346 +0,0 @@ -#!/usr/bin/env python -""" -Final visualizations: All 12 months of 2024 Chicago-wide data. -April clearly marked with caveat about limited data availability. -""" - -from pathlib import Path - -import matplotlib.pyplot as plt -import polars as pl -import seaborn as sns - -# Seaborn style -sns.set_style("whitegrid") -plt.rcParams["font.family"] = "sans-serif" -plt.rcParams["font.size"] = 11 -plt.rcParams["axes.labelsize"] = 13 -plt.rcParams["axes.titlesize"] = 16 -plt.rcParams["legend.fontsize"] = 11 -plt.rcParams["figure.titlesize"] = 18 -plt.rcParams["axes.grid"] = True -plt.rcParams["grid.alpha"] = 0.3 - -DATA_FILE = "analysis/chicago_2024_full_year/combined/chicago_2024_with_april_boost_CM90.parquet" -OUTPUT_DIR = Path("analysis/chicago_2024_full_year/final_visualizations") -OUTPUT_DIR.mkdir(parents=True, exist_ok=True) - - -def create_heatmap(data_path: str, output_path: str): - """Monthly-hourly heatmap with April footnote.""" - print("\n📊 Creating heatmap...") - - lf = pl.scan_parquet(data_path) - - stats = lf.select([ - pl.col("account_identifier").n_unique().alias("n_customers"), - pl.min("date").alias("min_date"), - pl.max("date").alias("max_date"), - ]).collect(engine="streaming") - - n_customers = stats["n_customers"][0] - date_range = f"{stats['min_date'][0]} to {stats['max_date'][0]}" - - monthly_hourly = ( - lf.group_by(["sample_month", "hour"]) - .agg(pl.col("kwh").sum().alias("total_kwh")) - .sort(["sample_month", "hour"]) - .collect(engine="streaming") - ) - - matrix = monthly_hourly.pivot(index="hour", columns="sample_month", values="total_kwh").fill_null(0) - - hour_labels = matrix.select("hour").to_series().to_list() - month_cols = sorted([c for c in matrix.columns if c != "hour"]) - data_matrix = matrix.select(month_cols).to_numpy() - - fig, ax = plt.subplots(figsize=(16, 9)) - - sns.heatmap( - data_matrix, - cmap="RdYlBu_r", - cbar_kws={ - "label": f"Total Energy Consumption (kWh)\n{n_customers:,} Households", - "shrink": 0.75, - "pad": 0.02, - "aspect": 30, - }, - xticklabels=[f"{m[:4]}-{m[4:]}" for m in month_cols], - yticklabels=hour_labels, - linewidths=0.8, - linecolor="white", - ax=ax, - robust=True, - square=False, - ) - - ax.set_xlabel("Month", fontsize=15, fontweight="bold", labelpad=12) - ax.set_ylabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12) - ax.set_title( - f"Residential Electricity Load Patterns: Temporal Heat Map\n" - f"Chicago • {date_range} • {n_customers:,} Households", - fontsize=18, - fontweight="bold", - pad=25, - ) - - # Mark April with asterisk - xtick_labels = ax.get_xticklabels() - for label in xtick_labels: - if "2024-04" in label.get_text(): - label.set_text(label.get_text() + "*") - label.set_fontweight("bold") - - ax.set_xticklabels(xtick_labels, rotation=45, ha="right", fontsize=11) - ax.invert_yaxis() - - # Add footnote about April - fig.text( - 0.5, - 0.02, - "*April 2024: Limited data availability (n=229 vs. ~950 typical) due to utility data quality issues", - ha="center", - fontsize=10, - style="italic", - color="#666666", - ) - - plt.tight_layout(rect=[0, 0.03, 1, 1]) - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -def create_hourly_profile(data_path: str, output_path: str): - """Average hourly profile.""" - print("\n📊 Creating hourly profile...") - - lf = pl.scan_parquet(data_path) - n_customers = lf.select(pl.col("account_identifier").n_unique()).collect(engine="streaming")[0, 0] - - hourly = ( - lf.group_by("hour") - .agg([ - pl.col("kwh").mean().alias("mean_kwh"), - pl.col("kwh").quantile(0.25).alias("p25_kwh"), - pl.col("kwh").quantile(0.75).alias("p75_kwh"), - ]) - .sort("hour") - .collect(engine="streaming") - ) - - fig, ax = plt.subplots(figsize=(15, 8)) - - hours = hourly["hour"].to_list() - mean = hourly["mean_kwh"].to_list() - p25 = hourly["p25_kwh"].to_list() - p75 = hourly["p75_kwh"].to_list() - - ax.fill_between(hours, p25, p75, alpha=0.3, color="#e74c3c", label="Interquartile Range") - - sns.lineplot(x=hours, y=mean, linewidth=3.5, color="#c0392b", marker="o", markersize=8, ax=ax, label="Mean Usage") - - ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12) - ax.set_ylabel("Energy Consumption (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12) - ax.set_title( - f"Average Hourly Electricity Usage Profile\n{n_customers:,} Chicago Households • 2024", - fontsize=18, - fontweight="bold", - pad=25, - ) - - ax.set_xticks(range(0, 24, 2)) - ax.set_xlim(-0.5, 23.5) - ax.set_ylim(bottom=0) - ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8) - - # Peak annotation - peak_hour = mean.index(max(mean)) - peak_value = max(mean) - ax.annotate( - f"Peak\n{peak_value:.3f} kWh\nat {peak_hour}:00", - xy=(peak_hour, peak_value), - xytext=(peak_hour - 4, peak_value + 0.04), - fontsize=12, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.7", facecolor="#f39c12", alpha=0.9, edgecolor="#000", linewidth=2), - arrowprops=dict(arrowstyle="->", lw=2.5, color="#000"), - ) - - # Baseload annotation - min_value = min(mean) - min_hour = mean.index(min_value) - ax.annotate( - f"Baseload\n{min_value:.3f} kWh\nat {min_hour}:00", - xy=(min_hour, min_value), - xytext=(min_hour + 3, min_value + 0.06), - fontsize=12, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.7", facecolor="#3498db", alpha=0.9, edgecolor="#000", linewidth=2), - arrowprops=dict(arrowstyle="->", lw=2.5, color="#000"), - ) - - ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=12) - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -def create_monthly_profile(data_path: str, output_path: str): - """Monthly average with April clearly marked.""" - print("\n📊 Creating monthly profile...") - - lf = pl.scan_parquet(data_path) - - monthly = ( - lf.group_by("sample_month") - .agg([ - pl.col("account_identifier").n_unique().alias("customers"), - pl.col("kwh").mean().alias("mean_kwh"), - pl.col("kwh").std().alias("std_kwh"), - ]) - .sort("sample_month") - .collect(engine="streaming") - ) - - fig, ax = plt.subplots(figsize=(15, 8)) - - months = monthly["sample_month"].to_list() - mean_kwh = monthly["mean_kwh"].to_list() - customers = monthly["customers"].to_list() - - # Month labels - month_labels = [f"{m[4:]}/{m[2:4]}" for m in months] # MM/YY format - - # Color bars - April in different color - colors = ["#e74c3c" if m == "202404" else "#3498db" for m in months] - - bars = ax.bar(range(len(months)), mean_kwh, color=colors, edgecolor="black", linewidth=1.5) - - # Add customer count on top of each bar - for i, (bar, count) in enumerate(zip(bars, customers)): - height = bar.get_height() - label = f"n={count}" - if months[i] == "202404": - label += "*" - ax.text( - bar.get_x() + bar.get_width() / 2.0, - height + 0.01, - label, - ha="center", - va="bottom", - fontsize=9, - fontweight="bold", - ) - - ax.set_xlabel("Month", fontsize=15, fontweight="bold", labelpad=12) - ax.set_ylabel("Average Energy (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12) - ax.set_title("Monthly Average Electricity Consumption\nChicago 2024", fontsize=18, fontweight="bold", pad=25) - - ax.set_xticks(range(len(months))) - ax.set_xticklabels(month_labels, fontsize=12) - ax.set_ylim(0, max(mean_kwh) * 1.15) - ax.grid(True, alpha=0.3, axis="y") - - # Legend - from matplotlib.patches import Patch - - legend_elements = [ - Patch(facecolor="#3498db", edgecolor="black", label="Normal months"), - Patch(facecolor="#e74c3c", edgecolor="black", label="April (limited data)"), - ] - ax.legend(handles=legend_elements, loc="upper left", fontsize=11) - - # Footnote - fig.text( - 0.5, - 0.02, - "*April 2024: n=229 (24% of typical) due to utility data quality issues", - ha="center", - fontsize=10, - style="italic", - color="#666666", - ) - - plt.tight_layout(rect=[0, 0.03, 1, 1]) - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -def create_weekend_comparison(data_path: str, output_path: str): - """Weekday vs weekend.""" - print("\n📊 Creating weekend comparison...") - - lf = pl.scan_parquet(data_path) - - comparison = ( - lf.group_by(["hour", "is_weekend"]) - .agg(pl.col("kwh").mean().alias("mean_kwh")) - .sort(["is_weekend", "hour"]) - .collect(engine="streaming") - ) - - weekday = comparison.filter(pl.col("is_weekend") == False) - weekend = comparison.filter(pl.col("is_weekend") == True) - - fig, ax = plt.subplots(figsize=(15, 9)) - - weekday_hours = weekday["hour"].to_list() - weekday_kwh = weekday["mean_kwh"].to_list() - weekend_hours = weekend["hour"].to_list() - weekend_kwh = weekend["mean_kwh"].to_list() - - sns.lineplot( - x=weekday_hours, y=weekday_kwh, linewidth=3.5, color="#2980b9", marker="o", markersize=8, ax=ax, label="Weekday" - ) - sns.lineplot( - x=weekend_hours, y=weekend_kwh, linewidth=3.5, color="#e67e22", marker="s", markersize=8, ax=ax, label="Weekend" - ) - - ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12) - ax.set_ylabel("Average Energy (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12) - ax.set_title("Weekday vs Weekend Load Profiles\nChicago 2024", fontsize=18, fontweight="bold", pad=30) - ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8) - ax.set_xticks(range(0, 24, 2)) - ax.set_xlim(-0.5, 23.5) - - max_val = max(max(weekday_kwh), max(weekend_kwh)) - ax.set_ylim(0, max_val * 1.25) - - ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=13) - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -if __name__ == "__main__": - print("=" * 80) - print("CHICAGO 2024 FINAL VISUALIZATIONS") - print("12 Months • 10,053 Unique Customers • 55 ZIP Codes") - print("=" * 80) - - if not Path(DATA_FILE).exists(): - print(f"❌ File not found: {DATA_FILE}") - exit(1) - - create_heatmap(DATA_FILE, OUTPUT_DIR / "chicago_2024_heatmap.png") - create_hourly_profile(DATA_FILE, OUTPUT_DIR / "chicago_2024_hourly_profile.png") - create_monthly_profile(DATA_FILE, OUTPUT_DIR / "chicago_2024_monthly_profile.png") - create_weekend_comparison(DATA_FILE, OUTPUT_DIR / "chicago_2024_weekend_comparison.png") - - print("\n" + "=" * 80) - print("✅ ALL VISUALIZATIONS COMPLETE!") - print("=" * 80) - print(f"\nOutput directory: {OUTPUT_DIR}") - print("\nGenerated files:") - print(" 📈 chicago_2024_heatmap.png") - print(" 📈 chicago_2024_hourly_profile.png") - print(" 📈 chicago_2024_monthly_profile.png") - print(" 📈 chicago_2024_weekend_comparison.png") - print("\nAll figures include April 2024 data with appropriate caveats.") - print("=" * 80) diff --git a/archive/residential_load_viz_2024/visualize_chicago_12months_fixed.py b/archive/residential_load_viz_2024/visualize_chicago_12months_fixed.py deleted file mode 100644 index a88d889..0000000 --- a/archive/residential_load_viz_2024/visualize_chicago_12months_fixed.py +++ /dev/null @@ -1,308 +0,0 @@ -#!/usr/bin/env python -""" -Final visualizations for Chicago 2024 (CM90 dataset): -- Heatmap shows MEAN kWh per 30-min per customer. -- Monthly bar chart annotates each bar with the month’s mean kWh (not sample size). -- No April-specific asterisks or footnotes anywhere. - -Inputs/outputs: - DATA_FILE -> path to combined CM90 parquet - OUTPUT_DIR -> folder for PNGs -""" - -from pathlib import Path - -import matplotlib.pyplot as plt -import polars as pl -import seaborn as sns - -# Style -sns.set_style("whitegrid") -plt.rcParams.update({ - "font.family": "sans-serif", - "font.size": 11, - "axes.labelsize": 13, - "axes.titlesize": 16, - "legend.fontsize": 11, - "figure.titlesize": 18, - "axes.grid": True, - "grid.alpha": 0.3, -}) - -DATA_FILE = "analysis/chicago_2024_full_year/combined/chicago_2024_with_april_boost_CM90.parquet" -OUTPUT_DIR = Path("analysis/chicago_2024_full_year/final_visualizations") -OUTPUT_DIR.mkdir(parents=True, exist_ok=True) - - -def create_heatmap(data_path: str, output_path: str): - """Monthly-hourly heatmap (MEAN kWh per customer).""" - print("\n📊 Creating heatmap (MEAN kWh per customer)...") - - lf = pl.scan_parquet(data_path) - - stats = lf.select([ - pl.col("account_identifier").n_unique().alias("n_customers"), - pl.min("date").alias("min_date"), - pl.max("date").alias("max_date"), - ]).collect(engine="streaming") - n_customers = stats["n_customers"][0] - date_range = f"{stats['min_date'][0]} to {stats['max_date'][0]}" - - monthly_hourly = ( - lf.group_by(["sample_month", "hour"]) - .agg(pl.col("kwh").mean().alias("mean_kwh")) - .sort(["sample_month", "hour"]) - .collect(engine="streaming") - ) - - matrix = monthly_hourly.pivot(index="hour", columns="sample_month", values="mean_kwh").fill_null(0) - - hour_labels = matrix.select("hour").to_series().to_list() - month_cols = sorted([c for c in matrix.columns if c != "hour"]) - data_matrix = matrix.select(month_cols).to_numpy() - - fig, ax = plt.subplots(figsize=(16, 9)) - sns.heatmap( - data_matrix, - cmap="RdYlBu_r", - cbar_kws={ - "label": f"Mean Energy Consumption per Customer (kWh)\n{n_customers:,} Households", - "shrink": 0.75, - "pad": 0.02, - "aspect": 30, - }, - xticklabels=[f"{m[:4]}-{m[4:]}" for m in month_cols], - yticklabels=hour_labels, - linewidths=0.8, - linecolor="white", - ax=ax, - robust=True, - square=False, - ) - - ax.set_xlabel("Month", fontsize=15, fontweight="bold", labelpad=12) - ax.set_ylabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12) - ax.set_title( - "Residential Electricity Load Patterns: Temporal Heat Map\n" - f"Chicago • {date_range} • {n_customers:,} Households", - fontsize=18, - fontweight="bold", - pad=25, - ) - ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=11) - ax.invert_yaxis() - - plt.tight_layout(rect=[0, 0.03, 1, 1]) - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -def create_hourly_profile(data_path: str, output_path: str): - """Average hourly profile across the year (mean and IQR).""" - print("\n📊 Creating hourly profile...") - - lf = pl.scan_parquet(data_path) - n_customers = lf.select(pl.col("account_identifier").n_unique()).collect(engine="streaming")[0, 0] - - hourly = ( - lf.group_by("hour") - .agg([ - pl.col("kwh").mean().alias("mean_kwh"), - pl.col("kwh").quantile(0.25).alias("p25_kwh"), - pl.col("kwh").quantile(0.75).alias("p75_kwh"), - ]) - .sort("hour") - .collect(engine="streaming") - ) - - fig, ax = plt.subplots(figsize=(15, 8)) - hours = hourly["hour"].to_list() - mean = hourly["mean_kwh"].to_list() - p25 = hourly["p25_kwh"].to_list() - p75 = hourly["p75_kwh"].to_list() - - ax.fill_between(hours, p25, p75, alpha=0.3, color="#e74c3c", label="Interquartile Range") - sns.lineplot(x=hours, y=mean, linewidth=3.5, color="#c0392b", marker="o", markersize=8, ax=ax, label="Mean Usage") - - ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12) - ax.set_ylabel("Energy Consumption (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12) - ax.set_title( - f"Average Hourly Electricity Usage Profile\n{n_customers:,} Chicago Households • 2024", - fontsize=18, - fontweight="bold", - pad=25, - ) - - ax.set_xticks(range(0, 24, 2)) - ax.set_xlim(-0.5, 23.5) - ax.set_ylim(bottom=0) - ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8) - - # Peak annotation - peak_idx = mean.index(max(mean)) - peak_val = mean[peak_idx] - ax.annotate( - f"Peak\n{peak_val:.3f} kWh\nat {peak_idx}:00", - xy=(peak_idx, peak_val), - xytext=(peak_idx - 4, peak_val + 0.04), - fontsize=12, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.7", facecolor="#f39c12", alpha=0.9, edgecolor="#000", linewidth=2), - arrowprops=dict(arrowstyle="->", lw=2.5, color="#000"), - ) - - # Baseload annotation - min_val = min(mean) - min_idx = mean.index(min_val) - ax.annotate( - f"Baseload\n{min_val:.3f} kWh\nat {min_idx}:00", - xy=(min_idx, min_val), - xytext=(min_idx + 3, min_val + 0.06), - fontsize=12, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.7", facecolor="#3498db", alpha=0.9, edgecolor="#000", linewidth=2), - arrowprops=dict(arrowstyle="->", lw=2.5, color="#000"), - ) - - ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=12) - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -def create_monthly_profile(data_path: str, output_path: str): - """ - Monthly average bar chart. - - Bars represent mean kWh per 30-min (as before). - - Annotation above each bar now shows the same average (kWh), NOT the sample count. - - No April asterisk/footnote. - """ - print("\n📊 Creating monthly profile (annotate with mean kWh)...") - - lf = pl.scan_parquet(data_path) - - monthly = ( - lf.group_by("sample_month") - .agg([ - pl.col("account_identifier").n_unique().alias("customers"), - pl.col("kwh").mean().alias("mean_kwh"), - pl.col("kwh").std().alias("std_kwh"), - ]) - .sort("sample_month") - .collect(engine="streaming") - ) - - fig, ax = plt.subplots(figsize=(15, 8)) - months = monthly["sample_month"].to_list() - mean_kwh = monthly["mean_kwh"].to_list() - - month_labels = [f"{m[4:]}/{m[2:4]}" for m in months] - colors = ["#3498db"] * len(months) - - bars = ax.bar(range(len(months)), mean_kwh, color=colors, edgecolor="black", linewidth=1.5) - - # Annotate each bar with its average energy - for i, bar in enumerate(bars): - h = bar.get_height() - ax.text( - bar.get_x() + bar.get_width() / 2.0, - h + 0.01, - f"{mean_kwh[i]:.3f} kWh", - ha="center", - va="bottom", - fontsize=9, - fontweight="bold", - ) - - ax.set_xlabel("Month", fontsize=15, fontweight="bold", labelpad=12) - ax.set_ylabel("Average Energy (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12) - ax.set_title("Monthly Average Electricity Consumption\nChicago 2024", fontsize=18, fontweight="bold", pad=25) - - ax.set_xticks(range(len(months))) - ax.set_xticklabels(month_labels, fontsize=12) - ax.set_ylim(0, max(mean_kwh) * 1.15) - ax.grid(True, alpha=0.3, axis="y") - - plt.tight_layout(rect=[0, 0.03, 1, 1]) - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -def create_weekend_comparison(data_path: str, output_path: str): - """Weekday vs weekend mean kWh per 30-min.""" - print("\n📊 Creating weekend comparison...") - - lf = pl.scan_parquet(data_path) - comparison = ( - lf.group_by(["hour", "is_weekend"]) - .agg(pl.col("kwh").mean().alias("mean_kwh")) - .sort(["is_weekend", "hour"]) - .collect(engine="streaming") - ) - - weekday = comparison.filter(pl.col("is_weekend") == False) - weekend = comparison.filter(pl.col("is_weekend") == True) - - fig, ax = plt.subplots(figsize=(15, 9)) - - sns.lineplot( - x=weekday["hour"].to_list(), - y=weekday["mean_kwh"].to_list(), - linewidth=3.5, - color="#2980b9", - marker="o", - markersize=8, - ax=ax, - label="Weekday", - ) - sns.lineplot( - x=weekend["hour"].to_list(), - y=weekend["mean_kwh"].to_list(), - linewidth=3.5, - color="#e67e22", - marker="s", - markersize=8, - ax=ax, - label="Weekend", - ) - - ax.set_xlabel("Hour of Day", fontsize=15, fontweight="bold", labelpad=12) - ax.set_ylabel("Average Energy (kWh per 30-min)", fontsize=15, fontweight="bold", labelpad=12) - ax.set_title("Weekday vs Weekend Load Profiles\nChicago 2024", fontsize=18, fontweight="bold", pad=30) - ax.grid(True, alpha=0.4, linestyle="--", linewidth=0.8) - ax.set_xticks(range(0, 24, 2)) - ax.set_xlim(-0.5, 23.5) - - max_val = max(weekday["mean_kwh"].max(), weekend["mean_kwh"].max()) - ax.set_ylim(0, max_val * 1.25) - - ax.legend(loc="upper left", framealpha=0.95, edgecolor="#000", fancybox=True, shadow=True, fontsize=13) - - plt.tight_layout() - plt.savefig(output_path, dpi=300, bbox_inches="tight", facecolor="white") - print(f"✅ Saved: {output_path}") - plt.close() - - -if __name__ == "__main__": - print("=" * 80) - print("CHICAGO 2024 FINAL VISUALIZATIONS") - print("=" * 80) - - if not Path(DATA_FILE).exists(): - print(f"❌ File not found: {DATA_FILE}") - raise SystemExit(1) - - create_heatmap(DATA_FILE, OUTPUT_DIR / "chicago_2024_heatmap.png") - create_hourly_profile(DATA_FILE, OUTPUT_DIR / "chicago_2024_hourly_profile.png") - create_monthly_profile(DATA_FILE, OUTPUT_DIR / "chicago_2024_monthly_profile.png") - create_weekend_comparison(DATA_FILE, OUTPUT_DIR / "chicago_2024_weekend_comparison.png") - - print("\n" + "=" * 80) - print("✅ ALL VISUALIZATIONS COMPLETE!") - print("=" * 80) - print(f"\nOutput directory: {OUTPUT_DIR}") - print("=" * 80) diff --git a/archive/test_files/fix_final.py b/archive/test_files/fix_final.py deleted file mode 100644 index b12ee10..0000000 --- a/archive/test_files/fix_final.py +++ /dev/null @@ -1,11 +0,0 @@ -with open("smart_meter_analysis/census.py") as f: - lines = f.readlines() - -# Find and fix the _import_cenpy line -for i, line in enumerate(lines): - if "def _import_cenpy()" in line: - lines[i] = "def _import_cenpy(): # type: ignore[no-untyped-def]\n" - break - -with open("smart_meter_analysis/census.py", "w") as f: - f.writelines(lines) diff --git a/docs/index.html b/docs/index.html index 84fecc5..b829477 100644 --- a/docs/index.html +++ b/docs/index.html @@ -155,29 +155,91 @@
We begin by aggregating load profile clusters at the block group level. If \(j\) is the block group, and \(q\) is the cluster (\(q\in\{1,\ldots,k\}\)), then
+We begin by aggregating load profile clusters at the block group level. If \(j\) is the block group, and \(q\) is the cluster (\(q\in\{0,1,2,3\}\)), then
\[ C_{jq} = \sum_{i\in \mathrm{bg}(j)} \mathrm{I}(c_i = q) \]
is the count of household-day observations assigned to cluster \(q\) in block group \(j\), where \(c_i\) denotes the cluster assignment for household-day observation \(i\). Under this aggregation, a single household with multiple sampled days contributes multiple observations to the block group totals.
We further normalize these to \(\pi_{jq} = \frac{C_{jq}}{\sum_q C_{jq}}\), the proportion of cluster assignments in each block group.
-Our aim is to understand how block group level demographics predict the load cluster mix. Because \((\pi_{j1},\ldots,\pi_{jk})\) are compositional proportions that must sum to 1, we model log-ratios of cluster proportions. To prevent numerical instability from zero proportions, we apply Laplace smoothing (\(\alpha = 0.5\) pseudocount) before computing log-ratios. Specifically, we define smoothed proportions
+Our aim is to understand how block group level demographics predict the load cluster mix. Because \((\pi_{j0},\pi_{j1},\pi_{j2},\pi_{j3})\) are compositional proportions that must sum to 1, we use multinomial logistic regression to model the distribution of cluster assignments. This approach properly accounts for the compositional constraint, handles zero counts naturally through the multinomial likelihood, and provides statistically correct standard errors and hypothesis tests.
+We model the counts \((C_{j0}, C_{j1}, C_{j2}, C_{j3})\) as following a multinomial distribution:
\[ -\tilde{\pi}_{jq} = \frac{C_{jq} + \alpha}{\sum_{q=1}^{k} C_{jq} + k\alpha}, -\qquad \alpha = 0.5, +(C_{j0}, C_{j1}, C_{j2}, C_{j3}) \sim \mathrm{Multinomial}\left(\sum_q C_{jq}; \pi_{j0}, \pi_{j1}, \pi_{j2}, \pi_{j3}\right) \]
-which ensures \(\tilde{\pi}_{jq} > 0\) for all \(q\) and prevents \(\log(0) = -\infty\).
-We fit one model per cluster outcome (except the baseline), modeling the log-ratio of each cluster’s share relative to baseline cluster 1. Specifically, for each \(q\in\{2,\ldots,k\}\), we fit:
+We fit a single unified multinomial logit model with Cluster 2 as the baseline (reference) category. This cluster contains the largest share of household-day observations and exhibits the most stable and representative load-shape pattern among the clusters, providing a stable comparison point for the log-ratio regressions. The model estimates coefficients for Clusters 0, 1, and 3 relative to this baseline, parameterizing the log-odds as:
\[ -\log\left(\frac{\tilde{\pi}_{jq}}{\tilde{\pi}_{j1}}\right) -= \beta_{0q} -+ \beta_{1q} X_{1j} -+ \beta_{2q} X_{2j} -+ \cdots -+ \varepsilon_{jq}. +\log\left(\frac{\pi_{jq}}{\pi_{j2}}\right) = \beta_{0q} + \beta_{1q} X_{1j} + \beta_{2q} X_{2j} + \cdots + \beta_{pq} X_{pj} \]
-We estimate these as separate weighted least squares (WLS) regressions, weighting each block group \(j\) by \(\mathrm{total\_obs}_j\), the total number of household-day observations available for that block group. This weighting gives greater influence to block groups with more observed data. As a robustness check, we also estimate unweighted OLS versions of the same log-ratio regressions; large differences between WLS and OLS estimates would indicate results are disproportionately driven by high-observation block groups.
-Coefficients are interpreted on the log-ratio scale: \(\beta_{pq}\) is the expected change in \(\log(\tilde{\pi}_{jq}/\tilde{\pi}_{j1})\) for a one-unit increase in predictor \(X_{pj}\), holding other predictors constant. Equivalently, \(\exp(\beta_{pq})\) is the multiplicative effect on the proportion ratio \(\tilde{\pi}_{jq}/\tilde{\pi}_{j1}\) for a one-unit increase in \(X_{pj}\).
+for \(q \in \{0, 1, 3\}\) (i.e., all non-baseline clusters).
+This parameterization ensures that:
+We estimate this model via maximum likelihood using the VGAM package in R, which implements iteratively reweighted least squares (IRLS) optimization. The multinomial likelihood naturally weights observations by their sample size—block groups with more household-day observations contribute more to the likelihood and thus have greater influence on parameter estimates.
+An alternative approach would be to compute empirical log-ratios \(\log(C_{jq}/C_{j2})\) and model them using separate weighted ordinary least squares (OLS) regressions. However, this approach has several statistical problems:
+Multinomial logit avoids all these issues by using the correct likelihood for multinomial count data.
+Our analysis includes 47 demographic predictor variables across five categories (spatial, economic, housing, household, and demographic characteristics). However, several of these variables form compositional groups where predictors sum to a constant (e.g., income brackets sum to 100% of households). This creates linear dependencies (rank deficiency) in the design matrix.
+To handle this, we:
+The final model includes 43 terms per cluster equation (42 predictors + 1 intercept), yielding 129 total coefficients across the 3 non-baseline cluster equations.
+Coefficients are interpreted on the log-odds scale: \(\beta_{pq}\) is the expected change in \(\log(\pi_{jq}/\pi_{j2})\) for a one-unit increase in predictor \(X_{pj}\), holding other predictors constant.
+Equivalently, \(\exp(\beta_{pq})\) is the multiplicative effect on the odds ratio \(\pi_{jq}/\pi_{j2}\) for a one-unit increase in \(X_{pj}\). For example:
+To obtain predicted probabilities for a block group with covariates \(X_j\), we use the softmax transformation:
+\[ +\hat{\pi}_{jq} = \begin{cases} +\frac{\exp(\hat{\beta}_{0q} + \hat{\beta}_{1q} X_{1j} + \cdots)} +{1 + \sum_{m \in \{0,1,3\}} \exp(\hat{\beta}_{0m} + \hat{\beta}_{1m} X_{1j} + \cdots)} +& \text{if } q \in \{0,1,3\} \\[1em] +\frac{1} +{1 + \sum_{m \in \{0,1,3\}} \exp(\hat{\beta}_{0m} + \hat{\beta}_{1m} X_{1j} + \cdots)} +& \text{if } q = 2 \text{ (baseline)} +\end{cases} +\]
+These predicted probabilities automatically sum to 1 and are guaranteed to be positive.
+Our final model achieves a pseudo-R² of 0.567 (McFadden’s R²), indicating that demographic predictors explain approximately 57% of the variation in cluster distributions across block groups. The model successfully converged using VGAM’s IRLS optimizer, and all coefficients have valid standard errors for hypothesis testing.
+