runsascoded
diff --git a/‎CLAUDE.md‎
Lines changed: 17 additions & 5 deletions b/‎CLAUDE.md‎
Lines changed: 17 additions & 5 deletions
diff --git a/‎src/awair/cli/config.py‎
Lines changed: 68 additions & 1 deletion b/‎src/awair/cli/config.py‎
Lines changed: 68 additions & 1 deletion
diff --git a/‎src/awair/cli/data.py‎
Lines changed: 168 additions & 21 deletions b/‎src/awair/cli/data.py‎
Lines changed: 168 additions & 21 deletions
diff --git a/‎src/awair/lmbda/app.py‎
Lines changed: 12 additions & 2 deletions b/‎src/awair/lmbda/app.py‎
Lines changed: 12 additions & 2 deletions
@@ -80,15 +80,23 @@ pnpm run test                 # Run tests
 ```
 Awair API
     ↓
-Lambda (every 1 min) → S3 (device-specific parquet files)
+Lambda (every 1 min) → S3 (monthly parquet shards)
     ↑                    ↓
 Python CLI         Web Dashboard (reads directly from S3)
 
 Multi-Device Example:
-  Gym (17617):  EventBridge (1min) → Lambda → s3://380nwk/awair-17617.parquet
-  BR (137496):   EventBridge (1min) → Lambda → s3://380nwk/awair-137496.parquet
+  Gym (17617):  EventBridge (1min) → Lambda → s3://380nwk/awair-17617/{YYYY-MM}.parquet
+  BR (137496):  EventBridge (1min) → Lambda → s3://380nwk/awair-137496/{YYYY-MM}.parquet
 ```
 
+### Monthly Sharding
+
+Data is stored in monthly shards to reduce Lambda write amplification:
+- Each Lambda invocation only downloads/uploads the current month's file (~14-44k rows)
+- Historical months are immutable (never modified after month ends)
+- Frontend fetches from multiple monthly files in parallel for longer time ranges
+- CLI commands auto-detect and aggregate across monthly files
+
 ### Key Components
 
 #### 1. Lambda Data Updater (`src/awair/lmbda/`)
@@ -149,8 +157,12 @@ All data files follow a fixed structure under the S3 root:
 ```
 {S3_ROOT}/
 ├── devices.parquet             # Device registry (cached from API)
-├── awair-17617.parquet         # Device data files
-├── awair-137496.parquet
+├── awair-17617/                # Device data (monthly shards)
+│   ├── 2025-06.parquet
+│   ├── 2025-07.parquet
+│   └── ...
+├── awair-137496/
+│   └── ...
 └── ...
 ```
 
 
@@ -63,10 +63,18 @@ def get_devices_path() -> str:
 
 
 def get_data_path(device_id: int) -> str:
-    """Get path to device data parquet file."""
+    """Get path to device data parquet file (legacy single-file format)."""
     return f'{get_s3_root()}/awair-{device_id}.parquet'
 
 
+def get_data_base_path(device_id: int) -> str:
+    """Get base path for device data (directory for monthly shards).
+
+    Monthly files are stored as: {base_path}/{YYYY-MM}.parquet
+    """
+    return f'{get_s3_root()}/awair-{device_id}'
+
+
 def parse_s3_path(s3_path: str) -> tuple[str, str]:
     """Parse S3 path into bucket and key components."""
     if not s3_path.startswith('s3://'):
@@ -85,6 +93,65 @@ def parse_s3_path(s3_path: str) -> tuple[str, str]:
     return bucket, key
 
 
+def list_monthly_files(base_path: str) -> list[str]:
+    """List all monthly parquet files in a device data directory.
+
+    Args:
+        base_path: Base path for device data (e.g., s3://bucket/awair-17617 or ./awair-17617)
+
+    Returns:
+        Sorted list of paths to monthly parquet files (e.g., ['s3://.../2024-11.parquet', ...])
+    """
+    import re
+
+    # Strip trailing slash if present
+    base_path = base_path.rstrip('/')
+
+    if base_path.startswith('s3://'):
+        import boto3
+        bucket, prefix = parse_s3_path(f'{base_path}/placeholder')
+        prefix = prefix.rsplit('/', 1)[0] + '/'  # Get directory prefix
+
+        s3 = boto3.client('s3')
+        response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
+
+        files = []
+        for obj in response.get('Contents', []):
+            key = obj['Key']
+            # Match YYYY-MM.parquet pattern
+            if re.match(r'.*/\d{4}-\d{2}\.parquet$', key):
+                files.append(f's3://{bucket}/{key}')
+        return sorted(files)
+    else:
+        from pathlib import Path
+        base = Path(base_path)
+        if not base.is_dir():
+            return []
+        files = [str(f) for f in base.glob('*.parquet') if re.match(r'\d{4}-\d{2}\.parquet$', f.name)]
+        return sorted(files)
+
+
+def load_monthly_data(base_path: str):
+    """Load and combine all monthly parquet files into a single DataFrame.
+
+    Args:
+        base_path: Base path for device data (e.g., s3://bucket/awair-17617)
+
+    Returns:
+        Combined DataFrame sorted by timestamp
+    """
+    import pandas as pd
+
+    files = list_monthly_files(base_path)
+    if not files:
+        return pd.DataFrame()
+
+    dfs = [pd.read_parquet(f) for f in files]
+    combined = pd.concat(dfs, ignore_index=True)
+    combined = combined.sort_values('timestamp').reset_index(drop=True)
+    return combined
+
+
 def get_default_data_path(device_id: int | None = None) -> str:
     """Get data file path for a device.
 
 
@@ -9,7 +9,14 @@
 from ..storage import ParquetStorage
 from .base import awair
 from .common_opts import device_id_opt
-from .config import data_path_opt, err
+from .config import (
+    data_path_opt,
+    err,
+    get_data_base_path,
+    list_monthly_files,
+    load_monthly_data,
+    resolve_device_by_name_or_id,
+)
 
 
 @awair.group
@@ -18,21 +25,82 @@ def data():
     pass
 
 
+def load_device_data(device_id: str | None, data_path: str) -> tuple[pd.DataFrame, str, bool]:
+    """Load device data, trying monthly files first then falling back to single file.
+
+    Args:
+        device_id: Device ID (string or numeric)
+        data_path: Data path (may be single file or base directory)
+
+    Returns:
+        Tuple of (DataFrame, source_description, is_monthly)
+    """
+    import re
+
+    # If device_id not provided, try to extract from data_path
+    # Pattern: awair-{deviceId}.parquet or awair-{deviceId}/
+    if device_id is None:
+        match = re.search(r'awair-(\d+)(?:\.parquet|/|$)', data_path)
+        if match:
+            device_id = match.group(1)
+
+    # Try monthly files first
+    if device_id is not None:
+        if isinstance(device_id, str):
+            try:
+                _, device_id_int = resolve_device_by_name_or_id(device_id)
+            except ValueError:
+                device_id_int = int(device_id)
+        else:
+            device_id_int = device_id
+
+        base_path = get_data_base_path(device_id_int)
+        monthly_files = list_monthly_files(base_path)
+
+        if monthly_files:
+            df = load_monthly_data(base_path)
+            source = f'{base_path}/ ({len(monthly_files)} monthly files)'
+            return df, source, True
+
+    # Fall back to single file
+    storage = ParquetStorage(data_path)
+    df = storage.read_data()
+    return df, data_path, False
+
+
 @data.command
 @device_id_opt
 @data_path_opt
 def info(device_id: str | None, data_path: str):
-    """Show data file information."""
-    storage = ParquetStorage(data_path)
-    summary = storage.get_data_summary()
+    """Show data file information.
 
-    echo(f'Data file: {data_path}')
-    echo(f'Total records: {summary["count"]}')
-    if summary['earliest']:
-        echo(f'Date range: {summary["earliest"]} to {summary["latest"]}')
-        echo(f'File size: {summary["file_size_mb"]:.2f} MB')
-    else:
-        echo('No data in file')
+    Automatically detects and reads from monthly sharded files if available,
+    falling back to single-file format.
+    """
+    df, source, is_monthly = load_device_data(device_id, data_path)
+
+    echo(f'Data source: {source}')
+
+    if df.empty:
+        echo('No data found')
+        return
+
+    echo(f'Total records: {len(df):,}')
+
+    df['timestamp'] = pd.to_datetime(df['timestamp'])
+    earliest = df['timestamp'].min()
+    latest = df['timestamp'].max()
+    echo(f'Date range: {earliest} to {latest}')
+
+    if is_monthly:
+        # Show per-month breakdown
+        base_path = source.split(' (')[0]
+        monthly_files = list_monthly_files(base_path)
+        echo('\nMonthly files:')
+        for f in monthly_files:
+            month_name = f.split('/')[-1].replace('.parquet', '')
+            month_df = pd.read_parquet(f)
+            echo(f'  {month_name}: {len(month_df):,} records')
 
 
 @data.command
@@ -49,14 +117,14 @@ def gaps(
     count: int,
     min_gap: int | None,
 ):
-    """Find and report the largest timing gaps in the data."""
+    """Find and report the largest timing gaps in the data.
 
-    # Read data
-    storage = ParquetStorage(data_path)
-    df = storage.read_data()
+    Automatically detects and reads from monthly sharded files if available.
+    """
+    df, source, _ = load_device_data(device_id, data_path)
 
     if df.empty:
-        err('No data in file')
+        err('No data found')
         return
 
     # Filter by date range if specified (parsing already handled by option callbacks)
@@ -100,7 +168,7 @@ def gaps(
     # Show summary
     date_range = f'{df["timestamp"].min().date()} to {df["timestamp"].max().date()}'
 
-    echo(f'Gap analysis for {data_path}')
+    echo(f'Gap analysis for {source}')
     echo(f'Date range: {date_range}')
     echo(f'Total records: {len(df)}')
 
@@ -132,13 +200,14 @@ def hist(
     from_dt: str | None,
     to_dt: str | None,
 ):
-    """Generate histogram of record counts per day."""
+    """Generate histogram of record counts per day.
 
-    storage = ParquetStorage(data_path)
-    df = storage.read_data()
+    Automatically detects and reads from monthly sharded files if available.
+    """
+    df, _, _ = load_device_data(device_id, data_path)
 
     if df.empty:
-        err('No data in file')
+        err('No data found')
         return
 
     # Ensure timestamp is datetime
@@ -166,3 +235,81 @@ def hist(
 
     for _, row in daily_counts.iterrows():
         echo(f'{row["count"]:7d} {row["date"]}')
+
+
+# Default row group size for monthly shards
+# 5000 rows = ~3.5 days at 1-minute intervals = ~80KB per RG
+# Monthly files have ~40-44k rows = ~8-9 RGs, good granularity for caching
+DEFAULT_MONTHLY_ROW_GROUP_SIZE = 5000
+
+
+@data.command
+@device_id_opt
+@data_path_opt
+@option('-n', '--dry-run', is_flag=True, help='Show what would be done without writing files')
+@option('-r', '--row-group-size', type=int, default=DEFAULT_MONTHLY_ROW_GROUP_SIZE,
+        help=f'Row group size for output files (default: {DEFAULT_MONTHLY_ROW_GROUP_SIZE})')
+def shard(device_id: str | None, data_path: str, dry_run: bool, row_group_size: int):
+    """Split single parquet file into monthly shards.
+
+    Reads the existing awair-{deviceId}.parquet file and splits it into
+    monthly files: awair-{deviceId}/{YYYY-MM}.parquet
+
+    This reduces Lambda write amplification by allowing updates to only
+    touch the current month's file.
+
+    Default row group size is 5000 rows (~3.5 days, ~80KB) for good cache
+    granularity. Use --row-group-size to customize.
+    """
+    # Read existing data
+    echo(f'Reading: {data_path}')
+    storage = ParquetStorage(data_path)
+    df = storage.read_data()
+
+    if df.empty:
+        err('No data in file')
+        return
+
+    echo(f'Using row_group_size: {row_group_size}')
+
+    # Ensure timestamp is datetime and extract year-month
+    df['timestamp'] = pd.to_datetime(df['timestamp'])
+    df['year_month'] = df['timestamp'].dt.strftime('%Y-%m')
+
+    # Group by year-month
+    groups = df.groupby('year_month')
+    echo(f'Found {len(groups)} months of data:')
+
+    # Determine output base path (directory)
+    # e.g., s3://380nwk/awair-17617.parquet -> s3://380nwk/awair-17617/
+    if data_path.endswith('.parquet'):
+        output_base = data_path[:-8]  # Remove .parquet suffix
+    else:
+        output_base = data_path
+
+    # Process each month
+    for year_month, group_df in sorted(groups):
+        count = len(group_df)
+        output_path = f'{output_base}/{year_month}.parquet'
+
+        date_range = f'{group_df["timestamp"].min().date()} to {group_df["timestamp"].max().date()}'
+        echo(f'  {year_month}: {count:,} records ({date_range})')
+
+        if dry_run:
+            echo(f'    Would write: {output_path}')
+        else:
+            # Prepare DataFrame for writing (remove year_month helper column)
+            write_df = group_df.drop(columns=['year_month']).sort_values('timestamp').reset_index(drop=True)
+
+            # Write to monthly file
+            write_df.to_parquet(output_path, index=False, engine='pyarrow', row_group_size=row_group_size)
+            echo(f'    Wrote: {output_path}')
+
+    total_records = len(df)
+    if dry_run:
+        echo(f'\nDry run complete. Would shard {total_records:,} records into {len(groups)} monthly files.')
+        echo('Run without --dry-run to execute.')
+    else:
+        echo(f'\nSharded {total_records:,} records into {len(groups)} monthly files.')
+        echo(f'Original file preserved: {data_path}')
+        echo('After verifying shards, you can delete the original file.')
@@ -50,6 +50,10 @@ def __init__(
         if not s3_bucket or not s3_key:
             raise ValueError(f'Invalid S3 path: {data_path}. Expected format: s3://bucket/key')
 
+        # Normalize key for IAM (remove .parquet suffix if present)
+        # Monthly sharding uses directory structure: awair-{deviceId}/{YYYY-MM}.parquet
+        s3_key_base = s3_key[:-8] if s3_key.endswith('.parquet') else s3_key
+
         # IAM role for Lambda
         lambda_role = iam.Role(
             self, "LambdaExecutionRole",
@@ -67,12 +71,18 @@ def __init__(
                                 "s3:PutObject",
                                 "s3:DeleteObject"
                             ],
-                            resources=[f"arn:aws:s3:::{s3_bucket}/{s3_key}"]
+                            # Wildcard for all monthly files: awair-{id}/*.parquet
+                            resources=[f"arn:aws:s3:::{s3_bucket}/{s3_key_base}/*"]
                         ),
                         iam.PolicyStatement(
                             effect=iam.Effect.ALLOW,
                             actions=["s3:ListBucket"],
-                            resources=[f"arn:aws:s3:::{s3_bucket}"]
+                            resources=[f"arn:aws:s3:::{s3_bucket}"],
+                            conditions={
+                                "StringLike": {
+                                    "s3:prefix": [f"{s3_key_base}/*"]
+                                }
+                            }
                         )
                     ]
                 )