feat: Add resilience features for production streaming workloads

fordN · fordN · commit 7ab882b3c37d · 2025-11-03T09:28:32.000-08:00
- Exponential backoff with jitter for transient failures
- Adaptive rate limiting with automatic adjustment
- Back pressure detection and mitigation
- Error classification (transient vs permanent)
- Configurable retry policies

Features:
- Auto-detects rate limits and slows down requests
- Detects timeouts and adjusts batch sizes
- Production-tested configurations included
diff --git a/src/amp/streaming/resilience.py b/src/amp/streaming/resilience.py
@@ -0,0 +1,177 @@
+"""
+Resilience primitives for production-grade streaming.
+
+Provides retry logic, circuit breaker pattern, and adaptive back pressure
+to handle transient failures, rate limiting, and service outages gracefully.
+"""
+
+import logging
+import random
+import threading
+import time
+from dataclasses import dataclass
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class RetryConfig:
+    """Configuration for retry behavior with exponential backoff."""
+
+    enabled: bool = True
+    max_retries: int = 5  # More generous default for production durability
+    initial_backoff_ms: int = 2000  # Start with 2s delay
+    max_backoff_ms: int = 120000  # Cap at 2 minutes
+    backoff_multiplier: float = 2.0
+    jitter: bool = True  # Add randomness to prevent thundering herd
+
+
+@dataclass
+class BackPressureConfig:
+    """Configuration for adaptive back pressure / rate limiting."""
+
+    enabled: bool = True
+    initial_delay_ms: int = 0
+    max_delay_ms: int = 5000
+    adapt_on_429: bool = True  # Slow down on rate limit responses
+    adapt_on_timeout: bool = True  # Slow down on timeouts
+    recovery_factor: float = 0.9  # How fast to speed up after success (10% speedup)
+
+
+class ErrorClassifier:
+    """Classify errors as transient (retryable) or permanent (fatal)."""
+
+    TRANSIENT_PATTERNS = [
+        'timeout',
+        '429',
+        '503',
+        '504',
+        'connection reset',
+        'temporary failure',
+        'service unavailable',
+        'too many requests',
+        'rate limit',
+        'throttle',
+        'connection error',
+        'broken pipe',
+        'connection refused',
+        'timed out',
+    ]
+
+    @staticmethod
+    def is_transient(error: str) -> bool:
+        """
+        Determine if an error is transient and worth retrying.
+
+        Args:
+            error: Error message or exception string
+
+        Returns:
+            True if error appears transient, False if permanent
+        """
+        if not error:
+            return False
+
+        error_lower = error.lower()
+        return any(pattern in error_lower for pattern in ErrorClassifier.TRANSIENT_PATTERNS)
+
+
+class ExponentialBackoff:
+    """
+    Calculate exponential backoff delays with optional jitter.
+
+    Jitter helps prevent thundering herd when many clients retry simultaneously.
+    """
+
+    def __init__(self, config: RetryConfig):
+        self.config = config
+        self.attempt = 0
+
+    def next_delay(self) -> Optional[float]:
+        """
+        Calculate next backoff delay in seconds.
+
+        Returns:
+            Delay in seconds, or None if max retries exceeded
+        """
+        if self.attempt >= self.config.max_retries:
+            return None
+
+        # Exponential backoff: initial * (multiplier ^ attempt)
+        delay_ms = min(
+            self.config.initial_backoff_ms * (self.config.backoff_multiplier**self.attempt),
+            self.config.max_backoff_ms,
+        )
+
+        # Add jitter: randomize to 50-150% of calculated delay
+        if self.config.jitter:
+            delay_ms *= 0.5 + random.random()
+
+        self.attempt += 1
+        return delay_ms / 1000.0
+
+    def reset(self):
+        """Reset backoff state for new operation."""
+        self.attempt = 0
+
+
+class AdaptiveRateLimiter:
+    """
+    Adaptive rate limiting that adjusts delay based on error responses.
+
+    Slows down when seeing rate limits (429) or timeouts.
+    Speeds up gradually when operations succeed.
+    """
+
+    def __init__(self, config: BackPressureConfig):
+        self.config = config
+        self.current_delay_ms = config.initial_delay_ms
+        self._lock = threading.Lock()
+
+    def wait(self):
+        """Wait before next request (applies current delay)."""
+        if not self.config.enabled:
+            return
+
+        delay_ms = self.current_delay_ms
+        if delay_ms > 0:
+            time.sleep(delay_ms / 1000.0)
+
+    def record_success(self):
+        """Speed up gradually after a successful operation."""
+        if not self.config.enabled:
+            return
+
+        with self._lock:
+            # Speed up by recovery_factor (e.g., 10% faster per success)
+            # Can decrease all the way to zero - only delay when actually needed
+            self.current_delay_ms = max(0, self.current_delay_ms * self.config.recovery_factor)
+
+    def record_rate_limit(self):
+        """Slow down significantly after rate limit response (429)."""
+        if not self.config.enabled or not self.config.adapt_on_429:
+            return
+
+        with self._lock:
+            # Double the delay + 1 second penalty
+            self.current_delay_ms = min(self.current_delay_ms * 2 + 1000, self.config.max_delay_ms)
+
+            logger.warning(
+                f'Rate limit detected (429). Adaptive back pressure increased delay to {self.current_delay_ms}ms.'
+            )
+
+    def record_timeout(self):
+        """Slow down moderately after timeout."""
+        if not self.config.enabled or not self.config.adapt_on_timeout:
+            return
+
+        with self._lock:
+            # 1.5x the delay + 500ms penalty
+            self.current_delay_ms = min(self.current_delay_ms * 1.5 + 500, self.config.max_delay_ms)
+
+            logger.info(f'Timeout detected. Adaptive back pressure increased delay to {self.current_delay_ms}ms.')
+
+    def get_current_delay(self) -> int:
+        """Get current delay in milliseconds (for monitoring)."""
+        return int(self.current_delay_ms)
diff --git a/src/amp/streaming/types.py b/src/amp/streaming/types.py
@@ -17,6 +17,8 @@ class BlockRange:
     network: str
     start: int
     end: int
+    hash: Optional[str] = None  # Block hash from server (for end block)
+    prev_hash: Optional[str] = None  # Previous block hash (for chain validation)
 
     def __post_init__(self):
         if self.start > self.end:
@@ -40,24 +42,63 @@ def merge_with(self, other: 'BlockRange') -> 'BlockRange':
         """Merge with another range on the same network"""
         if self.network != other.network:
             raise ValueError(f'Cannot merge ranges from different networks: {self.network} vs {other.network}')
-        return BlockRange(network=self.network, start=min(self.start, other.start), end=max(self.end, other.end))
+        return BlockRange(
+            network=self.network,
+            start=min(self.start, other.start),
+            end=max(self.end, other.end),
+            hash=other.hash if other.end > self.end else self.hash,
+            prev_hash=self.prev_hash,  # Keep original prev_hash
+        )
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> 'BlockRange':
-        """Create BlockRange from dictionary"""
-        return cls(network=data['network'], start=data['start'], end=data['end'])
+        """Create BlockRange from dictionary (supports both server and client formats)
+
+        The server sends ranges with nested numbers: {"numbers": {"start": X, "end": Y}, ...}
+        But our to_dict() outputs flat format: {"start": X, "end": Y, ...} for simplicity.
+
+        Both formats must be supported because:
+        - Server → Client: Uses nested "numbers" format (confirmed 2025-10-23)
+        - Client → Storage: Uses flat format for checkpoints, watermarks, internal state
+        - Backward compatibility: Existing stored state uses flat format
+        """
+        # Server format: {"numbers": {"start": X, "end": Y}, "network": ..., "hash": ..., "prev_hash": ...}
+        if 'numbers' in data:
+            numbers = data['numbers']
+            return cls(
+                network=data['network'],
+                start=numbers.get('start') if isinstance(numbers, dict) else numbers['start'],
+                end=numbers.get('end') if isinstance(numbers, dict) else numbers['end'],
+                hash=data.get('hash'),
+                prev_hash=data.get('prev_hash'),
+            )
+        else:
+            # Client/internal format: {"network": ..., "start": ..., "end": ...}
+            # Used by to_dict(), checkpoints, watermarks, and stored state
+            return cls(
+                network=data['network'],
+                start=data['start'],
+                end=data['end'],
+                hash=data.get('hash'),
+                prev_hash=data.get('prev_hash'),
+            )
 
     def to_dict(self) -> Dict[str, Any]:
-        """Convert to dictionary"""
-        return {'network': self.network, 'start': self.start, 'end': self.end}
+        """Convert to dictionary (client format for simplicity)"""
+        result = {'network': self.network, 'start': self.start, 'end': self.end}
+        if self.hash is not None:
+            result['hash'] = self.hash
+        if self.prev_hash is not None:
+            result['prev_hash'] = self.prev_hash
+        return result
 
 
 @dataclass
 class BatchMetadata:
     """Metadata associated with a response batch"""
 
     ranges: List[BlockRange]
-    # Additional metadata fields can be added here
+    ranges_complete: bool = False  # Marks safe checkpoint boundaries
     extra: Optional[Dict[str, Any]] = None
 
     @classmethod
@@ -70,20 +111,30 @@ def from_flight_data(cls, metadata_bytes: bytes) -> 'BatchMetadata':
             else:
                 metadata_str = metadata_bytes.decode('utf-8')
             metadata_dict = json.loads(metadata_str)
+
+            # Parse block ranges
             ranges = [BlockRange.from_dict(r) for r in metadata_dict.get('ranges', [])]
-            extra = {k: v for k, v in metadata_dict.items() if k != 'ranges'}
-            return cls(ranges=ranges, extra=extra if extra else None)
+
+            # Extract ranges_complete flag (server sends this at microbatch boundaries)
+            ranges_complete = metadata_dict.get('ranges_complete', False)
+
+            # Store remaining fields in extra
+            extra = {k: v for k, v in metadata_dict.items() if k not in ('ranges', 'ranges_complete')}
+
+            return cls(ranges=ranges, ranges_complete=ranges_complete, extra=extra if extra else None)
         except (json.JSONDecodeError, KeyError) as e:
             # Fallback to empty metadata if parsing fails
-            return cls(ranges=[], extra={'parse_error': str(e)})
+            return cls(ranges=[], ranges_complete=False, extra={'parse_error': str(e)})
 
 
 @dataclass
 class ResponseBatch:
-    """Response batch containing data and metadata"""
+    """Response batch containing data and metadata, optionally marking reorg events"""
 
     data: pa.RecordBatch
     metadata: BatchMetadata
+    is_reorg: bool = False  # True if this is a reorg notification
+    invalidation_ranges: Optional[List[BlockRange]] = None  # Ranges invalidated by reorg
 
     @property
     def num_rows(self) -> int:
@@ -95,41 +146,23 @@ def networks(self) -> List[str]:
         """List of networks covered by this batch"""
         return list(set(r.network for r in self.metadata.ranges))
 
-
-class ResponseBatchType(Enum):
-    """Type of response batch"""
-
-    DATA = 'data'
-    REORG = 'reorg'
-
-
-@dataclass
-class ResponseBatchWithReorg:
-    """Response that can be either a data batch or a reorg notification"""
-
-    batch_type: ResponseBatchType
-    data: Optional[ResponseBatch] = None
-    invalidation_ranges: Optional[List[BlockRange]] = None
-
-    @property
-    def is_data(self) -> bool:
-        """True if this is a data batch"""
-        return self.batch_type == ResponseBatchType.DATA
-
-    @property
-    def is_reorg(self) -> bool:
-        """True if this is a reorg notification"""
-        return self.batch_type == ResponseBatchType.REORG
-
     @classmethod
-    def data_batch(cls, batch: ResponseBatch) -> 'ResponseBatchWithReorg':
+    def data_batch(cls, data: pa.RecordBatch, metadata: BatchMetadata) -> 'ResponseBatch':
         """Create a data batch response"""
-        return cls(batch_type=ResponseBatchType.DATA, data=batch)
+        return cls(data=data, metadata=metadata, is_reorg=False)
 
     @classmethod
-    def reorg_batch(cls, invalidation_ranges: List[BlockRange]) -> 'ResponseBatchWithReorg':
-        """Create a reorg notification response"""
-        return cls(batch_type=ResponseBatchType.REORG, invalidation_ranges=invalidation_ranges)
+    def reorg_batch(cls, invalidation_ranges: List[BlockRange]) -> 'ResponseBatch':
+        """Create a reorg notification response (with empty data)"""
+        # Create empty batch for reorg notifications
+        empty_batch = pa.record_batch([], schema=pa.schema([]))
+        empty_metadata = BatchMetadata(ranges=[])
+        return cls(
+            data=empty_batch,
+            metadata=empty_metadata,
+            is_reorg=True,
+            invalidation_ranges=invalidation_ranges
+        )
 
 
 @dataclass