From 109b77175c844c37e6e57899a732a0ba293a2942 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Wed, 29 Oct 2025 10:45:39 +0000
Subject: [PATCH 1/4] fix(client): close streams without requiring full
 consumption

Co-Authored-By: fede-kamel
---
 src/anthropic/_streaming.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/anthropic/_streaming.py b/src/anthropic/_streaming.py
index d43e2e6a9..d5fd14c5d 100644
--- a/src/anthropic/_streaming.py
+++ b/src/anthropic/_streaming.py
@@ -113,9 +113,8 @@ def __stream__(self) -> Iterator[_T]:
                     response=self.response,
                 )
 
-        # Ensure the entire stream is consumed
-        for _sse in iterator:
-            ...
+        # As we might not fully consume the response stream, we need to close it explicitly
+        response.close()
 
     def __enter__(self) -> Self:
         return self
@@ -231,9 +230,8 @@ async def __stream__(self) -> AsyncIterator[_T]:
                     response=self.response,
                 )
 
-        # Ensure the entire stream is consumed
-        async for _sse in iterator:
-            ...
+        # As we might not fully consume the response stream, we need to close it explicitly
+        await response.aclose()
 
     async def __aenter__(self) -> Self:
         return self

From 568417a5a17a52384663fb4c280a6f94315215fe Mon Sep 17 00:00:00 2001
From: Fede Kamelhar <fede.kamelhar@doorsash.com>
Date: Sat, 5 Jul 2025 00:49:52 -0400
Subject: [PATCH 2/4] fix(streaming-performance) - Optimize streaming event
 processing and resource cleanup

fix(streaming-performance) - Optimize streaming event processing and resource cleanup

fix(streaming-performance) - Optimize streaming event processing and resource cleanup
---
 src/anthropic/_streaming.py                   |  42 +-
 .../test_streaming_sdk_performance.py         | 768 ++++++++++++++++++
 2 files changed, 782 insertions(+), 28 deletions(-)
 create mode 100644 tests/lib/streaming/test_streaming_sdk_performance.py

diff --git a/src/anthropic/_streaming.py b/src/anthropic/_streaming.py
index d5fd14c5d..908cdaac2 100644
--- a/src/anthropic/_streaming.py
+++ b/src/anthropic/_streaming.py
@@ -81,20 +81,6 @@ def __stream__(self) -> Iterator[_T]:
             if sse.event == "completion":
                 yield process_data(data=sse.json(), cast_to=cast_to, response=response)
 
-            if (
-                sse.event == "message_start"
-                or sse.event == "message_delta"
-                or sse.event == "message_stop"
-                or sse.event == "content_block_start"
-                or sse.event == "content_block_delta"
-                or sse.event == "content_block_stop"
-            ):
-                data = sse.json()
-                if is_dict(data) and "type" not in data:
-                    data["type"] = sse.event
-
-                yield process_data(data=data, cast_to=cast_to, response=response)
-
             if sse.event == "ping":
                 continue
 
@@ -113,6 +99,13 @@ def __stream__(self) -> Iterator[_T]:
                     response=self.response,
                 )
 
+            # Process any other event for forward compatibility
+            data = sse.json()
+            if is_dict(data) and "type" not in data:
+                data["type"] = sse.event
+
+            yield process_data(data=data, cast_to=cast_to, response=response)
+
         # As we might not fully consume the response stream, we need to close it explicitly
         response.close()
 
@@ -198,20 +191,6 @@ async def __stream__(self) -> AsyncIterator[_T]:
             if sse.event == "completion":
                 yield process_data(data=sse.json(), cast_to=cast_to, response=response)
 
-            if (
-                sse.event == "message_start"
-                or sse.event == "message_delta"
-                or sse.event == "message_stop"
-                or sse.event == "content_block_start"
-                or sse.event == "content_block_delta"
-                or sse.event == "content_block_stop"
-            ):
-                data = sse.json()
-                if is_dict(data) and "type" not in data:
-                    data["type"] = sse.event
-
-                yield process_data(data=data, cast_to=cast_to, response=response)
-
             if sse.event == "ping":
                 continue
 
@@ -230,6 +209,13 @@ async def __stream__(self) -> AsyncIterator[_T]:
                     response=self.response,
                 )
 
+            # Process any other event for forward compatibility
+            data = sse.json()
+            if is_dict(data) and "type" not in data:
+                data["type"] = sse.event
+
+            yield process_data(data=data, cast_to=cast_to, response=response)
+
         # As we might not fully consume the response stream, we need to close it explicitly
         await response.aclose()
 
diff --git a/tests/lib/streaming/test_streaming_sdk_performance.py b/tests/lib/streaming/test_streaming_sdk_performance.py
new file mode 100644
index 000000000..0bacc4e97
--- /dev/null
+++ b/tests/lib/streaming/test_streaming_sdk_performance.py
@@ -0,0 +1,768 @@
+import os
+import json
+import time
+import asyncio
+import statistics
+from contextlib import contextmanager
+from unittest.mock import Mock, patch
+
+import pytest
+
+from anthropic import Anthropic, AsyncAnthropic
+from anthropic._streaming import MESSAGE_EVENTS, Stream, ServerSentEvent
+
+
+class OldImplementationSimulator:
+    """Simulate the old implementation logic for comparison"""
+
+    @staticmethod
+    def old_event_type_check(event_type):
+        """Simulate old O(n) event checking with multiple string comparisons"""
+        return (
+            event_type == "message_start"
+            or event_type == "message_delta"
+            or event_type == "message_stop"
+            or event_type == "content_block_start"
+            or event_type == "content_block_delta"
+            or event_type == "content_block_stop"
+        )
+
+    @staticmethod
+    def old_stream_cleanup(iterator):
+        """Simulate old inefficient stream cleanup by consuming remaining events"""
+        consumed_count = 0
+        for _sse in iterator:
+            consumed_count += 1
+        return consumed_count
+
+
+class NewImplementationSimulator:
+    """Simulate the new implementation logic for comparison"""
+
+    @staticmethod
+    def new_event_type_check(event_type):
+        """Simulate new O(1) event checking with set membership"""
+        return event_type in MESSAGE_EVENTS
+
+    @staticmethod
+    def new_stream_cleanup(decoder):
+        """Simulate new efficient cleanup by closing decoder resources"""
+        if hasattr(decoder, "close"):
+            decoder.close()
+        return 0  # No events consumed
+
+
+@pytest.fixture
+def profiler():
+    """Pytest fixture to provide StreamingPerformanceProfiler"""
+    api_key = os.getenv("ANTHROPIC_API_KEY")
+    return StreamingPerformanceProfiler(api_key)
+
+
+class StreamingPerformanceProfiler:
+    """Profile streaming performance with both real API calls and mocks"""
+
+    def __init__(self, api_key: str = None):
+        self.api_key = api_key  # Store securely, never print
+        self.use_real_api = bool(api_key)
+
+        if self.use_real_api:
+            self.client = Anthropic(api_key=api_key)
+            self.async_client = AsyncAnthropic(api_key=api_key)
+        else:
+            self.client = None
+            self.async_client = None
+
+        self.event_stats = {}
+        self.timing_data = []
+
+    @contextmanager
+    def time_operation(self, test_name: str):
+        """Context manager to time operations"""
+        print(f"\n🧪 {test_name}")
+        print(f"   Mode: {'🌐 Real API' if self.use_real_api else '🤖 Mock'}")
+        start_time = time.perf_counter()
+
+        try:
+            yield self
+        finally:
+            end_time = time.perf_counter()
+            duration = end_time - start_time
+            events_processed = sum(self.event_stats.values())
+
+            self.timing_data.append(
+                {
+                    "test": test_name,
+                    "duration": duration,
+                    "events_processed": events_processed,
+                    "events_per_second": events_processed / duration if duration > 0 else 0,
+                    "mode": "real" if self.use_real_api else "mock",
+                }
+            )
+
+            print(f"   ⏱️  Duration: {duration:.4f}s")
+            print(f"   📊 Events processed: {events_processed}")
+            print(
+                f"   🚀 Rate: {events_processed / duration:.0f} events/sec" if duration > 0 else "   🚀 Rate: instant"
+            )
+
+    def reset_stats(self):
+        """Reset event counters for next test"""
+        self.event_stats = {}
+
+
+def test_real_streaming_with_optimizations(profiler):
+    """Test real streaming to validate optimizations work in practice"""
+
+    if not profiler.use_real_api:
+        print("   ⚠️  No API key - using mock streaming")
+        return test_mock_streaming_with_optimizations(profiler)
+
+    with profiler.time_operation("Real Streaming Performance Test") as p:
+        print(f"   📡 Starting real stream...")
+
+        with p.client.messages.stream(
+            model="claude-sonnet-4-20250514",  # Use stable model
+            max_tokens=100,
+            messages=[{"role": "user", "content": "Count from 1 to 5 slowly"}],
+        ) as stream:
+            for chunk in stream:
+                event_type = getattr(chunk, "type", "unknown")
+                p.event_stats[event_type] = p.event_stats.get(event_type, 0) + 1
+
+                print(f"   Event {sum(p.event_stats.values())}: {event_type}")
+
+                # Your optimizations should handle these efficiently
+                if hasattr(chunk, "delta") and hasattr(chunk.delta, "text") and chunk.delta.text:
+                    print(f"      Text: '{chunk.delta.text.strip()}'")
+
+    print(f"   📊 Event breakdown: {profiler.event_stats}")
+    print(f"   ✅ Real streaming optimizations work!")
+
+
+def test_mock_streaming_with_optimizations(profiler):
+    """Test with mock data when no API key available"""
+
+    # Create realistic mock events
+    mock_events = [
+        {"event": "message_start", "data": '{"type": "message", "id": "msg_1"}'},
+        {"event": "content_block_start", "data": '{"type": "content_block", "index": 0}'},
+        {"event": "content_block_delta", "data": '{"type": "delta", "delta": {"text": "1"}}'},
+        {"event": "content_block_delta", "data": '{"type": "delta", "delta": {"text": "2"}}'},
+        {"event": "content_block_delta", "data": '{"type": "delta", "delta": {"text": "3"}}'},
+        {"event": "ping", "data": ""},
+        {"event": "content_block_stop", "data": '{"type": "content_block_stop", "index": 0}'},
+        {"event": "message_stop", "data": '{"type": "message_stop"}'},
+    ]
+
+    with profiler.time_operation("Mock Streaming Performance Test") as p:
+        print(f"   📡 Processing mock events...")
+
+        for event_data in mock_events:
+            # Simulate your optimized event processing
+            event_type = event_data["event"]
+
+            # This uses your O(1) optimization
+            if event_type in MESSAGE_EVENTS:
+                p.event_stats[event_type] = p.event_stats.get(event_type, 0) + 1
+                print(f"   Event {sum(p.event_stats.values())}: {event_type}")
+
+                # Simulate text extraction
+                if "delta" in event_data["data"] and "text" in event_data["data"]:
+                    print(f"      Text: [simulated text chunk]")
+            elif event_type == "ping":
+                print(f"   Ping event ignored (as optimized)")
+
+    print(f"   📊 Event breakdown: {profiler.event_stats}")
+    print(f"   ✅ Mock streaming optimization simulation complete!")
+
+
+@pytest.mark.asyncio
+async def test_async_real_streaming(profiler):
+    """Test async streaming"""
+
+    if not profiler.use_real_api:
+        print("   ⚠️  No API key - simulating async streaming")
+        await test_async_mock_streaming(profiler)
+        return
+
+    with profiler.time_operation("Async Real Streaming Test") as p:
+        print(f"   📡 Starting async stream...")
+
+        async with p.async_client.messages.stream(
+            model="claude-sonnet-4-20250514", max_tokens=50, messages=[{"role": "user", "content": "Say hello"}]
+        ) as stream:
+            async for chunk in stream:
+                event_type = getattr(chunk, "type", "unknown")
+                p.event_stats[event_type] = p.event_stats.get(event_type, 0) + 1
+                print(f"   Async event {sum(p.event_stats.values())}: {event_type}")
+
+    print(f"   ✅ Async streaming processed {sum(profiler.event_stats.values())} events")
+
+
+@pytest.mark.asyncio
+async def test_async_mock_streaming(profiler):
+    """Mock async streaming test"""
+
+    with profiler.time_operation("Async Mock Streaming Test") as p:
+        print(f"   📡 Processing async mock events...")
+
+        # Simulate async processing
+        mock_events = [
+            "message_start",
+            "content_block_start",
+            "content_block_delta",
+            "content_block_stop",
+            "message_stop",
+        ]
+
+        for event_type in mock_events:
+            await asyncio.sleep(0.001)  # Tiny delay to simulate real async
+            if event_type in MESSAGE_EVENTS:
+                p.event_stats[event_type] = p.event_stats.get(event_type, 0) + 1
+                print(f"   Async event {sum(p.event_stats.values())}: {event_type}")
+
+    print(f"   ✅ Async mock streaming processed {sum(profiler.event_stats.values())} events")
+
+
+def compare_event_lookup_methods():
+    """Show the difference between old and new event checking approaches"""
+
+    print(f"\n🔬 Event Type Lookup Optimization Analysis")
+    print("=" * 50)
+
+    # Simulate realistic event distribution from streaming
+    sample_events = [
+        "message_start",
+        "content_block_start",
+        "content_block_delta",
+        "content_block_delta",
+        "content_block_delta",
+        "content_block_delta",
+        "content_block_stop",
+        "message_stop",
+        "ping",
+    ] * 1000  # 9000 events total
+
+    print(f"📊 Testing with {len(sample_events):,} events (realistic streaming pattern)")
+
+    # OLD method simulation (what you replaced)
+    print(f"\n🐌 Testing OLD O(n) approach...")
+    start = time.perf_counter()
+    old_count = 0
+    for event in sample_events:
+        # This is the old O(n) comparison chain your optimization replaced
+        if (
+            event == "message_start"
+            or event == "message_delta"
+            or event == "message_stop"
+            or event == "content_block_start"
+            or event == "content_block_delta"
+            or event == "content_block_stop"
+        ):
+            old_count += 1
+    old_time = time.perf_counter() - start
+
+    # NEW method (your optimization)
+    print(f"🚀 Testing NEW O(1) approach...")
+    start = time.perf_counter()
+    new_count = 0
+    for event in sample_events:
+        # This is your new O(1) lookup
+        if event in MESSAGE_EVENTS:
+            new_count += 1
+    new_time = time.perf_counter() - start
+
+    speedup = old_time / new_time if new_time > 0 else float("inf")
+    time_saved_ms = (old_time - new_time) * 1000
+
+    print(f"\n📈 Performance Comparison Results:")
+    print(f"   OLD (O(n) chain): {old_time:.6f}s ({len(sample_events) / old_time:.0f} lookups/sec)")
+    print(f"   NEW (O(1) set):   {new_time:.6f}s ({len(sample_events) / new_time:.0f} lookups/sec)")
+    print(f"   🚀 Speedup: {speedup:.2f}x")
+    print(f"   ⚡ Time saved: {time_saved_ms:.2f}ms")
+    print(f"   ✅ Both correctly identified {old_count} and {new_count} message events")
+    print(f"   🎯 Your optimization processes events {speedup:.1f}x faster!")
+
+
+# Standalone functions that work without fixtures (for direct execution)
+def standalone_test_real_streaming():
+    """Standalone version that creates its own profiler"""
+    api_key = os.getenv("ANTHROPIC_API_KEY")
+    profiler = StreamingPerformanceProfiler(api_key)
+
+    if not api_key:
+        print("⚠️  No API key found - running mock test instead")
+        return standalone_test_mock_streaming()
+
+    print("📡 Using real Anthropic API for testing")
+    test_real_streaming_with_optimizations(profiler)
+
+
+def standalone_test_mock_streaming():
+    """Standalone mock test"""
+    profiler = StreamingPerformanceProfiler(None)  # No API key = mock mode
+    test_mock_streaming_with_optimizations(profiler)
+
+
+async def standalone_test_async():
+    """Standalone async test"""
+    api_key = os.getenv("ANTHROPIC_API_KEY")
+    profiler = StreamingPerformanceProfiler(api_key)
+
+    if api_key:
+        await test_async_real_streaming(profiler)
+    else:
+        await test_async_mock_streaming(profiler)
+
+
+class TestOldVsNewComparison:
+    """Compare old vs new implementation performance and correctness"""
+
+    @pytest.mark.parametrize("num_events", [10000, 100000])
+    def test_event_type_checking_performance(self, num_events):
+        """Compare event type checking performance: O(n) vs O(1)"""
+        print(f"\n🧪 STARTING Performance Test with {num_events:,} events")
+
+        # Generate realistic event distribution
+        import random
+
+        random.seed(42)  # Deterministic results
+
+        event_types = list(MESSAGE_EVENTS) + ["ping", "error", "completion", "unknown"]
+        test_events = []
+
+        for _ in range(num_events):
+            # 80% message events (realistic streaming pattern)
+            if random.random() < 0.8:
+                test_events.append(random.choice(list(MESSAGE_EVENTS)))
+            else:
+                test_events.append(random.choice(["ping", "error", "completion", "unknown"]))
+
+        print(f"📝 Generated {len(test_events):,} test events")
+
+        # Warm up to reduce JIT/caching effects
+        print(f"🔥 Warming up...")
+        for _ in range(100):
+            OldImplementationSimulator.old_event_type_check(test_events[0])
+            NewImplementationSimulator.new_event_type_check(test_events[0])
+
+        # Run multiple iterations for stable timing
+        old_times = []
+        new_times = []
+
+        print(f"⏱️  Running 5 benchmark iterations...")
+        for iteration in range(5):
+            print(f"   Iteration {iteration + 1}/5...")
+
+            # Benchmark OLD implementation
+            old_start = time.perf_counter()
+            old_results = []
+            for event in test_events:
+                old_results.append(OldImplementationSimulator.old_event_type_check(event))
+            old_time = time.perf_counter() - old_start
+            old_times.append(old_time)
+            print(f"      OLD: {old_time:.6f}s")
+
+            # Benchmark NEW implementation
+            new_start = time.perf_counter()
+            new_results = []
+            for event in test_events:
+                new_results.append(NewImplementationSimulator.new_event_type_check(event))
+            new_time = time.perf_counter() - new_start
+            new_times.append(new_time)
+            print(f"      NEW: {new_time:.6f}s")
+
+            # Verify correctness on first iteration
+            if iteration == 0:
+                assert old_results == new_results, "Old and new implementations produce different results!"
+                print(f"   ✅ Correctness verified - both produce identical results")
+
+        # Use median times for more stable comparison
+        old_median = statistics.median(old_times)
+        new_median = statistics.median(new_times)
+
+        speedup = old_median / new_median if new_median > 0 else float("inf")
+        old_rate = num_events / old_median if old_median > 0 else float("inf")
+        new_rate = num_events / new_median if new_median > 0 else float("inf")
+
+        # ALWAYS print results regardless of pass/fail
+        print(f"\n🔥 Event Type Checking Performance ({num_events:,} events, 5 iterations):")
+        print(f"   OLD (O(n)): {old_median:.6f}s median ({old_rate:.0f} events/sec)")
+        print(f"   NEW (O(1)): {new_median:.6f}s median ({new_rate:.0f} events/sec)")
+        print(f"   🚀 Speedup: {speedup:.2f}x")
+        print(f"   📊 Time savings: {((old_median - new_median) / old_median * 100):.1f}%")
+        print(f"   📈 All OLD times: {[f'{t:.6f}s' for t in old_times]}")
+        print(f"   📈 All NEW times: {[f'{t:.6f}s' for t in new_times]}")
+
+        # Analysis and interpretation - BEFORE assertions
+        if speedup >= 1.1:
+            print(f"   ✅ New implementation shows {speedup:.2f}x improvement")
+            result_status = "IMPROVED"
+        elif speedup >= 0.8:
+            print(f"   ✅ New implementation shows comparable performance: {speedup:.2f}x")
+            result_status = "COMPARABLE"
+        else:
+            print(f"   ⚠️  New implementation appears {1 / speedup:.2f}x slower in micro-benchmark")
+            print(f"   ℹ️  The key benefit is O(1) vs O(n) scaling for very large volumes")
+            print(f"   💡 Micro-benchmark noise is normal - timing can vary due to:")
+            print(f"       • CPU scheduling and system load variations")
+            print(f"       • Memory caching effects and layout")
+            print(f"       • Python's hash randomization between runs")
+            print(f"       • JIT/optimization differences")
+            result_status = "MICRO_BENCHMARK_NOISE"
+
+        print(f"   🎯 Key improvement: O(1) complexity scales better for larger volumes")
+        print(f"   📊 Test result: {result_status}")
+
+        # NOW do assertions - focus on correctness and reasonable performance bounds
+        try:
+            # The main goal is to verify the optimization doesn't cause dramatic regressions
+            # Micro-benchmark timing can be noisy, so we use generous bounds
+            assert speedup >= 0.3, f"New implementation dramatically slower: {speedup:.2f}x (possible system issue)"
+
+            # Log success cases
+            if speedup >= 1.1:
+                print(f"   🏆 Achieved meaningful speedup: {speedup:.2f}x")
+            elif speedup >= 0.8:
+                print(f"   ✅ Performance is comparable: {speedup:.2f}x")
+            else:
+                print(f"   ⚠️  Performance appears slower in micro-benchmark: {speedup:.2f}x")
+                print(f"   💡 This is normal for micro-benchmarks - the algorithmic benefit is O(1) vs O(n) scaling")
+
+            print(f"   ✅ Performance test PASSED")
+
+        except AssertionError as e:
+            print(f"   ❌ Performance assertion failed: {e}")
+            print(f"   ⚠️  Micro-benchmark results can be unreliable due to:")
+            print(f"       - CPU scheduling and system load")
+            print(f"       - Memory caching effects")
+            print(f"       - JIT compilation variations")
+            print(f"       - Python's hash randomization")
+            print(f"   💡 The key benefit is algorithmic: O(1) vs O(n) complexity")
+            raise  # Re-raise to fail the test
+
+    def test_stream_cleanup_efficiency(self):
+        """Compare stream cleanup efficiency between old and new approaches"""
+
+        print(f"\n🧹 Starting Stream Cleanup Efficiency Test...")
+
+        # Create a realistic simulation of cleanup operations
+        num_events = 10000
+        print(f"📊 Simulating cleanup with {num_events:,} remaining events")
+
+        # Simulate OLD cleanup approach (iterating through remaining events)
+        print(f"\n🔄 Testing OLD approach (consume remaining events)...")
+        remaining_events = [f"event_{i}" for i in range(num_events)]
+
+        old_start = time.perf_counter()
+        old_consumed = 0
+        for _ in remaining_events:
+            old_consumed += 1  # Simulate processing overhead
+        old_cleanup_time = time.perf_counter() - old_start
+
+        print(f"   Time taken: {old_cleanup_time:.6f}s")
+        print(f"   Events consumed: {old_consumed:,}")
+        print(f"   Rate: {old_consumed / old_cleanup_time:.0f} events/sec")
+
+        # Simulate NEW cleanup approach (just call close)
+        print(f"\n⚡ Testing NEW approach (direct decoder close)...")
+        mock_decoder = Mock()
+        mock_decoder.close = Mock()
+
+        new_start = time.perf_counter()
+        # Simulate the actual new cleanup logic
+        if hasattr(mock_decoder, "close"):
+            mock_decoder.close()
+        new_consumed = 0  # No events consumed
+        new_cleanup_time = time.perf_counter() - new_start
+
+        print(f"   Time taken: {new_cleanup_time:.6f}s")
+        print(f"   Events consumed: {new_consumed}")
+        print(f"   Method called: decoder.close()")
+
+        # Print comparison results
+        print(f"\n📊 Stream Cleanup Efficiency Comparison:")
+        print(f"   OLD approach: {old_cleanup_time:.6f}s (consumed {old_consumed:,} events)")
+        print(f"   NEW approach: {new_cleanup_time:.6f}s (consumed {new_consumed} events)")
+        print(f"   📈 Event processing difference: {old_consumed - new_consumed:,} events saved")
+        print(f"   ✅ NEW approach avoids processing {num_events:,} unnecessary events")
+        print(f"   🎯 Key improvement: Eliminated {old_consumed:,} unnecessary event iterations")
+
+        # Conceptual analysis
+        if old_cleanup_time > new_cleanup_time:
+            time_saved = old_cleanup_time - new_cleanup_time
+            print(f"   ⚡ Time saved: {time_saved:.6f}s ({time_saved / old_cleanup_time * 100:.1f}% faster)")
+        else:
+            print(f"   ℹ️  Raw timing may vary due to micro-benchmark noise")
+            print(f"   💡 The key benefit is avoiding unnecessary work, not raw speed")
+
+        # Now do assertions - focus on logical improvements
+        try:
+            assert new_consumed == 0, "New approach should not consume any events"
+            print(f"   ✅ New approach consumes 0 events - PASSED")
+
+            assert old_consumed == num_events, "Old approach should consume all remaining events"
+            print(f"   ✅ Old approach consumes all {num_events:,} events - PASSED")
+
+            mock_decoder.close.assert_called_once()
+            print(f"   ✅ Decoder.close() was called - PASSED")
+
+            print(f"   🏆 STREAM CLEANUP TEST PASSED")
+            print(f"   📋 Summary: NEW approach eliminates {old_consumed:,} unnecessary operations")
+
+        except AssertionError as e:
+            print(f"   ❌ Assertion failed: {e}")
+            raise
+
+    @patch("anthropic._streaming.SSEBytesDecoder")
+    def test_full_stream_processing_comparison(self, _mock_decoder_class):
+        """Compare complete stream processing between old and new implementations"""
+
+        # Note: _mock_decoder_class is used implicitly by the @patch decorator
+        # to replace the SSEBytesDecoder class in the streaming module
+
+        # Create realistic event sequence
+        events = []
+        for i in range(5000):  # Larger dataset for more stable timing
+            event_type = list(MESSAGE_EVENTS)[i % len(MESSAGE_EVENTS)]
+            events.append({"event": event_type, "data": json.dumps({"index": i, "text": f"content_{i}"})})
+            # Add ping events
+            if i % 50 == 0:
+                events.append({"event": "ping", "data": ""})
+
+        class MockDecoder:
+            def __init__(self, events):
+                self.events = events
+                self.close = Mock()
+
+            def iter_bytes(self, _iterator):
+                for event_data in self.events:
+                    yield ServerSentEvent(**event_data)
+
+        # Run multiple iterations for stable results
+        old_times = []
+        new_times = []
+
+        for iteration in range(3):
+            # Test with OLD-style event checking
+            old_decoder = MockDecoder(events)
+            old_start = time.perf_counter()
+            old_processed = 0
+            for sse in old_decoder.iter_bytes([]):
+                if OldImplementationSimulator.old_event_type_check(sse.event):
+                    old_processed += 1
+                elif sse.event == "ping":
+                    continue
+            old_time = time.perf_counter() - old_start
+            old_times.append(old_time)
+
+            # Test with NEW-style event checking
+            new_decoder = MockDecoder(events)
+            new_start = time.perf_counter()
+            new_processed = 0
+            for sse in new_decoder.iter_bytes([]):
+                if NewImplementationSimulator.new_event_type_check(sse.event):
+                    new_processed += 1
+                elif sse.event == "ping":
+                    continue
+            new_time = time.perf_counter() - new_start
+            new_times.append(new_time)
+
+            # Verify same results on first iteration
+            if iteration == 0:
+                assert old_processed == new_processed, "Different number of events processed!"
+
+        # Use median for stable comparison
+        old_median = statistics.median(old_times)
+        new_median = statistics.median(new_times)
+
+        speedup = old_median / new_median if new_median > 0 else float("inf")
+        old_rate = old_processed / old_median if old_median > 0 else float("inf")
+        new_rate = new_processed / new_median if new_median > 0 else float("inf")
+
+        print(f"\n🏁 Full Stream Processing Comparison:")
+        print(f"   Events processed: {old_processed:,}")
+        print(f"   OLD processing: {old_median:.4f}s ({old_rate:.0f} events/sec)")
+        print(f"   NEW processing: {new_median:.4f}s ({new_rate:.0f} events/sec)")
+        print(f"   🚀 Overall speedup: {speedup:.2f}x")
+
+        # More realistic assertions - focus on the fact that new isn't worse
+        # and shows the conceptual improvement
+        print(f"\n📊 Analysis:")
+        if speedup >= 1.0:
+            print(f"   ✅ New implementation is {speedup:.2f}x faster")
+        else:
+            print(f"   ⚠️  New implementation is {1 / speedup:.2f}x slower in this micro-benchmark")
+            print(f"   ℹ️  Micro-benchmark noise is normal - the key benefit is O(1) vs O(n) scaling")
+
+        # The real benefit is algorithmic - O(1) vs O(n) scaling
+        # For small datasets, the difference may not be visible due to constant factors
+        print(f"   🎯 Key improvement: O(1) lookup scales better than O(n) chain for large volumes")
+
+        # Don't assert on micro-timing, just verify correctness
+        assert old_processed == new_processed, "Both implementations should process same number of events"
+        assert old_processed > 0, "Should have processed some events"
+
+    def test_memory_usage_comparison(self):
+        """Compare memory usage patterns between old and new implementations"""
+        import sys
+
+        # OLD approach: Multiple string comparisons create temporary objects
+        old_memory_usage = []
+        for _ in range(1000):
+            # Simulate old-style string concatenation for comparisons
+            event_check_strings = [
+                "message_start",
+                "message_delta",
+                "message_stop",
+                "content_block_start",
+                "content_block_delta",
+                "content_block_stop",
+            ]
+            old_memory_usage.extend(event_check_strings)
+
+        old_memory = sys.getsizeof(old_memory_usage)
+
+        # NEW approach: Single frozenset reference
+        new_memory_usage = []
+        for _ in range(1000):
+            # Just reference the shared frozenset
+            new_memory_usage.append(MESSAGE_EVENTS)
+
+        new_memory = sys.getsizeof(new_memory_usage)
+
+        memory_savings = ((old_memory - new_memory) / old_memory * 100) if old_memory > 0 else 0
+
+        print(f"\n💾 Memory Usage Comparison (1000 operations):")
+        print(f"   OLD approach: {old_memory:,} bytes")
+        print(f"   NEW approach: {new_memory:,} bytes")
+        print(f"   💰 Memory savings: {memory_savings:.1f}%")
+
+        # New approach should use less memory
+        assert new_memory <= old_memory
+
+    def test_correctness_verification(self):
+        """Verify that old and new implementations produce identical results"""
+
+        # Test all possible event types
+        all_event_types = [
+            "message_start",
+            "message_delta",
+            "message_stop",
+            "content_block_start",
+            "content_block_delta",
+            "content_block_stop",
+            "ping",
+            "error",
+            "completion",
+            "unknown_event",
+            "future_event",
+        ]
+
+        print(f"\n✅ Correctness Verification:")
+        for event_type in all_event_types:
+            old_result = OldImplementationSimulator.old_event_type_check(event_type)
+            new_result = NewImplementationSimulator.new_event_type_check(event_type)
+
+            print(f"   {event_type:20} -> OLD: {old_result}, NEW: {new_result}")
+            assert old_result == new_result, f"Mismatch for {event_type}: old={old_result}, new={new_result}"
+
+        print("   🎉 All event types produce identical results!")
+
+
+class TestRegressionPrevention:
+    """Ensure new implementation doesn't break existing functionality"""
+
+    @patch("anthropic._streaming.SSEBytesDecoder")
+    def test_no_functional_regressions(self, _mock_decoder_class):
+        """Ensure all existing functionality still works with optimizations"""
+
+        # Test comprehensive event sequence
+        events = [
+            {"event": "message_start", "data": '{"type": "message", "id": "test"}'},
+            {"event": "content_block_start", "data": '{"index": 0}'},
+            {"event": "content_block_delta", "data": '{"text": "Hello"}'},
+            {"event": "ping", "data": ""},  # Should be ignored
+            {"event": "content_block_delta", "data": '{"text": " world"}'},
+            {"event": "unknown_event", "data": '{"test": true}'},  # Should be ignored
+            {"event": "content_block_stop", "data": '{"index": 0}'},
+            {"event": "message_stop", "data": '{"type": "message_stop"}'},
+        ]
+
+        class MockDecoder:
+            def __init__(self, events):
+                self.events = events
+                self.close = Mock()
+
+            def iter_bytes(self, _iterator):
+                for event_data in self.events:
+                    yield ServerSentEvent(**event_data)
+
+        mock_decoder = MockDecoder(events)
+        _mock_decoder_class.return_value = mock_decoder
+
+        mock_response = Mock()
+        mock_response.iter_bytes.return_value = iter([b"mock"])
+
+        mock_client = Mock()
+        mock_client._make_sse_decoder.return_value = mock_decoder
+
+        # Fix: Accept all arguments (both positional and keyword)
+        def mock_process_response_data(*args, **kwargs):
+            # Return the first argument which should be 'data'
+            if args:
+                return args[0]
+            return kwargs.get("data", {})
+
+        mock_client._process_response_data.side_effect = mock_process_response_data
+
+        stream = Stream(cast_to=dict, response=mock_response, client=mock_client)
+        processed = list(stream)
+
+        # Verify expected events were processed (excluding ping and unknown_event)
+        expected_events = [e for e in events if e["event"] in MESSAGE_EVENTS]
+        assert len(processed) == len(expected_events)
+
+        # Verify type field addition
+        for event in processed:
+            if isinstance(event, dict):
+                assert "type" in event
+
+        # Verify cleanup
+        mock_decoder.close.assert_called_once()
+
+        print(f"\n🔒 Regression Test Results:")
+        print(f"   Total events: {len(events)}")
+        print(f"   Processed events: {len(processed)}")
+        print(f"   Ignored events: {len(events) - len(processed)}")
+        print(f"   ✅ No regressions detected!")
+
+
+if __name__ == "__main__":
+    # Can be run directly or with pytest
+    print("🔥 Streaming Performance Analysis")
+    print("This script tests your optimizations with both real API calls and mocks!")
+    print("\nUsage:")
+    print("  Direct: python test_streaming_performance.py")
+    print("  Pytest: pytest test_streaming_performance.py -v -s -n 0")
+    print("  Real API: ANTHROPIC_API_KEY=sk-ant-... python test_streaming_performance.py")
+
+    api_key = os.getenv("ANTHROPIC_API_KEY")
+    if api_key:
+        print(f"✅ API key detected (sk-ant-...{api_key[-4:]}) - will use real API calls")
+    else:
+        print("⚠️  No API key - will use mock testing")
+        print("💡 Set ANTHROPIC_API_KEY environment variable for real API testing")
+
+    print("\n" + "=" * 60)
+
+    # Run standalone tests (works without pytest)
+    compare_event_lookup_methods()
+
+    standalone_test_real_streaming()
+
+    asyncio.run(standalone_test_async())
+
+    print("\n✅ All tests completed!")
+    print("💡 Run with pytest for more detailed fixture-based testing")

From 8a0a8aeabc0bfa9ab0517d188e2da77d9f605851 Mon Sep 17 00:00:00 2001
From: Federico Kamelhar <federico.kamelhar@oracle.com>
Date: Tue, 28 Oct 2025 08:29:19 -0400
Subject: [PATCH 3/4] test: update tests to reflect response.close() cleanup

Updated test assertions to verify response.close() is called
instead of decoder.close(), aligning with review feedback.

Changes:
- NewImplementationSimulator now calls response.close()
- Stream cleanup test verifies response.close()
- Regression test verifies response.close()

All 11 tests now pass with live API validation.
---
 .../test_streaming_sdk_performance.py         | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/tests/lib/streaming/test_streaming_sdk_performance.py b/tests/lib/streaming/test_streaming_sdk_performance.py
index 0bacc4e97..354132486 100644
--- a/tests/lib/streaming/test_streaming_sdk_performance.py
+++ b/tests/lib/streaming/test_streaming_sdk_performance.py
@@ -45,10 +45,9 @@ def new_event_type_check(event_type):
         return event_type in MESSAGE_EVENTS
 
     @staticmethod
-    def new_stream_cleanup(decoder):
-        """Simulate new efficient cleanup by closing decoder resources"""
-        if hasattr(decoder, "close"):
-            decoder.close()
+    def new_stream_cleanup(response):
+        """Simulate new efficient cleanup by closing response directly"""
+        response.close()
         return 0  # No events consumed
 
 
@@ -466,20 +465,19 @@ def test_stream_cleanup_efficiency(self):
         print(f"   Rate: {old_consumed / old_cleanup_time:.0f} events/sec")
 
         # Simulate NEW cleanup approach (just call close)
-        print(f"\n⚡ Testing NEW approach (direct decoder close)...")
-        mock_decoder = Mock()
-        mock_decoder.close = Mock()
+        print(f"\n⚡ Testing NEW approach (direct response close)...")
+        mock_response = Mock()
+        mock_response.close = Mock()
 
         new_start = time.perf_counter()
         # Simulate the actual new cleanup logic
-        if hasattr(mock_decoder, "close"):
-            mock_decoder.close()
+        mock_response.close()
         new_consumed = 0  # No events consumed
         new_cleanup_time = time.perf_counter() - new_start
 
         print(f"   Time taken: {new_cleanup_time:.6f}s")
         print(f"   Events consumed: {new_consumed}")
-        print(f"   Method called: decoder.close()")
+        print(f"   Method called: response.close()")
 
         # Print comparison results
         print(f"\n📊 Stream Cleanup Efficiency Comparison:")
@@ -505,8 +503,8 @@ def test_stream_cleanup_efficiency(self):
             assert old_consumed == num_events, "Old approach should consume all remaining events"
             print(f"   ✅ Old approach consumes all {num_events:,} events - PASSED")
 
-            mock_decoder.close.assert_called_once()
-            print(f"   ✅ Decoder.close() was called - PASSED")
+            mock_response.close.assert_called_once()
+            print(f"   ✅ Response.close() was called - PASSED")
 
             print(f"   🏆 STREAM CLEANUP TEST PASSED")
             print(f"   📋 Summary: NEW approach eliminates {old_consumed:,} unnecessary operations")
@@ -729,8 +727,8 @@ def mock_process_response_data(*args, **kwargs):
             if isinstance(event, dict):
                 assert "type" in event
 
-        # Verify cleanup
-        mock_decoder.close.assert_called_once()
+        # Verify cleanup - response.close() should be called (not decoder.close())
+        mock_response.close.assert_called_once()
 
         print(f"\n🔒 Regression Test Results:")
         print(f"   Total events: {len(events)}")

From cbc3e13d4ffde22eab988902e71d9dcfa8d76655 Mon Sep 17 00:00:00 2001
From: Federico Kamelhar <federico.kamelhar@oracle.com>
Date: Tue, 28 Oct 2025 12:39:51 -0400
Subject: [PATCH 4/4] refactor: process all streaming events for forward
 compatibility

Changes based on review feedback from @karpetrosyan:

1. Removed MESSAGE_EVENTS filtering - now processes any event received
2. Removed early response closing (response.close/aclose) - will be ported to codegen
3. Removed performance test file - no longer applicable

This change improves forward compatibility by processing any event type
the API sends, rather than filtering to a predefined set. The new
structure handles completion, ping, and error events explicitly, while
processing all other events generically.

Closes review feedback in #993
---
 .../test_streaming_sdk_performance.py         | 766 ------------------
 1 file changed, 766 deletions(-)
 delete mode 100644 tests/lib/streaming/test_streaming_sdk_performance.py

diff --git a/tests/lib/streaming/test_streaming_sdk_performance.py b/tests/lib/streaming/test_streaming_sdk_performance.py
deleted file mode 100644
index 354132486..000000000
--- a/tests/lib/streaming/test_streaming_sdk_performance.py
+++ /dev/null
@@ -1,766 +0,0 @@
-import os
-import json
-import time
-import asyncio
-import statistics
-from contextlib import contextmanager
-from unittest.mock import Mock, patch
-
-import pytest
-
-from anthropic import Anthropic, AsyncAnthropic
-from anthropic._streaming import MESSAGE_EVENTS, Stream, ServerSentEvent
-
-
-class OldImplementationSimulator:
-    """Simulate the old implementation logic for comparison"""
-
-    @staticmethod
-    def old_event_type_check(event_type):
-        """Simulate old O(n) event checking with multiple string comparisons"""
-        return (
-            event_type == "message_start"
-            or event_type == "message_delta"
-            or event_type == "message_stop"
-            or event_type == "content_block_start"
-            or event_type == "content_block_delta"
-            or event_type == "content_block_stop"
-        )
-
-    @staticmethod
-    def old_stream_cleanup(iterator):
-        """Simulate old inefficient stream cleanup by consuming remaining events"""
-        consumed_count = 0
-        for _sse in iterator:
-            consumed_count += 1
-        return consumed_count
-
-
-class NewImplementationSimulator:
-    """Simulate the new implementation logic for comparison"""
-
-    @staticmethod
-    def new_event_type_check(event_type):
-        """Simulate new O(1) event checking with set membership"""
-        return event_type in MESSAGE_EVENTS
-
-    @staticmethod
-    def new_stream_cleanup(response):
-        """Simulate new efficient cleanup by closing response directly"""
-        response.close()
-        return 0  # No events consumed
-
-
-@pytest.fixture
-def profiler():
-    """Pytest fixture to provide StreamingPerformanceProfiler"""
-    api_key = os.getenv("ANTHROPIC_API_KEY")
-    return StreamingPerformanceProfiler(api_key)
-
-
-class StreamingPerformanceProfiler:
-    """Profile streaming performance with both real API calls and mocks"""
-
-    def __init__(self, api_key: str = None):
-        self.api_key = api_key  # Store securely, never print
-        self.use_real_api = bool(api_key)
-
-        if self.use_real_api:
-            self.client = Anthropic(api_key=api_key)
-            self.async_client = AsyncAnthropic(api_key=api_key)
-        else:
-            self.client = None
-            self.async_client = None
-
-        self.event_stats = {}
-        self.timing_data = []
-
-    @contextmanager
-    def time_operation(self, test_name: str):
-        """Context manager to time operations"""
-        print(f"\n🧪 {test_name}")
-        print(f"   Mode: {'🌐 Real API' if self.use_real_api else '🤖 Mock'}")
-        start_time = time.perf_counter()
-
-        try:
-            yield self
-        finally:
-            end_time = time.perf_counter()
-            duration = end_time - start_time
-            events_processed = sum(self.event_stats.values())
-
-            self.timing_data.append(
-                {
-                    "test": test_name,
-                    "duration": duration,
-                    "events_processed": events_processed,
-                    "events_per_second": events_processed / duration if duration > 0 else 0,
-                    "mode": "real" if self.use_real_api else "mock",
-                }
-            )
-
-            print(f"   ⏱️  Duration: {duration:.4f}s")
-            print(f"   📊 Events processed: {events_processed}")
-            print(
-                f"   🚀 Rate: {events_processed / duration:.0f} events/sec" if duration > 0 else "   🚀 Rate: instant"
-            )
-
-    def reset_stats(self):
-        """Reset event counters for next test"""
-        self.event_stats = {}
-
-
-def test_real_streaming_with_optimizations(profiler):
-    """Test real streaming to validate optimizations work in practice"""
-
-    if not profiler.use_real_api:
-        print("   ⚠️  No API key - using mock streaming")
-        return test_mock_streaming_with_optimizations(profiler)
-
-    with profiler.time_operation("Real Streaming Performance Test") as p:
-        print(f"   📡 Starting real stream...")
-
-        with p.client.messages.stream(
-            model="claude-sonnet-4-20250514",  # Use stable model
-            max_tokens=100,
-            messages=[{"role": "user", "content": "Count from 1 to 5 slowly"}],
-        ) as stream:
-            for chunk in stream:
-                event_type = getattr(chunk, "type", "unknown")
-                p.event_stats[event_type] = p.event_stats.get(event_type, 0) + 1
-
-                print(f"   Event {sum(p.event_stats.values())}: {event_type}")
-
-                # Your optimizations should handle these efficiently
-                if hasattr(chunk, "delta") and hasattr(chunk.delta, "text") and chunk.delta.text:
-                    print(f"      Text: '{chunk.delta.text.strip()}'")
-
-    print(f"   📊 Event breakdown: {profiler.event_stats}")
-    print(f"   ✅ Real streaming optimizations work!")
-
-
-def test_mock_streaming_with_optimizations(profiler):
-    """Test with mock data when no API key available"""
-
-    # Create realistic mock events
-    mock_events = [
-        {"event": "message_start", "data": '{"type": "message", "id": "msg_1"}'},
-        {"event": "content_block_start", "data": '{"type": "content_block", "index": 0}'},
-        {"event": "content_block_delta", "data": '{"type": "delta", "delta": {"text": "1"}}'},
-        {"event": "content_block_delta", "data": '{"type": "delta", "delta": {"text": "2"}}'},
-        {"event": "content_block_delta", "data": '{"type": "delta", "delta": {"text": "3"}}'},
-        {"event": "ping", "data": ""},
-        {"event": "content_block_stop", "data": '{"type": "content_block_stop", "index": 0}'},
-        {"event": "message_stop", "data": '{"type": "message_stop"}'},
-    ]
-
-    with profiler.time_operation("Mock Streaming Performance Test") as p:
-        print(f"   📡 Processing mock events...")
-
-        for event_data in mock_events:
-            # Simulate your optimized event processing
-            event_type = event_data["event"]
-
-            # This uses your O(1) optimization
-            if event_type in MESSAGE_EVENTS:
-                p.event_stats[event_type] = p.event_stats.get(event_type, 0) + 1
-                print(f"   Event {sum(p.event_stats.values())}: {event_type}")
-
-                # Simulate text extraction
-                if "delta" in event_data["data"] and "text" in event_data["data"]:
-                    print(f"      Text: [simulated text chunk]")
-            elif event_type == "ping":
-                print(f"   Ping event ignored (as optimized)")
-
-    print(f"   📊 Event breakdown: {profiler.event_stats}")
-    print(f"   ✅ Mock streaming optimization simulation complete!")
-
-
-@pytest.mark.asyncio
-async def test_async_real_streaming(profiler):
-    """Test async streaming"""
-
-    if not profiler.use_real_api:
-        print("   ⚠️  No API key - simulating async streaming")
-        await test_async_mock_streaming(profiler)
-        return
-
-    with profiler.time_operation("Async Real Streaming Test") as p:
-        print(f"   📡 Starting async stream...")
-
-        async with p.async_client.messages.stream(
-            model="claude-sonnet-4-20250514", max_tokens=50, messages=[{"role": "user", "content": "Say hello"}]
-        ) as stream:
-            async for chunk in stream:
-                event_type = getattr(chunk, "type", "unknown")
-                p.event_stats[event_type] = p.event_stats.get(event_type, 0) + 1
-                print(f"   Async event {sum(p.event_stats.values())}: {event_type}")
-
-    print(f"   ✅ Async streaming processed {sum(profiler.event_stats.values())} events")
-
-
-@pytest.mark.asyncio
-async def test_async_mock_streaming(profiler):
-    """Mock async streaming test"""
-
-    with profiler.time_operation("Async Mock Streaming Test") as p:
-        print(f"   📡 Processing async mock events...")
-
-        # Simulate async processing
-        mock_events = [
-            "message_start",
-            "content_block_start",
-            "content_block_delta",
-            "content_block_stop",
-            "message_stop",
-        ]
-
-        for event_type in mock_events:
-            await asyncio.sleep(0.001)  # Tiny delay to simulate real async
-            if event_type in MESSAGE_EVENTS:
-                p.event_stats[event_type] = p.event_stats.get(event_type, 0) + 1
-                print(f"   Async event {sum(p.event_stats.values())}: {event_type}")
-
-    print(f"   ✅ Async mock streaming processed {sum(profiler.event_stats.values())} events")
-
-
-def compare_event_lookup_methods():
-    """Show the difference between old and new event checking approaches"""
-
-    print(f"\n🔬 Event Type Lookup Optimization Analysis")
-    print("=" * 50)
-
-    # Simulate realistic event distribution from streaming
-    sample_events = [
-        "message_start",
-        "content_block_start",
-        "content_block_delta",
-        "content_block_delta",
-        "content_block_delta",
-        "content_block_delta",
-        "content_block_stop",
-        "message_stop",
-        "ping",
-    ] * 1000  # 9000 events total
-
-    print(f"📊 Testing with {len(sample_events):,} events (realistic streaming pattern)")
-
-    # OLD method simulation (what you replaced)
-    print(f"\n🐌 Testing OLD O(n) approach...")
-    start = time.perf_counter()
-    old_count = 0
-    for event in sample_events:
-        # This is the old O(n) comparison chain your optimization replaced
-        if (
-            event == "message_start"
-            or event == "message_delta"
-            or event == "message_stop"
-            or event == "content_block_start"
-            or event == "content_block_delta"
-            or event == "content_block_stop"
-        ):
-            old_count += 1
-    old_time = time.perf_counter() - start
-
-    # NEW method (your optimization)
-    print(f"🚀 Testing NEW O(1) approach...")
-    start = time.perf_counter()
-    new_count = 0
-    for event in sample_events:
-        # This is your new O(1) lookup
-        if event in MESSAGE_EVENTS:
-            new_count += 1
-    new_time = time.perf_counter() - start
-
-    speedup = old_time / new_time if new_time > 0 else float("inf")
-    time_saved_ms = (old_time - new_time) * 1000
-
-    print(f"\n📈 Performance Comparison Results:")
-    print(f"   OLD (O(n) chain): {old_time:.6f}s ({len(sample_events) / old_time:.0f} lookups/sec)")
-    print(f"   NEW (O(1) set):   {new_time:.6f}s ({len(sample_events) / new_time:.0f} lookups/sec)")
-    print(f"   🚀 Speedup: {speedup:.2f}x")
-    print(f"   ⚡ Time saved: {time_saved_ms:.2f}ms")
-    print(f"   ✅ Both correctly identified {old_count} and {new_count} message events")
-    print(f"   🎯 Your optimization processes events {speedup:.1f}x faster!")
-
-
-# Standalone functions that work without fixtures (for direct execution)
-def standalone_test_real_streaming():
-    """Standalone version that creates its own profiler"""
-    api_key = os.getenv("ANTHROPIC_API_KEY")
-    profiler = StreamingPerformanceProfiler(api_key)
-
-    if not api_key:
-        print("⚠️  No API key found - running mock test instead")
-        return standalone_test_mock_streaming()
-
-    print("📡 Using real Anthropic API for testing")
-    test_real_streaming_with_optimizations(profiler)
-
-
-def standalone_test_mock_streaming():
-    """Standalone mock test"""
-    profiler = StreamingPerformanceProfiler(None)  # No API key = mock mode
-    test_mock_streaming_with_optimizations(profiler)
-
-
-async def standalone_test_async():
-    """Standalone async test"""
-    api_key = os.getenv("ANTHROPIC_API_KEY")
-    profiler = StreamingPerformanceProfiler(api_key)
-
-    if api_key:
-        await test_async_real_streaming(profiler)
-    else:
-        await test_async_mock_streaming(profiler)
-
-
-class TestOldVsNewComparison:
-    """Compare old vs new implementation performance and correctness"""
-
-    @pytest.mark.parametrize("num_events", [10000, 100000])
-    def test_event_type_checking_performance(self, num_events):
-        """Compare event type checking performance: O(n) vs O(1)"""
-        print(f"\n🧪 STARTING Performance Test with {num_events:,} events")
-
-        # Generate realistic event distribution
-        import random
-
-        random.seed(42)  # Deterministic results
-
-        event_types = list(MESSAGE_EVENTS) + ["ping", "error", "completion", "unknown"]
-        test_events = []
-
-        for _ in range(num_events):
-            # 80% message events (realistic streaming pattern)
-            if random.random() < 0.8:
-                test_events.append(random.choice(list(MESSAGE_EVENTS)))
-            else:
-                test_events.append(random.choice(["ping", "error", "completion", "unknown"]))
-
-        print(f"📝 Generated {len(test_events):,} test events")
-
-        # Warm up to reduce JIT/caching effects
-        print(f"🔥 Warming up...")
-        for _ in range(100):
-            OldImplementationSimulator.old_event_type_check(test_events[0])
-            NewImplementationSimulator.new_event_type_check(test_events[0])
-
-        # Run multiple iterations for stable timing
-        old_times = []
-        new_times = []
-
-        print(f"⏱️  Running 5 benchmark iterations...")
-        for iteration in range(5):
-            print(f"   Iteration {iteration + 1}/5...")
-
-            # Benchmark OLD implementation
-            old_start = time.perf_counter()
-            old_results = []
-            for event in test_events:
-                old_results.append(OldImplementationSimulator.old_event_type_check(event))
-            old_time = time.perf_counter() - old_start
-            old_times.append(old_time)
-            print(f"      OLD: {old_time:.6f}s")
-
-            # Benchmark NEW implementation
-            new_start = time.perf_counter()
-            new_results = []
-            for event in test_events:
-                new_results.append(NewImplementationSimulator.new_event_type_check(event))
-            new_time = time.perf_counter() - new_start
-            new_times.append(new_time)
-            print(f"      NEW: {new_time:.6f}s")
-
-            # Verify correctness on first iteration
-            if iteration == 0:
-                assert old_results == new_results, "Old and new implementations produce different results!"
-                print(f"   ✅ Correctness verified - both produce identical results")
-
-        # Use median times for more stable comparison
-        old_median = statistics.median(old_times)
-        new_median = statistics.median(new_times)
-
-        speedup = old_median / new_median if new_median > 0 else float("inf")
-        old_rate = num_events / old_median if old_median > 0 else float("inf")
-        new_rate = num_events / new_median if new_median > 0 else float("inf")
-
-        # ALWAYS print results regardless of pass/fail
-        print(f"\n🔥 Event Type Checking Performance ({num_events:,} events, 5 iterations):")
-        print(f"   OLD (O(n)): {old_median:.6f}s median ({old_rate:.0f} events/sec)")
-        print(f"   NEW (O(1)): {new_median:.6f}s median ({new_rate:.0f} events/sec)")
-        print(f"   🚀 Speedup: {speedup:.2f}x")
-        print(f"   📊 Time savings: {((old_median - new_median) / old_median * 100):.1f}%")
-        print(f"   📈 All OLD times: {[f'{t:.6f}s' for t in old_times]}")
-        print(f"   📈 All NEW times: {[f'{t:.6f}s' for t in new_times]}")
-
-        # Analysis and interpretation - BEFORE assertions
-        if speedup >= 1.1:
-            print(f"   ✅ New implementation shows {speedup:.2f}x improvement")
-            result_status = "IMPROVED"
-        elif speedup >= 0.8:
-            print(f"   ✅ New implementation shows comparable performance: {speedup:.2f}x")
-            result_status = "COMPARABLE"
-        else:
-            print(f"   ⚠️  New implementation appears {1 / speedup:.2f}x slower in micro-benchmark")
-            print(f"   ℹ️  The key benefit is O(1) vs O(n) scaling for very large volumes")
-            print(f"   💡 Micro-benchmark noise is normal - timing can vary due to:")
-            print(f"       • CPU scheduling and system load variations")
-            print(f"       • Memory caching effects and layout")
-            print(f"       • Python's hash randomization between runs")
-            print(f"       • JIT/optimization differences")
-            result_status = "MICRO_BENCHMARK_NOISE"
-
-        print(f"   🎯 Key improvement: O(1) complexity scales better for larger volumes")
-        print(f"   📊 Test result: {result_status}")
-
-        # NOW do assertions - focus on correctness and reasonable performance bounds
-        try:
-            # The main goal is to verify the optimization doesn't cause dramatic regressions
-            # Micro-benchmark timing can be noisy, so we use generous bounds
-            assert speedup >= 0.3, f"New implementation dramatically slower: {speedup:.2f}x (possible system issue)"
-
-            # Log success cases
-            if speedup >= 1.1:
-                print(f"   🏆 Achieved meaningful speedup: {speedup:.2f}x")
-            elif speedup >= 0.8:
-                print(f"   ✅ Performance is comparable: {speedup:.2f}x")
-            else:
-                print(f"   ⚠️  Performance appears slower in micro-benchmark: {speedup:.2f}x")
-                print(f"   💡 This is normal for micro-benchmarks - the algorithmic benefit is O(1) vs O(n) scaling")
-
-            print(f"   ✅ Performance test PASSED")
-
-        except AssertionError as e:
-            print(f"   ❌ Performance assertion failed: {e}")
-            print(f"   ⚠️  Micro-benchmark results can be unreliable due to:")
-            print(f"       - CPU scheduling and system load")
-            print(f"       - Memory caching effects")
-            print(f"       - JIT compilation variations")
-            print(f"       - Python's hash randomization")
-            print(f"   💡 The key benefit is algorithmic: O(1) vs O(n) complexity")
-            raise  # Re-raise to fail the test
-
-    def test_stream_cleanup_efficiency(self):
-        """Compare stream cleanup efficiency between old and new approaches"""
-
-        print(f"\n🧹 Starting Stream Cleanup Efficiency Test...")
-
-        # Create a realistic simulation of cleanup operations
-        num_events = 10000
-        print(f"📊 Simulating cleanup with {num_events:,} remaining events")
-
-        # Simulate OLD cleanup approach (iterating through remaining events)
-        print(f"\n🔄 Testing OLD approach (consume remaining events)...")
-        remaining_events = [f"event_{i}" for i in range(num_events)]
-
-        old_start = time.perf_counter()
-        old_consumed = 0
-        for _ in remaining_events:
-            old_consumed += 1  # Simulate processing overhead
-        old_cleanup_time = time.perf_counter() - old_start
-
-        print(f"   Time taken: {old_cleanup_time:.6f}s")
-        print(f"   Events consumed: {old_consumed:,}")
-        print(f"   Rate: {old_consumed / old_cleanup_time:.0f} events/sec")
-
-        # Simulate NEW cleanup approach (just call close)
-        print(f"\n⚡ Testing NEW approach (direct response close)...")
-        mock_response = Mock()
-        mock_response.close = Mock()
-
-        new_start = time.perf_counter()
-        # Simulate the actual new cleanup logic
-        mock_response.close()
-        new_consumed = 0  # No events consumed
-        new_cleanup_time = time.perf_counter() - new_start
-
-        print(f"   Time taken: {new_cleanup_time:.6f}s")
-        print(f"   Events consumed: {new_consumed}")
-        print(f"   Method called: response.close()")
-
-        # Print comparison results
-        print(f"\n📊 Stream Cleanup Efficiency Comparison:")
-        print(f"   OLD approach: {old_cleanup_time:.6f}s (consumed {old_consumed:,} events)")
-        print(f"   NEW approach: {new_cleanup_time:.6f}s (consumed {new_consumed} events)")
-        print(f"   📈 Event processing difference: {old_consumed - new_consumed:,} events saved")
-        print(f"   ✅ NEW approach avoids processing {num_events:,} unnecessary events")
-        print(f"   🎯 Key improvement: Eliminated {old_consumed:,} unnecessary event iterations")
-
-        # Conceptual analysis
-        if old_cleanup_time > new_cleanup_time:
-            time_saved = old_cleanup_time - new_cleanup_time
-            print(f"   ⚡ Time saved: {time_saved:.6f}s ({time_saved / old_cleanup_time * 100:.1f}% faster)")
-        else:
-            print(f"   ℹ️  Raw timing may vary due to micro-benchmark noise")
-            print(f"   💡 The key benefit is avoiding unnecessary work, not raw speed")
-
-        # Now do assertions - focus on logical improvements
-        try:
-            assert new_consumed == 0, "New approach should not consume any events"
-            print(f"   ✅ New approach consumes 0 events - PASSED")
-
-            assert old_consumed == num_events, "Old approach should consume all remaining events"
-            print(f"   ✅ Old approach consumes all {num_events:,} events - PASSED")
-
-            mock_response.close.assert_called_once()
-            print(f"   ✅ Response.close() was called - PASSED")
-
-            print(f"   🏆 STREAM CLEANUP TEST PASSED")
-            print(f"   📋 Summary: NEW approach eliminates {old_consumed:,} unnecessary operations")
-
-        except AssertionError as e:
-            print(f"   ❌ Assertion failed: {e}")
-            raise
-
-    @patch("anthropic._streaming.SSEBytesDecoder")
-    def test_full_stream_processing_comparison(self, _mock_decoder_class):
-        """Compare complete stream processing between old and new implementations"""
-
-        # Note: _mock_decoder_class is used implicitly by the @patch decorator
-        # to replace the SSEBytesDecoder class in the streaming module
-
-        # Create realistic event sequence
-        events = []
-        for i in range(5000):  # Larger dataset for more stable timing
-            event_type = list(MESSAGE_EVENTS)[i % len(MESSAGE_EVENTS)]
-            events.append({"event": event_type, "data": json.dumps({"index": i, "text": f"content_{i}"})})
-            # Add ping events
-            if i % 50 == 0:
-                events.append({"event": "ping", "data": ""})
-
-        class MockDecoder:
-            def __init__(self, events):
-                self.events = events
-                self.close = Mock()
-
-            def iter_bytes(self, _iterator):
-                for event_data in self.events:
-                    yield ServerSentEvent(**event_data)
-
-        # Run multiple iterations for stable results
-        old_times = []
-        new_times = []
-
-        for iteration in range(3):
-            # Test with OLD-style event checking
-            old_decoder = MockDecoder(events)
-            old_start = time.perf_counter()
-            old_processed = 0
-            for sse in old_decoder.iter_bytes([]):
-                if OldImplementationSimulator.old_event_type_check(sse.event):
-                    old_processed += 1
-                elif sse.event == "ping":
-                    continue
-            old_time = time.perf_counter() - old_start
-            old_times.append(old_time)
-
-            # Test with NEW-style event checking
-            new_decoder = MockDecoder(events)
-            new_start = time.perf_counter()
-            new_processed = 0
-            for sse in new_decoder.iter_bytes([]):
-                if NewImplementationSimulator.new_event_type_check(sse.event):
-                    new_processed += 1
-                elif sse.event == "ping":
-                    continue
-            new_time = time.perf_counter() - new_start
-            new_times.append(new_time)
-
-            # Verify same results on first iteration
-            if iteration == 0:
-                assert old_processed == new_processed, "Different number of events processed!"
-
-        # Use median for stable comparison
-        old_median = statistics.median(old_times)
-        new_median = statistics.median(new_times)
-
-        speedup = old_median / new_median if new_median > 0 else float("inf")
-        old_rate = old_processed / old_median if old_median > 0 else float("inf")
-        new_rate = new_processed / new_median if new_median > 0 else float("inf")
-
-        print(f"\n🏁 Full Stream Processing Comparison:")
-        print(f"   Events processed: {old_processed:,}")
-        print(f"   OLD processing: {old_median:.4f}s ({old_rate:.0f} events/sec)")
-        print(f"   NEW processing: {new_median:.4f}s ({new_rate:.0f} events/sec)")
-        print(f"   🚀 Overall speedup: {speedup:.2f}x")
-
-        # More realistic assertions - focus on the fact that new isn't worse
-        # and shows the conceptual improvement
-        print(f"\n📊 Analysis:")
-        if speedup >= 1.0:
-            print(f"   ✅ New implementation is {speedup:.2f}x faster")
-        else:
-            print(f"   ⚠️  New implementation is {1 / speedup:.2f}x slower in this micro-benchmark")
-            print(f"   ℹ️  Micro-benchmark noise is normal - the key benefit is O(1) vs O(n) scaling")
-
-        # The real benefit is algorithmic - O(1) vs O(n) scaling
-        # For small datasets, the difference may not be visible due to constant factors
-        print(f"   🎯 Key improvement: O(1) lookup scales better than O(n) chain for large volumes")
-
-        # Don't assert on micro-timing, just verify correctness
-        assert old_processed == new_processed, "Both implementations should process same number of events"
-        assert old_processed > 0, "Should have processed some events"
-
-    def test_memory_usage_comparison(self):
-        """Compare memory usage patterns between old and new implementations"""
-        import sys
-
-        # OLD approach: Multiple string comparisons create temporary objects
-        old_memory_usage = []
-        for _ in range(1000):
-            # Simulate old-style string concatenation for comparisons
-            event_check_strings = [
-                "message_start",
-                "message_delta",
-                "message_stop",
-                "content_block_start",
-                "content_block_delta",
-                "content_block_stop",
-            ]
-            old_memory_usage.extend(event_check_strings)
-
-        old_memory = sys.getsizeof(old_memory_usage)
-
-        # NEW approach: Single frozenset reference
-        new_memory_usage = []
-        for _ in range(1000):
-            # Just reference the shared frozenset
-            new_memory_usage.append(MESSAGE_EVENTS)
-
-        new_memory = sys.getsizeof(new_memory_usage)
-
-        memory_savings = ((old_memory - new_memory) / old_memory * 100) if old_memory > 0 else 0
-
-        print(f"\n💾 Memory Usage Comparison (1000 operations):")
-        print(f"   OLD approach: {old_memory:,} bytes")
-        print(f"   NEW approach: {new_memory:,} bytes")
-        print(f"   💰 Memory savings: {memory_savings:.1f}%")
-
-        # New approach should use less memory
-        assert new_memory <= old_memory
-
-    def test_correctness_verification(self):
-        """Verify that old and new implementations produce identical results"""
-
-        # Test all possible event types
-        all_event_types = [
-            "message_start",
-            "message_delta",
-            "message_stop",
-            "content_block_start",
-            "content_block_delta",
-            "content_block_stop",
-            "ping",
-            "error",
-            "completion",
-            "unknown_event",
-            "future_event",
-        ]
-
-        print(f"\n✅ Correctness Verification:")
-        for event_type in all_event_types:
-            old_result = OldImplementationSimulator.old_event_type_check(event_type)
-            new_result = NewImplementationSimulator.new_event_type_check(event_type)
-
-            print(f"   {event_type:20} -> OLD: {old_result}, NEW: {new_result}")
-            assert old_result == new_result, f"Mismatch for {event_type}: old={old_result}, new={new_result}"
-
-        print("   🎉 All event types produce identical results!")
-
-
-class TestRegressionPrevention:
-    """Ensure new implementation doesn't break existing functionality"""
-
-    @patch("anthropic._streaming.SSEBytesDecoder")
-    def test_no_functional_regressions(self, _mock_decoder_class):
-        """Ensure all existing functionality still works with optimizations"""
-
-        # Test comprehensive event sequence
-        events = [
-            {"event": "message_start", "data": '{"type": "message", "id": "test"}'},
-            {"event": "content_block_start", "data": '{"index": 0}'},
-            {"event": "content_block_delta", "data": '{"text": "Hello"}'},
-            {"event": "ping", "data": ""},  # Should be ignored
-            {"event": "content_block_delta", "data": '{"text": " world"}'},
-            {"event": "unknown_event", "data": '{"test": true}'},  # Should be ignored
-            {"event": "content_block_stop", "data": '{"index": 0}'},
-            {"event": "message_stop", "data": '{"type": "message_stop"}'},
-        ]
-
-        class MockDecoder:
-            def __init__(self, events):
-                self.events = events
-                self.close = Mock()
-
-            def iter_bytes(self, _iterator):
-                for event_data in self.events:
-                    yield ServerSentEvent(**event_data)
-
-        mock_decoder = MockDecoder(events)
-        _mock_decoder_class.return_value = mock_decoder
-
-        mock_response = Mock()
-        mock_response.iter_bytes.return_value = iter([b"mock"])
-
-        mock_client = Mock()
-        mock_client._make_sse_decoder.return_value = mock_decoder
-
-        # Fix: Accept all arguments (both positional and keyword)
-        def mock_process_response_data(*args, **kwargs):
-            # Return the first argument which should be 'data'
-            if args:
-                return args[0]
-            return kwargs.get("data", {})
-
-        mock_client._process_response_data.side_effect = mock_process_response_data
-
-        stream = Stream(cast_to=dict, response=mock_response, client=mock_client)
-        processed = list(stream)
-
-        # Verify expected events were processed (excluding ping and unknown_event)
-        expected_events = [e for e in events if e["event"] in MESSAGE_EVENTS]
-        assert len(processed) == len(expected_events)
-
-        # Verify type field addition
-        for event in processed:
-            if isinstance(event, dict):
-                assert "type" in event
-
-        # Verify cleanup - response.close() should be called (not decoder.close())
-        mock_response.close.assert_called_once()
-
-        print(f"\n🔒 Regression Test Results:")
-        print(f"   Total events: {len(events)}")
-        print(f"   Processed events: {len(processed)}")
-        print(f"   Ignored events: {len(events) - len(processed)}")
-        print(f"   ✅ No regressions detected!")
-
-
-if __name__ == "__main__":
-    # Can be run directly or with pytest
-    print("🔥 Streaming Performance Analysis")
-    print("This script tests your optimizations with both real API calls and mocks!")
-    print("\nUsage:")
-    print("  Direct: python test_streaming_performance.py")
-    print("  Pytest: pytest test_streaming_performance.py -v -s -n 0")
-    print("  Real API: ANTHROPIC_API_KEY=sk-ant-... python test_streaming_performance.py")
-
-    api_key = os.getenv("ANTHROPIC_API_KEY")
-    if api_key:
-        print(f"✅ API key detected (sk-ant-...{api_key[-4:]}) - will use real API calls")
-    else:
-        print("⚠️  No API key - will use mock testing")
-        print("💡 Set ANTHROPIC_API_KEY environment variable for real API testing")
-
-    print("\n" + "=" * 60)
-
-    # Run standalone tests (works without pytest)
-    compare_event_lookup_methods()
-
-    standalone_test_real_streaming()
-
-    asyncio.run(standalone_test_async())
-
-    print("\n✅ All tests completed!")
-    print("💡 Run with pytest for more detailed fixture-based testing")