Add integration tests for network retry behavior

haacked · haacked · commit 64adeece6ab3 · 2025-12-16T15:06:42.000-08:00
Add tests that verify actual retry behavior, not just configuration:

- test_retries_on_503_then_succeeds: Spins up a local HTTP server that
  returns 503 twice then 200, verifying 3 requests are made
- test_connection_errors_are_retried: Verifies connection errors trigger
  retries by measuring elapsed time with backoff

Both tests use dynamically allocated ports for CI safety.
diff --git a/posthog/test/test_request.py b/posthog/test/test_request.py
@@ -457,3 +457,210 @@ def test_flags_no_retry_on_quota_limit(self, mock_get_flags_session):
 
         # QuotaLimitError is raised after response is received, not retried
         self.assertEqual(mock_session.post.call_count, 1)
+
+
+class TestFlagsSessionNetworkRetries(unittest.TestCase):
+    """Tests for network failure retries in the flags session."""
+
+    def test_flags_session_retry_config_includes_connection_errors(self):
+        """
+        Verify that the flags session is configured to retry on connection errors.
+
+        The urllib3 Retry adapter with connect=2 and read=2 automatically
+        retries on network-level failures (DNS failures, connection refused,
+        connection reset, etc.) up to 2 times each.
+        """
+        from posthog.request import _build_flags_session
+
+        session = _build_flags_session()
+
+        # Get the adapter for https://
+        adapter = session.get_adapter("https://test.posthog.com")
+
+        # Verify retry configuration
+        retry = adapter.max_retries
+        self.assertEqual(retry.total, 2, "Should have 2 total retries")
+        self.assertEqual(retry.connect, 2, "Should retry connection errors twice")
+        self.assertEqual(retry.read, 2, "Should retry read errors twice")
+        self.assertIn("POST", retry.allowed_methods, "Should allow POST retries")
+
+    def test_flags_session_retries_on_server_errors(self):
+        """
+        Verify that transient server errors (5xx) trigger retries.
+
+        This tests the status_forcelist configuration which specifies
+        which HTTP status codes should trigger a retry.
+        """
+        from posthog.request import _build_flags_session, RETRY_STATUS_FORCELIST
+
+        session = _build_flags_session()
+        adapter = session.get_adapter("https://test.posthog.com")
+        retry = adapter.max_retries
+
+        # Verify the status codes that trigger retries
+        self.assertEqual(
+            set(retry.status_forcelist),
+            set(RETRY_STATUS_FORCELIST),
+            "Should retry on transient server errors",
+        )
+
+        # Verify specific codes are included
+        self.assertIn(500, retry.status_forcelist)
+        self.assertIn(502, retry.status_forcelist)
+        self.assertIn(503, retry.status_forcelist)
+        self.assertIn(504, retry.status_forcelist)
+
+        # Verify rate limits and quota errors are NOT retried
+        self.assertNotIn(429, retry.status_forcelist)
+        self.assertNotIn(402, retry.status_forcelist)
+
+    def test_flags_session_has_backoff(self):
+        """
+        Verify that retries use exponential backoff to avoid thundering herd.
+        """
+        from posthog.request import _build_flags_session
+
+        session = _build_flags_session()
+        adapter = session.get_adapter("https://test.posthog.com")
+        retry = adapter.max_retries
+
+        self.assertEqual(
+            retry.backoff_factor,
+            0.5,
+            "Should use 0.5s backoff factor (0.5s, 1s delays)",
+        )
+
+
+class TestFlagsSessionRetryIntegration(unittest.TestCase):
+    """Integration tests that verify actual retry behavior with a local server."""
+
+    def test_retries_on_503_then_succeeds(self):
+        """
+        Verify that 503 errors trigger retries and eventually succeed.
+
+        Uses a local HTTP server that fails twice with 503, then succeeds.
+        This tests the full retry flow including backoff timing.
+        """
+        import threading
+        from http.server import HTTPServer, BaseHTTPRequestHandler
+        from socketserver import ThreadingMixIn
+        from urllib3.util.retry import Retry
+        from posthog.request import HTTPAdapterWithSocketOptions, RETRY_STATUS_FORCELIST
+
+        request_count = 0
+
+        class RetryTestHandler(BaseHTTPRequestHandler):
+            protocol_version = "HTTP/1.1"
+
+            def do_POST(self):
+                nonlocal request_count
+                request_count += 1
+
+                # Read and discard request body to prevent connection issues
+                content_length = int(self.headers.get("Content-Length", 0))
+                if content_length > 0:
+                    self.rfile.read(content_length)
+
+                if request_count <= 2:
+                    self.send_response(503)
+                    self.send_header("Content-Type", "application/json")
+                    body = b'{"error": "Service unavailable"}'
+                    self.send_header("Content-Length", str(len(body)))
+                    self.end_headers()
+                    self.wfile.write(body)
+                else:
+                    self.send_response(200)
+                    self.send_header("Content-Type", "application/json")
+                    body = (
+                        b'{"featureFlags": {"test": true}, "featureFlagPayloads": {}}'
+                    )
+                    self.send_header("Content-Length", str(len(body)))
+                    self.end_headers()
+                    self.wfile.write(body)
+
+            def log_message(self, format, *args):
+                pass  # Suppress logging
+
+        # Use ThreadingMixIn for cleaner shutdown
+        class ThreadedHTTPServer(ThreadingMixIn, HTTPServer):
+            daemon_threads = True
+
+        # Start server on a random available port
+        server = ThreadedHTTPServer(("127.0.0.1", 0), RetryTestHandler)
+        port = server.server_address[1]
+        server_thread = threading.Thread(target=server.serve_forever)
+        server_thread.daemon = True
+        server_thread.start()
+
+        try:
+            # Build session with same retry config as _build_flags_session
+            # but mounted on http:// for local testing
+            adapter = HTTPAdapterWithSocketOptions(
+                max_retries=Retry(
+                    total=2,
+                    connect=2,
+                    read=2,
+                    backoff_factor=0.01,  # Fast backoff for testing
+                    status_forcelist=RETRY_STATUS_FORCELIST,
+                    allowed_methods=["POST"],
+                ),
+            )
+            session = requests.Session()
+            session.mount("http://", adapter)
+
+            response = session.post(
+                f"http://127.0.0.1:{port}/flags/?v=2",
+                json={"distinct_id": "user123"},
+                timeout=5,
+            )
+
+            # Should succeed on 3rd attempt
+            self.assertEqual(response.status_code, 200)
+            self.assertEqual(request_count, 3)  # 1 initial + 2 retries
+        finally:
+            server.shutdown()
+            server.server_close()
+
+    def test_connection_errors_are_retried(self):
+        """
+        Verify that connection errors (no server) trigger retries.
+
+        Binds a socket to get a guaranteed available port, then closes it
+        so connection attempts fail with ConnectionError.
+        """
+        import socket
+        import time
+        from urllib3.util.retry import Retry
+        from posthog.request import HTTPAdapterWithSocketOptions, RETRY_STATUS_FORCELIST
+
+        # Get an available port by binding then closing a socket
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        sock.bind(("127.0.0.1", 0))
+        port = sock.getsockname()[1]
+        sock.close()  # Port is now available but nothing is listening
+
+        adapter = HTTPAdapterWithSocketOptions(
+            max_retries=Retry(
+                total=2,
+                connect=2,
+                read=2,
+                backoff_factor=0.05,  # Very fast for testing
+                status_forcelist=RETRY_STATUS_FORCELIST,
+                allowed_methods=["POST"],
+            ),
+        )
+        session = requests.Session()
+        session.mount("http://", adapter)
+
+        start = time.time()
+        with self.assertRaises(requests.exceptions.ConnectionError):
+            session.post(
+                f"http://127.0.0.1:{port}/flags/?v=2",
+                json={"distinct_id": "user123"},
+                timeout=1,
+            )
+        elapsed = time.time() - start
+
+        # With 3 attempts and backoff, should take more than instant
+        # but less than timeout (confirms retries happened)
+        self.assertGreater(elapsed, 0.05, "Should have some delay from retries")