fix(hybridcloud) Deliver payloads concurrently (#66870)

markstory · web-flow · commit b558535d88b7 · 2024-03-15T08:31:23.000-07:00
We're hitting the ceiling of how many messages we can deliver
concurrently with single threaded workers. Because most of our time is
spent in IO, we can get more throughput by sacrificing strong ordering
and delivering messages in small concurrent batches.

Should a batch contain a delivery that needs to be retried progress will
be stopped. This means we could delivery n-1 messages out of order if we
hit an intermittent error.

Initially I only want to use parallel delivery for large mailboxes as
they would naturally not have strong ordering due to the amount of
concurrency they create. Both the usage of threaded io and the
threadpool size can be controlled by options.
diff --git a/src/sentry/hybridcloud/tasks/deliver_webhooks.py b/src/sentry/hybridcloud/tasks/deliver_webhooks.py
@@ -1,5 +1,6 @@
 import datetime
 import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
 
 import sentry_sdk
 from django.db.models import Min, Subquery
@@ -8,6 +9,7 @@
 from requests.models import HTTPError
 from rest_framework import status
 
+from sentry import options
 from sentry.exceptions import RestrictedIPAddress
 from sentry.hybridcloud.models.webhookpayload import BACKOFF_INTERVAL, MAX_ATTEMPTS, WebhookPayload
 from sentry.shared_integrations.exceptions import (
@@ -84,6 +86,7 @@ def schedule_webhook_delivery(**kwargs) -> None:
     metrics.distribution(
         "hybridcloud.schedule_webhook_delivery.mailbox_count", scheduled_mailboxes.count()
     )
+    use_parallel = options.get("hybridcloud.webhookpayload.use_parallel")
     for record in scheduled_mailboxes[:BATCH_SIZE]:
         # Reschedule the records that we will attempt to deliver next.
         # We update schedule_for in an attempt to minimize races for potentially in-flight batches.
@@ -92,11 +95,14 @@ def schedule_webhook_delivery(**kwargs) -> None:
             .order_by("id")
             .values("id")[:MAX_MAILBOX_DRAIN]
         )
-        WebhookPayload.objects.filter(id__in=Subquery(mailbox_batch)).update(
+        updated_count = WebhookPayload.objects.filter(id__in=Subquery(mailbox_batch)).update(
             schedule_for=timezone.now() + BATCH_SCHEDULE_OFFSET
         )
-
-        drain_mailbox.delay(record["id"])
+        # If we have a full batch we should process in parallel as we're likely behind.
+        if use_parallel and updated_count >= MAX_MAILBOX_DRAIN:
+            drain_mailbox_parallel.delay(record["id"])
+        else:
+            drain_mailbox.delay(record["id"])
 
 
 @instrumented_task(
@@ -159,6 +165,119 @@ def drain_mailbox(payload_id: int) -> None:
                 return
 
 
+@instrumented_task(
+    name="sentry.hybridcloud.tasks.deliver_webhooks.drain_mailbox_parallel",
+    queue="webhook.control",
+    silo_mode=SiloMode.CONTROL,
+)
+def drain_mailbox_parallel(payload_id: int) -> None:
+    """
+    Deliver messages from a mailbox in small parallel batches.
+
+    Parallel delivery sacrifices strict ordering for increased throughput.
+    Because of the sequential delivery in a mailbox we can't get higher throughput
+    by scheduling batches in parallel.
+
+    Messages will be delivered in small batches until one fails, the batch
+    delay timeout is reached, or a message with a schedule_for greater than
+    the current time is encountered. A message with a higher schedule_for value
+    indicates that we have hit the start of another batch that has been scheduled.
+    """
+    try:
+        payload = WebhookPayload.objects.get(id=payload_id)
+    except WebhookPayload.DoesNotExist:
+        # We could have hit a race condition. Since we've lost already return
+        # and let the other process continue, or a future process.
+        metrics.incr("hybridcloud.deliver_webhooks.delivery", tags={"outcome": "race"})
+        logger.info(
+            "deliver_webhook_parallel.potential_race",
+            extra={
+                "id": payload_id,
+            },
+        )
+        return
+
+    worker_threads = options.get("hybridcloud.webhookpayload.worker_threads")
+    deadline = timezone.now() + BATCH_SCHEDULE_OFFSET
+    request_failed = False
+
+    while True:
+        # We have run until the end of our batch schedule delay. Break the loop so this worker can take another
+        # task.
+        if timezone.now() >= deadline:
+            logger.info(
+                "deliver_webhook_parallel.delivery_deadline",
+                extra={
+                    "mailbox_name": payload.mailbox_name,
+                },
+            )
+            metrics.incr(
+                "hybridcloud.deliver_webhooks.delivery", tags={"outcome": "delivery_deadline"}
+            )
+            break
+
+        # Fetch records from the batch in batch_size blocks. This avoids reading
+        # redundant data should we hit an error and should help keep query duration low.
+        query = WebhookPayload.objects.filter(
+            id__gte=payload.id, mailbox_name=payload.mailbox_name
+        ).order_by("id")
+
+        # No more messages to deliver
+        if query.count() < 1:
+            break
+
+        # Use a threadpool to send requests concurrently
+        with ThreadPoolExecutor(max_workers=worker_threads) as threadpool:
+            futures = {
+                threadpool.submit(deliver_message_parallel, record)
+                for record in query[:worker_threads]
+            }
+            for future in as_completed(futures):
+                payload_record, err = future.result()
+
+                if err:
+                    # Was this the final attempt? Failing on a final attempt shouldn't stop
+                    # deliveries as we won't retry
+                    if payload_record.attempts >= MAX_ATTEMPTS:
+                        payload_record.delete()
+
+                        metrics.incr(
+                            "hybridcloud.deliver_webhooks.delivery",
+                            tags={"outcome": "attempts_exceed"},
+                        )
+                        logger.info(
+                            "deliver_webhook_parallel.discard",
+                            extra={"id": payload_record.id, "attempts": payload_record.attempts},
+                        )
+                    else:
+                        metrics.incr(
+                            "hybridcloud.deliver_webhooks.delivery", tags={"outcome": "retry"}
+                        )
+                        payload_record.schedule_next_attempt()
+                        request_failed = True
+                    if not isinstance(err, DeliveryFailed):
+                        raise err
+                else:
+                    # Delivery was successful
+                    payload_record.delete()
+                    metrics.incr("hybridcloud.deliver_webhooks.delivery", tags={"outcome": "ok"})
+                    metrics.distribution(
+                        "hybridcloud.deliver_webhooks.attempts", payload_record.attempts
+                    )
+
+        # If a delivery failed we should stop processing this mailbox and try again later.
+        if request_failed:
+            return
+
+
+def deliver_message_parallel(payload: WebhookPayload) -> tuple[WebhookPayload, Exception | None]:
+    try:
+        perform_request(payload)
+        return (payload, None)
+    except Exception as err:
+        return (payload, err)
+
+
 def deliver_message(payload: WebhookPayload) -> None:
     """Deliver a message if it still has delivery attempts remaining"""
     if payload.attempts >= MAX_ATTEMPTS:
diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py
@@ -1619,6 +1619,18 @@
 
 # Break glass controls
 register("hybrid_cloud.rpc.disabled-service-methods", default=[], flags=FLAG_AUTOMATOR_MODIFIABLE)
+
+register(
+    "hybridcloud.webhookpayload.use_parallel",
+    default=False,
+    flags=FLAG_AUTOMATOR_MODIFIABLE,
+)
+
+register(
+    "hybridcloud.webhookpayload.worker_threads",
+    default=4,
+    flags=FLAG_AUTOMATOR_MODIFIABLE,
+)
 # == End hybrid cloud subsystem
 
 # Decides whether an incoming transaction triggers an update of the clustering rule applied to it.
diff --git a/tests/sentry/hybridcloud/tasks/test_deliver_webhooks.py b/tests/sentry/hybridcloud/tasks/test_deliver_webhooks.py