chore(kafka): use core api for data streams monitoring [backport 2.0] (#7163)

github-actions[bot] · tabgok · web-flow · commit 7c1b921b704d · 2023-10-05T14:56:35.000+01:00
Backport 5480ca8 from #6890 to 2.0. This commit swaps out all remaining DSM code in the kafka integration to use the core API. This commit also contains the addition of two tests and fixes to the commit wrapper which were required to safely transition. ## Checklist - [x] Change(s) are motivated and described in the PR description. - [x] Testing strategy is described if automated tests are not included in the PR. - [x] Risk is outlined (performance impact, potential for breakage, maintainability, etc). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] [Library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) are followed. If no release note is required, add label `changelog/no-changelog`. - [x] Documentation is included (in-code, generated user docs, [public corp docs](https://github.com/DataDog/documentation/)). - [x] Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Title is accurate. - [x] No unnecessary changes are introduced. - [x] Description motivates each change. - [x] Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes unless absolutely necessary. - [x] Testing strategy adequately addresses listed risk(s). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] Release note makes sense to a user of the library. - [x] Reviewer has explicitly acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment. - [x] Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) - [x] If this PR touches code that signs or publishes builds or packages, or handles credentials of any kind, I've requested a review from `@DataDog/security-design-and-guidance`. - [x] This PR doesn't touch any of that. Co-authored-by: Teague Bick <teague.bick@datadoghq.com>
diff --git a/ddtrace/contrib/kafka/patch.py b/ddtrace/contrib/kafka/patch.py
@@ -1,7 +1,4 @@
-import time
-
 import confluent_kafka
-from confluent_kafka import TopicPartition
 
 from ddtrace import config
 from ddtrace.constants import ANALYTICS_SAMPLE_RATE_KEY
@@ -20,7 +17,6 @@
 from ddtrace.internal.schema.span_attribute_schema import SpanDirection
 from ddtrace.internal.utils import ArgumentError
 from ddtrace.internal.utils import get_argument_value
-from ddtrace.internal.utils import set_argument_value
 from ddtrace.internal.utils.formats import asbool
 from ddtrace.pin import Pin
 
@@ -121,37 +117,7 @@ def traced_produce(func, instance, args, kwargs):
         value = None
     message_key = kwargs.get("key", "")
     partition = kwargs.get("partition", -1)
-    if config._data_streams_enabled:
-        # inject data streams context
-        core.dispatch("kafka.produce.start", [instance, args, kwargs])
-
-        on_delivery_kwarg = "on_delivery"
-        on_delivery_arg = 5
-        on_delivery = None
-        try:
-            on_delivery = get_argument_value(args, kwargs, on_delivery_arg, on_delivery_kwarg)
-        except ArgumentError:
-            on_delivery_kwarg = "callback"
-            on_delivery_arg = 4
-            try:
-                on_delivery = get_argument_value(args, kwargs, on_delivery_arg, on_delivery_kwarg)
-            except ArgumentError:
-                on_delivery = None
-
-        def wrapped_callback(err, msg):
-            if err is None:
-                if pin.tracer.data_streams_processor:
-                    pin.tracer.data_streams_processor.track_kafka_produce(
-                        msg.topic(), msg.partition(), msg.offset() or -1, time.time()
-                    )
-            if on_delivery is not None:
-                on_delivery(err, msg)
-
-        try:
-            args, kwargs = set_argument_value(args, kwargs, on_delivery_arg, on_delivery_kwarg, wrapped_callback)
-        except ArgumentError:
-            # we set the callback even if it's not set by the client, to track produce calls correctly.
-            kwargs[on_delivery_kwarg] = wrapped_callback
+    core.dispatch("kafka.produce.start", [instance, args, kwargs])
 
     with pin.tracer.trace(
         schematize_messaging_operation(kafkax.PRODUCE, provider="kafka", direction=SpanDirection.OUTBOUND),
@@ -191,15 +157,8 @@ def traced_poll(func, instance, args, kwargs):
         span.set_tag_str(kafkax.RECEIVED_MESSAGE, str(message is not None))
         span.set_tag_str(kafkax.GROUP_ID, instance._group_id)
         if message is not None:
-            if config._data_streams_enabled:
-                core.set_item("kafka_topic", message.topic())
-                core.dispatch("kafka.consume.start", [instance, message])
-                if instance._auto_commit:
-                    # it's not exactly true, but if auto commit is enabled, we consider that a message is acknowledged
-                    # when it's read.
-                    pin.tracer.data_streams_processor.track_kafka_commit(
-                        instance._group_id, message.topic(), message.partition(), message.offset() or -1, time.time()
-                    )
+            core.set_item("kafka_topic", message.topic())
+            core.dispatch("kafka.consume.start", [instance, message])
 
             message_key = message.key() or ""
             message_offset = message.offset() or -1
@@ -220,17 +179,6 @@ def traced_commit(func, instance, args, kwargs):
     if not pin or not pin.enabled():
         return func(*args, **kwargs)
 
-    if config._data_streams_enabled:
-        message = get_argument_value(args, kwargs, 0, "message", True)
-        # message and offset are mutually exclusive. Only one parameter can be passed.
-        if message is not None:
-            offsets = [TopicPartition(message.topic(), message.partition(), offset=message.offset())]
-        else:
-            offsets = get_argument_value(args, kwargs, 1, "offsets", True)
-
-        if offsets:
-            for offset in offsets:
-                pin.tracer.data_streams_processor.track_kafka_commit(
-                    instance._group_id, offset.topic, offset.partition, offset.offset or -1, time.time()
-                )
+    core.dispatch("kafka.commit.start", [instance, args, kwargs])
+
     return func(*args, **kwargs)
diff --git a/ddtrace/internal/datastreams/__init__.py b/ddtrace/internal/datastreams/__init__.py
@@ -1,14 +1,23 @@
-from . import kafka  # noqa:F401
+from ddtrace import config
+from ddtrace.internal import agent
 
+from ...internal.utils.importlib import require_modules
 
+
+required_modules = ["confluent_kafka"]
 _processor = None
 
+if config._data_streams_enabled:
+    with require_modules(required_modules) as missing_modules:
+        if not missing_modules:
+            from . import kafka  # noqa:F401
 
-def data_streams_processor():
-    from . import processor
 
+def data_streams_processor():
     global _processor
-    if not _processor:
-        _processor = processor.DataStreamsProcessor()
+    if config._data_streams_enabled and not _processor:
+        from . import processor
+
+        _processor = processor.DataStreamsProcessor(agent.get_trace_url())
 
     return _processor
diff --git a/ddtrace/internal/datastreams/kafka.py b/ddtrace/internal/datastreams/kafka.py
@@ -1,6 +1,16 @@
+import time
+
+from confluent_kafka import TopicPartition
+
 from ddtrace import config
 from ddtrace.internal import core
 from ddtrace.internal.datastreams.processor import PROPAGATION_KEY
+from ddtrace.internal.utils import ArgumentError
+from ddtrace.internal.utils import get_argument_value
+from ddtrace.internal.utils import set_argument_value
+
+
+INT_TYPES = (int,)
 
 
 def dsm_kafka_message_produce(instance, args, kwargs):
@@ -13,6 +23,29 @@ def dsm_kafka_message_produce(instance, args, kwargs):
     headers[PROPAGATION_KEY] = encoded_pathway
     kwargs["headers"] = headers
 
+    on_delivery_kwarg = "on_delivery"
+    on_delivery_arg = 5
+    on_delivery = None
+    try:
+        on_delivery = get_argument_value(args, kwargs, on_delivery_arg, on_delivery_kwarg)
+    except ArgumentError:
+        on_delivery_kwarg = "callback"
+        on_delivery_arg = 4
+        on_delivery = get_argument_value(args, kwargs, on_delivery_arg, on_delivery_kwarg, optional=True)
+
+    def wrapped_callback(err, msg):
+        if err is None:
+            reported_offset = msg.offset() if isinstance(msg.offset(), INT_TYPES) else -1
+            processor().track_kafka_produce(msg.topic(), msg.partition(), reported_offset, time.time())
+        if on_delivery is not None:
+            on_delivery(err, msg)
+
+    try:
+        args, kwargs = set_argument_value(args, kwargs, on_delivery_arg, on_delivery_kwarg, wrapped_callback)
+    except ArgumentError:
+        # we set the callback even if it's not set by the client, to track produce calls correctly.
+        kwargs[on_delivery_kwarg] = wrapped_callback
+
 
 def dsm_kafka_message_consume(instance, message):
     from . import data_streams_processor as processor
@@ -24,7 +57,39 @@ def dsm_kafka_message_consume(instance, message):
     ctx = processor().decode_pathway(headers.get(PROPAGATION_KEY, None))
     ctx.set_checkpoint(["direction:in", "group:" + group, "topic:" + topic, "type:kafka"])
 
+    if instance._auto_commit:
+        # it's not exactly true, but if auto commit is enabled, we consider that a message is acknowledged
+        # when it's read.
+        reported_offset = message.offset() if isinstance(message.offset(), INT_TYPES) else -1
+        processor().track_kafka_commit(
+            instance._group_id, message.topic(), message.partition(), reported_offset, time.time()
+        )
+
+
+def dsm_kafka_message_commit(instance, args, kwargs):
+    from . import data_streams_processor as processor
+
+    message = get_argument_value(args, kwargs, 0, "message", optional=True)
+
+    offsets = []
+    if message is not None:
+        # We need to add one to message offsets to make them mean the same thing as offsets
+        # passed in by the offsets keyword
+        reported_offset = message.offset() + 1 if isinstance(message.offset(), INT_TYPES) else -1
+        offsets = [TopicPartition(message.topic(), message.partition(), reported_offset)]
+    else:
+        offsets = get_argument_value(args, kwargs, 1, "offsets", True) or []
+
+    for offset in offsets:
+        # When offsets is passed in as an arg, its an exact value for the next expected message.
+        # When message is passed in Kafka reports msg.offset() + 1.  We add +1 above to message
+        # offsets to make them mean the same thing as passed in offsets, then subtract 1 universally
+        # here from both
+        reported_offset = offset.offset - 1 if isinstance(offset.offset, INT_TYPES) else -1
+        processor().track_kafka_commit(instance._group_id, offset.topic, offset.partition, reported_offset, time.time())
+
 
 if config._data_streams_enabled:
     core.on("kafka.produce.start", dsm_kafka_message_produce)
     core.on("kafka.consume.start", dsm_kafka_message_consume)
+    core.on("kafka.commit.start", dsm_kafka_message_commit)
diff --git a/ddtrace/internal/datastreams/processor.py b/ddtrace/internal/datastreams/processor.py
@@ -1,6 +1,7 @@
 # coding: utf-8
 import base64
 from collections import defaultdict
+from functools import partial
 import gzip
 import os
 import struct
@@ -20,6 +21,7 @@
 
 import ddtrace
 from ddtrace import config
+from ddtrace.internal.atexit import register_on_exit_signal
 from ddtrace.internal.utils.retry import fibonacci_backoff_with_jitter
 
 from .._encoding import packb
@@ -68,6 +70,7 @@ def gzip_compress(payload):
 
 PROPAGATION_KEY = "dd-pathway-ctx"
 PROPAGATION_KEY_BASE_64 = "dd-pathway-ctx-base64"
+SHUTDOWN_TIMEOUT = 5
 
 """
 PathwayAggrKey uniquely identifies a pathway to aggregate stats on.
@@ -135,6 +138,7 @@ def __init__(self, agent_url, interval=None, timeout=1.0, retry_attempts=3):
             initial_wait=0.618 * self.interval / (1.618 ** retry_attempts) / 2,
         )(self._flush_stats)
 
+        register_on_exit_signal(partial(_atexit, obj=self))
         self.start()
 
     def on_checkpoint_creation(
@@ -442,3 +446,14 @@ def set_checkpoint(self, tags, now_sec=None, edge_start_sec_override=None, pathw
         self.processor.on_checkpoint_creation(
             hash_value, parent_hash, tags, now_sec, edge_latency_sec, pathway_latency_sec
         )
+
+
+def _atexit(obj=None):
+    try:
+        # Data streams tries to flush data on shutdown.
+        # Adding a try except here to ensure we don't crash the application if the agent is killed before
+        # the application for example.
+        obj.shutdown(SHUTDOWN_TIMEOUT)
+    except Exception as e:
+        if config._data_streams_enabled:
+            log.warning("Failed to shutdown data streams processor: %s", repr(e))
diff --git a/ddtrace/tracer.py b/ddtrace/tracer.py
@@ -1007,14 +1007,6 @@ def shutdown(self, timeout=None):
         """
         with self._shutdown_lock:
             # Thread safety: Ensures tracer is shutdown synchronously
-            try:
-                # Data streams tries to flush data on shutdown.
-                # Adding a try except here to ensure we don't crash the application if the agent is killed before
-                # the application for example.
-                self.data_streams_processor.shutdown(timeout)
-            except Exception as e:
-                if config._data_streams_enabled:
-                    log.warning("Failed to shutdown data streams processor: %s", repr(e))
             span_processors = self._span_processors
             deferred_processors = self._deferred_processors
             self._span_processors = []
diff --git a/pyproject.toml b/pyproject.toml
@@ -154,6 +154,8 @@ exclude-modules = '''
   | ddtrace.appsec._iast._ast.aspects
   | ddtrace.appsec._iast._taint_utils
   | ddtrace.appsec._iast.taint_sinks.sql_injection
+  # DSM specific contribs
+  | ddtrace.internal.datastreams.kafka
 )
 '''
 
diff --git a/releasenotes/notes/dsm-fix-offsets-and-metrics-7e34b2734667e986.yaml b/releasenotes/notes/dsm-fix-offsets-and-metrics-7e34b2734667e986.yaml
@@ -0,0 +1,3 @@
+fixes:
+  - |
+    DSM: fix off-by-one metric issue and error where statistics weren't calculated when the core API was used.
diff --git a/tests/contrib/kafka/test_kafka.py b/tests/contrib/kafka/test_kafka.py
diff --git a/tests/datastreams/test_processor.py b/tests/datastreams/test_processor.py
diff --git a/tests/utils.py b/tests/utils.py

Original file line number	Diff line number	Diff line change
`@@ -154,6 +154,8 @@ exclude-modules = '''`
`154`	`154`	`\| ddtrace.appsec._iast._ast.aspects`
`155`	`155`	`\| ddtrace.appsec._iast._taint_utils`
`156`	`156`	`\| ddtrace.appsec._iast.taint_sinks.sql_injection`
	`157`	`+ # DSM specific contribs`
	`158`	`+ \| ddtrace.internal.datastreams.kafka`
`157`	`159`	`)`
`158`	`160`	`'''`
`159`	`161`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+fixes:`
	`2`	`+ - \|`
	`3`	`+ DSM: fix off-by-one metric issue and error where statistics weren't calculated when the core API was used.`