feat: kafka trace consume [backport 2.7] (#8757)

emmettbutler · mabdinur · web-flow · commit a8d0b7376156 · 2024-03-25T20:52:27.000Z
Backported because it contains a fix for #8752 Adds tracing and DSM support for https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#confluent_kafka.Consumer.consume ## Checklist - [x] Change(s) are motivated and described in the PR description - [x] Testing strategy is described if automated tests are not included in the PR - [x] Risks are described (performance impact, potential for breakage, maintainability) - [x] Change is maintainable (easy to change, telemetry, documentation) - [x] [Library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) are followed or label `changelog/no-changelog` is set - [x] Documentation is included (in-code, generated user docs, [public corp docs](https://github.com/DataDog/documentation/)) - [x] Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) - [x] If this PR changes the public interface, I've notified `@DataDog/apm-tees`. - [x] If change touches code that signs or publishes builds or packages, or handles credentials of any kind, I've requested a review from `@DataDog/security-design-and-guidance`. ## Reviewer Checklist - [x] Title is accurate - [x] All changes are related to the pull request's stated goal - [x] Description motivates each change - [x] Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - [x] Testing strategy adequately addresses listed risks - [x] Change is maintainable (easy to change, telemetry, documentation) - [x] Release note makes sense to a user of the library - [x] Author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - [x] Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) Co-authored-by: Munir Abdinur <munir.abdinur@datadoghq.com>
diff --git a/ddtrace/contrib/kafka/patch.py b/ddtrace/contrib/kafka/patch.py
@@ -1,4 +1,5 @@
 import os
+import sys
 
 import confluent_kafka
 
@@ -115,8 +116,11 @@ def patch():
     for producer in (TracedProducer, TracedSerializingProducer):
         trace_utils.wrap(producer, "produce", traced_produce)
     for consumer in (TracedConsumer, TracedDeserializingConsumer):
-        trace_utils.wrap(consumer, "poll", traced_poll)
+        trace_utils.wrap(consumer, "poll", traced_poll_or_consume)
         trace_utils.wrap(consumer, "commit", traced_commit)
+
+    # Consume is not implemented in deserializing consumers
+    trace_utils.wrap(TracedConsumer, "consume", traced_poll_or_consume)
     Pin().onto(confluent_kafka.Producer)
     Pin().onto(confluent_kafka.Consumer)
     Pin().onto(confluent_kafka.SerializingProducer)
@@ -136,6 +140,10 @@ def unpatch():
         if trace_utils.iswrapped(consumer.commit):
             trace_utils.unwrap(consumer, "commit")
 
+    # Consume is not implemented in deserializing consumers
+    if trace_utils.iswrapped(TracedConsumer.consume):
+        trace_utils.unwrap(TracedConsumer, "consume")
+
     confluent_kafka.Producer = _Producer
     confluent_kafka.Consumer = _Consumer
     if _SerializingProducer is not None:
@@ -194,7 +202,7 @@ def traced_produce(func, instance, args, kwargs):
         return func(*args, **kwargs)
 
 
-def traced_poll(func, instance, args, kwargs):
+def traced_poll_or_consume(func, instance, args, kwargs):
     pin = Pin.get_from(instance)
     if not pin or not pin.enabled():
         return func(*args, **kwargs)
@@ -204,67 +212,79 @@ def traced_poll(func, instance, args, kwargs):
     start_ns = time_ns()
     # wrap in a try catch and raise exception after span is started
     err = None
+    result = None
     try:
-        message = func(*args, **kwargs)
+        result = func(*args, **kwargs)
+        return result
     except Exception as e:
         err = e
+        raise err
+    finally:
+        if isinstance(result, confluent_kafka.Message):
+            # poll returns a single message
+            _instrument_message([result], pin, start_ns, instance, err)
+        elif isinstance(result, list):
+            # consume returns a list of messages,
+            _instrument_message(result, pin, start_ns, instance, err)
+        elif config.kafka.trace_empty_poll_enabled:
+            _instrument_message([None], pin, start_ns, instance, err)
+
+
+def _instrument_message(messages, pin, start_ns, instance, err):
     ctx = None
-    if message is not None and config.kafka.distributed_tracing_enabled and message.headers():
-        ctx = Propagator.extract(dict(message.headers()))
-    if message is not None or config.kafka.trace_empty_poll_enabled:
-        with pin.tracer.start_span(
-            name=schematize_messaging_operation(kafkax.CONSUME, provider="kafka", direction=SpanDirection.PROCESSING),
-            service=trace_utils.ext_service(pin, config.kafka),
-            span_type=SpanTypes.WORKER,
-            child_of=ctx if ctx is not None else pin.tracer.context_provider.active(),
-            activate=True,
-        ) as span:
-            # reset span start time to before function call
-            span.start_ns = start_ns
-
-            span.set_tag_str(MESSAGING_SYSTEM, kafkax.SERVICE)
-            span.set_tag_str(COMPONENT, config.kafka.integration_name)
-            span.set_tag_str(SPAN_KIND, SpanKind.CONSUMER)
-            span.set_tag_str(kafkax.RECEIVED_MESSAGE, str(message is not None))
-            span.set_tag_str(kafkax.GROUP_ID, instance._group_id)
+    # First message is used to extract context and enrich datadog spans
+    # This approach aligns with the opentelemetry confluent kafka semantics
+    first_message = messages[0]
+    if first_message and config.kafka.distributed_tracing_enabled and first_message.headers():
+        ctx = Propagator.extract(dict(first_message.headers()))
+    with pin.tracer.start_span(
+        name=schematize_messaging_operation(kafkax.CONSUME, provider="kafka", direction=SpanDirection.PROCESSING),
+        service=trace_utils.ext_service(pin, config.kafka),
+        span_type=SpanTypes.WORKER,
+        child_of=ctx if ctx is not None else pin.tracer.context_provider.active(),
+        activate=True,
+    ) as span:
+        # reset span start time to before function call
+        span.start_ns = start_ns
+
+        for message in messages:
             if message is not None:
-                core.set_item("kafka_topic", message.topic())
-                core.dispatch("kafka.consume.start", (instance, message, span))
-
-                message_key = message.key() or ""
-                message_offset = message.offset() or -1
-                span.set_tag_str(kafkax.TOPIC, message.topic())
-
-                # If this is a deserializing consumer, do not set the key as a tag since we
-                # do not have the serialization function
-                if (
-                    (_DeserializingConsumer is not None and not isinstance(instance, _DeserializingConsumer))
-                    or isinstance(message_key, str)
-                    or isinstance(message_key, bytes)
-                ):
-                    span.set_tag_str(kafkax.MESSAGE_KEY, message_key)
-                span.set_tag(kafkax.PARTITION, message.partition())
-                is_tombstone = False
-                try:
-                    is_tombstone = len(message) == 0
-                except TypeError:  # https://github.com/confluentinc/confluent-kafka-python/issues/1192
-                    pass
-                span.set_tag_str(kafkax.TOMBSTONE, str(is_tombstone))
-                span.set_tag(kafkax.MESSAGE_OFFSET, message_offset)
-            span.set_tag(SPAN_MEASURED_KEY)
-            rate = config.kafka.get_analytics_sample_rate()
-            if rate is not None:
-                span.set_tag(ANALYTICS_SAMPLE_RATE_KEY, rate)
-
-            # raise exception if one was encountered
-            if err is not None:
-                raise err
-            return message
-    else:
+                core.set_item("kafka_topic", first_message.topic())
+                core.dispatch("kafka.consume.start", (instance, first_message, span))
+
+        span.set_tag_str(MESSAGING_SYSTEM, kafkax.SERVICE)
+        span.set_tag_str(COMPONENT, config.kafka.integration_name)
+        span.set_tag_str(SPAN_KIND, SpanKind.CONSUMER)
+        span.set_tag_str(kafkax.RECEIVED_MESSAGE, str(first_message is not None))
+        span.set_tag_str(kafkax.GROUP_ID, instance._group_id)
+        if messages[0] is not None:
+            message_key = messages[0].key() or ""
+            message_offset = messages[0].offset() or -1
+            span.set_tag_str(kafkax.TOPIC, messages[0].topic())
+
+            # If this is a deserializing consumer, do not set the key as a tag since we
+            # do not have the serialization function
+            if (
+                (_DeserializingConsumer is not None and not isinstance(instance, _DeserializingConsumer))
+                or isinstance(message_key, str)
+                or isinstance(message_key, bytes)
+            ):
+                span.set_tag_str(kafkax.MESSAGE_KEY, message_key)
+            span.set_tag(kafkax.PARTITION, messages[0].partition())
+            is_tombstone = False
+            try:
+                is_tombstone = len(messages[0]) == 0
+            except TypeError:  # https://github.com/confluentinc/confluent-kafka-python/issues/1192
+                pass
+            span.set_tag_str(kafkax.TOMBSTONE, str(is_tombstone))
+            span.set_tag(kafkax.MESSAGE_OFFSET, message_offset)
+        span.set_tag(SPAN_MEASURED_KEY)
+        rate = config.kafka.get_analytics_sample_rate()
+        if rate is not None:
+            span.set_tag(ANALYTICS_SAMPLE_RATE_KEY, rate)
+
         if err is not None:
-            raise err
-        else:
-            return message
+            span.set_exc_info(*sys.exc_info())
 
 
 def traced_commit(func, instance, args, kwargs):
diff --git a/releasenotes/notes/trace-kafka-consume-10a797a1305b1cd9.yaml b/releasenotes/notes/trace-kafka-consume-10a797a1305b1cd9.yaml
@@ -0,0 +1,4 @@
+---
+features:
+  - |
+    kafka: Adds tracing and DSM support for ``confluent_kafka.Consumer.consume()``. Previously only `confluent_kafka.Consumer.poll` was instrumented.
diff --git a/tests/contrib/kafka/test_kafka.py b/tests/contrib/kafka/test_kafka.py
@@ -90,10 +90,16 @@ def dummy_tracer():
 
 
 @pytest.fixture
-def tracer():
+def should_filter_empty_polls():
+    yield True
+
+
+@pytest.fixture
+def tracer(should_filter_empty_polls):
     patch()
     t = Tracer()
-    t.configure(settings={"FILTERS": [KafkaConsumerPollFilter()]})
+    if should_filter_empty_polls:
+        t.configure(settings={"FILTERS": [KafkaConsumerPollFilter()]})
     # disable backoff because it makes these tests less reliable
     t._writer._send_payload_with_backoff = t._writer._send_payload
     try:
@@ -266,6 +272,42 @@ def test_commit(producer, consumer, kafka_topic):
         consumer.commit(message)
 
 
+@pytest.mark.snapshot(ignores=["metrics.kafka.message_offset"])
+def test_commit_with_consume_single_message(producer, consumer, kafka_topic):
+    with override_config("kafka", dict(trace_empty_poll_enabled=False)):
+        producer.produce(kafka_topic, PAYLOAD, key=KEY)
+        producer.flush()
+        # One message is consumed and one span is generated.
+        messages = consumer.consume(num_messages=1)
+        assert len(messages) == 1
+        consumer.commit(messages[0])
+
+
+@pytest.mark.snapshot(ignores=["metrics.kafka.message_offset"])
+def test_commit_with_consume_with_multiple_messages(producer, consumer, kafka_topic):
+    with override_config("kafka", dict(trace_empty_poll_enabled=False)):
+        producer.produce(kafka_topic, PAYLOAD, key=KEY)
+        producer.produce(kafka_topic, PAYLOAD, key=KEY)
+        producer.flush()
+        # Two messages are consumed but only ONE span is generated
+        messages = consumer.consume(num_messages=2)
+        assert len(messages) == 2
+
+
+@pytest.mark.snapshot(ignores=["metrics.kafka.message_offset", "meta.error.stack"])
+@pytest.mark.parametrize("should_filter_empty_polls", [False])
+def test_commit_with_consume_with_error(producer, consumer, kafka_topic):
+    producer.produce(kafka_topic, PAYLOAD, key=KEY)
+    producer.flush()
+    # Raises an exception by consuming messages after the consumer has been closed
+    with pytest.raises(TypeError):
+        # Empty poll spans are filtered out by the KafkaConsumerPollFilter. We need to disable
+        # it to test error spans.
+        # Allowing empty poll spans could introduce flakiness in the test.
+        with override_config("kafka", dict(trace_empty_poll_enabled=True)):
+            consumer.consume(num_messages=1, invalid_args="invalid_args")
+
+
 @pytest.mark.snapshot(ignores=["metrics.kafka.message_offset"])
 def test_commit_with_offset(producer, consumer, kafka_topic):
     with override_config("kafka", dict(trace_empty_poll_enabled=False)):
@@ -415,20 +457,10 @@ def _generate_in_subprocess(random_topic):
     import ddtrace
     from ddtrace.contrib.kafka.patch import patch
     from ddtrace.contrib.kafka.patch import unpatch
-    from ddtrace.filters import TraceFilter
+    from tests.contrib.kafka.test_kafka import KafkaConsumerPollFilter
 
     PAYLOAD = bytes("hueh hueh hueh", encoding="utf-8")
 
-    class KafkaConsumerPollFilter(TraceFilter):
-        def process_trace(self, trace):
-            # Filter out all poll spans that have no received message
-            return (
-                None
-                if trace[0].name in {"kafka.consume", "kafka.process"}
-                and trace[0].get_tag("kafka.received_message") == "False"
-                else trace
-            )
-
     ddtrace.tracer.configure(settings={"FILTERS": [KafkaConsumerPollFilter()]})
     # disable backoff because it makes these tests less reliable
     ddtrace.tracer._writer._send_payload_with_backoff = ddtrace.tracer._writer._send_payload
@@ -733,6 +765,7 @@ def test_tracing_context_is_propagated_when_enabled(ddtrace_run_python_code_in_s
 from tests.contrib.kafka.test_kafka import kafka_topic
 from tests.contrib.kafka.test_kafka import producer
 from tests.contrib.kafka.test_kafka import tracer
+from tests.contrib.kafka.test_kafka import should_filter_empty_polls
 from tests.utils import DummyTracer
 
 def test(consumer, producer, kafka_topic):
@@ -923,6 +956,7 @@ def test_does_not_trace_empty_poll_when_disabled(ddtrace_run_python_code_in_subp
 from tests.contrib.kafka.test_kafka import kafka_topic
 from tests.contrib.kafka.test_kafka import producer
 from tests.contrib.kafka.test_kafka import tracer
+from tests.contrib.kafka.test_kafka import should_filter_empty_polls
 from tests.utils import DummyTracer
 
 def test(consumer, producer, kafka_topic):
diff --git a/tests/snapshots/tests.contrib.kafka.test_kafka.test_commit_with_consume_single_message.json b/tests/snapshots/tests.contrib.kafka.test_kafka.test_commit_with_consume_single_message.json
@@ -0,0 +1,74 @@
+[[
+  {
+    "name": "kafka.consume",
+    "service": "kafka",
+    "resource": "kafka.consume",
+    "trace_id": 0,
+    "span_id": 1,
+    "parent_id": 0,
+    "type": "worker",
+    "error": 0,
+    "meta": {
+      "_dd.base_service": "",
+      "_dd.p.dm": "-0",
+      "_dd.p.tid": "65dcd1fd00000000",
+      "component": "kafka",
+      "kafka.group_id": "test_group",
+      "kafka.message_key": "test_key",
+      "kafka.received_message": "True",
+      "kafka.tombstone": "False",
+      "kafka.topic": "test_commit_with_consume_single_message",
+      "language": "python",
+      "messaging.system": "kafka",
+      "pathway.hash": "7964333589438960939",
+      "runtime-id": "ff074b2cc3b34b63bbdabbfb5bafd0a4",
+      "span.kind": "consumer"
+    },
+    "metrics": {
+      "_dd.measured": 1,
+      "_dd.top_level": 1,
+      "_dd.tracer_kr": 1.0,
+      "_sampling_priority_v1": 1,
+      "kafka.message_offset": -1,
+      "kafka.partition": 0,
+      "process_id": 96733
+    },
+    "duration": 3198787000,
+    "start": 1708970490483150000
+  }],
+[
+  {
+    "name": "kafka.produce",
+    "service": "kafka",
+    "resource": "kafka.produce",
+    "trace_id": 1,
+    "span_id": 1,
+    "parent_id": 0,
+    "type": "worker",
+    "error": 0,
+    "meta": {
+      "_dd.base_service": "",
+      "_dd.p.dm": "-0",
+      "_dd.p.tid": "65dcd1f900000000",
+      "component": "kafka",
+      "kafka.message_key": "test_key",
+      "kafka.tombstone": "False",
+      "kafka.topic": "test_commit_with_consume_single_message",
+      "language": "python",
+      "messaging.kafka.bootstrap.servers": "localhost:29092",
+      "messaging.system": "kafka",
+      "pathway.hash": "8904226842384519559",
+      "runtime-id": "ff074b2cc3b34b63bbdabbfb5bafd0a4",
+      "span.kind": "producer"
+    },
+    "metrics": {
+      "_dd.measured": 1,
+      "_dd.top_level": 1,
+      "_dd.tracer_kr": 1.0,
+      "_sampling_priority_v1": 1,
+      "kafka.partition": -1,
+      "process_id": 96733
+    },
+    "duration": 356000,
+    "start": 1708970489477615000
+  }]]
diff --git a/tests/snapshots/tests.contrib.kafka.test_kafka.test_commit_with_consume_with_error[False].json b/tests/snapshots/tests.contrib.kafka.test_kafka.test_commit_with_consume_with_error[False].json
diff --git a/tests/snapshots/tests.contrib.kafka.test_kafka.test_commit_with_consume_with_multiple_messages.json b/tests/snapshots/tests.contrib.kafka.test_kafka.test_commit_with_consume_with_multiple_messages.json

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +---
 +features:
 +  - |
 +    kafka: Adds tracing and DSM support for ``confluent_kafka.Consumer.consume()``. Previously only `confluent_kafka.Consumer.poll` was instrumented.