ref(spans): Use provided segment_id for buffering (#97832)

jan-auer · web-flow · commit b964a4547a01 · 2025-08-14T13:18:25.000+02:00
Now that the span buffer will be used in the ingestion pipeline, we can
use the `segment_id` provided by Relay on most spans. This will
guarantee correct output, since Relay takes the `segment_id` from the
transaction event. For otel, the buffer still constructs span trees
recursively via the `parent_id`.

This particularly makes a difference for transactions/segments where
some spans are disconnected from the tree. They will now be correctly
assigned to the segment.

Metrics on mismatches and the "original_segment_id" attribute are
removed, since they will now always match if they are available. By
definition, the match rate will be 100%.
diff --git a/src/sentry/spans/buffer.py b/src/sentry/spans/buffer.py
@@ -126,6 +126,7 @@ class Span(NamedTuple):
     trace_id: str
     span_id: str
     parent_span_id: str | None
+    segment_id: str | None
     project_id: int
     payload: bytes
     end_timestamp_precise: float
@@ -138,7 +139,7 @@ def effective_parent_id(self):
         if self.is_segment_span:
             return self.span_id
         else:
-            return self.parent_span_id or self.span_id
+            return self.segment_id or self.parent_span_id or self.span_id
 
 
 class OutputSpan(NamedTuple):
@@ -308,9 +309,14 @@ def _group_by_parent(self, spans: Sequence[Span]) -> dict[tuple[str, str], list[
         top-most known parent, and the value is a flat list of all its
         transitive children.
 
+        For spans with a known segment_id, the grouping is done by the
+        segment_id instead of the parent_span_id. This is the case for spans
+        extracted from transaction events, or if in the future SDKs provide
+        segment IDs.
+
         :param spans: List of spans to be grouped.
-        :return: Dictionary of grouped spans. The key is a tuple of
-            the `project_and_trace`, and the `parent_span_id`.
+        :return: Dictionary of grouped spans. The key is a tuple of the
+            `project_and_trace`, and the `parent_span_id`.
         """
         trees: dict[tuple[str, str], list[Span]] = {}
         redirects: dict[str, dict[str, str]] = {}
@@ -423,31 +429,15 @@ def flush_segments(self, now: int) -> dict[SegmentKey, FlushedSegment]:
             metrics.timing("spans.buffer.flush_segments.num_spans_per_segment", len(segment))
             for payload in segment:
                 val = orjson.loads(payload)
-                old_segment_id = val.get("segment_id")
-                outcome = "same" if old_segment_id == segment_span_id else "different"
 
-                is_segment = val["is_segment"] = segment_span_id == val["span_id"]
+                if not val.get("segment_id"):
+                    val["segment_id"] = segment_span_id
+
+                is_segment = segment_span_id == val["span_id"]
+                val["is_segment"] = is_segment
                 if is_segment:
                     has_root_span = True
 
-                val_data = val.setdefault("data", {})
-                if isinstance(val_data, dict):
-                    val_data["sentry._internal.span_buffer_segment_id_outcome"] = outcome
-
-                    if old_segment_id:
-                        val_data["sentry._internal.span_buffer_old_segment_id"] = old_segment_id
-
-                val["segment_id"] = segment_span_id
-
-                metrics.incr(
-                    "spans.buffer.flush_segments.is_same_segment",
-                    tags={
-                        "outcome": outcome,
-                        "is_segment_span": is_segment,
-                        "old_segment_is_null": "true" if old_segment_id is None else "false",
-                    },
-                )
-
                 output_spans.append(OutputSpan(payload=val))
 
             metrics.incr(
diff --git a/src/sentry/spans/consumers/process/factory.py b/src/sentry/spans/consumers/process/factory.py
@@ -182,6 +182,7 @@ def process_batch(
                 trace_id=val["trace_id"],
                 span_id=val["span_id"],
                 parent_span_id=val.get("parent_span_id"),
+                segment_id=cast(str | None, val.get("segment_id")),
                 project_id=val["project_id"],
                 payload=payload.value,
                 end_timestamp_precise=val["end_timestamp_precise"],
diff --git a/tests/sentry/spans/consumers/process/test_consumer.py b/tests/sentry/spans/consumers/process/test_consumer.py
@@ -76,9 +76,6 @@ def add_commit(offsets, force=False):
             assert orjson.loads(msg.value) == {
                 "spans": [
                     {
-                        "data": {
-                            "sentry._internal.span_buffer_segment_id_outcome": "different",
-                        },
                         "is_segment": True,
                         "project_id": 12,
                         "segment_id": "aaaaaaaaaaaaaaaa",
diff --git a/tests/sentry/spans/consumers/process/test_flusher.py b/tests/sentry/spans/consumers/process/test_flusher.py
@@ -48,6 +48,7 @@ def append(msg):
                         trace_id=trace_id,
                         span_id="a" * 16,
                         parent_span_id="b" * 16,
+                        segment_id=None,
                         project_id=1,
                         end_timestamp_precise=now,
                     ),
@@ -56,6 +57,7 @@ def append(msg):
                         trace_id=trace_id,
                         span_id="d" * 16,
                         parent_span_id="b" * 16,
+                        segment_id=None,
                         project_id=1,
                         end_timestamp_precise=now,
                     ),
@@ -64,6 +66,7 @@ def append(msg):
                         trace_id=trace_id,
                         span_id="c" * 16,
                         parent_span_id="b" * 16,
+                        segment_id=None,
                         project_id=1,
                         end_timestamp_precise=now,
                     ),
@@ -73,6 +76,7 @@ def append(msg):
                         span_id="b" * 16,
                         parent_span_id=None,
                         is_segment_span=True,
+                        segment_id=None,
                         project_id=1,
                         end_timestamp_precise=now,
                     ),
diff --git a/tests/sentry/spans/test_buffer.py b/tests/sentry/spans/test_buffer.py