feat(segment-enrichment): Record seen segment names (#103913)

mjq · web-flow · commit a1472e4fe64b · 2025-11-26T10:58:50.000-05:00
Continues the work started in #103739. As a first step to adding incremental segment name clustering to segment enrichment, record each seen segment name (like we do for transaction names in Sentry's `event_manager`). To work around the different signatures for transactions and segment spans, I extracted as much of the logic as I could into a second function, with one caller per scenario. As with the previous PR, the changes are behind an org flag (organizations:normalize_segment_names_in_span_enrichment) for testing and easy rollback. Closes ENG-5951.
diff --git a/src/sentry/ingest/transaction_clusterer/datasource/redis.py b/src/sentry/ingest/transaction_clusterer/datasource/redis.py
@@ -1,12 +1,13 @@
 """Write transactions into redis sets"""
 
 import logging
-from collections.abc import Iterator, Mapping
+from collections.abc import Callable, Iterator, Mapping
 from typing import Any
 
 import sentry_sdk
 from django.conf import settings
 from rediscluster import RedisCluster
+from sentry_conventions.attributes import ATTRIBUTE_NAMES
 
 from sentry.ingest.transaction_clusterer import ClustererNamespace
 from sentry.ingest.transaction_clusterer.datasource import (
@@ -16,6 +17,7 @@
 )
 from sentry.models.project import Project
 from sentry.options.rollout import in_random_rollout
+from sentry.spans.consumers.process_segments.types import CompatibleSpan, attribute_value
 from sentry.utils import redis
 from sentry.utils.safe import safe_execute
 
@@ -123,41 +125,59 @@ def record_transaction_name(project: Project, event_data: Mapping[str, Any], **k
             safe_execute(_bump_rule_lifetime, project, event_data)
 
 
+def record_segment_name(project: Project, segment_span: CompatibleSpan) -> None:
+    if segment_name := _should_store_segment_name(segment_span):
+        safe_execute(
+            _record_sample,
+            ClustererNamespace.TRANSACTIONS,
+            project,
+            segment_name,
+        )
+
+
 def _should_store_transaction_name(event_data: Mapping[str, Any]) -> str | None:
-    """Returns whether the given event must be stored as input for the
-    transaction clusterer."""
     transaction_name = event_data.get("transaction")
-    if not transaction_name:
-        return None
-
-    tags = event_data.get("tags") or {}
     transaction_info = event_data.get("transaction_info") or {}
     source = transaction_info.get("source")
 
-    # We also feed back transactions into the clustering algorithm
-    # that have already been sanitized, so we have a chance to discover
-    # more high cardinality segments after partial sanitation.
-    # For example, we may have sanitized `/orgs/*/projects/foo`,
-    # But the clusterer has yet to discover `/orgs/*/projects/*`.
-    #
-    # Disadvantage: the load on redis does not decrease over time.
-    #
+    def is_404() -> bool:
+        tags = event_data.get("tags") or []
+        return HTTP_404_TAG in tags
+
+    return _should_store_segment_name_inner(transaction_name, source, is_404)
+
+
+def _should_store_segment_name(segment_span: CompatibleSpan) -> str | None:
+    segment_name = attribute_value(
+        segment_span, ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME
+    ) or segment_span.get("name")
+    source = attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE)
+
+    def is_404() -> bool:
+        status_code = attribute_value(segment_span, ATTRIBUTE_NAMES.HTTP_RESPONSE_STATUS_CODE)
+        return status_code == 404
+
+    return _should_store_segment_name_inner(segment_name, source, is_404)
+
+
+def _should_store_segment_name_inner(
+    name: str | None, source: str | None, is_404: Callable[[], bool]
+) -> str | None:
+    if not name:
+        return None
     source_matches = source in (TRANSACTION_SOURCE_URL, TRANSACTION_SOURCE_SANITIZED) or (
         # Relay leaves source None if it expects it to be high cardinality, (otherwise it sets it to "unknown")
         # (see https://github.com/getsentry/relay/blob/2d07bef86415cc0ae8af01d16baecde10cdb23a6/relay-general/src/store/transactions/processor.rs#L369-L373).
         #
         # Our data shows that a majority of these `None` source transactions contain slashes, so treat them as URL transactions:
         source is None
-        and "/" in transaction_name
+        and "/" in name
     )
-
     if not source_matches:
         return None
-
-    if tags and HTTP_404_TAG in tags:
+    if is_404():
         return None
-
-    return transaction_name
+    return name
 
 
 def _bump_rule_lifetime(project: Project, event_data: Mapping[str, Any]) -> None:
diff --git a/src/sentry/ingest/transaction_clusterer/normalization.py b/src/sentry/ingest/transaction_clusterer/normalization.py
@@ -4,6 +4,7 @@
 import orjson
 from sentry_conventions.attributes import ATTRIBUTE_NAMES
 
+from sentry.ingest.transaction_clusterer.datasource import TRANSACTION_SOURCE_SANITIZED
 from sentry.spans.consumers.process_segments.types import CompatibleSpan, attribute_value
 
 # Ported from Relay:
@@ -106,7 +107,7 @@ def _scrub_identifiers(segment_span: CompatibleSpan, segment_name: str):
     }
     attributes[ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE] = {
         "type": "string",
-        "value": "sanitized",
+        "value": TRANSACTION_SOURCE_SANITIZED,
     }
     attributes[f"sentry._meta.fields.attributes.{ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME}"] = {
         "type": "string",
diff --git a/src/sentry/spans/consumers/process_segments/message.py b/src/sentry/spans/consumers/process_segments/message.py
@@ -13,6 +13,8 @@
 from sentry.constants import DataCategory
 from sentry.dynamic_sampling.rules.helpers.latest_releases import record_latest_release
 from sentry.event_manager import INSIGHT_MODULE_TO_PROJECT_FLAG_NAME
+from sentry.ingest.transaction_clusterer.datasource import TRANSACTION_SOURCE_URL
+from sentry.ingest.transaction_clusterer.datasource.redis import record_segment_name
 from sentry.ingest.transaction_clusterer.normalization import normalize_segment_name
 from sentry.insights import FilterSpan
 from sentry.insights import modules as insights_modules
@@ -64,7 +66,7 @@ def process_segment(
         # If the project does not exist then it might have been deleted during ingestion.
         return []
 
-    safe_execute(_normalize_segment_name, segment_span, project.organization)
+    safe_execute(_normalize_segment_name, segment_span, project)
     _add_segment_name(segment_span, spans)
     _compute_breakdowns(segment_span, spans, project)
     _create_models(segment_span, project)
@@ -145,8 +147,10 @@ def _enrich_spans(
 
 
 @metrics.wraps("spans.consumers.process_segments.normalize_segment_name")
-def _normalize_segment_name(segment_span: CompatibleSpan, organization: Organization) -> None:
-    if not features.has("organizations:normalize_segment_names_in_span_enrichment", organization):
+def _normalize_segment_name(segment_span: CompatibleSpan, project: Project) -> None:
+    if not features.has(
+        "organizations:normalize_segment_names_in_span_enrichment", project.organization
+    ):
         return
 
     segment_name = attribute_value(
@@ -157,10 +161,12 @@ def _normalize_segment_name(segment_span: CompatibleSpan, organization: Organiza
 
     source = attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE)
     unknown_if_parameterized = not source
-    known_to_be_unparameterized = source == "url"
+    known_to_be_unparameterized = source == TRANSACTION_SOURCE_URL
     if unknown_if_parameterized or known_to_be_unparameterized:
         normalize_segment_name(segment_span)
 
+    record_segment_name(project, segment_span)
+
 
 @metrics.wraps("spans.consumers.process_segments.add_segment_name")
 def _add_segment_name(segment: CompatibleSpan, spans: Sequence[CompatibleSpan]) -> None:
diff --git a/tests/sentry/ingest/test_transaction_clusterer.py b/tests/sentry/ingest/test_transaction_clusterer.py
@@ -1,6 +1,7 @@
 from unittest import mock
 
 import pytest
+from sentry_conventions.attributes import ATTRIBUTE_NAMES
 
 from sentry.ingest.transaction_clusterer import ClustererNamespace
 from sentry.ingest.transaction_clusterer.base import ReplacementRule
@@ -13,6 +14,7 @@
     get_active_projects,
     get_redis_client,
     get_transaction_names,
+    record_segment_name,
     record_transaction_name,
 )
 from sentry.ingest.transaction_clusterer.meta import get_clusterer_meta
@@ -209,6 +211,47 @@ def test_record_transactions(
     assert len(mocked_record.mock_calls) == expected
 
 
+@mock.patch("sentry.ingest.transaction_clusterer.datasource.redis._record_sample")
+@django_db_all
+@pytest.mark.parametrize(
+    "source, segment_name, attributes, expected",
+    [
+        ("url", "/a/b/c", {}, 1),
+        (
+            "url",
+            "/a/b/c",
+            {ATTRIBUTE_NAMES.HTTP_RESPONSE_STATUS_CODE: {"type": "integer", "value": 200}},
+            1,
+        ),
+        ("route", "/", {}, 0),
+        ("url", None, {}, 0),
+        (
+            "url",
+            "/a/b/c",
+            {ATTRIBUTE_NAMES.HTTP_RESPONSE_STATUS_CODE: {"type": "integer", "value": 404}},
+            0,
+        ),
+        (None, "/a/b/c", {}, 1),
+        (None, "foo", {}, 0),
+    ],
+)
+def test_record_segment_name(
+    mocked_record, default_organization, source, segment_name, attributes, expected
+) -> None:
+    project = Project(id=111, name="project", organization_id=default_organization.id)
+    record_segment_name(
+        project,
+        {
+            "name": segment_name,
+            "attributes": {
+                ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE: {"type": "string", "value": source},
+                **attributes,
+            },
+        },  # type: ignore[typeddict-item]
+    )
+    assert len(mocked_record.mock_calls) == expected
+
+
 def test_sort_rules() -> None:
     rules = {
         ReplacementRule("/a/*/**"): 1,
diff --git a/tests/sentry/spans/consumers/process_segments/test_message.py b/tests/sentry/spans/consumers/process_segments/test_message.py
@@ -274,16 +274,23 @@ def test_segment_name_propagation_when_name_missing(self):
         child_attributes = child_span["attributes"] or {}
         assert child_attributes.get("sentry.segment.name") is None
 
-    def test_segment_name_normalization_with_feature(self):
+    @mock.patch("sentry.spans.consumers.process_segments.message.record_segment_name")
+    def test_segment_name_normalization_with_feature(
+        self, mock_record_segment_name: mock.MagicMock
+    ):
         _, segment_span = self.generate_basic_spans()
         segment_span["name"] = "/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0"
 
         with self.feature("organizations:normalize_segment_names_in_span_enrichment"):
             processed_spans = process_segment([segment_span])
 
         assert processed_spans[0]["name"] == "/foo/*/user/*/0"
+        mock_record_segment_name.assert_called_once()
 
-    def test_segment_name_normalization_without_feature(self):
+    @mock.patch("sentry.spans.consumers.process_segments.message.record_segment_name")
+    def test_segment_name_normalization_without_feature(
+        self, mock_record_segment_name: mock.MagicMock
+    ):
         _, segment_span = self.generate_basic_spans()
         segment_span["name"] = "/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0"
 
@@ -293,6 +300,7 @@ def test_segment_name_normalization_without_feature(self):
         assert (
             processed_spans[0]["name"] == "/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0"
         )
+        mock_record_segment_name.assert_not_called()
 
     def test_segment_name_normalization_checks_source(self):
         _, segment_span = self.generate_basic_spans()

Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,7 @@`
`4`	`4`	`import orjson`
`5`	`5`	`from sentry_conventions.attributes import ATTRIBUTE_NAMES`
`6`	`6`
	`7`	`+from sentry.ingest.transaction_clusterer.datasource import TRANSACTION_SOURCE_SANITIZED`
`7`	`8`	`from sentry.spans.consumers.process_segments.types import CompatibleSpan, attribute_value`
`8`	`9`
`9`	`10`	`# Ported from Relay:`
`@@ -106,7 +107,7 @@ def _scrub_identifiers(segment_span: CompatibleSpan, segment_name: str):`
`106`	`107`	`}`
`107`	`108`	`attributes[ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE] = {`
`108`	`109`	`"type": "string",`
`109`		`- "value": "sanitized",`
	`110`	`+ "value": TRANSACTION_SOURCE_SANITIZED,`
`110`	`111`	`}`
`111`	`112`	`attributes[f"sentry._meta.fields.attributes.{ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME}"] = {`
`112`	`113`	`"type": "string",`