Skip to content

Commit a1472e4

Browse files
authored
feat(segment-enrichment): Record seen segment names (#103913)
Continues the work started in #103739. As a first step to adding incremental segment name clustering to segment enrichment, record each seen segment name (like we do for transaction names in Sentry's `event_manager`). To work around the different signatures for transactions and segment spans, I extracted as much of the logic as I could into a second function, with one caller per scenario. As with the previous PR, the changes are behind an org flag (organizations:normalize_segment_names_in_span_enrichment) for testing and easy rollback. Closes ENG-5951.
1 parent 45a2194 commit a1472e4

File tree

5 files changed

+106
-28
lines changed

5 files changed

+106
-28
lines changed

src/sentry/ingest/transaction_clusterer/datasource/redis.py

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
"""Write transactions into redis sets"""
22

33
import logging
4-
from collections.abc import Iterator, Mapping
4+
from collections.abc import Callable, Iterator, Mapping
55
from typing import Any
66

77
import sentry_sdk
88
from django.conf import settings
99
from rediscluster import RedisCluster
10+
from sentry_conventions.attributes import ATTRIBUTE_NAMES
1011

1112
from sentry.ingest.transaction_clusterer import ClustererNamespace
1213
from sentry.ingest.transaction_clusterer.datasource import (
@@ -16,6 +17,7 @@
1617
)
1718
from sentry.models.project import Project
1819
from sentry.options.rollout import in_random_rollout
20+
from sentry.spans.consumers.process_segments.types import CompatibleSpan, attribute_value
1921
from sentry.utils import redis
2022
from sentry.utils.safe import safe_execute
2123

@@ -123,41 +125,59 @@ def record_transaction_name(project: Project, event_data: Mapping[str, Any], **k
123125
safe_execute(_bump_rule_lifetime, project, event_data)
124126

125127

128+
def record_segment_name(project: Project, segment_span: CompatibleSpan) -> None:
129+
if segment_name := _should_store_segment_name(segment_span):
130+
safe_execute(
131+
_record_sample,
132+
ClustererNamespace.TRANSACTIONS,
133+
project,
134+
segment_name,
135+
)
136+
137+
126138
def _should_store_transaction_name(event_data: Mapping[str, Any]) -> str | None:
127-
"""Returns whether the given event must be stored as input for the
128-
transaction clusterer."""
129139
transaction_name = event_data.get("transaction")
130-
if not transaction_name:
131-
return None
132-
133-
tags = event_data.get("tags") or {}
134140
transaction_info = event_data.get("transaction_info") or {}
135141
source = transaction_info.get("source")
136142

137-
# We also feed back transactions into the clustering algorithm
138-
# that have already been sanitized, so we have a chance to discover
139-
# more high cardinality segments after partial sanitation.
140-
# For example, we may have sanitized `/orgs/*/projects/foo`,
141-
# But the clusterer has yet to discover `/orgs/*/projects/*`.
142-
#
143-
# Disadvantage: the load on redis does not decrease over time.
144-
#
143+
def is_404() -> bool:
144+
tags = event_data.get("tags") or []
145+
return HTTP_404_TAG in tags
146+
147+
return _should_store_segment_name_inner(transaction_name, source, is_404)
148+
149+
150+
def _should_store_segment_name(segment_span: CompatibleSpan) -> str | None:
151+
segment_name = attribute_value(
152+
segment_span, ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME
153+
) or segment_span.get("name")
154+
source = attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE)
155+
156+
def is_404() -> bool:
157+
status_code = attribute_value(segment_span, ATTRIBUTE_NAMES.HTTP_RESPONSE_STATUS_CODE)
158+
return status_code == 404
159+
160+
return _should_store_segment_name_inner(segment_name, source, is_404)
161+
162+
163+
def _should_store_segment_name_inner(
164+
name: str | None, source: str | None, is_404: Callable[[], bool]
165+
) -> str | None:
166+
if not name:
167+
return None
145168
source_matches = source in (TRANSACTION_SOURCE_URL, TRANSACTION_SOURCE_SANITIZED) or (
146169
# Relay leaves source None if it expects it to be high cardinality, (otherwise it sets it to "unknown")
147170
# (see https://github.com/getsentry/relay/blob/2d07bef86415cc0ae8af01d16baecde10cdb23a6/relay-general/src/store/transactions/processor.rs#L369-L373).
148171
#
149172
# Our data shows that a majority of these `None` source transactions contain slashes, so treat them as URL transactions:
150173
source is None
151-
and "/" in transaction_name
174+
and "/" in name
152175
)
153-
154176
if not source_matches:
155177
return None
156-
157-
if tags and HTTP_404_TAG in tags:
178+
if is_404():
158179
return None
159-
160-
return transaction_name
180+
return name
161181

162182

163183
def _bump_rule_lifetime(project: Project, event_data: Mapping[str, Any]) -> None:

src/sentry/ingest/transaction_clusterer/normalization.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import orjson
55
from sentry_conventions.attributes import ATTRIBUTE_NAMES
66

7+
from sentry.ingest.transaction_clusterer.datasource import TRANSACTION_SOURCE_SANITIZED
78
from sentry.spans.consumers.process_segments.types import CompatibleSpan, attribute_value
89

910
# Ported from Relay:
@@ -106,7 +107,7 @@ def _scrub_identifiers(segment_span: CompatibleSpan, segment_name: str):
106107
}
107108
attributes[ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE] = {
108109
"type": "string",
109-
"value": "sanitized",
110+
"value": TRANSACTION_SOURCE_SANITIZED,
110111
}
111112
attributes[f"sentry._meta.fields.attributes.{ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME}"] = {
112113
"type": "string",

src/sentry/spans/consumers/process_segments/message.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
from sentry.constants import DataCategory
1414
from sentry.dynamic_sampling.rules.helpers.latest_releases import record_latest_release
1515
from sentry.event_manager import INSIGHT_MODULE_TO_PROJECT_FLAG_NAME
16+
from sentry.ingest.transaction_clusterer.datasource import TRANSACTION_SOURCE_URL
17+
from sentry.ingest.transaction_clusterer.datasource.redis import record_segment_name
1618
from sentry.ingest.transaction_clusterer.normalization import normalize_segment_name
1719
from sentry.insights import FilterSpan
1820
from sentry.insights import modules as insights_modules
@@ -64,7 +66,7 @@ def process_segment(
6466
# If the project does not exist then it might have been deleted during ingestion.
6567
return []
6668

67-
safe_execute(_normalize_segment_name, segment_span, project.organization)
69+
safe_execute(_normalize_segment_name, segment_span, project)
6870
_add_segment_name(segment_span, spans)
6971
_compute_breakdowns(segment_span, spans, project)
7072
_create_models(segment_span, project)
@@ -145,8 +147,10 @@ def _enrich_spans(
145147

146148

147149
@metrics.wraps("spans.consumers.process_segments.normalize_segment_name")
148-
def _normalize_segment_name(segment_span: CompatibleSpan, organization: Organization) -> None:
149-
if not features.has("organizations:normalize_segment_names_in_span_enrichment", organization):
150+
def _normalize_segment_name(segment_span: CompatibleSpan, project: Project) -> None:
151+
if not features.has(
152+
"organizations:normalize_segment_names_in_span_enrichment", project.organization
153+
):
150154
return
151155

152156
segment_name = attribute_value(
@@ -157,10 +161,12 @@ def _normalize_segment_name(segment_span: CompatibleSpan, organization: Organiza
157161

158162
source = attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE)
159163
unknown_if_parameterized = not source
160-
known_to_be_unparameterized = source == "url"
164+
known_to_be_unparameterized = source == TRANSACTION_SOURCE_URL
161165
if unknown_if_parameterized or known_to_be_unparameterized:
162166
normalize_segment_name(segment_span)
163167

168+
record_segment_name(project, segment_span)
169+
164170

165171
@metrics.wraps("spans.consumers.process_segments.add_segment_name")
166172
def _add_segment_name(segment: CompatibleSpan, spans: Sequence[CompatibleSpan]) -> None:

tests/sentry/ingest/test_transaction_clusterer.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from unittest import mock
22

33
import pytest
4+
from sentry_conventions.attributes import ATTRIBUTE_NAMES
45

56
from sentry.ingest.transaction_clusterer import ClustererNamespace
67
from sentry.ingest.transaction_clusterer.base import ReplacementRule
@@ -13,6 +14,7 @@
1314
get_active_projects,
1415
get_redis_client,
1516
get_transaction_names,
17+
record_segment_name,
1618
record_transaction_name,
1719
)
1820
from sentry.ingest.transaction_clusterer.meta import get_clusterer_meta
@@ -209,6 +211,47 @@ def test_record_transactions(
209211
assert len(mocked_record.mock_calls) == expected
210212

211213

214+
@mock.patch("sentry.ingest.transaction_clusterer.datasource.redis._record_sample")
215+
@django_db_all
216+
@pytest.mark.parametrize(
217+
"source, segment_name, attributes, expected",
218+
[
219+
("url", "/a/b/c", {}, 1),
220+
(
221+
"url",
222+
"/a/b/c",
223+
{ATTRIBUTE_NAMES.HTTP_RESPONSE_STATUS_CODE: {"type": "integer", "value": 200}},
224+
1,
225+
),
226+
("route", "/", {}, 0),
227+
("url", None, {}, 0),
228+
(
229+
"url",
230+
"/a/b/c",
231+
{ATTRIBUTE_NAMES.HTTP_RESPONSE_STATUS_CODE: {"type": "integer", "value": 404}},
232+
0,
233+
),
234+
(None, "/a/b/c", {}, 1),
235+
(None, "foo", {}, 0),
236+
],
237+
)
238+
def test_record_segment_name(
239+
mocked_record, default_organization, source, segment_name, attributes, expected
240+
) -> None:
241+
project = Project(id=111, name="project", organization_id=default_organization.id)
242+
record_segment_name(
243+
project,
244+
{
245+
"name": segment_name,
246+
"attributes": {
247+
ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE: {"type": "string", "value": source},
248+
**attributes,
249+
},
250+
}, # type: ignore[typeddict-item]
251+
)
252+
assert len(mocked_record.mock_calls) == expected
253+
254+
212255
def test_sort_rules() -> None:
213256
rules = {
214257
ReplacementRule("/a/*/**"): 1,

tests/sentry/spans/consumers/process_segments/test_message.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -274,16 +274,23 @@ def test_segment_name_propagation_when_name_missing(self):
274274
child_attributes = child_span["attributes"] or {}
275275
assert child_attributes.get("sentry.segment.name") is None
276276

277-
def test_segment_name_normalization_with_feature(self):
277+
@mock.patch("sentry.spans.consumers.process_segments.message.record_segment_name")
278+
def test_segment_name_normalization_with_feature(
279+
self, mock_record_segment_name: mock.MagicMock
280+
):
278281
_, segment_span = self.generate_basic_spans()
279282
segment_span["name"] = "/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0"
280283

281284
with self.feature("organizations:normalize_segment_names_in_span_enrichment"):
282285
processed_spans = process_segment([segment_span])
283286

284287
assert processed_spans[0]["name"] == "/foo/*/user/*/0"
288+
mock_record_segment_name.assert_called_once()
285289

286-
def test_segment_name_normalization_without_feature(self):
290+
@mock.patch("sentry.spans.consumers.process_segments.message.record_segment_name")
291+
def test_segment_name_normalization_without_feature(
292+
self, mock_record_segment_name: mock.MagicMock
293+
):
287294
_, segment_span = self.generate_basic_spans()
288295
segment_span["name"] = "/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0"
289296

@@ -293,6 +300,7 @@ def test_segment_name_normalization_without_feature(self):
293300
assert (
294301
processed_spans[0]["name"] == "/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0"
295302
)
303+
mock_record_segment_name.assert_not_called()
296304

297305
def test_segment_name_normalization_checks_source(self):
298306
_, segment_span = self.generate_basic_spans()

0 commit comments

Comments
 (0)