Skip to content

Commit 6ed1f7f

Browse files
authored
ref(grouping): Small parameterization refactor (#109690)
This is a small refactor to our message parameterization code, to make it a little easier to understand. Aside from a few renames and some added/tweaked comments, the main changes here are: - Make it so that `ParameterizationRegex.experimental_pattern` is `None` if there's no experimental pattern, rather than having it fall back to using the regular pattern. The fallback is now done in the creation of `EXPERIMENTAL_PARAMETERIZATION_REGEXES_MAP`. - Make the behavior of `Parameterizer.parametrize_w_regex` match its docstring by a) making it take in a regex (as its docstring says it does), and b) removing the (non-existent) callback parameter from the docstring. - Add a few test cases. - Skip the test testing experimental patterns if there are none.
1 parent 3c3133e commit 6ed1f7f

File tree

2 files changed

+56
-34
lines changed

2 files changed

+56
-34
lines changed

src/sentry/grouping/parameterization.py

Lines changed: 34 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,6 @@
33
from collections import defaultdict
44
from collections.abc import Callable, Iterable, Sequence
55

6-
__all__ = [
7-
"ParameterizationCallable",
8-
"ParameterizationRegex",
9-
"Parameterizer",
10-
]
11-
126

137
@dataclasses.dataclass
148
class ParameterizationRegex:
@@ -24,20 +18,21 @@ class ParameterizationRegex:
2418

2519
@property
2620
def pattern(self) -> str:
27-
return self._pattern(False)
21+
return self._get_pattern(self.raw_pattern)
2822

2923
@property
30-
def experimental_pattern(self) -> str:
31-
return self._pattern(self.raw_pattern_experimental is not None)
24+
def experimental_pattern(self) -> str | None:
25+
if not self.raw_pattern_experimental:
26+
return None
27+
return self._get_pattern(self.raw_pattern_experimental)
3228

33-
def _pattern(self, experimental: bool = False) -> str:
29+
def _get_pattern(self, raw_pattern: str) -> str:
3430
"""
3531
Returns the regex pattern with a named matching group and lookbehind/lookahead if needed.
3632
"""
37-
pattern = self.raw_pattern_experimental if experimental else self.raw_pattern
3833
prefix = rf"(?<={self.lookbehind})" if self.lookbehind else ""
3934
postfix = rf"(?={self.lookahead})" if self.lookahead else ""
40-
return rf"{prefix}(?P<{self.name}>{pattern}){postfix}"
35+
return rf"{prefix}(?P<{self.name}>{raw_pattern}){postfix}"
4136

4237

4338
DEFAULT_PARAMETERIZATION_REGEXES = [
@@ -198,37 +193,46 @@ def _pattern(self, experimental: bool = False) -> str:
198193
ParameterizationRegex(name="int", raw_pattern=r"""-\d+\b | \b\d+\b"""),
199194
ParameterizationRegex(
200195
name="quoted_str",
201-
raw_pattern=r"""# Using `=`lookbehind which guarantees we'll only match the value half of key-value pairs,
202-
# rather than all quoted strings
196+
raw_pattern=r"""
203197
'([^']+)' | "([^"]+)"
204198
""",
199+
# Using an `=` lookbehind guarantees we'll only match the value half of key-value pairs,
200+
# rather than all quoted strings
205201
lookbehind="=",
206202
),
207203
ParameterizationRegex(
208204
name="bool",
209-
raw_pattern=r"""# Using `=`lookbehind which guarantees we'll only match the value half of key-value pairs,
210-
# rather than all instances of the words 'true' and 'false'.
205+
raw_pattern=r"""
211206
True |
212207
true |
213208
False |
214209
false
215210
""",
211+
# Using an `=` lookbehind guarantees we'll only match the value half of key-value pairs,
212+
# rather than all instances of the words 'true' and 'false'.
216213
lookbehind="=",
217214
),
218215
]
219216

220217

218+
# Patterns to use for each match type when not in experimental mode.
221219
DEFAULT_PARAMETERIZATION_REGEXES_MAP = {r.name: r.pattern for r in DEFAULT_PARAMETERIZATION_REGEXES}
220+
221+
# Patterns to use when in experimental mode. If no experimental pattern exists for a given type of
222+
# match, falls back to the default pattern.
222223
EXPERIMENTAL_PARAMETERIZATION_REGEXES_MAP = {
223-
r.name: r.experimental_pattern for r in DEFAULT_PARAMETERIZATION_REGEXES
224+
r.name: r.experimental_pattern if r.experimental_pattern else r.pattern
225+
for r in DEFAULT_PARAMETERIZATION_REGEXES
224226
}
225227

226228

227229
@dataclasses.dataclass
228230
class ParameterizationCallable:
229231
"""
230-
Represents a callable that can be used to modify a string, which can give
231-
us more flexibility than just using regex.
232+
Represents a callable that can be used to modify a string, which can give us more flexibility
233+
than just using regex.
234+
235+
Note: Future-proofing. Not currently in use.
232236
"""
233237

234238
name: str # name of the pattern (also used as group name in combined regex)
@@ -239,7 +243,11 @@ class ParameterizationCallable:
239243
class Parameterizer:
240244
def __init__(
241245
self,
246+
# List of `ParameterizationRegex.name` values, used to selectively enable pattern types. To
247+
# use all available parameterization, omit this argument.
242248
regex_pattern_keys: Sequence[str] | None = None,
249+
# Whether to use experimental patterns, if available. (Pattern types without an experimental
250+
# pattern will fall back to the standard pattern.)
243251
experimental: bool = False,
244252
):
245253
self._experimental = experimental
@@ -268,15 +276,14 @@ def _make_regex_from_patterns(self, pattern_keys: Iterable[str]) -> re.Pattern[s
268276

269277
return re.compile(rf"(?x){'|'.join(regexes_map[k] for k in pattern_keys)}")
270278

271-
def parametrize_w_regex(self, content: str) -> str:
279+
def parametrize_w_regex(self, input_str: str, parameterization_regex: re.Pattern[str]) -> str:
272280
"""
273-
Replace all matches of the given regex in the content with a placeholder string.
281+
Replace all matches of the given regex in the input string with a placeholder.
274282
275-
@param content: The string to replace matches in.
283+
@param input_str: The string to replace matches in.
276284
@param parameterization_regex: The compiled regex pattern to match.
277-
@param match_callback: An optional callback function to call with the key of the matched pattern.
278285
279-
@returns: The content with all matches replaced with placeholders.
286+
@returns: The input string with all matches replaced with placeholders.
280287
"""
281288

282289
def _handle_regex_match(match: re.Match[str]) -> str:
@@ -289,7 +296,7 @@ def _handle_regex_match(match: re.Match[str]) -> str:
289296
return f"<{key}>"
290297
return ""
291298

292-
return self._parameterization_regex.sub(_handle_regex_match, content)
299+
return parameterization_regex.sub(_handle_regex_match, input_str)
293300

294-
def parameterize_all(self, content: str) -> str:
295-
return self.parametrize_w_regex(content)
301+
def parameterize_all(self, input_str: str) -> str:
302+
return self.parametrize_w_regex(input_str, self._parameterization_regex)

tests/sentry/grouping/test_parameterization.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
import pytest
22

3-
from sentry.grouping.parameterization import Parameterizer
3+
from sentry.grouping.parameterization import (
4+
DEFAULT_PARAMETERIZATION_REGEXES_MAP,
5+
EXPERIMENTAL_PARAMETERIZATION_REGEXES_MAP,
6+
Parameterizer,
7+
)
48

59

610
@pytest.fixture
@@ -58,13 +62,16 @@ def experimental_parameterizer() -> Parameterizer:
5862
("date - RFC3339Nano without offset", "2006-01-02T15:04:05.999999999Z", "<date>"),
5963
("date - plain", "2006-01-02", "<date>"),
6064
("date - long", "Jan 18, 2019", "<date>"),
61-
("date - datetime", "2006-01-02 15:04:05", "<date>"),
65+
("date - datetime space-separated", "2006-01-02 15:04:05", "<date>"),
66+
("date - datetime T-separated", "2006-01-02T15:04:05", "<date>"),
67+
("date - datetime T-separated UTC", "2006-01-02T15:04:05Z", "<date>"),
68+
("date - datetime T-separated w offset", "2006-01-02T15:04:05+01:00", "<date>"),
6269
("date - kitchen", "3:04PM", "<date>"),
6370
("date - time", "15:04:05", "<date>"),
6471
("date - basic", "Mon Jan 02, 1999", "<date>"),
65-
("date - compressed", "20240220 11:55:33.546593", "<date>"),
72+
("date - datetime compressed date", "20240220 11:55:33.546593", "<date>"),
6673
("date - datestamp", "2024-02-23 02:13:53.418", "<date>"),
67-
("date - datetime", "datetime.datetime(2025, 6, 24, 18, 33, 0, 447640)", "<date>"),
74+
("date - datetime object", "datetime.datetime(2025, 6, 24, 18, 33, 0, 447640)", "<date>"),
6875
("duration - 0ms", "0ms", "<duration>"),
6976
("duration - 1ms", "1ms", "<duration>"),
7077
("duration - 10ms", "10ms", "<duration>"),
@@ -112,6 +119,10 @@ def experimental_parameterizer() -> Parameterizer:
112119
("hex without prefix - uppercase, 128 digits", "B0" * 64, "<hex>"),
113120
("hex without prefix - lowercase, no numbers", "deadbeef", "deadbeef"),
114121
("hex without prefix - uppercase, no numbers", "DEADBEEF", "DEADBEEF"),
122+
("hex without prefix - no letters, < 8 digits", "1234567", "<int>"),
123+
("hex without prefix - no letters, 8+ digits", "12345678", "<hex>"),
124+
("git sha - all letters", "commit deadbeef", "commit deadbeef"),
125+
("git sha - all numbers", "commit 4150908", "commit <int>"),
115126
("float", "0.23", "<float>"),
116127
("int", "23", "<int>"),
117128
("int - separator", "0:17502", "<int>:<int>"),
@@ -145,7 +156,7 @@ def experimental_parameterizer() -> Parameterizer:
145156

146157

147158
@pytest.mark.parametrize(("name", "input", "expected"), standard_cases)
148-
def test_parameterize_standard(
159+
def test_default_parameterization(
149160
name: str, input: str, expected: str, parameterizer: Parameterizer
150161
) -> None:
151162
assert parameterizer.parameterize_all(input) == expected
@@ -155,7 +166,7 @@ def test_parameterize_standard(
155166

156167

157168
@pytest.mark.parametrize(("name", "input", "expected"), experimental_cases)
158-
def test_parameterize_standard_not_experimental(
169+
def test_default_parameterizer_misses_experimental_cases(
159170
name: str, input: str, expected: str, parameterizer: Parameterizer
160171
) -> None:
161172
assert parameterizer.parameterize_all(input) != expected
@@ -164,8 +175,12 @@ def test_parameterize_standard_not_experimental(
164175
assert parameterizer.parameterize_all(f"prefix {input} suffix") != f"prefix {expected} suffix"
165176

166177

178+
@pytest.mark.skipif(
179+
EXPERIMENTAL_PARAMETERIZATION_REGEXES_MAP == DEFAULT_PARAMETERIZATION_REGEXES_MAP,
180+
reason="no experimental regexes to test",
181+
)
167182
@pytest.mark.parametrize(("name", "input", "expected"), standard_cases + experimental_cases)
168-
def test_parameterize_experimental(
183+
def test_experimental_parameterization(
169184
name: str, input: str, expected: str, experimental_parameterizer: Parameterizer
170185
) -> None:
171186
assert experimental_parameterizer.parameterize_all(input) == expected

0 commit comments

Comments
 (0)