Skip to content

Commit a3224b6

Browse files
authored
ref(grouping): Add git SHA parameterization (#109753)
It is standard git practice to abbreviate commit SHAs to 7 digits where there's no ambiguity. However, for message parameterization our current hex matching only applies to strings 8 characters and up. This therefore adds a new, separate pattern to our collection of parameterizer regexes, specifically to match git SHAs. It's similar to our hex regex, but stricter, in that the string must be exactly 7 characters, must include both a letter and a number (this will miss a few, but both words and ints are more common, so we let them win), and must only use lowercase letters, as git always does.
1 parent 2c6a1c1 commit a3224b6

File tree

2 files changed

+12
-6
lines changed

2 files changed

+12
-6
lines changed

src/sentry/grouping/parameterization.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,17 @@ def _get_pattern(self, raw_pattern: str) -> str:
206206
(\b(?=[A-F]*[0-9])[0-9A-F]{8,128}\b)
207207
""",
208208
),
209+
ParameterizationRegex(
210+
name="git_sha",
211+
raw_pattern=r"""
212+
# This is similar to the hex pattern above, except it has lookaheads for both numbers
213+
# and letters, to guarantee we have at least one of each. (This means it will miss git
214+
# shas which consist of only letters or only numbers, but fortunately > 96% of 7-digit
215+
# hex values are mixed, so that's a tradeoff we're okay with.) Also, it only includes
216+
# lowercase letters, since git shas are always expressed that way.
217+
(\b(?=[a-f]*[0-9])(?=[0-9]*[a-f])[0-9a-f]{7}\b)
218+
""",
219+
),
209220
ParameterizationRegex(name="float", raw_pattern=r"""-\d+\.\d+\b | \b\d+\.\d+\b"""),
210221
ParameterizationRegex(name="int", raw_pattern=r"""-\d+\b | \b\d+\b"""),
211222
ParameterizationRegex(

tests/sentry/grouping/test_parameterization.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ def experimental_parameterizer() -> Parameterizer:
129129
("hex without prefix - uppercase, no numbers until later", "DEADBEEF 123", "DEADBEEF <int>"),
130130
("hex without prefix - no letters, < 8 digits", "1234567", "<int>"),
131131
("hex without prefix - no letters, 8+ digits", "12345678", "<hex>"),
132+
("git sha", "commit a93c7d2", "commit <git_sha>"),
132133
("git sha - all letters", "commit deadbeef", "commit deadbeef"),
133134
("git sha - all numbers", "commit 4150908", "commit <int>"),
134135
("float", "0.23", "<float>"),
@@ -208,12 +209,6 @@ def test_experimental_parameterization(
208209
# parameterization. (Remember to remove the last item in each tuple for the cases you fix.)
209210
incorrect_cases = [
210211
# ("name", "input", "desired", "actual")
211-
(
212-
"git sha",
213-
"commit a93c7d2",
214-
"commit <git_sha>",
215-
"commit a93c7d2",
216-
),
217212
(
218213
"int - number in word",
219214
"Encoding: utf-8",

0 commit comments

Comments
 (0)