Skip to content

Commit a91f6c4

Browse files
committed
feat(data-collection): gate HTTP request data collection
1 parent 98022a4 commit a91f6c4

18 files changed

Lines changed: 350 additions & 85 deletions

sentry_sdk/integrations/_asgi_common.py

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import urllib
22
from typing import TYPE_CHECKING
33

4+
import sentry_sdk
5+
from sentry_sdk.data_collection import scrub_query_string
46
from sentry_sdk.integrations._wsgi_common import _filter_headers
5-
from sentry_sdk.scope import should_send_default_pii
7+
from sentry_sdk.scope import should_collect_user_info, should_send_default_pii
68

79
if TYPE_CHECKING:
810
from typing import Any, Dict, Optional, Union
@@ -93,14 +95,29 @@ def _get_request_data(asgi_scope: "Any") -> "Dict[str, Any]":
9395
request_data["headers"] = headers = _filter_headers(
9496
_get_headers(asgi_scope),
9597
)
96-
request_data["query_string"] = _get_query(asgi_scope)
98+
99+
# Event request.query_string is set unconditionally in legacy mode. When
100+
# data_collection is set explicitly, the query_params behavior governs
101+
# whether/how it is collected.
102+
dc = sentry_sdk.get_client().data_collection
103+
if dc.explicit:
104+
raw_query = _get_query(asgi_scope)
105+
scrubbed_query = (
106+
scrub_query_string(raw_query, dc.query_params)
107+
if raw_query is not None
108+
else None
109+
)
110+
if scrubbed_query is not None:
111+
request_data["query_string"] = scrubbed_query
112+
else:
113+
request_data["query_string"] = _get_query(asgi_scope)
97114

98115
request_data["url"] = _get_url(
99116
asgi_scope, "http" if ty == "http" else "ws", headers.get("host")
100117
)
101118

102119
client = asgi_scope.get("client")
103-
if client and should_send_default_pii():
120+
if client and should_collect_user_info():
104121
request_data["env"] = {"REMOTE_ADDR": _get_ip(asgi_scope)}
105122

106123
return request_data
@@ -121,7 +138,28 @@ def _get_request_attributes(asgi_scope: "Any") -> "dict[str, Any]":
121138
for header, value in headers.items():
122139
attributes[f"http.request.header.{header.lower()}"] = value
123140

124-
if should_send_default_pii():
141+
dc = sentry_sdk.get_client().data_collection
142+
if dc.explicit:
143+
url_without_query_string = _get_url(
144+
asgi_scope, "http" if ty == "http" else "ws", headers.get("host")
145+
)
146+
raw_query = _get_query(asgi_scope)
147+
scrubbed_query = (
148+
scrub_query_string(raw_query, dc.query_params)
149+
if raw_query is not None
150+
else None
151+
)
152+
if scrubbed_query is not None:
153+
attributes["http.query"] = scrubbed_query
154+
attributes["url.full"] = f"{url_without_query_string}?{scrubbed_query}"
155+
else:
156+
attributes["url.full"] = url_without_query_string
157+
# url.path never contains a query string, so it is unaffected by
158+
# query_params and is collected as technical context.
159+
attributes["url.path"] = asgi_scope.get("root_path", "") + asgi_scope.get(
160+
"path", ""
161+
)
162+
elif should_send_default_pii():
125163
query = _get_query(asgi_scope)
126164
if query:
127165
attributes["http.query"] = query
@@ -140,7 +178,7 @@ def _get_request_attributes(asgi_scope: "Any") -> "dict[str, Any]":
140178
)
141179

142180
client = asgi_scope.get("client")
143-
if client and should_send_default_pii():
181+
if client and should_collect_user_info():
144182
ip = _get_ip(asgi_scope)
145183
attributes["client.address"] = ip
146184

sentry_sdk/integrations/_wsgi_common.py

Lines changed: 79 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,14 @@
44

55
import sentry_sdk
66
from sentry_sdk._types import SENSITIVE_DATA_SUBSTITUTE
7+
from sentry_sdk.data_collection import (
8+
BODY_TYPE_INCOMING_REQUEST,
9+
COLLECTION_OFF,
10+
apply_key_value_collection,
11+
filter_request_headers,
12+
scrub_query_string,
13+
should_collect_body_type,
14+
)
715
from sentry_sdk.scope import should_send_default_pii
816
from sentry_sdk.utils import AnnotatedValue, logger
917

@@ -90,15 +98,34 @@ def extract_into_event(self, event: "Event") -> None:
9098
if not client.is_active():
9199
return
92100

101+
dc = client.data_collection
102+
93103
data: "Optional[Union[AnnotatedValue, Dict[str, Any]]]" = None
94104

95105
content_length = self.content_length()
96106
request_info = event.get("request", {})
97107

98-
if should_send_default_pii():
108+
# Cookies. When data_collection is set explicitly, collect according to
109+
# the cookies behavior (default denyList scrubs sensitive cookie values);
110+
# otherwise fall back to the legacy send_default_pii gate.
111+
if dc.explicit:
112+
if dc.cookies.mode != COLLECTION_OFF:
113+
request_info["cookies"] = apply_key_value_collection(
114+
dict(self.cookies()), dc.cookies
115+
)
116+
elif should_send_default_pii():
99117
request_info["cookies"] = dict(self.cookies())
100118

101-
if not request_body_within_bounds(client, content_length):
119+
# Request body. When data_collection is set explicitly, only collect the
120+
# incoming request body if that body type is enabled; size is still
121+
# bounded by max_request_body_size.
122+
collect_body = True
123+
if dc.explicit:
124+
collect_body = should_collect_body_type(dc, BODY_TYPE_INCOMING_REQUEST)
125+
126+
if not collect_body:
127+
data = None
128+
elif not request_body_within_bounds(client, content_length):
102129
data = AnnotatedValue.removed_because_over_size_limit()
103130
else:
104131
# First read the raw body data
@@ -213,21 +240,68 @@ def _filter_headers(
213240
headers: "Mapping[str, str]",
214241
use_annotated_value: bool = True,
215242
) -> "Mapping[str, Union[AnnotatedValue, str]]":
216-
if should_send_default_pii():
217-
return headers
218-
219243
substitute: "Union[AnnotatedValue, str]" = (
220244
SENSITIVE_DATA_SUBSTITUTE
221245
if not use_annotated_value
222246
else AnnotatedValue.removed_because_over_size_limit()
223247
)
224248

249+
dc = sentry_sdk.get_client().data_collection
250+
if dc.explicit:
251+
# Apply the configured request-header collection behavior (default
252+
# denyList scrubs sensitive header values; the raw Cookie/Set-Cookie
253+
# header is always filtered).
254+
return filter_request_headers(
255+
headers, dc.http_headers.request, substitute=substitute
256+
)
257+
258+
# Legacy behavior (data_collection not set explicitly).
259+
if should_send_default_pii():
260+
return headers
261+
225262
return {
226263
k: (v if k.upper().replace("-", "_") not in SENSITIVE_HEADERS else substitute)
227264
for k, v in headers.items()
228265
}
229266

230267

268+
def collect_query_string(
269+
raw_query_string: "Optional[str]",
270+
) -> "Optional[str]":
271+
"""
272+
Return the (possibly scrubbed) query string to attach to span attributes
273+
(``http.query`` / ``url.query`` / the query portion of ``url.full``), or
274+
``None`` if the query string should not be collected.
275+
276+
When ``data_collection`` is set explicitly, the ``query_params`` behavior
277+
governs collection/scrubbing. Otherwise the legacy ``send_default_pii`` gate
278+
applies (preserving current behavior).
279+
"""
280+
if not raw_query_string:
281+
return None
282+
283+
dc = sentry_sdk.get_client().data_collection
284+
if dc.explicit:
285+
return scrub_query_string(raw_query_string, dc.query_params)
286+
287+
if should_send_default_pii():
288+
return raw_query_string
289+
return None
290+
291+
292+
def should_collect_url() -> bool:
293+
"""
294+
Whether to collect non-query URL attributes (``url.full`` base and
295+
``url.path``). These never contain query strings, so they are treated as
296+
technical context and collected whenever ``data_collection`` is set
297+
explicitly. Otherwise the legacy ``send_default_pii`` gate applies.
298+
"""
299+
dc = sentry_sdk.get_client().data_collection
300+
if dc.explicit:
301+
return True
302+
return should_send_default_pii()
303+
304+
231305
def _in_http_status_code_range(
232306
code: object, code_ranges: "list[HttpStatusCodeRange]"
233307
) -> bool:

sentry_sdk/integrations/aiohttp.py

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import sentry_sdk
66
from sentry_sdk.api import continue_trace
77
from sentry_sdk.consts import OP, SPANDATA, SPANSTATUS
8+
from sentry_sdk.data_collection import scrub_query_string
89
from sentry_sdk.integrations import (
910
_DEFAULT_FAILED_REQUEST_STATUS_CODES,
1011
DidNotEnable,
@@ -13,10 +14,12 @@
1314
)
1415
from sentry_sdk.integrations._wsgi_common import (
1516
_filter_headers,
17+
collect_query_string,
1618
request_body_within_bounds,
19+
should_collect_url,
1720
)
1821
from sentry_sdk.integrations.logging import ignore_logger
19-
from sentry_sdk.scope import Scope, should_send_default_pii
22+
from sentry_sdk.scope import Scope, should_collect_user_info
2023
from sentry_sdk.sessions import track_session
2124
from sentry_sdk.traces import (
2225
SOURCE_FOR_STYLE as SEGMENT_SOURCE_FOR_STYLE,
@@ -159,20 +162,21 @@ async def sentry_app_handle(
159162
header_value
160163
)
161164

162-
url_attributes = {}
163-
if should_send_default_pii():
165+
url_attributes: "dict[str, Any]" = {}
166+
if should_collect_url():
164167
url_attributes["url.full"] = "%s://%s%s" % (
165168
request.scheme,
166169
request.host,
167170
request.path,
168171
)
169172
url_attributes["url.path"] = request.path
170173

171-
if request.query_string:
172-
url_attributes["url.query"] = request.query_string
174+
query = collect_query_string(request.query_string)
175+
if query:
176+
url_attributes["url.query"] = query
173177

174178
client_address_attributes = {}
175-
if should_send_default_pii() and request.remote:
179+
if should_collect_user_info() and request.remote:
176180
client_address_attributes["client.address"] = request.remote
177181
scope.set_attribute(
178182
SPANDATA.USER_IP_ADDRESS, request.remote
@@ -358,15 +362,18 @@ async def on_request_start(
358362
"sentry.origin": AioHttpIntegration.origin,
359363
"http.request.method": method,
360364
}
361-
if parsed_url is not None and should_send_default_pii():
365+
if parsed_url is not None and should_collect_url():
362366
attributes["url.full"] = parsed_url.url
363367
attributes["url.path"] = params.url.path
364368

365-
if parsed_url.query:
366-
attributes["url.query"] = parsed_url.query
367369
if parsed_url.fragment:
368370
attributes["url.fragment"] = parsed_url.fragment
369371

372+
if parsed_url is not None:
373+
query = collect_query_string(parsed_url.query)
374+
if query:
375+
attributes["url.query"] = query
376+
370377
span = sentry_sdk.traces.start_span(name=span_name, attributes=attributes)
371378
else:
372379
legacy_span = sentry_sdk.start_span(
@@ -458,7 +465,20 @@ def aiohttp_processor(
458465
request.path,
459466
)
460467

461-
request_info["query_string"] = request.query_string
468+
# Event request.query_string is set unconditionally in legacy mode;
469+
# when data_collection is explicit it is governed by query_params.
470+
query_string = request.query_string
471+
dc = sentry_sdk.get_client().data_collection
472+
if dc.explicit:
473+
scrubbed_qs = (
474+
scrub_query_string(query_string, dc.query_params)
475+
if query_string
476+
else None
477+
)
478+
if scrubbed_qs is not None:
479+
request_info["query_string"] = scrubbed_qs
480+
else:
481+
request_info["query_string"] = query_string
462482
request_info["method"] = request.method
463483
request_info["env"] = {"REMOTE_ADDR": request.remote}
464484
request_info["headers"] = _filter_headers(dict(request.headers))

sentry_sdk/integrations/asgi.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
DEFAULT_HTTP_METHODS_TO_CAPTURE,
2525
nullcontext,
2626
)
27-
from sentry_sdk.scope import Scope, should_send_default_pii
27+
from sentry_sdk.scope import Scope, should_collect_user_info
2828
from sentry_sdk.sessions import track_session
2929
from sentry_sdk.traces import (
3030
SOURCE_FOR_STYLE as SEGMENT_SOURCE_FOR_STYLE,
@@ -248,7 +248,7 @@ async def _run_app(
248248
"network.protocol.name": ty,
249249
}
250250

251-
if scope.get("client") and should_send_default_pii():
251+
if scope.get("client") and should_collect_user_info():
252252
sentry_scope.set_attribute(
253253
SPANDATA.USER_IP_ADDRESS, _get_ip(scope)
254254
)

sentry_sdk/integrations/aws_lambda.py

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,18 @@
1111
import sentry_sdk
1212
from sentry_sdk.api import continue_trace
1313
from sentry_sdk.consts import OP
14+
from sentry_sdk.data_collection import COLLECTION_OFF, apply_key_value_collection
1415
from sentry_sdk.integrations import Integration
15-
from sentry_sdk.integrations._wsgi_common import _filter_headers
16+
from sentry_sdk.integrations._wsgi_common import _filter_headers, collect_query_string
1617
from sentry_sdk.integrations.cloud_resource_context import (
1718
CLOUD_PLATFORM,
1819
CLOUD_PROVIDER,
1920
)
20-
from sentry_sdk.scope import Scope, should_send_default_pii
21+
from sentry_sdk.scope import (
22+
Scope,
23+
should_collect_user_info,
24+
should_send_default_pii,
25+
)
2126
from sentry_sdk.traces import SegmentSource
2227
from sentry_sdk.tracing import TransactionSource
2328
from sentry_sdk.tracing_utils import has_span_streaming_enabled
@@ -164,10 +169,12 @@ def sentry_handler(
164169
"httpMethod"
165170
]
166171

167-
if should_send_default_pii() and "queryStringParameters" in request_data:
172+
if "queryStringParameters" in request_data:
168173
qs = request_data["queryStringParameters"]
169174
if qs:
170-
additional_attributes["url.query"] = urlencode(qs)
175+
query_string = collect_query_string(urlencode(qs))
176+
if query_string:
177+
additional_attributes["url.query"] = query_string
171178

172179
sampling_context = {
173180
"aws_event": aws_event,
@@ -409,12 +416,22 @@ def event_processor(
409416
request["url"] = _get_url(aws_event, aws_context)
410417

411418
if "queryStringParameters" in aws_event:
412-
request["query_string"] = aws_event["queryStringParameters"]
419+
# Event request.query_string is set unconditionally in legacy mode;
420+
# when data_collection is explicit it is governed by query_params.
421+
qs = aws_event["queryStringParameters"]
422+
dc = sentry_sdk.get_client().data_collection
423+
if dc.explicit:
424+
if qs and dc.query_params.mode != COLLECTION_OFF:
425+
request["query_string"] = apply_key_value_collection(
426+
qs, dc.query_params
427+
)
428+
else:
429+
request["query_string"] = qs
413430

414431
if "headers" in aws_event:
415432
request["headers"] = _filter_headers(aws_event["headers"])
416433

417-
if should_send_default_pii():
434+
if should_collect_user_info():
418435
user_info = sentry_event.setdefault("user", {})
419436

420437
identity = aws_event.get("identity")
@@ -429,6 +446,7 @@ def event_processor(
429446
if ip is not None:
430447
user_info.setdefault("ip_address", ip)
431448

449+
if should_send_default_pii():
432450
if "body" in aws_event:
433451
request["data"] = aws_event.get("body", "")
434452
else:

0 commit comments

Comments
 (0)