Skip to content

Commit ba1286e

Browse files
authored
feat(pii): Sanitize URLs in Span description and breadcrumbs (#1876)
When recording spans for outgoing HTTP requests, strip the target URLs in three parts: base URL, query params and fragment. The URL is always stripped of the authority and then set in the spans description. query params and fragment go into data fields of the span. This is also done when creating breadcrumbs for HTTP requests and in the HTTPX and Boto3 integrations.
1 parent 0b489c6 commit ba1286e

File tree

10 files changed

+331
-17
lines changed

10 files changed

+331
-17
lines changed

sentry_sdk/consts.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,6 @@
4444
DEFAULT_QUEUE_SIZE = 100
4545
DEFAULT_MAX_BREADCRUMBS = 100
4646

47-
SENSITIVE_DATA_SUBSTITUTE = "[Filtered]"
48-
4947

5048
class INSTRUMENTER:
5149
SENTRY = "sentry"

sentry_sdk/integrations/boto3.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from sentry_sdk._functools import partial
99
from sentry_sdk._types import MYPY
10+
from sentry_sdk.utils import parse_url
1011

1112
if MYPY:
1213
from typing import Any
@@ -66,9 +67,14 @@ def _sentry_request_created(service_id, request, operation_name, **kwargs):
6667
op=OP.HTTP_CLIENT,
6768
description=description,
6869
)
70+
71+
parsed_url = parse_url(request.url, sanitize=False)
72+
6973
span.set_tag("aws.service_id", service_id)
7074
span.set_tag("aws.operation_name", operation_name)
71-
span.set_data("aws.request.url", request.url)
75+
span.set_data("aws.request.url", parsed_url.url)
76+
span.set_data("http.query", parsed_url.query)
77+
span.set_data("http.fragment", parsed_url.fragment)
7278

7379
# We do it in order for subsequent http calls/retries be
7480
# attached to this span.

sentry_sdk/integrations/django/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import weakref
77

88
from sentry_sdk._types import MYPY
9-
from sentry_sdk.consts import OP, SENSITIVE_DATA_SUBSTITUTE
9+
from sentry_sdk.consts import OP
1010
from sentry_sdk.hub import Hub, _should_send_default_pii
1111
from sentry_sdk.scope import add_global_event_processor
1212
from sentry_sdk.serializer import add_global_repr_processor
@@ -16,6 +16,7 @@
1616
AnnotatedValue,
1717
HAS_REAL_CONTEXTVARS,
1818
CONTEXTVARS_ERROR_MESSAGE,
19+
SENSITIVE_DATA_SUBSTITUTE,
1920
logger,
2021
capture_internal_exceptions,
2122
event_from_exception,

sentry_sdk/integrations/httpx.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from sentry_sdk import Hub
22
from sentry_sdk.consts import OP
33
from sentry_sdk.integrations import Integration, DidNotEnable
4-
from sentry_sdk.utils import logger
4+
from sentry_sdk.utils import logger, parse_url
55

66
from sentry_sdk._types import MYPY
77

@@ -41,11 +41,17 @@ def send(self, request, **kwargs):
4141
if hub.get_integration(HttpxIntegration) is None:
4242
return real_send(self, request, **kwargs)
4343

44+
parsed_url = parse_url(str(request.url), sanitize=False)
45+
4446
with hub.start_span(
45-
op=OP.HTTP_CLIENT, description="%s %s" % (request.method, request.url)
47+
op=OP.HTTP_CLIENT,
48+
description="%s %s" % (request.method, parsed_url.url),
4649
) as span:
4750
span.set_data("method", request.method)
48-
span.set_data("url", str(request.url))
51+
span.set_data("url", parsed_url.url)
52+
span.set_data("http.query", parsed_url.query)
53+
span.set_data("http.fragment", parsed_url.fragment)
54+
4955
for key, value in hub.iter_trace_propagation_headers():
5056
logger.debug(
5157
"[Tracing] Adding `{key}` header {value} to outgoing request to {url}.".format(
@@ -58,6 +64,7 @@ def send(self, request, **kwargs):
5864
span.set_data("status_code", rv.status_code)
5965
span.set_http_status(rv.status_code)
6066
span.set_data("reason", rv.reason_phrase)
67+
6168
return rv
6269

6370
Client.send = send
@@ -73,11 +80,17 @@ async def send(self, request, **kwargs):
7380
if hub.get_integration(HttpxIntegration) is None:
7481
return await real_send(self, request, **kwargs)
7582

83+
parsed_url = parse_url(str(request.url), sanitize=False)
84+
7685
with hub.start_span(
77-
op=OP.HTTP_CLIENT, description="%s %s" % (request.method, request.url)
86+
op=OP.HTTP_CLIENT,
87+
description="%s %s" % (request.method, parsed_url.url),
7888
) as span:
7989
span.set_data("method", request.method)
80-
span.set_data("url", str(request.url))
90+
span.set_data("url", parsed_url.url)
91+
span.set_data("http.query", parsed_url.query)
92+
span.set_data("http.fragment", parsed_url.fragment)
93+
8194
for key, value in hub.iter_trace_propagation_headers():
8295
logger.debug(
8396
"[Tracing] Adding `{key}` header {value} to outgoing request to {url}.".format(
@@ -90,6 +103,7 @@ async def send(self, request, **kwargs):
90103
span.set_data("status_code", rv.status_code)
91104
span.set_http_status(rv.status_code)
92105
span.set_data("reason", rv.reason_phrase)
106+
93107
return rv
94108

95109
AsyncClient.send = send

sentry_sdk/integrations/huey.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,15 @@
66
from sentry_sdk._compat import reraise
77
from sentry_sdk._types import MYPY
88
from sentry_sdk import Hub
9-
from sentry_sdk.consts import OP, SENSITIVE_DATA_SUBSTITUTE
9+
from sentry_sdk.consts import OP
1010
from sentry_sdk.hub import _should_send_default_pii
1111
from sentry_sdk.integrations import DidNotEnable, Integration
1212
from sentry_sdk.tracing import Transaction, TRANSACTION_SOURCE_TASK
13-
from sentry_sdk.utils import capture_internal_exceptions, event_from_exception
13+
from sentry_sdk.utils import (
14+
capture_internal_exceptions,
15+
event_from_exception,
16+
SENSITIVE_DATA_SUBSTITUTE,
17+
)
1418

1519
if MYPY:
1620
from typing import Any, Callable, Optional, Union, TypeVar

sentry_sdk/integrations/stdlib.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,12 @@
88
from sentry_sdk.integrations import Integration
99
from sentry_sdk.scope import add_global_event_processor
1010
from sentry_sdk.tracing_utils import EnvironHeaders
11-
from sentry_sdk.utils import capture_internal_exceptions, logger, safe_repr
11+
from sentry_sdk.utils import (
12+
capture_internal_exceptions,
13+
logger,
14+
safe_repr,
15+
parse_url,
16+
)
1217

1318
from sentry_sdk._types import MYPY
1419

@@ -79,12 +84,17 @@ def putrequest(self, method, url, *args, **kwargs):
7984
url,
8085
)
8186

87+
parsed_url = parse_url(real_url, sanitize=False)
88+
8289
span = hub.start_span(
83-
op=OP.HTTP_CLIENT, description="%s %s" % (method, real_url)
90+
op=OP.HTTP_CLIENT,
91+
description="%s %s" % (method, parsed_url.url),
8492
)
8593

8694
span.set_data("method", method)
87-
span.set_data("url", real_url)
95+
span.set_data("url", parsed_url.url)
96+
span.set_data("http.query", parsed_url.query)
97+
span.set_data("http.fragment", parsed_url.fragment)
8898

8999
rv = real_putrequest(self, method, url, *args, **kwargs)
90100

sentry_sdk/utils.py

Lines changed: 94 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,25 @@
88
import sys
99
import threading
1010
import time
11+
from collections import namedtuple
12+
13+
try:
14+
# Python 3
15+
from urllib.parse import parse_qs
16+
from urllib.parse import unquote
17+
from urllib.parse import urlencode
18+
from urllib.parse import urlsplit
19+
from urllib.parse import urlunsplit
20+
21+
except ImportError:
22+
# Python 2
23+
from cgi import parse_qs # type: ignore
24+
from urllib import unquote # type: ignore
25+
from urllib import urlencode # type: ignore
26+
from urlparse import urlsplit # type: ignore
27+
from urlparse import urlunsplit # type: ignore
28+
29+
1130
from datetime import datetime
1231
from functools import partial
1332

@@ -43,13 +62,14 @@
4362

4463
epoch = datetime(1970, 1, 1)
4564

46-
4765
# The logger is created here but initialized in the debug support module
4866
logger = logging.getLogger("sentry_sdk.errors")
4967

5068
MAX_STRING_LENGTH = 1024
5169
BASE64_ALPHABET = re.compile(r"^[a-zA-Z0-9/+=]*$")
5270

71+
SENSITIVE_DATA_SUBSTITUTE = "[Filtered]"
72+
5373

5474
def json_dumps(data):
5575
# type: (Any) -> bytes
@@ -374,8 +394,6 @@ def removed_because_over_size_limit(cls):
374394
def substituted_because_contains_sensitive_data(cls):
375395
# type: () -> AnnotatedValue
376396
"""The actual value was removed because it contained sensitive information."""
377-
from sentry_sdk.consts import SENSITIVE_DATA_SUBSTITUTE
378-
379397
return AnnotatedValue(
380398
value=SENSITIVE_DATA_SUBSTITUTE,
381399
metadata={
@@ -1163,6 +1181,79 @@ def from_base64(base64_string):
11631181
return utf8_string
11641182

11651183

1184+
Components = namedtuple("Components", ["scheme", "netloc", "path", "query", "fragment"])
1185+
1186+
1187+
def sanitize_url(url, remove_authority=True, remove_query_values=True):
1188+
# type: (str, bool, bool) -> str
1189+
"""
1190+
Removes the authority and query parameter values from a given URL.
1191+
"""
1192+
parsed_url = urlsplit(url)
1193+
query_params = parse_qs(parsed_url.query, keep_blank_values=True)
1194+
1195+
# strip username:password (netloc can be usr:[email protected])
1196+
if remove_authority:
1197+
netloc_parts = parsed_url.netloc.split("@")
1198+
if len(netloc_parts) > 1:
1199+
netloc = "%s:%s@%s" % (
1200+
SENSITIVE_DATA_SUBSTITUTE,
1201+
SENSITIVE_DATA_SUBSTITUTE,
1202+
netloc_parts[-1],
1203+
)
1204+
else:
1205+
netloc = parsed_url.netloc
1206+
else:
1207+
netloc = parsed_url.netloc
1208+
1209+
# strip values from query string
1210+
if remove_query_values:
1211+
query_string = unquote(
1212+
urlencode({key: SENSITIVE_DATA_SUBSTITUTE for key in query_params})
1213+
)
1214+
else:
1215+
query_string = parsed_url.query
1216+
1217+
safe_url = urlunsplit(
1218+
Components(
1219+
scheme=parsed_url.scheme,
1220+
netloc=netloc,
1221+
query=query_string,
1222+
path=parsed_url.path,
1223+
fragment=parsed_url.fragment,
1224+
)
1225+
)
1226+
1227+
return safe_url
1228+
1229+
1230+
ParsedUrl = namedtuple("ParsedUrl", ["url", "query", "fragment"])
1231+
1232+
1233+
def parse_url(url, sanitize=True):
1234+
1235+
# type: (str, bool) -> ParsedUrl
1236+
"""
1237+
Splits a URL into a url (including path), query and fragment. If sanitize is True, the query
1238+
parameters will be sanitized to remove sensitive data. The autority (username and password)
1239+
in the URL will always be removed.
1240+
"""
1241+
url = sanitize_url(url, remove_authority=True, remove_query_values=sanitize)
1242+
1243+
parsed_url = urlsplit(url)
1244+
base_url = urlunsplit(
1245+
Components(
1246+
scheme=parsed_url.scheme,
1247+
netloc=parsed_url.netloc,
1248+
query="",
1249+
path=parsed_url.path,
1250+
fragment="",
1251+
)
1252+
)
1253+
1254+
return ParsedUrl(url=base_url, query=parsed_url.query, fragment=parsed_url.fragment)
1255+
1256+
11661257
if PY37:
11671258

11681259
def nanosecond_time():

tests/integrations/httpx/test_httpx.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ def before_breadcrumb(crumb, hint):
3434
assert crumb["data"] == {
3535
"url": url,
3636
"method": "GET",
37+
"http.fragment": "",
38+
"http.query": "",
3739
"status_code": 200,
3840
"reason": "OK",
3941
"extra": "foo",

tests/integrations/requests/test_requests.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ def test_crumb_capture(sentry_init, capture_events):
2020
assert crumb["data"] == {
2121
"url": "https://httpbin.org/status/418",
2222
"method": "GET",
23+
"http.fragment": "",
24+
"http.query": "",
2325
"status_code": response.status_code,
2426
"reason": response.reason,
2527
}

0 commit comments

Comments
 (0)