Skip to content

Conversation

misrasaurabh1
Copy link
Contributor

📄 45% (0.45x) speedup for EventScrubber.scrub_dict in sentry_sdk/scrubber.py

⏱️ Runtime : 1.17 milliseconds 808 microseconds (best of 285 runs)

📝 Explanation and details

The optimization achieves a 44% speedup by converting the denylist from a list to a set for lookups while preserving the original list for compatibility.

Key optimization:

  • Added self._denylist_set = set(self.denylist) in __init__()
  • Changed k.lower() in self.denylist to k.lower() in self._denylist_set in scrub_dict()

Why this works:

  • List membership checking (in operator) is O(n) - it must scan through each element until found
  • Set membership checking is O(1) average case - uses hash table for instant lookup
  • The line profiler shows the lookup line went from 466.1ns per hit to 336.2ns per hit (28% faster per lookup)

Performance impact by test case:

  • Most effective on dictionaries with many non-sensitive keys (141% speedup on 1000-key dict)
  • Significant gains (25-37%) on nested structures and mixed sensitive/non-sensitive data
  • Minimal overhead on simple cases (empty dicts, single keys)

The optimization is particularly beneficial for large dictionaries or applications that frequently scrub data with extensive denylists, as each key check becomes dramatically faster while maintaining identical functionality.

Correctness verification report:

Test Status
⚙️ Existing Unit Tests 🔘 None Found
🌀 Generated Regression Tests 95 Passed
⏪ Replay Tests 🔘 None Found
🔎 Concolic Coverage Tests 🔘 None Found
📊 Tests Coverage 100.0%
🌀 Generated Regression Tests and Runtime
import pytest
from sentry_sdk.scrubber import EventScrubber

# function to test (copied from above, with all dependencies)
SENSITIVE_DATA_SUBSTITUTE = "[Filtered]"


# unit tests
# 1. Basic Test Cases


def test_scrub_dict_basic_non_sensitive_key():
    # Test that a non-sensitive key is not scrubbed
    scrubber = EventScrubber()
    d = {"username": "alice"}
    scrubber.scrub_dict(d) # 1.16μs -> 900ns (29.3% faster)




def test_scrub_dict_empty_dict():
    # Test that an empty dictionary is handled gracefully
    scrubber = EventScrubber()
    d = {}
    scrubber.scrub_dict(d) # 608ns -> 572ns (6.29% faster)

def test_scrub_dict_no_dict_input():
    # Test that non-dict inputs are ignored
    scrubber = EventScrubber()
    before = [1, 2, 3]
    after = before.copy()
    scrubber.scrub_dict(before) # 412ns -> 392ns (5.10% faster)



def test_scrub_dict_sensitive_key_in_nested_dict_non_recursive():
    # Test that nested sensitive keys are not scrubbed if recursive is False
    scrubber = EventScrubber(recursive=False)
    d = {"outer": {"password": "pw"}}
    scrubber.scrub_dict(d) # 1.21μs -> 888ns (36.6% faster)



def test_scrub_dict_sensitive_key_in_list_of_dicts_non_recursive():
    # Test that sensitive keys in dicts inside lists are not scrubbed if recursive is False
    scrubber = EventScrubber(recursive=False)
    d = {"data": [{"password": "pw"}, {"user": "bob"}]}
    scrubber.scrub_dict(d) # 1.15μs -> 895ns (28.5% faster)


def test_scrub_dict_sensitive_key_with_leading_trailing_spaces():
    # Test that keys with spaces are not matched (should not be scrubbed)
    scrubber = EventScrubber()
    d = {" password ": "pw"}
    scrubber.scrub_dict(d) # 1.19μs -> 913ns (30.8% faster)

def test_scrub_dict_sensitive_key_in_tuple_key():
    # Test that tuple keys are not scrubbed
    scrubber = EventScrubber()
    d = {("password",): "pw"}
    scrubber.scrub_dict(d) # 899ns -> 847ns (6.14% faster)






def test_scrub_dict_sensitive_key_with_bytes_key():
    # Test that bytes keys are not matched (should not be scrubbed)
    scrubber = EventScrubber()
    d = {b"password": "pw"}
    scrubber.scrub_dict(d) # 762ns -> 764ns (0.262% slower)

# 3. Large Scale Test Cases



def test_scrub_dict_large_nested_dict_non_recursive():
    # Test a large nested dictionary with non-recursive scrubbing
    scrubber = EventScrubber(recursive=False)
    d = {}
    d["a"] = [{"password": f"pw{i}", "other": i} for i in range(100)]
    scrubber.scrub_dict(d) # 1.19μs -> 945ns (25.4% faster)
    for i in range(100):
        pass



#------------------------------------------------
import pytest
from sentry_sdk.scrubber import EventScrubber

# function to test
SENSITIVE_DATA_SUBSTITUTE = "[Filtered]"

DEFAULT_DENYLIST = [
    "password",
    "passwd",
    "secret",
    "api_key",
    "apikey",
    "auth",
    "credentials",
    "mysql_pwd",
    "privatekey",
    "private_key",
    "token",
    "session",
    "csrftoken",
    "sessionid",
    "x_csrftoken",
    "x_forwarded_for",
    "set_cookie",
    "cookie",
    "authorization",
    "x_api_key",
    "aiohttp_session",
    "connect.sid",
    "csrf_token",
    "csrf",
    "_csrf",
    "_csrf_token",
    "PHPSESSID",
    "_session",
    "symfony",
    "user_session",
    "_xsrf",
    "XSRF-TOKEN",
]

# Alias for test clarity
scrub_dict = EventScrubber().scrub_dict

# ---------------------------
# UNIT TESTS
# ---------------------------

# 1. BASIC TEST CASES

def test_no_sensitive_keys():
    # Dict with no sensitive keys should remain unchanged
    d = {"foo": 1, "bar": "baz"}
    orig = d.copy()
    scrub_dict(d) # 1.94μs -> 1.44μs (34.1% faster)





def test_empty_dict():
    # Empty dict should remain unchanged
    d = {}
    scrub_dict(d) # 674ns -> 589ns (14.4% faster)


def test_dict_with_nested_dict_nonrecursive():
    # Nested dicts should not be scrubbed by default
    d = {"outer": {"password": "1234"}}
    scrub_dict(d) # 1.39μs -> 1.01μs (37.4% faster)



def test_dict_with_list_of_dicts_nonrecursive():
    # Lists containing dicts should not be scrubbed if recursive=False
    d = {"list": [{"password": "abc"}, {"foo": "bar"}]}
    scrub_dict(d) # 1.34μs -> 1.01μs (32.8% faster)












def test_dict_with_sensitive_key_and_whitespace():
    # Whitespace in key should not match
    d = {" password ": "should not be scrubbed"}
    scrub_dict(d) # 1.37μs -> 1.05μs (31.2% faster)

def test_non_dict_input():
    # Should not raise or modify non-dict input
    x = 123
    scrub_dict(x) # 481ns -> 417ns (15.3% faster)
    x = "not a dict"
    scrub_dict(x) # 225ns -> 225ns (0.000% faster)
    x = ["not", "a", "dict"]
    scrub_dict(x) # 151ns -> 150ns (0.667% faster)

# 3. LARGE SCALE TEST CASES




def test_large_list_of_dicts_nonrecursive():
    # Large list of dicts, non-recursive: nothing inside list should be scrubbed
    d = {"list": [{"password": f"pw{i}", "foo": i} for i in range(100)]}
    scrub_dict(d) # 1.36μs -> 1.08μs (25.0% faster)
    for i in range(100):
        pass


def test_large_dict_with_many_non_sensitive_keys():
    # Dict with many non-sensitive keys should not be scrubbed
    d = {f"nonsensitive{i}": i for i in range(1000)}
    orig = d.copy()
    scrub_dict(d) # 196μs -> 81.7μs (141% faster)


def test_large_deeply_nested_structure_nonrecursive():
    # Deeply nested structure with dicts and lists, non-recursive: no scrubbing
    d = {"a": [{"b": {"c": [{"d": {"password": "pw"}}]}}]}
    scrub_dict(d) # 1.31μs -> 1.02μs (28.1% faster)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

To edit these changes git checkout codeflash/optimize-EventScrubber.scrub_dict-mg91kjfw and push.

Codeflash

The optimization achieves a **44% speedup** by converting the denylist from a list to a set for lookups while preserving the original list for compatibility.

**Key optimization:**
- Added `self._denylist_set = set(self.denylist)` in `__init__()` 
- Changed `k.lower() in self.denylist` to `k.lower() in self._denylist_set` in `scrub_dict()`

**Why this works:**
- List membership checking (`in` operator) is O(n) - it must scan through each element until found
- Set membership checking is O(1) average case - uses hash table for instant lookup
- The line profiler shows the lookup line went from 466.1ns per hit to 336.2ns per hit (28% faster per lookup)

**Performance impact by test case:**
- Most effective on dictionaries with many non-sensitive keys (141% speedup on 1000-key dict)
- Significant gains (25-37%) on nested structures and mixed sensitive/non-sensitive data
- Minimal overhead on simple cases (empty dicts, single keys)

The optimization is particularly beneficial for large dictionaries or applications that frequently scrub data with extensive denylists, as each key check becomes dramatically faster while maintaining identical functionality.
@misrasaurabh1 misrasaurabh1 requested a review from a team as a code owner October 16, 2025 02:03
@misrasaurabh1
Copy link
Contributor Author

btw i also think that the original self.denylist is not necessary, it should just be a set

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant