-
Notifications
You must be signed in to change notification settings - Fork 5
feat: redact SQL literals in query logging (WBC-139) #70
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
+267
−2
Merged
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
f298672
feat: redact SQL literals in query logging
ClayMav d2e736d
refactor(db): use sqlparse for SQL redaction
ClayMav ea518f5
fix: fail closed on unterminated-quote redaction to prevent multi-sta…
ClayMav 6b62c0f
fix(db): lazy DEBUG redaction, cap redacted INFO log, cover prefixed …
ClayMav File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,151 @@ | ||
| """Tests for SQL literal redaction used in query logging.""" | ||
|
|
||
| import pytest | ||
|
|
||
| from wherobots.db.redaction import get_statement_type, redact_sql | ||
|
|
||
|
|
||
| def test_redacts_single_string_literal() -> None: | ||
| assert redact_sql("SELECT * FROM t WHERE name = 'alice'") == ( | ||
| "SELECT * FROM t WHERE name = ?" | ||
| ) | ||
|
|
||
|
|
||
| def test_redacts_string_with_escaped_quote() -> None: | ||
| # Doubled single-quote is an escaped quote inside the literal; the whole | ||
| # literal must collapse to one placeholder (no leakage of the second half). | ||
| assert redact_sql("SELECT * FROM t WHERE name = 'O''Brien'") == ( | ||
| "SELECT * FROM t WHERE name = ?" | ||
| ) | ||
|
|
||
|
|
||
| def test_redacts_string_with_backslash_escaped_quote() -> None: | ||
| assert redact_sql(r"SELECT * FROM t WHERE name = 'a\'b'") == ( | ||
| "SELECT * FROM t WHERE name = ?" | ||
| ) | ||
|
|
||
|
|
||
| def test_redacts_numeric_literals() -> None: | ||
| assert redact_sql("SELECT * FROM t WHERE age = 42 AND score > 3.14") == ( | ||
| "SELECT * FROM t WHERE age = ? AND score > ?" | ||
| ) | ||
|
|
||
|
|
||
| def test_redacts_scientific_notation() -> None: | ||
| assert redact_sql("SELECT * FROM t WHERE x < 1.5e10") == ( | ||
| "SELECT * FROM t WHERE x < ?" | ||
| ) | ||
|
|
||
|
|
||
| def test_double_quoted_identifier_left_intact() -> None: | ||
| # Double-quoted identifiers are column/table names, not value literals. | ||
| assert redact_sql('SELECT "user id", count(*) FROM "my table" WHERE x = 1') == ( | ||
| 'SELECT "user id", count(*) FROM "my table" WHERE x = ?' | ||
| ) | ||
|
|
||
|
|
||
| def test_backtick_identifier_left_intact() -> None: | ||
| assert redact_sql("SELECT `col` FROM `db`.`tbl` WHERE n = 5") == ( | ||
| "SELECT `col` FROM `db`.`tbl` WHERE n = ?" | ||
| ) | ||
|
|
||
|
|
||
| def test_show_tblproperties_statement() -> None: | ||
| # SHOW TBLPROPERTIES with a quoted property key: the single-quoted literal | ||
| # still redacts, while the table name (an identifier) stays intact. | ||
| assert redact_sql("SHOW TBLPROPERTIES my_db.my_table ('comment')") == ( | ||
| "SHOW TBLPROPERTIES my_db.my_table (?)" | ||
| ) | ||
|
|
||
|
|
||
| def test_select_where_secret_redacted() -> None: | ||
| statement = "SELECT id FROM users WHERE ssn = '123-45-6789'" | ||
| redacted = redact_sql(statement) | ||
| assert "123-45-6789" not in redacted | ||
| assert redacted == "SELECT id FROM users WHERE ssn = ?" | ||
|
|
||
|
|
||
| def test_identifier_with_digits_not_redacted() -> None: | ||
| # Column names containing digits must not be treated as numeric literals. | ||
| assert redact_sql("SELECT col1, t2.col3 FROM tbl4") == ( | ||
| "SELECT col1, t2.col3 FROM tbl4" | ||
| ) | ||
|
|
||
|
|
||
| def test_unterminated_string_does_not_leak() -> None: | ||
| redacted = redact_sql("SELECT * FROM t WHERE x = 'unterminated secret") | ||
| assert "secret" not in redacted | ||
| assert redacted == "SELECT * FROM t WHERE x = ?" | ||
|
|
||
|
|
||
| def test_unterminated_quote_does_not_leak_later_statements() -> None: | ||
| # Multi-statement input where the FIRST statement contains an unterminated | ||
| # quote (a lone ``"`` opener, which sqlparse emits as an Error token) and a | ||
| # LATER statement contains a secret literal. Redaction must fail closed and | ||
| # bail on the entire input, never appending a later statement verbatim -- | ||
| # otherwise the secret would leak into the logs. (sqlparse splits this into | ||
| # two statements, so this exercises the cross-statement path specifically.) | ||
| statement = "SELECT \" FROM t; SELECT v FROM creds WHERE token = 'topsecret123'" | ||
| redacted = redact_sql(statement) | ||
| assert "topsecret123" not in redacted | ||
| assert redacted == "SELECT ?" | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| ("statement", "expected", "secret"), | ||
| [ | ||
| # Hex/blob literal: sqlparse classifies the quoted body as String.Single, | ||
| # so the value collapses to ? (the bare X prefix is a Name and is kept). | ||
| ( | ||
| "SELECT * FROM t WHERE b = X'deadbeef'", | ||
| "SELECT * FROM t WHERE b = X?", | ||
| "deadbeef", | ||
| ), | ||
| # Unicode string literal (U&'...'): the quoted body is String.Single. | ||
| ( | ||
| r"SELECT * FROM t WHERE s = U&'\0041'", | ||
| "SELECT * FROM t WHERE s = U&?", | ||
| "0041", | ||
| ), | ||
| # National-character / symbol-prefixed literal (N'...'): String.Single. | ||
| ("SELECT * FROM t WHERE s = N'abc'", "SELECT * FROM t WHERE s = N?", "abc"), | ||
| ], | ||
| ) | ||
| def test_prefixed_string_literals_redacted( | ||
| statement: str, expected: str, secret: str | ||
| ) -> None: | ||
| # Hex (X'..'), unicode (U&'..') and symbol-prefixed (N'..') string literals | ||
| # all tokenize their value-bearing quoted body as String.Single, so the | ||
| # existing guard already redacts them -- no broadening to ``T.String`` is | ||
| # needed (and broadening would be harmful: double-quoted identifiers are | ||
| # String.Symbol, see test_double_quoted_identifier_left_intact). | ||
| redacted = redact_sql(statement) | ||
| assert secret not in redacted | ||
| assert redacted == expected | ||
|
|
||
|
|
||
| def test_multiple_literals_mixed() -> None: | ||
| statement = "INSERT INTO t (a, b) VALUES ('x', 10), ('y', 20)" | ||
| assert redact_sql(statement) == "INSERT INTO t (a, b) VALUES (?, ?), (?, ?)" | ||
|
|
||
|
|
||
| def test_line_comment_preserved_literal_in_comment_kept() -> None: | ||
| # We only redact value positions; a literal inside a comment is left as-is | ||
| # because comments are preserved verbatim (structure-preserving). | ||
| assert redact_sql("SELECT 1 -- note: keep this\n") == ( | ||
| "SELECT ? -- note: keep this\n" | ||
| ) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| ("statement", "expected"), | ||
| [ | ||
| ("SELECT 1", "SELECT"), | ||
| (" show tables", "SHOW"), | ||
| ("describe t", "DESCRIBE"), | ||
| ("/* c */ SELECT 1", "UNKNOWN"), | ||
| ("", "UNKNOWN"), | ||
| ], | ||
| ) | ||
| def test_get_statement_type(statement: str, expected: str) -> None: | ||
| assert get_statement_type(statement) == expected | ||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,78 @@ | ||
| """Redaction helpers for SQL statements logged by this driver. | ||
|
|
||
| The driver logs SQL statements for observability (e.g. ``Executing SQL query | ||
| <id>: ...``). Raw statements can embed literal PII (for example ``WHERE ssn = | ||
| '123-45-6789'``), and because this library is embedded by other services its log | ||
| output ends up in their log streams. This module replaces literal *values* | ||
| (single-quoted string literals and numeric literals) with a ``?`` placeholder | ||
| while preserving the statement structure: keywords, function names, and | ||
| identifiers (including double-quoted/back-quoted identifiers) are kept intact so | ||
| the redacted form is still useful for debugging and aggregation. | ||
|
|
||
| The implementation tokenizes with ``sqlparse``, a lenient, pure-Python tokenizer | ||
| with zero transitive dependencies. It is dialect-agnostic, never raises on | ||
| malformed input, and classifies literals by token type, which lets us redact | ||
| precisely the value-bearing tokens while copying everything else (keywords, | ||
| identifiers, operators, comments, whitespace) through verbatim. | ||
|
|
||
| This logic is duplicated from the ``sql-session`` service (PR #197); the two | ||
| repositories do not share a package, so the implementation is intentionally | ||
| replicated here rather than imported. | ||
|
|
||
| This is a best-effort redaction for *logging*. It is not a security boundary and | ||
| must not be relied on to sanitize untrusted input for any other purpose. | ||
| """ | ||
|
|
||
| import re | ||
|
|
||
| import sqlparse | ||
| from sqlparse import tokens as T | ||
|
|
||
| REDACTED_PLACEHOLDER = "?" | ||
|
|
||
| # Leading SQL keyword -> statement type. Used purely for observability tagging. | ||
| # Intentionally a regex (not sqlparse's ``Statement.get_type()``) because | ||
| # ``get_type()`` returns "UNKNOWN" for SHOW/DESCRIBE/SET, which would regress the | ||
| # observability tagging this module exists to support. | ||
| _STATEMENT_TYPE_RE = re.compile(r"\s*([A-Za-z]+)") | ||
|
|
||
|
|
||
| def get_statement_type(statement: str) -> str: | ||
| """Return the upper-cased leading keyword of a statement (e.g. ``SELECT``). | ||
|
|
||
| Returns ``"UNKNOWN"`` when the statement does not begin with a word. | ||
| """ | ||
| match = _STATEMENT_TYPE_RE.match(statement) | ||
| if match is None: | ||
| return "UNKNOWN" | ||
| return match.group(1).upper() | ||
|
|
||
|
|
||
| def redact_sql(statement: str) -> str: | ||
| """Return ``statement`` with string and numeric literals replaced by ``?``. | ||
|
|
||
| Single-quoted string literals (``String.Single``) and numeric literals | ||
| (``Number.*``) are each collapsed to a single ``?`` placeholder. Everything | ||
| else -- keywords, function names, plain and quoted identifiers (``"col"`` / | ||
| `` `col` ``), comments, whitespace, and operators -- is preserved verbatim. | ||
| """ | ||
| out: list[str] = [] | ||
| for stmt in sqlparse.parse(statement): | ||
| # sqlparse ships no type stubs, so ``flatten`` is untyped under strict. | ||
| for token in stmt.flatten(): # type: ignore[no-untyped-call] | ||
| ttype = token.ttype | ||
| if ttype in T.String.Single or ttype in T.Number: | ||
|
ClayMav marked this conversation as resolved.
|
||
| out.append(REDACTED_PLACEHOLDER) | ||
| elif ttype in T.Error and token.value in ("'", '"', "`"): | ||
| # An unterminated quote: sqlparse emits the lone opener as an | ||
|
salty-hambot[bot] marked this conversation as resolved.
|
||
| # Error token and tokenizes the trailing characters as ordinary | ||
| # text. Redact the opener and fail closed by bailing on the | ||
| # entire input -- returning here (rather than ``break``, which | ||
| # only exits the inner loop) ensures that for multi-statement | ||
| # input no later statement is appended verbatim, which would | ||
| # leak the literals this function exists to hide. | ||
| out.append(REDACTED_PLACEHOLDER) | ||
| return "".join(out) | ||
| else: | ||
| out.append(token.value) | ||
| return "".join(out) | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.