Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ dependencies = [
"pandas",
"types-requests>=2.31.0",
"pandas-stubs>=2.0.3.230814",
"sqlparse>=0.5.0",
]

[project.optional-dependencies]
Expand Down
151 changes: 151 additions & 0 deletions tests/test_redaction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
"""Tests for SQL literal redaction used in query logging."""

import pytest

from wherobots.db.redaction import get_statement_type, redact_sql


def test_redacts_single_string_literal() -> None:
assert redact_sql("SELECT * FROM t WHERE name = 'alice'") == (
"SELECT * FROM t WHERE name = ?"
)


def test_redacts_string_with_escaped_quote() -> None:
# Doubled single-quote is an escaped quote inside the literal; the whole
# literal must collapse to one placeholder (no leakage of the second half).
assert redact_sql("SELECT * FROM t WHERE name = 'O''Brien'") == (
"SELECT * FROM t WHERE name = ?"
)


def test_redacts_string_with_backslash_escaped_quote() -> None:
assert redact_sql(r"SELECT * FROM t WHERE name = 'a\'b'") == (
"SELECT * FROM t WHERE name = ?"
)


def test_redacts_numeric_literals() -> None:
assert redact_sql("SELECT * FROM t WHERE age = 42 AND score > 3.14") == (
"SELECT * FROM t WHERE age = ? AND score > ?"
)


def test_redacts_scientific_notation() -> None:
assert redact_sql("SELECT * FROM t WHERE x < 1.5e10") == (
"SELECT * FROM t WHERE x < ?"
)


def test_double_quoted_identifier_left_intact() -> None:
# Double-quoted identifiers are column/table names, not value literals.
assert redact_sql('SELECT "user id", count(*) FROM "my table" WHERE x = 1') == (
'SELECT "user id", count(*) FROM "my table" WHERE x = ?'
)


def test_backtick_identifier_left_intact() -> None:
assert redact_sql("SELECT `col` FROM `db`.`tbl` WHERE n = 5") == (
"SELECT `col` FROM `db`.`tbl` WHERE n = ?"
)


def test_show_tblproperties_statement() -> None:
# SHOW TBLPROPERTIES with a quoted property key: the single-quoted literal
# still redacts, while the table name (an identifier) stays intact.
assert redact_sql("SHOW TBLPROPERTIES my_db.my_table ('comment')") == (
"SHOW TBLPROPERTIES my_db.my_table (?)"
)


def test_select_where_secret_redacted() -> None:
statement = "SELECT id FROM users WHERE ssn = '123-45-6789'"
redacted = redact_sql(statement)
assert "123-45-6789" not in redacted
assert redacted == "SELECT id FROM users WHERE ssn = ?"


def test_identifier_with_digits_not_redacted() -> None:
# Column names containing digits must not be treated as numeric literals.
assert redact_sql("SELECT col1, t2.col3 FROM tbl4") == (
"SELECT col1, t2.col3 FROM tbl4"
)


def test_unterminated_string_does_not_leak() -> None:
redacted = redact_sql("SELECT * FROM t WHERE x = 'unterminated secret")
Comment thread
salty-hambot[bot] marked this conversation as resolved.
assert "secret" not in redacted
assert redacted == "SELECT * FROM t WHERE x = ?"


def test_unterminated_quote_does_not_leak_later_statements() -> None:
# Multi-statement input where the FIRST statement contains an unterminated
# quote (a lone ``"`` opener, which sqlparse emits as an Error token) and a
# LATER statement contains a secret literal. Redaction must fail closed and
# bail on the entire input, never appending a later statement verbatim --
# otherwise the secret would leak into the logs. (sqlparse splits this into
# two statements, so this exercises the cross-statement path specifically.)
statement = "SELECT \" FROM t; SELECT v FROM creds WHERE token = 'topsecret123'"
redacted = redact_sql(statement)
assert "topsecret123" not in redacted
assert redacted == "SELECT ?"


@pytest.mark.parametrize(
("statement", "expected", "secret"),
[
# Hex/blob literal: sqlparse classifies the quoted body as String.Single,
# so the value collapses to ? (the bare X prefix is a Name and is kept).
(
"SELECT * FROM t WHERE b = X'deadbeef'",
"SELECT * FROM t WHERE b = X?",
"deadbeef",
),
# Unicode string literal (U&'...'): the quoted body is String.Single.
(
r"SELECT * FROM t WHERE s = U&'\0041'",
"SELECT * FROM t WHERE s = U&?",
"0041",
),
# National-character / symbol-prefixed literal (N'...'): String.Single.
("SELECT * FROM t WHERE s = N'abc'", "SELECT * FROM t WHERE s = N?", "abc"),
],
)
def test_prefixed_string_literals_redacted(
statement: str, expected: str, secret: str
) -> None:
# Hex (X'..'), unicode (U&'..') and symbol-prefixed (N'..') string literals
# all tokenize their value-bearing quoted body as String.Single, so the
# existing guard already redacts them -- no broadening to ``T.String`` is
# needed (and broadening would be harmful: double-quoted identifiers are
# String.Symbol, see test_double_quoted_identifier_left_intact).
redacted = redact_sql(statement)
assert secret not in redacted
assert redacted == expected


def test_multiple_literals_mixed() -> None:
statement = "INSERT INTO t (a, b) VALUES ('x', 10), ('y', 20)"
assert redact_sql(statement) == "INSERT INTO t (a, b) VALUES (?, ?), (?, ?)"


def test_line_comment_preserved_literal_in_comment_kept() -> None:
# We only redact value positions; a literal inside a comment is left as-is
# because comments are preserved verbatim (structure-preserving).
assert redact_sql("SELECT 1 -- note: keep this\n") == (
"SELECT ? -- note: keep this\n"
)


@pytest.mark.parametrize(
("statement", "expected"),
[
("SELECT 1", "SELECT"),
(" show tables", "SHOW"),
("describe t", "DESCRIBE"),
("/* c */ SELECT 1", "UNKNOWN"),
("", "UNKNOWN"),
],
)
def test_get_statement_type(statement: str, expected: str) -> None:
assert get_statement_type(statement) == expected
11 changes: 11 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 26 additions & 2 deletions wherobots/db/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from dataclasses import dataclass
from typing import Any, Callable, Dict

from wherobots.db.redaction import get_statement_type, redact_sql

import pandas
import pyarrow
import cbor2
Expand Down Expand Up @@ -260,9 +262,26 @@ def _handle_results(self, execution_id: str, results: Dict[str, Any]) -> Any:

def __send(self, message: Dict[str, Any]) -> None:
request = json.dumps(message)
logging.debug("Request: %s", request)
# Only compute the redacted request (json.dumps + sqlparse parse) when
# DEBUG is actually enabled; the log argument is evaluated eagerly, so an
# unguarded call would redact on every request even with DEBUG off.
if logging.getLogger().isEnabledFor(logging.DEBUG):
logging.debug("Request: %s", self.__redacted_request(message))
self.__ws.send(request)
Comment thread
salty-hambot[bot] marked this conversation as resolved.

@staticmethod
def __redacted_request(message: Dict[str, Any]) -> str:
"""Serialize a request for logging with any SQL statement redacted.

The wire payload (sent verbatim by ``__send``) carries the raw
``statement``; this driver is embedded by other services, so logging it
-- even at DEBUG -- would leak raw SQL into their log streams (WBC-139).
"""
statement = message.get("statement")
if isinstance(statement, str):
message = {**message, "statement": redact_sql(statement)}
return json.dumps(message)

def __recv(self) -> Dict[str, Any]:
frame = self.__ws.recv(timeout=self.__read_timeout)
if isinstance(frame, str):
Expand Down Expand Up @@ -301,8 +320,13 @@ def __execute_sql(
store=store,
)

# Redact literal values before logging: this driver is embedded by other
# services, so raw SQL here would leak into their log streams (WBC-139).
logging.info(
"Executing SQL query %s: %s", execution_id, textwrap.shorten(sql, width=60)
"Executing SQL query %s (%s): %s",
Comment thread
ClayMav marked this conversation as resolved.
execution_id,
get_statement_type(sql),
textwrap.shorten(redact_sql(sql), width=200),
)
self.__send(request)
return execution_id
Expand Down
78 changes: 78 additions & 0 deletions wherobots/db/redaction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""Redaction helpers for SQL statements logged by this driver.

The driver logs SQL statements for observability (e.g. ``Executing SQL query
<id>: ...``). Raw statements can embed literal PII (for example ``WHERE ssn =
'123-45-6789'``), and because this library is embedded by other services its log
output ends up in their log streams. This module replaces literal *values*
(single-quoted string literals and numeric literals) with a ``?`` placeholder
while preserving the statement structure: keywords, function names, and
identifiers (including double-quoted/back-quoted identifiers) are kept intact so
the redacted form is still useful for debugging and aggregation.

The implementation tokenizes with ``sqlparse``, a lenient, pure-Python tokenizer
with zero transitive dependencies. It is dialect-agnostic, never raises on
malformed input, and classifies literals by token type, which lets us redact
precisely the value-bearing tokens while copying everything else (keywords,
identifiers, operators, comments, whitespace) through verbatim.

This logic is duplicated from the ``sql-session`` service (PR #197); the two
repositories do not share a package, so the implementation is intentionally
replicated here rather than imported.

This is a best-effort redaction for *logging*. It is not a security boundary and
must not be relied on to sanitize untrusted input for any other purpose.
"""

import re

import sqlparse
from sqlparse import tokens as T

REDACTED_PLACEHOLDER = "?"

# Leading SQL keyword -> statement type. Used purely for observability tagging.
# Intentionally a regex (not sqlparse's ``Statement.get_type()``) because
# ``get_type()`` returns "UNKNOWN" for SHOW/DESCRIBE/SET, which would regress the
# observability tagging this module exists to support.
_STATEMENT_TYPE_RE = re.compile(r"\s*([A-Za-z]+)")


def get_statement_type(statement: str) -> str:
"""Return the upper-cased leading keyword of a statement (e.g. ``SELECT``).

Returns ``"UNKNOWN"`` when the statement does not begin with a word.
"""
match = _STATEMENT_TYPE_RE.match(statement)
if match is None:
return "UNKNOWN"
return match.group(1).upper()


def redact_sql(statement: str) -> str:
"""Return ``statement`` with string and numeric literals replaced by ``?``.

Single-quoted string literals (``String.Single``) and numeric literals
(``Number.*``) are each collapsed to a single ``?`` placeholder. Everything
else -- keywords, function names, plain and quoted identifiers (``"col"`` /
`` `col` ``), comments, whitespace, and operators -- is preserved verbatim.
"""
out: list[str] = []
for stmt in sqlparse.parse(statement):
# sqlparse ships no type stubs, so ``flatten`` is untyped under strict.
for token in stmt.flatten(): # type: ignore[no-untyped-call]
ttype = token.ttype
if ttype in T.String.Single or ttype in T.Number:
Comment thread
ClayMav marked this conversation as resolved.
out.append(REDACTED_PLACEHOLDER)
elif ttype in T.Error and token.value in ("'", '"', "`"):
# An unterminated quote: sqlparse emits the lone opener as an
Comment thread
salty-hambot[bot] marked this conversation as resolved.
# Error token and tokenizes the trailing characters as ordinary
# text. Redact the opener and fail closed by bailing on the
# entire input -- returning here (rather than ``break``, which
# only exits the inner loop) ensures that for multi-statement
# input no later statement is appended verbatim, which would
# leak the literals this function exists to hide.
out.append(REDACTED_PLACEHOLDER)
return "".join(out)
else:
out.append(token.value)
return "".join(out)
Loading