Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/guardrails/checks/text/keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,11 @@ def _compile_pattern(keywords: tuple[str, ...]) -> re.Pattern[str]:
Returns:
re.Pattern[str]: Compiled regex pattern to match any given keyword.
"""
escaped = (re.escape(k) for k in keywords)
pattern_text = r"\b(?:" + "|".join(escaped) + r")\b"
escaped_keywords = tuple(re.escape(keyword) for keyword in keywords)
# (?<!\w)/(?!\w) emulate Unicode-aware word boundaries (letters, digits, underscore).
pattern_text = r"(?<!\w)(?:" + "|".join(escaped_keywords) + r")(?!\w)"

return re.compile(pattern_text, re.IGNORECASE)
return re.compile(pattern_text, re.IGNORECASE | re.UNICODE)


def match_keywords(
Expand Down
77 changes: 77 additions & 0 deletions tests/unit/checks/test_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,80 @@ async def test_keywords_does_not_trigger_on_benign_text() -> None:
result = await keywords(ctx=None, data="Safe content", config=config)

assert result.tripwire_triggered is False # noqa: S101


def test_match_keywords_does_not_match_partial_words() -> None:
"""Ensure substrings embedded in larger words are ignored."""
config = KeywordCfg(keywords=["orld"])
result = match_keywords("Hello, world!", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is False # noqa: S101


def test_match_keywords_handles_numeric_tokens() -> None:
"""Keywords containing digits should match exact tokens."""
config = KeywordCfg(keywords=["world123"])
result = match_keywords("Hello, world123", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is True # noqa: S101
assert result.info["matched"] == ["world123"] # noqa: S101


def test_match_keywords_rejects_partial_numeric_tokens() -> None:
"""Numeric keywords should not match when extra digits follow."""
config = KeywordCfg(keywords=["world123"])
result = match_keywords("Hello, world12345", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is False # noqa: S101


def test_match_keywords_handles_underscored_tokens() -> None:
"""Underscored keywords should be detected exactly once."""
config = KeywordCfg(keywords=["w_o_r_l_d"])
result = match_keywords("Hello, w_o_r_l_d", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is True # noqa: S101
assert result.info["matched"] == ["w_o_r_l_d"] # noqa: S101


def test_match_keywords_rejects_words_embedded_in_underscores() -> None:
"""Words surrounded by underscores should not trigger partial matches."""
config = KeywordCfg(keywords=["world"])
result = match_keywords("Hello, test_world_test", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is False # noqa: S101


def test_match_keywords_handles_chinese_characters() -> None:
"""Unicode keywords such as Chinese characters should match."""
config = KeywordCfg(keywords=["你好"])
result = match_keywords("你好", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is True # noqa: S101
assert result.info["matched"] == ["你好"] # noqa: S101


def test_match_keywords_handles_chinese_tokens_with_digits() -> None:
"""Unicode keywords that include digits should match whole tokens."""
config = KeywordCfg(keywords=["你好123"])
result = match_keywords("你好123", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is True # noqa: S101
assert result.info["matched"] == ["你好123"] # noqa: S101


def test_match_keywords_rejects_partial_chinese_tokens_with_digits() -> None:
"""Unicode keywords with trailing digits should not match supersets."""
config = KeywordCfg(keywords=["你好123"])
result = match_keywords("你好12345", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is False # noqa: S101


def test_match_keywords_applies_boundaries_to_all_keywords() -> None:
"""Every keyword in a multi-token pattern should respect Unicode boundaries."""
config = KeywordCfg(keywords=["test", "hello", "world"])
result = match_keywords("testing hello world", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is True # noqa: S101
assert result.info["matched"] == ["hello", "world"] # noqa: S101
Loading