Skip to content

Commit fca08ae

Browse files
committed
Add unicode handling to keyword match
1 parent 98fa6d0 commit fca08ae

File tree

2 files changed

+81
-3
lines changed

2 files changed

+81
-3
lines changed

src/guardrails/checks/text/keywords.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,11 @@ def _compile_pattern(keywords: tuple[str, ...]) -> re.Pattern[str]:
7373
Returns:
7474
re.Pattern[str]: Compiled regex pattern to match any given keyword.
7575
"""
76-
escaped = (re.escape(k) for k in keywords)
77-
pattern_text = r"\b(?:" + "|".join(escaped) + r")\b"
76+
escaped_keywords = tuple(re.escape(keyword) for keyword in keywords)
77+
# (?<!\w)/(?!\w) emulate Unicode-aware word boundaries (letters, digits, underscore).
78+
pattern_text = r"(?<!\w)(?:" + "|".join(escaped_keywords) + r")(?!\w)"
7879

79-
return re.compile(pattern_text, re.IGNORECASE)
80+
return re.compile(pattern_text, re.IGNORECASE | re.UNICODE)
8081

8182

8283
def match_keywords(

tests/unit/checks/test_keywords.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,80 @@ async def test_keywords_does_not_trigger_on_benign_text() -> None:
6565
result = await keywords(ctx=None, data="Safe content", config=config)
6666

6767
assert result.tripwire_triggered is False # noqa: S101
68+
69+
70+
def test_match_keywords_does_not_match_partial_words() -> None:
71+
"""Ensure substrings embedded in larger words are ignored."""
72+
config = KeywordCfg(keywords=["orld"])
73+
result = match_keywords("Hello, world!", config, guardrail_name="Test Guardrail")
74+
75+
assert result.tripwire_triggered is False # noqa: S101
76+
77+
78+
def test_match_keywords_handles_numeric_tokens() -> None:
79+
"""Keywords containing digits should match exact tokens."""
80+
config = KeywordCfg(keywords=["world123"])
81+
result = match_keywords("Hello, world123", config, guardrail_name="Test Guardrail")
82+
83+
assert result.tripwire_triggered is True # noqa: S101
84+
assert result.info["matched"] == ["world123"] # noqa: S101
85+
86+
87+
def test_match_keywords_rejects_partial_numeric_tokens() -> None:
88+
"""Numeric keywords should not match when extra digits follow."""
89+
config = KeywordCfg(keywords=["world123"])
90+
result = match_keywords("Hello, world12345", config, guardrail_name="Test Guardrail")
91+
92+
assert result.tripwire_triggered is False # noqa: S101
93+
94+
95+
def test_match_keywords_handles_underscored_tokens() -> None:
96+
"""Underscored keywords should be detected exactly once."""
97+
config = KeywordCfg(keywords=["w_o_r_l_d"])
98+
result = match_keywords("Hello, w_o_r_l_d", config, guardrail_name="Test Guardrail")
99+
100+
assert result.tripwire_triggered is True # noqa: S101
101+
assert result.info["matched"] == ["w_o_r_l_d"] # noqa: S101
102+
103+
104+
def test_match_keywords_rejects_words_embedded_in_underscores() -> None:
105+
"""Words surrounded by underscores should not trigger partial matches."""
106+
config = KeywordCfg(keywords=["world"])
107+
result = match_keywords("Hello, test_world_test", config, guardrail_name="Test Guardrail")
108+
109+
assert result.tripwire_triggered is False # noqa: S101
110+
111+
112+
def test_match_keywords_handles_chinese_characters() -> None:
113+
"""Unicode keywords such as Chinese characters should match."""
114+
config = KeywordCfg(keywords=["你好"])
115+
result = match_keywords("你好", config, guardrail_name="Test Guardrail")
116+
117+
assert result.tripwire_triggered is True # noqa: S101
118+
assert result.info["matched"] == ["你好"] # noqa: S101
119+
120+
121+
def test_match_keywords_handles_chinese_tokens_with_digits() -> None:
122+
"""Unicode keywords that include digits should match whole tokens."""
123+
config = KeywordCfg(keywords=["你好123"])
124+
result = match_keywords("你好123", config, guardrail_name="Test Guardrail")
125+
126+
assert result.tripwire_triggered is True # noqa: S101
127+
assert result.info["matched"] == ["你好123"] # noqa: S101
128+
129+
130+
def test_match_keywords_rejects_partial_chinese_tokens_with_digits() -> None:
131+
"""Unicode keywords with trailing digits should not match supersets."""
132+
config = KeywordCfg(keywords=["你好123"])
133+
result = match_keywords("你好12345", config, guardrail_name="Test Guardrail")
134+
135+
assert result.tripwire_triggered is False # noqa: S101
136+
137+
138+
def test_match_keywords_applies_boundaries_to_all_keywords() -> None:
139+
"""Every keyword in a multi-token pattern should respect Unicode boundaries."""
140+
config = KeywordCfg(keywords=["test", "hello", "world"])
141+
result = match_keywords("testing hello world", config, guardrail_name="Test Guardrail")
142+
143+
assert result.tripwire_triggered is True # noqa: S101
144+
assert result.info["matched"] == ["hello", "world"] # noqa: S101

0 commit comments

Comments
 (0)