Skip to content

Commit 9792753

Browse files
committed
support Unicode characters
1 parent df73f41 commit 9792753

File tree

2 files changed

+56
-5
lines changed

2 files changed

+56
-5
lines changed

src/__tests__/unit/checks/keywords-urls.test.ts

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,64 @@ describe('keywords guardrail', () => {
2222
expect(result.info?.totalKeywords).toBe(1);
2323
});
2424

25-
it('ignores text without the configured keywords', () => {
26-
const result = keywordsCheck(
25+
it('ignores text without the configured keywords', async () => {
26+
const result = await keywordsCheck(
2727
{},
2828
'All clear content',
2929
KeywordsConfig.parse({ keywords: ['secret'] })
30-
) as GuardrailResult;
30+
);
3131

3232
expect(result.tripwireTriggered).toBe(false);
3333
expect(result.info?.matchedKeywords).toEqual([]);
3434
});
35+
36+
it('should return the correct result', async () => {
37+
const result = await keywordsCheck({}, 'Hello, world!', KeywordsConfig.parse({ keywords: ['hello', 'world'] }));
38+
expect(result.tripwireTriggered).toEqual(true);
39+
});
40+
41+
it('should not match partial words', async () => {
42+
const result = await keywordsCheck({}, 'Hello, world!', KeywordsConfig.parse({ keywords: ['orld'] }));
43+
expect(result.tripwireTriggered).toEqual(false);
44+
});
45+
46+
it('should match numbers', async () => {
47+
const result = await keywordsCheck({}, 'Hello, world123', KeywordsConfig.parse({ keywords: ['world123'] }));
48+
expect(result.tripwireTriggered).toEqual(true);
49+
expect(result.info.matchedKeywords).toEqual(['world123']);
50+
});
51+
52+
it('should not match partial numbers', async () => {
53+
const result = await keywordsCheck({}, 'Hello, world12345', KeywordsConfig.parse({ keywords: ['world123'] }));
54+
expect(result.tripwireTriggered).toEqual(false);
55+
});
56+
57+
it('should match underscore', async () => {
58+
const result = await keywordsCheck({}, 'Hello, w_o_r_l_d', KeywordsConfig.parse({ keywords: ['w_o_r_l_d'] }));
59+
expect(result.tripwireTriggered).toEqual(true);
60+
expect(result.info.matchedKeywords).toEqual(['w_o_r_l_d']);
61+
});
62+
63+
it('should not match in between underscore', async () => {
64+
const result = await keywordsCheck({}, 'Hello, test_world_test', KeywordsConfig.parse({ keywords: ['world'] }));
65+
expect(result.tripwireTriggered).toEqual(false);
66+
});
67+
68+
it('should work with chinese characters', async () => {
69+
const result = await keywordsCheck({}, '你好', KeywordsConfig.parse({ keywords: ['你好'] }));
70+
expect(result.tripwireTriggered).toEqual(true);
71+
});
72+
73+
it('should work with chinese characters with numbers', async () => {
74+
const result = await keywordsCheck({}, '你好123', KeywordsConfig.parse({ keywords: ['你好123'] }));
75+
expect(result.tripwireTriggered).toEqual(true);
76+
expect(result.info.matchedKeywords).toEqual(['你好123']);
77+
});
78+
79+
it('should not match partial chinese characters with numbers', async () => {
80+
const result = await keywordsCheck({}, '你好12345', KeywordsConfig.parse({ keywords: ['你好123'] }));
81+
expect(result.tripwireTriggered).toEqual(false);
82+
});
3583
});
3684

3785
describe('urls guardrail', () => {

src/checks/keywords.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,11 @@ export const keywordsCheck: CheckFn<KeywordsContext, string, KeywordsConfig> = (
5757
const escapedKeywords = sanitizedKeywords.map((k: string) =>
5858
k.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
5959
);
60-
const patternText = `\\b(?:${escapedKeywords.join('|')})\\b`;
61-
const pattern = new RegExp(patternText, 'gi'); // case-insensitive, global
60+
// \p{L}|\p{N}|_ - any unicode letter, number, or underscore. Alternative to \b
61+
// (?<!\p{L}) - not preceded by a letter
62+
// (?!\p{L}) - not followed by a letter
63+
const patternText = `(?<!\\p{L}|\\p{N}|_)${escapedKeywords.join('|')}(?!\\p{L}|\\p{N}|_)`;
64+
const pattern = new RegExp(patternText, 'giu'); // case-insensitive, global, unicode
6265

6366
const matches: string[] = [];
6467
let match;

0 commit comments

Comments
 (0)