Skip to content

Commit ba2a942

Browse files
tliumozillaMardak
authored andcommitted
add chat whitelist for query intent detection along with the unit tests (#85)
1 parent e5f4ecf commit ba2a942

File tree

2 files changed

+337
-1
lines changed

2 files changed

+337
-1
lines changed

browser/components/genai/SmartAssistEngine.sys.mjs

Lines changed: 180 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,181 @@ const search_open_tabs = ({ type }) => {
5858
};
5959
};
6060

61+
/**
62+
* Adding a chat whitelist for short greetings and chat starters
63+
*/
64+
65+
const FORCED_CHAT_PHRASES = [
66+
"amuse me",
67+
"are we alone",
68+
"are you alive",
69+
"are you gpt",
70+
"are you human",
71+
"are you real",
72+
"bark like dog",
73+
"cheer me up",
74+
"comfort me",
75+
"count numbers",
76+
"curse me",
77+
"do aliens exist",
78+
"do we matter",
79+
"do you dream",
80+
"do you think",
81+
"does fate exist",
82+
"dream meaning",
83+
"drop wisdom",
84+
"encourage me",
85+
"entertain me",
86+
"explain yourself",
87+
"flip coin",
88+
"give blessing",
89+
"give wisdom",
90+
"good morning",
91+
"good night",
92+
"guess number",
93+
"hallo",
94+
"hello",
95+
"hey",
96+
"hi",
97+
"hola",
98+
"how are you",
99+
"inspire me",
100+
"invent a word",
101+
"invent holiday",
102+
"invent joke",
103+
"is god real",
104+
"life advice",
105+
"life purpose",
106+
"list animals",
107+
"list capitals",
108+
"list colors",
109+
"list countries",
110+
"list elements",
111+
"list fruits",
112+
"list metals",
113+
"list oceans",
114+
"list planets",
115+
"list shapes",
116+
"meaning of life",
117+
"meow like cat",
118+
"motivate me",
119+
"now you are",
120+
"play a game",
121+
"pretend alien",
122+
"pretend child",
123+
"pretend detective",
124+
"pretend ghost",
125+
"pretend pirate",
126+
"pretend robot",
127+
"pretend superhero",
128+
"pretend teacher",
129+
"pretend wizard",
130+
"random fact",
131+
"random number",
132+
"roll dice",
133+
"goodbye",
134+
"simulate chat",
135+
"simulate future",
136+
"simulate past",
137+
"sing like robot",
138+
"sing lullaby",
139+
"sing rap",
140+
"sup",
141+
"surprise me",
142+
"teach me",
143+
"tell bedtime story",
144+
"tell fortune",
145+
"tell joke",
146+
"tell prophecy",
147+
"tell riddle",
148+
"tell story",
149+
"what is art",
150+
"what is beauty",
151+
"what is death",
152+
"what is freedom",
153+
"what is justice",
154+
"what is love",
155+
"what is mind",
156+
"what is reality",
157+
"what is right",
158+
"what is self",
159+
"what is soul",
160+
"what is time",
161+
"what is truth",
162+
"what is wrong",
163+
"what model are you",
164+
"what version",
165+
"what’s up",
166+
"which model are you",
167+
"who am i",
168+
"who are you",
169+
"who made you",
170+
"why are we",
171+
"write a poem",
172+
"write a song",
173+
"write haiku",
174+
"write quote",
175+
"your model is"
176+
]
177+
178+
// ------------------------
179+
// Normalization & tokenization (Unicode-aware) for chat whitelist
180+
// ------------------------
181+
export function normalizeTextForWhitelist(s) {
182+
return s
183+
.toLowerCase()
184+
.normalize("NFKC")
185+
.replace(/\s+/g, " ")
186+
.trim();
187+
}
188+
189+
// Split on non-word chars; letters/numbers/_ are "word" characters
190+
export function tokenize(s) {
191+
return normalizeTextForWhitelist(s)
192+
.split(/[^\p{L}\p{N}_]+/u)
193+
.filter(Boolean);
194+
}
195+
196+
// ------------------------
197+
// Build phrase sets by token length (exact token sequence match)
198+
// ------------------------
199+
export function buildPhraseSets(phrases) {
200+
const byLen = new Map(); // len -> Set("tok tok ...")
201+
for (const p of phrases) {
202+
const key = tokenize(p).join(" ");
203+
if (!key) continue;
204+
const k = key.split(" ").length;
205+
if (!byLen.has(k)) byLen.set(k, new Set());
206+
byLen.get(k).add(key);
207+
}
208+
return byLen;
209+
}
210+
211+
// Factory: returns a fast checker for “does query contain any isolated phrase?”
212+
export function makeIsolatedPhraseChecker(phrases) {
213+
const byLen = buildPhraseSets(phrases);
214+
const cache = new Map();
215+
216+
return function containsIsolatedPhrase(query) {
217+
const qNorm = normalizeTextForWhitelist(query);
218+
if (cache.has(qNorm)) return cache.get(qNorm);
219+
220+
const toks = qNorm.split(/[^\p{L}\p{N}_]+/u).filter(Boolean);
221+
for (const [k, set] of byLen) {
222+
for (let i = 0; i + k <= toks.length; i++) {
223+
if (set.has(toks.slice(i, i + k).join(" "))) {
224+
cache.set(qNorm, true);
225+
return true;
226+
}
227+
}
228+
}
229+
cache.set(qNorm, false);
230+
return false;
231+
};
232+
}
233+
234+
export const _isForcedChatIsolated = makeIsolatedPhraseChecker(FORCED_CHAT_PHRASES);
235+
61236
/**
62237
* Smart Assist Engine
63238
*/
@@ -220,14 +395,17 @@ export const SmartAssistEngine = {
220395

221396
async getPromptIntent(query) {
222397
try {
398+
const cleanedQuery = this._preprocessQuery(query);
399+
if (_isForcedChatIsolated(cleanedQuery)) {
400+
return "chat";
401+
}
223402
const engine = await this._createEngine({
224403
featureId: "smart-intent",
225404
modelId: "mozilla/mobilebert-query-intent-detection",
226405
modelRevision: "v0.2.0",
227406
taskName: "text-classification",
228407
});
229408
const threshold = 0.6;
230-
const cleanedQuery = this._preprocessQuery(query);
231409
const resp = await engine.run({ args: [[cleanedQuery]] });
232410
// resp example: [{ label: "chat", score: 0.95 }, { label: "search", score: 0.04 }]
233411
if (
@@ -252,4 +430,5 @@ export const SmartAssistEngine = {
252430
}
253431
return query.replace(/\?/g, "").trim();
254432
},
433+
255434
};

browser/components/genai/tests/xpcshell/test_smart_assist_engine.js

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,16 @@ const { sinon } = ChromeUtils.importESModule(
99
"resource://testing-common/Sinon.sys.mjs"
1010
);
1111

12+
const {
13+
normalizeTextForWhitelist,
14+
tokenize,
15+
buildPhraseSets,
16+
makeIsolatedPhraseChecker,
17+
_isForcedChatIsolated,
18+
} = ChromeUtils.importESModule(
19+
"moz-src:///browser/components/genai/SmartAssistEngine.sys.mjs"
20+
);
21+
1222
// Prefs
1323
const PREF_API_KEY = "browser.ml.smartAssist.apiKey";
1424
const PREF_ENDPOINT = "browser.ml.smartAssist.endpoint";
@@ -259,3 +269,150 @@ add_task(async function test_preprocessQuery_removes_question_marks() {
259269

260270
sb.restore();
261271
});
272+
273+
add_task(function test_normalizeTextForWhitelist_basic() {
274+
// lowercasing + trimming + collapsing internal spaces
275+
Assert.equal(
276+
normalizeTextForWhitelist(" HeLLo There "),
277+
"hello there",
278+
"Should lowercase, trim, and collapse spaces"
279+
);
280+
281+
// NFKC normalization: compatibility forms → canonical
282+
// Fullwidth characters normalize: e.g., 'TEST' → 'test'
283+
Assert.equal(
284+
normalizeTextForWhitelist("TEST 123"),
285+
"test 123",
286+
"Should NFKC-normalize fullwidth letters/digits"
287+
);
288+
289+
// Multiple whitespace kinds (NBSP, tabs, newlines) collapse
290+
Assert.equal(
291+
normalizeTextForWhitelist("a\u00A0b\tc\nd"),
292+
"a b c d",
293+
"Should collapse all whitespace kinds to single spaces"
294+
);
295+
});
296+
297+
add_task(function test_tokenize_unicode_and_boundaries() {
298+
// Splits on non-word chars, keeps letters/digits/underscore
299+
Assert.deepEqual(
300+
tokenize("hello, world! 42_times"),
301+
["hello", "world", "42_times"],
302+
"Should split on punctuation and keep underscores"
303+
);
304+
305+
// Unicode letters should be treated as word chars (\p{L})
306+
Assert.deepEqual(
307+
tokenize("mañana—café!"),
308+
["mañana", "café"],
309+
"Should keep Unicode letters and split on punctuation (em dash, bang)"
310+
);
311+
312+
// Apostrophes split (non-word), as intended
313+
Assert.deepEqual(
314+
tokenize("what's up"),
315+
["what", "s", "up"],
316+
"Apostrophes are separators, so tokens split around them"
317+
);
318+
});
319+
320+
add_task(function test_buildPhraseSets_grouping_and_normalization() {
321+
const phrases = [
322+
"sup",
323+
"hi there", // 2 tokens
324+
"what's up", // becomes "what s up" (3 tokens)
325+
" foo bar ", // leading/trailing + multiple spaces
326+
"", // empty should be skipped
327+
"___", // token of underscores counts as 1 token
328+
];
329+
const sets = buildPhraseSets(phrases);
330+
331+
// Expect keys for lengths: 1, 2, 3
332+
Assert.ok(sets.has(1), "Should have set for single-token phrases");
333+
Assert.ok(sets.has(2), "Should have set for two-token phrases");
334+
Assert.ok(sets.has(3), "Should have set for three-token phrases");
335+
336+
// 1-token set contains: "sup", "___"
337+
Assert.ok(sets.get(1).has("sup"), "Single-token set should contain 'sup'");
338+
Assert.ok(sets.get(1).has("___"), "Single-token set should contain '___'");
339+
340+
// 2-token set contains normalized "hi there" and "foo bar"
341+
Assert.ok(sets.get(2).has("hi there"), "Two-token set should contain 'hi there'");
342+
Assert.ok(sets.get(2).has("foo bar"), "Two-token set should contain normalized 'foo bar'");
343+
344+
// 3-token set contains "what s up" (note apostrophe split)
345+
Assert.ok(sets.get(3).has("what s up"), "Three-token set should contain 'what s up'");
346+
347+
// Empty phrase skipped: nothing added for length 0
348+
for (const [k, set] of sets) {
349+
Assert.ok(k > 0 && set.size >= 1, "No empty keys, each set has at least one entry");
350+
}
351+
});
352+
353+
add_task(function test_isolated_phrase_checker_single_word_boundaries() {
354+
const phrases = ["sup", "hello", "___"];
355+
const isForced = makeIsolatedPhraseChecker(phrases);
356+
357+
// Positive: exact token present
358+
Assert.ok(isForced("sup bro"), "Should match 'sup' as an isolated token at start");
359+
Assert.ok(isForced("hey, hello there"), "Should match 'hello' surrounded by punctuation");
360+
Assert.ok(isForced("foo ___ bar"), "Should match token with underscores");
361+
362+
// Negative: partial-word should NOT match
363+
Assert.ok(!isForced("supposingly, this should not match"), "No partial-word match for 'sup'");
364+
Assert.ok(!isForced("supper time"), "No partial-word match inside 'supper'");
365+
Assert.ok(!isForced("shelloworld"), "No partial-word match for 'hello'");
366+
});
367+
368+
add_task(function test_isolated_phrase_checker_multiword_and_punctuation() {
369+
// Multiword phrases; apostrophes become token splits -> "what's up" => "what s up"
370+
const phrases = ["hi there", "what's up"];
371+
const isForced = makeIsolatedPhraseChecker(phrases);
372+
373+
// Positive: punctuation between words should still match (token split)
374+
Assert.ok(isForced("hi—there!"), "Em dash between words should match 'hi there'");
375+
Assert.ok(isForced("well, hi there!!"), "Punctuation around phrase should match");
376+
Assert.ok(isForced("so, what’s up today?"), "Curly apostrophe splits to tokens; should match 'what s up'");
377+
378+
// Negative: glued words should not match
379+
Assert.ok(!isForced("hithere"), "Concatenated words should not match 'hi there'");
380+
Assert.ok(!isForced("whatssup"), "Should not match 'what s up' without separators");
381+
});
382+
383+
add_task(function test_isolated_phrase_checker_spacing_and_unicode_norm() {
384+
const phrases = ["good morning", "hello"];
385+
const isForced = makeIsolatedPhraseChecker(phrases);
386+
387+
// Multiple spaces collapse
388+
Assert.ok(isForced("good morning everyone"), "Multiple spaces between tokens should still match");
389+
390+
// Fullwidth / NFKC normalization (TEST) and basic usage
391+
Assert.ok(isForced(" HELLO "), "Case and surrounding spaces should normalize and match 'hello'");
392+
393+
// Non-breaking spaces and tabs
394+
Assert.ok(isForced("good\u00A0morning\tteam"), "NBSP and tabs normalize and match");
395+
});
396+
397+
add_task(function test_isolated_phrase_checker_no_match_cases() {
398+
const phrases = ["hi there", "sup"];
399+
const isForced = makeIsolatedPhraseChecker(phrases);
400+
401+
Assert.ok(!isForced(""), "Empty string should not match");
402+
Assert.ok(!isForced("nothing to see here"), "Unrelated text should not match");
403+
Assert.ok(!isForced("support"), "Partial token with 'sup' prefix should not match");
404+
});
405+
406+
add_task(function test_isolated_phrase_checker_caching_stability() {
407+
const phrases = ["hello", "hi there"];
408+
const isForced = makeIsolatedPhraseChecker(phrases);
409+
410+
// Repeated calls with the same input should return identical results (cache sanity)
411+
const q1 = "Hello there!";
412+
const first = isForced(q1);
413+
const second = isForced(q1);
414+
Assert.equal(first, second, "Same query should yield identical result across calls (cache-stable)");
415+
416+
// Different whitespace should normalize to the same outcome
417+
Assert.equal(isForced(" hello there "), isForced("hello there"), "Whitespace variations should not affect result");
418+
});

0 commit comments

Comments
 (0)