Skip to content

Commit 6d2d281

Browse files
Tighten semantic search normalization edge cases
Co-authored-by: Kent C. Dodds <me+github@kentcdodds.com>
1 parent 2c7a9d0 commit 6d2d281

File tree

2 files changed

+23
-8
lines changed

2 files changed

+23
-8
lines changed

app/utils/__tests__/semantic-search.server.test.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,9 @@ describe('semantic search result normalization', () => {
166166
expect(ids).toContain('blog:react-hooks-pitfalls')
167167

168168
const blogResult = results.find((r) => r.id === 'blog:react-hooks-pitfalls')
169-
expect(blogResult?.snippet).toBe('snippet-0')
169+
expect(blogResult).toBeDefined()
170+
expect(blogResult!.snippet).toBe('snippet-0')
171+
expect(blogResult!.score).toBe(0.99)
170172

171173
// Credits share the same URL, but should not be collapsed (slug differentiates them).
172174
expect(ids).toContain('credit:alice')

app/utils/semantic-search.server.ts

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,14 @@ function normalizeUrlForKey(url: string): string {
3131
try {
3232
if (/^https?:\/\//i.test(url)) {
3333
const u = new URL(url)
34-
return u.pathname && u.pathname !== '/' ? u.pathname.replace(/\/+$/, '') : u.pathname
34+
return u.pathname !== '/' ? u.pathname.replace(/\/+$/, '') : u.pathname
3535
}
3636
} catch {
3737
// ignore
3838
}
39-
const cleaned = (url.split(/[?#]/)[0] ?? url).trim()
40-
return cleaned && cleaned !== '/' ? cleaned.replace(/\/+$/, '') : cleaned
39+
const cleaned = (url.split(/[?#]/)[0] ?? '').trim()
40+
if (!cleaned) return '/'
41+
return cleaned !== '/' ? cleaned.replace(/\/+$/, '') : cleaned
4142
}
4243

4344
/**
@@ -48,6 +49,11 @@ function normalizeTitleForKey(title: string) {
4849
return title.toLowerCase()
4950
}
5051

52+
function normalizeSlugForKey(slug: string) {
53+
// Normalize for case-insensitive dedupe parity with titles.
54+
return slug.toLowerCase()
55+
}
56+
5157
/**
5258
* Compute a doc-level identifier for semantic search results.
5359
*
@@ -69,9 +75,10 @@ function getCanonicalResultId({
6975
}) {
7076
// The Vectorize index stores multiple chunk vectors per doc, so we need a
7177
// canonical, doc-level identifier to collapse duplicates in query results.
72-
if (type && slug) return `${type}:${slug}`
73-
if (type && url) return `${type}:${normalizeUrlForKey(url)}`
74-
if (url) return normalizeUrlForKey(url)
78+
if (type && slug) return `${type}:${normalizeSlugForKey(slug)}`
79+
const normalizedUrl = url ? normalizeUrlForKey(url) : undefined
80+
if (type && normalizedUrl) return `${type}:${normalizedUrl}`
81+
if (normalizedUrl) return normalizedUrl
7582
if (type && title) return `${type}:${normalizeTitleForKey(title)}`
7683
return vectorId
7784
}
@@ -222,6 +229,10 @@ export async function semanticSearchKCD({
222229
topK = 15,
223230
}: {
224231
query: string
232+
/**
233+
* Requested number of unique docs to return.
234+
* Clamped to 20 because Vectorize metadata queries cap `topK` at 20.
235+
*/
225236
topK?: number
226237
}): Promise<Array<SemanticSearchResult>> {
227238
const { accountId, apiToken, indexName, embeddingModel } =
@@ -234,7 +245,9 @@ export async function semanticSearchKCD({
234245
}
235246

236247
const safeTopK =
237-
typeof topK === 'number' && Number.isFinite(topK) ? Math.max(1, Math.floor(topK)) : 15
248+
typeof topK === 'number' && Number.isFinite(topK)
249+
? Math.max(1, Math.min(20, Math.floor(topK)))
250+
: 15
238251
// Vectorize returns chunk-level matches and overlapping chunks commonly score
239252
// highly together. Overfetch and then de-dupe down to unique docs.
240253
// When requesting metadata, Vectorize caps topK at 20.

0 commit comments

Comments
 (0)