Skip to content

Commit 951b068

Browse files
Duplicate content normalization (#602)
* Normalize semantic search duplicate results Co-authored-by: Kent C. Dodds <me+github@kentcdodds.com> * Clamp vectorize overfetch limit * Fix fetch body typing for Workers AI transcription Co-authored-by: Kent C. Dodds <me+github@kentcdodds.com> * Harden semantic search canonicalization Co-authored-by: Kent C. Dodds <me+github@kentcdodds.com> * Add docs for semantic search normalization helpers Co-authored-by: Kent C. Dodds <me+github@kentcdodds.com> * Tighten semantic search normalization edge cases Co-authored-by: Kent C. Dodds <me+github@kentcdodds.com> * Use MSW Cloudflare mocks for semantic search tests Co-authored-by: Kent C. Dodds <me+github@kentcdodds.com> * Fix import order in semantic search test Co-authored-by: Kent C. Dodds <me+github@kentcdodds.com> --------- Co-authored-by: Cursor Agent <cursoragent@cursor.com>
1 parent 0ce48c0 commit 951b068

File tree

3 files changed

+325
-13
lines changed

3 files changed

+325
-13
lines changed

app/utils/__tests__/semantic-search.server.test.ts

Lines changed: 155 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,25 @@
1-
import { describe, expect, test } from 'vitest'
1+
import { setupServer } from 'msw/node'
2+
import { afterAll, beforeAll, beforeEach, describe, expect, test } from 'vitest'
3+
import { cloudflareHandlers, resetCloudflareMockState } from '../../../mocks/cloudflare.ts'
24
import {
35
isSemanticSearchConfigured,
46
semanticSearchKCD,
57
} from '../semantic-search.server.ts'
68

9+
const server = setupServer(...cloudflareHandlers)
10+
11+
beforeAll(() => {
12+
server.listen({ onUnhandledRequest: 'error' })
13+
})
14+
15+
beforeEach(() => {
16+
resetCloudflareMockState()
17+
})
18+
19+
afterAll(() => {
20+
server.close()
21+
})
22+
723
describe('semantic search env gating', () => {
824
test('isSemanticSearchConfigured is false without env vars', () => {
925
const original = {
@@ -46,3 +62,141 @@ describe('semantic search env gating', () => {
4662
})
4763
})
4864

65+
describe('semantic search result normalization', () => {
66+
test('dedupes chunk-level matches into unique docs', async () => {
67+
const originalEnv = {
68+
CLOUDFLARE_ACCOUNT_ID: process.env.CLOUDFLARE_ACCOUNT_ID,
69+
CLOUDFLARE_API_TOKEN: process.env.CLOUDFLARE_API_TOKEN,
70+
CLOUDFLARE_VECTORIZE_INDEX: process.env.CLOUDFLARE_VECTORIZE_INDEX,
71+
CLOUDFLARE_AI_EMBEDDING_MODEL: process.env.CLOUDFLARE_AI_EMBEDDING_MODEL,
72+
}
73+
try {
74+
const accountId = 'acc123'
75+
const apiToken = 'test-token'
76+
const indexName = 'semantic-index'
77+
78+
process.env.CLOUDFLARE_ACCOUNT_ID = accountId
79+
process.env.CLOUDFLARE_API_TOKEN = apiToken
80+
process.env.CLOUDFLARE_VECTORIZE_INDEX = indexName
81+
delete process.env.CLOUDFLARE_AI_EMBEDDING_MODEL
82+
83+
// Use a query that's unlikely to match any seeded doc titles/snippets,
84+
// so the Cloudflare Vectorize mock falls back to cosine similarity rather
85+
// than match-sorter ranking.
86+
const query = 'zz_semantic_dedupe_test_02157475'
87+
88+
const embedRes = await fetch(
89+
`https://api.cloudflare.com/client/v4/accounts/${accountId}/ai/run/@cf/google/embeddinggemma-300m`,
90+
{
91+
method: 'POST',
92+
headers: {
93+
Authorization: `Bearer ${apiToken}`,
94+
'Content-Type': 'application/json',
95+
},
96+
body: JSON.stringify({ text: [query] }),
97+
},
98+
)
99+
expect(embedRes.ok).toBe(true)
100+
const embedJson = (await embedRes.json()) as any
101+
const vector = embedJson?.result?.data?.[0] as unknown
102+
expect(Array.isArray(vector)).toBe(true)
103+
104+
const vectorsToUpsert = [
105+
{
106+
id: 'blog:cursor-dup:chunk:0',
107+
values: vector as number[],
108+
metadata: {
109+
type: 'blog',
110+
slug: 'cursor-dup',
111+
url: '/blog/cursor-dup',
112+
title: 'Cursor Dup',
113+
snippet: 'snippet-best',
114+
},
115+
},
116+
{
117+
id: 'blog:cursor-dup:chunk:1',
118+
values: vector as number[],
119+
metadata: {
120+
type: 'blog',
121+
slug: 'cursor-dup',
122+
url: '/blog/cursor-dup',
123+
title: 'Cursor Dup (chunk 2)',
124+
snippet: 'snippet-worse',
125+
},
126+
},
127+
{
128+
id: 'blog:cursor-one:chunk:0',
129+
values: vector as number[],
130+
metadata: {
131+
type: 'blog',
132+
slug: 'cursor-one',
133+
url: '/blog/cursor-one',
134+
title: 'Cursor One',
135+
snippet: 'one-snippet',
136+
},
137+
},
138+
{
139+
id: 'credit:alice:chunk:0',
140+
values: vector as number[],
141+
metadata: {
142+
type: 'credit',
143+
slug: 'alice',
144+
url: '/credits',
145+
title: 'Alice',
146+
snippet: 'alice-snippet',
147+
},
148+
},
149+
{
150+
id: 'credit:bob:chunk:0',
151+
values: vector as number[],
152+
metadata: {
153+
type: 'credit',
154+
slug: 'bob',
155+
url: '/credits',
156+
title: 'Bob',
157+
snippet: 'bob-snippet',
158+
},
159+
},
160+
]
161+
162+
const ndjson =
163+
vectorsToUpsert.map((v) => JSON.stringify(v)).join('\n') + '\n'
164+
const upsertRes = await fetch(
165+
`https://api.cloudflare.com/client/v4/accounts/${accountId}/vectorize/v2/indexes/${indexName}/upsert`,
166+
{
167+
method: 'POST',
168+
headers: {
169+
Authorization: `Bearer ${apiToken}`,
170+
'Content-Type': 'application/x-ndjson',
171+
},
172+
body: ndjson,
173+
},
174+
)
175+
expect(upsertRes.ok).toBe(true)
176+
177+
const results = await semanticSearchKCD({ query, topK: 4 })
178+
expect(results).toHaveLength(4)
179+
180+
// Chunk-level duplicates collapse into a single doc-level result.
181+
const ids = results.map((r) => r.id)
182+
const urls = results.map((r) => r.url)
183+
expect(new Set(ids).size).toBe(ids.length)
184+
expect(urls.filter((u) => u === '/blog/cursor-dup')).toHaveLength(1)
185+
186+
const blogResult = results.find((r) => r.url === '/blog/cursor-dup')
187+
expect(blogResult).toBeDefined()
188+
expect(blogResult!.snippet).toBe('snippet-best')
189+
190+
// Credits share the same URL, but should not be collapsed (slug differentiates them).
191+
expect(ids).toContain('credit:alice')
192+
expect(ids).toContain('credit:bob')
193+
expect(urls.filter((u) => u === '/credits')).toHaveLength(2)
194+
} finally {
195+
for (const [key, value] of Object.entries(originalEnv)) {
196+
if (typeof value === 'string') process.env[key] = value
197+
else delete process.env[key]
198+
}
199+
}
200+
})
201+
})
202+

app/utils/cloudflare-ai-transcription.server.ts

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,14 @@ export async function transcribeMp3WithWorkersAi({
4141
model,
4242
)}`
4343

44+
// Some TS `fetch` typings only accept `ArrayBufferView` backed by `ArrayBuffer`
45+
// (not `ArrayBufferLike`). Convert to an `ArrayBuffer`-backed view without
46+
// copying when possible.
47+
const mp3Body =
48+
mp3.buffer instanceof ArrayBuffer
49+
? new Uint8Array(mp3.buffer, mp3.byteOffset, mp3.byteLength)
50+
: Uint8Array.from(mp3)
51+
4452
// For `@cf/openai/whisper`, Cloudflare supports raw binary audio as the body.
4553
// Docs: https://developers.cloudflare.com/workers-ai/models/whisper/
4654
const res = await fetch(url, {
@@ -50,9 +58,7 @@ export async function transcribeMp3WithWorkersAi({
5058
// Best-effort content-type; CF can infer in many cases, but be explicit.
5159
'Content-Type': 'audio/mpeg',
5260
},
53-
// Some fetch/undici TS typings are stricter than runtime and require
54-
// `Uint8Array<ArrayBuffer>` rather than `Uint8Array<ArrayBufferLike>`.
55-
body: mp3 as unknown as Uint8Array<ArrayBuffer>,
61+
body: mp3Body,
5662
})
5763

5864
if (!res.ok) {

app/utils/semantic-search.server.ts

Lines changed: 161 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,95 @@ type VectorizeQueryResponse = {
1212
}>
1313
}
1414

15+
/**
16+
* Parse a value that may be a string, returning a trimmed non-empty string.
17+
*/
18+
function asNonEmptyString(value: unknown): string | undefined {
19+
if (typeof value !== 'string') return undefined
20+
const trimmed = value.trim()
21+
return trimmed ? trimmed : undefined
22+
}
23+
24+
/**
25+
* Normalize a URL/path into a stable key:
26+
* - absolute URLs -> pathname
27+
* - relative paths -> strip query/fragment and trailing slashes
28+
*/
29+
function normalizeUrlForKey(url: string): string {
30+
// Prefer treating absolute URLs and relative paths as the same canonical key.
31+
try {
32+
if (/^https?:\/\//i.test(url)) {
33+
const u = new URL(url)
34+
return u.pathname !== '/' ? u.pathname.replace(/\/+$/, '') : u.pathname
35+
}
36+
} catch {
37+
// ignore
38+
}
39+
const cleaned = (url.split(/[?#]/)[0] ?? '').trim()
40+
if (!cleaned) return '/'
41+
return cleaned !== '/' ? cleaned.replace(/\/+$/, '') : cleaned
42+
}
43+
44+
/**
45+
* Normalize a title for canonicalization (case-insensitive).
46+
*/
47+
function normalizeTitleForKey(title: string) {
48+
// asNonEmptyString already trims; use lowercase to avoid casing-only duplicates.
49+
return title.toLowerCase()
50+
}
51+
52+
function normalizeSlugForKey(slug: string) {
53+
// Normalize for case-insensitive dedupe parity with titles.
54+
return slug.toLowerCase()
55+
}
56+
57+
function parseDocRefFromVectorId(
58+
vectorId: string,
59+
): { type: string; slug: string } | null {
60+
// Indexers generally use `<type>:<slug>:chunk:<n>` for chunk-level vectors.
61+
// When metadata is missing/incomplete, we can still collapse chunk hits into a
62+
// doc-level hit using the stable vector id structure.
63+
const match =
64+
/^(?<type>[^:]+):(?<slug>[^:]+):chunk:(?<chunkIndex>\d+)$/u.exec(vectorId)
65+
const type = match?.groups?.type
66+
const slug = match?.groups?.slug
67+
if (!type || !slug) return null
68+
return { type, slug }
69+
}
70+
71+
/**
72+
* Compute a doc-level identifier for semantic search results.
73+
*
74+
* Vectorize stores one vector per chunk; the canonical ID collapses chunk hits
75+
* into a single doc hit so search results don't contain duplicates.
76+
*/
77+
function getCanonicalResultId({
78+
vectorId,
79+
type,
80+
slug,
81+
url,
82+
title,
83+
}: {
84+
vectorId: string
85+
type: string | undefined
86+
slug: string | undefined
87+
url: string | undefined
88+
title: string | undefined
89+
}) {
90+
// The Vectorize index stores multiple chunk vectors per doc, so we need a
91+
// canonical, doc-level identifier to collapse duplicates in query results.
92+
if (type && slug) return `${type}:${normalizeSlugForKey(slug)}`
93+
const fromVectorId = parseDocRefFromVectorId(vectorId)
94+
if (fromVectorId) {
95+
return `${fromVectorId.type}:${normalizeSlugForKey(fromVectorId.slug)}`
96+
}
97+
const normalizedUrl = url ? normalizeUrlForKey(url) : undefined
98+
if (type && normalizedUrl) return `${type}:${normalizedUrl}`
99+
if (normalizedUrl) return normalizedUrl
100+
if (type && title) return `${type}:${normalizeTitleForKey(title)}`
101+
return vectorId
102+
}
103+
15104
function getRequiredSemanticSearchEnv() {
16105
const accountId = process.env.CLOUDFLARE_ACCOUNT_ID
17106
const apiToken = process.env.CLOUDFLARE_API_TOKEN
@@ -158,6 +247,10 @@ export async function semanticSearchKCD({
158247
topK = 15,
159248
}: {
160249
query: string
250+
/**
251+
* Requested number of unique docs to return.
252+
* Clamped to 20 because Vectorize metadata queries cap `topK` at 20.
253+
*/
161254
topK?: number
162255
}): Promise<Array<SemanticSearchResult>> {
163256
const { accountId, apiToken, indexName, embeddingModel } =
@@ -169,6 +262,15 @@ export async function semanticSearchKCD({
169262
)
170263
}
171264

265+
const safeTopK =
266+
typeof topK === 'number' && Number.isFinite(topK)
267+
? Math.max(1, Math.min(20, Math.floor(topK)))
268+
: 15
269+
// Vectorize returns chunk-level matches and overlapping chunks commonly score
270+
// highly together. Overfetch and then de-dupe down to unique docs.
271+
// When requesting metadata, Vectorize caps topK at 20.
272+
const rawTopK = Math.min(20, safeTopK * 5)
273+
172274
const vector = await getEmbedding({
173275
accountId,
174276
apiToken,
@@ -181,21 +283,71 @@ export async function semanticSearchKCD({
181283
apiToken,
182284
indexName,
183285
vector,
184-
topK,
286+
topK: rawTopK,
185287
})
186288
const result = (responseJson as any).result ?? responseJson
187289
const matches = (result?.matches ?? []) as VectorizeQueryResponse['matches']
188290

189-
return matches.map((m) => {
291+
type RankedResult = { rank: number; result: SemanticSearchResult }
292+
const byCanonicalId = new Map<string, RankedResult>()
293+
294+
for (let i = 0; i < matches.length; i++) {
295+
const m = matches[i]
296+
if (!m) continue
190297
const md = (m.metadata ?? {}) as Record<string, unknown>
191-
return {
192-
id: m.id,
298+
const type = asNonEmptyString(md.type)
299+
const slug = asNonEmptyString(md.slug)
300+
const title = asNonEmptyString(md.title)
301+
const url = asNonEmptyString(md.url)
302+
const snippet = asNonEmptyString(md.snippet)
303+
304+
const canonicalId = getCanonicalResultId({
305+
vectorId: m.id,
306+
type,
307+
slug,
308+
url,
309+
title,
310+
})
311+
312+
const next: SemanticSearchResult = {
313+
id: canonicalId,
193314
score: m.score,
194-
type: typeof md.type === 'string' ? md.type : undefined,
195-
title: typeof md.title === 'string' ? md.title : undefined,
196-
url: typeof md.url === 'string' ? md.url : undefined,
197-
snippet: typeof md.snippet === 'string' ? md.snippet : undefined,
315+
type,
316+
title,
317+
url,
318+
snippet,
198319
}
199-
})
320+
321+
const existing = byCanonicalId.get(canonicalId)
322+
if (!existing) {
323+
byCanonicalId.set(canonicalId, { rank: i, result: next })
324+
continue
325+
}
326+
327+
const prev = existing.result
328+
const prevScore = typeof prev.score === 'number' && Number.isFinite(prev.score) ? prev.score : -Infinity
329+
const nextScore = typeof next.score === 'number' && Number.isFinite(next.score) ? next.score : -Infinity
330+
const bestScore = Math.max(prevScore, nextScore)
331+
const nextIsBetter = nextScore > prevScore
332+
333+
existing.result = {
334+
id: canonicalId,
335+
score: bestScore,
336+
type: prev.type ?? next.type,
337+
title: prev.title ?? next.title,
338+
url: prev.url ?? next.url,
339+
// Prefer the snippet from the highest-scoring chunk, but fall back to any snippet.
340+
snippet: nextIsBetter ? next.snippet ?? prev.snippet : prev.snippet ?? next.snippet,
341+
}
342+
}
343+
344+
return [...byCanonicalId.values()]
345+
.sort((a, b) => {
346+
const scoreDiff = (b.result.score ?? 0) - (a.result.score ?? 0)
347+
if (scoreDiff) return scoreDiff
348+
return a.rank - b.rank
349+
})
350+
.slice(0, safeTopK)
351+
.map((x) => x.result)
200352
}
201353

0 commit comments

Comments
 (0)