Duplicate content normalization (#602)

kentcdodds · cursoragent · web-flow · commit 951b068e9554 · 2026-02-18T19:41:10.000-07:00
* Normalize semantic search duplicate results

Co-authored-by: Kent C. Dodds &lt;me+github@kentcdodds.com&gt;

* Clamp vectorize overfetch limit

* Fix fetch body typing for Workers AI transcription

Co-authored-by: Kent C. Dodds &lt;me+github@kentcdodds.com&gt;

* Harden semantic search canonicalization

Co-authored-by: Kent C. Dodds &lt;me+github@kentcdodds.com&gt;

* Add docs for semantic search normalization helpers

Co-authored-by: Kent C. Dodds &lt;me+github@kentcdodds.com&gt;

* Tighten semantic search normalization edge cases

Co-authored-by: Kent C. Dodds &lt;me+github@kentcdodds.com&gt;

* Use MSW Cloudflare mocks for semantic search tests

Co-authored-by: Kent C. Dodds &lt;me+github@kentcdodds.com&gt;

* Fix import order in semantic search test

Co-authored-by: Kent C. Dodds &lt;me+github@kentcdodds.com&gt;

---------

Co-authored-by: Cursor Agent &lt;cursoragent@cursor.com&gt;
diff --git a/app/utils/__tests__/semantic-search.server.test.ts b/app/utils/__tests__/semantic-search.server.test.ts
@@ -1,9 +1,25 @@
-import { describe, expect, test } from 'vitest'
+import { setupServer } from 'msw/node'
+import { afterAll, beforeAll, beforeEach, describe, expect, test } from 'vitest'
+import { cloudflareHandlers, resetCloudflareMockState } from '../../../mocks/cloudflare.ts'
 import {
 	isSemanticSearchConfigured,
 	semanticSearchKCD,
 } from '../semantic-search.server.ts'
 
+const server = setupServer(...cloudflareHandlers)
+
+beforeAll(() => {
+	server.listen({ onUnhandledRequest: 'error' })
+})
+
+beforeEach(() => {
+	resetCloudflareMockState()
+})
+
+afterAll(() => {
+	server.close()
+})
+
 describe('semantic search env gating', () => {
 	test('isSemanticSearchConfigured is false without env vars', () => {
 		const original = {
@@ -46,3 +62,141 @@ describe('semantic search env gating', () => {
 	})
 })
 
+describe('semantic search result normalization', () => {
+	test('dedupes chunk-level matches into unique docs', async () => {
+		const originalEnv = {
+			CLOUDFLARE_ACCOUNT_ID: process.env.CLOUDFLARE_ACCOUNT_ID,
+			CLOUDFLARE_API_TOKEN: process.env.CLOUDFLARE_API_TOKEN,
+			CLOUDFLARE_VECTORIZE_INDEX: process.env.CLOUDFLARE_VECTORIZE_INDEX,
+			CLOUDFLARE_AI_EMBEDDING_MODEL: process.env.CLOUDFLARE_AI_EMBEDDING_MODEL,
+		}
+		try {
+			const accountId = 'acc123'
+			const apiToken = 'test-token'
+			const indexName = 'semantic-index'
+
+			process.env.CLOUDFLARE_ACCOUNT_ID = accountId
+			process.env.CLOUDFLARE_API_TOKEN = apiToken
+			process.env.CLOUDFLARE_VECTORIZE_INDEX = indexName
+			delete process.env.CLOUDFLARE_AI_EMBEDDING_MODEL
+
+			// Use a query that's unlikely to match any seeded doc titles/snippets,
+			// so the Cloudflare Vectorize mock falls back to cosine similarity rather
+			// than match-sorter ranking.
+			const query = 'zz_semantic_dedupe_test_02157475'
+
+			const embedRes = await fetch(
+				`https://api.cloudflare.com/client/v4/accounts/${accountId}/ai/run/@cf/google/embeddinggemma-300m`,
+				{
+					method: 'POST',
+					headers: {
+						Authorization: `Bearer ${apiToken}`,
+						'Content-Type': 'application/json',
+					},
+					body: JSON.stringify({ text: [query] }),
+				},
+			)
+			expect(embedRes.ok).toBe(true)
+			const embedJson = (await embedRes.json()) as any
+			const vector = embedJson?.result?.data?.[0] as unknown
+			expect(Array.isArray(vector)).toBe(true)
+
+			const vectorsToUpsert = [
+				{
+					id: 'blog:cursor-dup:chunk:0',
+					values: vector as number[],
+					metadata: {
+						type: 'blog',
+						slug: 'cursor-dup',
+						url: '/blog/cursor-dup',
+						title: 'Cursor Dup',
+						snippet: 'snippet-best',
+					},
+				},
+				{
+					id: 'blog:cursor-dup:chunk:1',
+					values: vector as number[],
+					metadata: {
+						type: 'blog',
+						slug: 'cursor-dup',
+						url: '/blog/cursor-dup',
+						title: 'Cursor Dup (chunk 2)',
+						snippet: 'snippet-worse',
+					},
+				},
+				{
+					id: 'blog:cursor-one:chunk:0',
+					values: vector as number[],
+					metadata: {
+						type: 'blog',
+						slug: 'cursor-one',
+						url: '/blog/cursor-one',
+						title: 'Cursor One',
+						snippet: 'one-snippet',
+					},
+				},
+				{
+					id: 'credit:alice:chunk:0',
+					values: vector as number[],
+					metadata: {
+						type: 'credit',
+						slug: 'alice',
+						url: '/credits',
+						title: 'Alice',
+						snippet: 'alice-snippet',
+					},
+				},
+				{
+					id: 'credit:bob:chunk:0',
+					values: vector as number[],
+					metadata: {
+						type: 'credit',
+						slug: 'bob',
+						url: '/credits',
+						title: 'Bob',
+						snippet: 'bob-snippet',
+					},
+				},
+			]
+
+			const ndjson =
+				vectorsToUpsert.map((v) => JSON.stringify(v)).join('\n') + '\n'
+			const upsertRes = await fetch(
+				`https://api.cloudflare.com/client/v4/accounts/${accountId}/vectorize/v2/indexes/${indexName}/upsert`,
+				{
+					method: 'POST',
+					headers: {
+						Authorization: `Bearer ${apiToken}`,
+						'Content-Type': 'application/x-ndjson',
+					},
+					body: ndjson,
+				},
+			)
+			expect(upsertRes.ok).toBe(true)
+
+			const results = await semanticSearchKCD({ query, topK: 4 })
+			expect(results).toHaveLength(4)
+
+			// Chunk-level duplicates collapse into a single doc-level result.
+			const ids = results.map((r) => r.id)
+			const urls = results.map((r) => r.url)
+			expect(new Set(ids).size).toBe(ids.length)
+			expect(urls.filter((u) => u === '/blog/cursor-dup')).toHaveLength(1)
+
+			const blogResult = results.find((r) => r.url === '/blog/cursor-dup')
+			expect(blogResult).toBeDefined()
+			expect(blogResult!.snippet).toBe('snippet-best')
+
+			// Credits share the same URL, but should not be collapsed (slug differentiates them).
+			expect(ids).toContain('credit:alice')
+			expect(ids).toContain('credit:bob')
+			expect(urls.filter((u) => u === '/credits')).toHaveLength(2)
+		} finally {
+			for (const [key, value] of Object.entries(originalEnv)) {
+				if (typeof value === 'string') process.env[key] = value
+				else delete process.env[key]
+			}
+		}
+	})
+})
+
diff --git a/app/utils/cloudflare-ai-transcription.server.ts b/app/utils/cloudflare-ai-transcription.server.ts
@@ -41,6 +41,14 @@ export async function transcribeMp3WithWorkersAi({
 		model,
 	)}`
 
+	// Some TS `fetch` typings only accept `ArrayBufferView` backed by `ArrayBuffer`
+	// (not `ArrayBufferLike`). Convert to an `ArrayBuffer`-backed view without
+	// copying when possible.
+	const mp3Body =
+		mp3.buffer instanceof ArrayBuffer
+			? new Uint8Array(mp3.buffer, mp3.byteOffset, mp3.byteLength)
+			: Uint8Array.from(mp3)
+
 	// For `@cf/openai/whisper`, Cloudflare supports raw binary audio as the body.
 	// Docs: https://developers.cloudflare.com/workers-ai/models/whisper/
 	const res = await fetch(url, {
@@ -50,9 +58,7 @@ export async function transcribeMp3WithWorkersAi({
 			// Best-effort content-type; CF can infer in many cases, but be explicit.
 			'Content-Type': 'audio/mpeg',
 		},
-		// Some fetch/undici TS typings are stricter than runtime and require
-		// `Uint8Array<ArrayBuffer>` rather than `Uint8Array<ArrayBufferLike>`.
-		body: mp3 as unknown as Uint8Array<ArrayBuffer>,
+		body: mp3Body,
 	})
 
 	if (!res.ok) {
diff --git a/app/utils/semantic-search.server.ts b/app/utils/semantic-search.server.ts
@@ -12,6 +12,95 @@ type VectorizeQueryResponse = {
 	}>
 }
 
+/**
+ * Parse a value that may be a string, returning a trimmed non-empty string.
+ */
+function asNonEmptyString(value: unknown): string | undefined {
+	if (typeof value !== 'string') return undefined
+	const trimmed = value.trim()
+	return trimmed ? trimmed : undefined
+}
+
+/**
+ * Normalize a URL/path into a stable key:
+ * - absolute URLs -> pathname
+ * - relative paths -> strip query/fragment and trailing slashes
+ */
+function normalizeUrlForKey(url: string): string {
+	// Prefer treating absolute URLs and relative paths as the same canonical key.
+	try {
+		if (/^https?:\/\//i.test(url)) {
+			const u = new URL(url)
+			return u.pathname !== '/' ? u.pathname.replace(/\/+$/, '') : u.pathname
+		}
+	} catch {
+		// ignore
+	}
+	const cleaned = (url.split(/[?#]/)[0] ?? '').trim()
+	if (!cleaned) return '/'
+	return cleaned !== '/' ? cleaned.replace(/\/+$/, '') : cleaned
+}
+
+/**
+ * Normalize a title for canonicalization (case-insensitive).
+ */
+function normalizeTitleForKey(title: string) {
+	// asNonEmptyString already trims; use lowercase to avoid casing-only duplicates.
+	return title.toLowerCase()
+}
+
+function normalizeSlugForKey(slug: string) {
+	// Normalize for case-insensitive dedupe parity with titles.
+	return slug.toLowerCase()
+}
+
+function parseDocRefFromVectorId(
+	vectorId: string,
+): { type: string; slug: string } | null {
+	// Indexers generally use `<type>:<slug>:chunk:<n>` for chunk-level vectors.
+	// When metadata is missing/incomplete, we can still collapse chunk hits into a
+	// doc-level hit using the stable vector id structure.
+	const match =
+		/^(?<type>[^:]+):(?<slug>[^:]+):chunk:(?<chunkIndex>\d+)$/u.exec(vectorId)
+	const type = match?.groups?.type
+	const slug = match?.groups?.slug
+	if (!type || !slug) return null
+	return { type, slug }
+}
+
+/**
+ * Compute a doc-level identifier for semantic search results.
+ *
+ * Vectorize stores one vector per chunk; the canonical ID collapses chunk hits
+ * into a single doc hit so search results don't contain duplicates.
+ */
+function getCanonicalResultId({
+	vectorId,
+	type,
+	slug,
+	url,
+	title,
+}: {
+	vectorId: string
+	type: string | undefined
+	slug: string | undefined
+	url: string | undefined
+	title: string | undefined
+}) {
+	// The Vectorize index stores multiple chunk vectors per doc, so we need a
+	// canonical, doc-level identifier to collapse duplicates in query results.
+	if (type && slug) return `${type}:${normalizeSlugForKey(slug)}`
+	const fromVectorId = parseDocRefFromVectorId(vectorId)
+	if (fromVectorId) {
+		return `${fromVectorId.type}:${normalizeSlugForKey(fromVectorId.slug)}`
+	}
+	const normalizedUrl = url ? normalizeUrlForKey(url) : undefined
+	if (type && normalizedUrl) return `${type}:${normalizedUrl}`
+	if (normalizedUrl) return normalizedUrl
+	if (type && title) return `${type}:${normalizeTitleForKey(title)}`
+	return vectorId
+}
+
 function getRequiredSemanticSearchEnv() {
 	const accountId = process.env.CLOUDFLARE_ACCOUNT_ID
 	const apiToken = process.env.CLOUDFLARE_API_TOKEN
@@ -158,6 +247,10 @@ export async function semanticSearchKCD({
 	topK = 15,
 }: {
 	query: string
+	/**
+	 * Requested number of unique docs to return.
+	 * Clamped to 20 because Vectorize metadata queries cap `topK` at 20.
+	 */
 	topK?: number
 }): Promise<Array<SemanticSearchResult>> {
 	const { accountId, apiToken, indexName, embeddingModel } =
@@ -169,6 +262,15 @@ export async function semanticSearchKCD({
 		)
 	}
 
+	const safeTopK =
+		typeof topK === 'number' && Number.isFinite(topK)
+			? Math.max(1, Math.min(20, Math.floor(topK)))
+			: 15
+	// Vectorize returns chunk-level matches and overlapping chunks commonly score
+	// highly together. Overfetch and then de-dupe down to unique docs.
+	// When requesting metadata, Vectorize caps topK at 20.
+	const rawTopK = Math.min(20, safeTopK * 5)
+
 	const vector = await getEmbedding({
 		accountId,
 		apiToken,
@@ -181,21 +283,71 @@ export async function semanticSearchKCD({
 		apiToken,
 		indexName,
 		vector,
-		topK,
+		topK: rawTopK,
 	})
 	const result = (responseJson as any).result ?? responseJson
 	const matches = (result?.matches ?? []) as VectorizeQueryResponse['matches']
 
-	return matches.map((m) => {
+	type RankedResult = { rank: number; result: SemanticSearchResult }
+	const byCanonicalId = new Map<string, RankedResult>()
+
+	for (let i = 0; i < matches.length; i++) {
+		const m = matches[i]
+		if (!m) continue
 		const md = (m.metadata ?? {}) as Record<string, unknown>
-		return {
-			id: m.id,
+		const type = asNonEmptyString(md.type)
+		const slug = asNonEmptyString(md.slug)
+		const title = asNonEmptyString(md.title)
+		const url = asNonEmptyString(md.url)
+		const snippet = asNonEmptyString(md.snippet)
+
+		const canonicalId = getCanonicalResultId({
+			vectorId: m.id,
+			type,
+			slug,
+			url,
+			title,
+		})
+
+		const next: SemanticSearchResult = {
+			id: canonicalId,
 			score: m.score,
-			type: typeof md.type === 'string' ? md.type : undefined,
-			title: typeof md.title === 'string' ? md.title : undefined,
-			url: typeof md.url === 'string' ? md.url : undefined,
-			snippet: typeof md.snippet === 'string' ? md.snippet : undefined,
+			type,
+			title,
+			url,
+			snippet,
 		}
-	})
+
+		const existing = byCanonicalId.get(canonicalId)
+		if (!existing) {
+			byCanonicalId.set(canonicalId, { rank: i, result: next })
+			continue
+		}
+
+		const prev = existing.result
+		const prevScore = typeof prev.score === 'number' && Number.isFinite(prev.score) ? prev.score : -Infinity
+		const nextScore = typeof next.score === 'number' && Number.isFinite(next.score) ? next.score : -Infinity
+		const bestScore = Math.max(prevScore, nextScore)
+		const nextIsBetter = nextScore > prevScore
+
+		existing.result = {
+			id: canonicalId,
+			score: bestScore,
+			type: prev.type ?? next.type,
+			title: prev.title ?? next.title,
+			url: prev.url ?? next.url,
+			// Prefer the snippet from the highest-scoring chunk, but fall back to any snippet.
+			snippet: nextIsBetter ? next.snippet ?? prev.snippet : prev.snippet ?? next.snippet,
+		}
+	}
+
+	return [...byCanonicalId.values()]
+		.sort((a, b) => {
+			const scoreDiff = (b.result.score ?? 0) - (a.result.score ?? 0)
+			if (scoreDiff) return scoreDiff
+			return a.rank - b.rank
+		})
+		.slice(0, safeTopK)
+		.map((x) => x.result)
 }