Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 155 additions & 1 deletion app/utils/__tests__/semantic-search.server.test.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,25 @@
import { describe, expect, test } from 'vitest'
import { setupServer } from 'msw/node'
import { afterAll, beforeAll, beforeEach, describe, expect, test } from 'vitest'
import { cloudflareHandlers, resetCloudflareMockState } from '../../../mocks/cloudflare.ts'
import {
isSemanticSearchConfigured,
semanticSearchKCD,
} from '../semantic-search.server.ts'

const server = setupServer(...cloudflareHandlers)

beforeAll(() => {
server.listen({ onUnhandledRequest: 'error' })
})

beforeEach(() => {
resetCloudflareMockState()
})

afterAll(() => {
server.close()
})

describe('semantic search env gating', () => {
test('isSemanticSearchConfigured is false without env vars', () => {
const original = {
Expand Down Expand Up @@ -46,3 +62,141 @@ describe('semantic search env gating', () => {
})
})

describe('semantic search result normalization', () => {
test('dedupes chunk-level matches into unique docs', async () => {
const originalEnv = {
CLOUDFLARE_ACCOUNT_ID: process.env.CLOUDFLARE_ACCOUNT_ID,
CLOUDFLARE_API_TOKEN: process.env.CLOUDFLARE_API_TOKEN,
CLOUDFLARE_VECTORIZE_INDEX: process.env.CLOUDFLARE_VECTORIZE_INDEX,
CLOUDFLARE_AI_EMBEDDING_MODEL: process.env.CLOUDFLARE_AI_EMBEDDING_MODEL,
}
try {
const accountId = 'acc123'
const apiToken = 'test-token'
const indexName = 'semantic-index'

process.env.CLOUDFLARE_ACCOUNT_ID = accountId
process.env.CLOUDFLARE_API_TOKEN = apiToken
process.env.CLOUDFLARE_VECTORIZE_INDEX = indexName
delete process.env.CLOUDFLARE_AI_EMBEDDING_MODEL

// Use a query that's unlikely to match any seeded doc titles/snippets,
// so the Cloudflare Vectorize mock falls back to cosine similarity rather
// than match-sorter ranking.
const query = 'zz_semantic_dedupe_test_02157475'

const embedRes = await fetch(
`https://api.cloudflare.com/client/v4/accounts/${accountId}/ai/run/@cf/google/embeddinggemma-300m`,
{
method: 'POST',
headers: {
Authorization: `Bearer ${apiToken}`,
'Content-Type': 'application/json',
},
body: JSON.stringify({ text: [query] }),
},
)
expect(embedRes.ok).toBe(true)
const embedJson = (await embedRes.json()) as any
const vector = embedJson?.result?.data?.[0] as unknown
expect(Array.isArray(vector)).toBe(true)

const vectorsToUpsert = [
{
id: 'blog:cursor-dup:chunk:0',
values: vector as number[],
metadata: {
type: 'blog',
slug: 'cursor-dup',
url: '/blog/cursor-dup',
title: 'Cursor Dup',
snippet: 'snippet-best',
},
},
{
id: 'blog:cursor-dup:chunk:1',
values: vector as number[],
metadata: {
type: 'blog',
slug: 'cursor-dup',
url: '/blog/cursor-dup',
title: 'Cursor Dup (chunk 2)',
snippet: 'snippet-worse',
},
},
{
id: 'blog:cursor-one:chunk:0',
values: vector as number[],
metadata: {
type: 'blog',
slug: 'cursor-one',
url: '/blog/cursor-one',
title: 'Cursor One',
snippet: 'one-snippet',
},
},
{
id: 'credit:alice:chunk:0',
values: vector as number[],
metadata: {
type: 'credit',
slug: 'alice',
url: '/credits',
title: 'Alice',
snippet: 'alice-snippet',
},
},
{
id: 'credit:bob:chunk:0',
values: vector as number[],
metadata: {
type: 'credit',
slug: 'bob',
url: '/credits',
title: 'Bob',
snippet: 'bob-snippet',
},
},
]

const ndjson =
vectorsToUpsert.map((v) => JSON.stringify(v)).join('\n') + '\n'
const upsertRes = await fetch(
`https://api.cloudflare.com/client/v4/accounts/${accountId}/vectorize/v2/indexes/${indexName}/upsert`,
{
method: 'POST',
headers: {
Authorization: `Bearer ${apiToken}`,
'Content-Type': 'application/x-ndjson',
},
body: ndjson,
},
)
expect(upsertRes.ok).toBe(true)

const results = await semanticSearchKCD({ query, topK: 4 })
expect(results).toHaveLength(4)

// Chunk-level duplicates collapse into a single doc-level result.
const ids = results.map((r) => r.id)
const urls = results.map((r) => r.url)
expect(new Set(ids).size).toBe(ids.length)
expect(urls.filter((u) => u === '/blog/cursor-dup')).toHaveLength(1)

const blogResult = results.find((r) => r.url === '/blog/cursor-dup')
expect(blogResult).toBeDefined()
expect(blogResult!.snippet).toBe('snippet-best')

// Credits share the same URL, but should not be collapsed (slug differentiates them).
expect(ids).toContain('credit:alice')
expect(ids).toContain('credit:bob')
expect(urls.filter((u) => u === '/credits')).toHaveLength(2)
} finally {
for (const [key, value] of Object.entries(originalEnv)) {
if (typeof value === 'string') process.env[key] = value
else delete process.env[key]
}
}
})
})

12 changes: 9 additions & 3 deletions app/utils/cloudflare-ai-transcription.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,14 @@ export async function transcribeMp3WithWorkersAi({
model,
)}`

// Some TS `fetch` typings only accept `ArrayBufferView` backed by `ArrayBuffer`
// (not `ArrayBufferLike`). Convert to an `ArrayBuffer`-backed view without
// copying when possible.
const mp3Body =
mp3.buffer instanceof ArrayBuffer
? new Uint8Array(mp3.buffer, mp3.byteOffset, mp3.byteLength)
: Uint8Array.from(mp3)

// For `@cf/openai/whisper`, Cloudflare supports raw binary audio as the body.
// Docs: https://developers.cloudflare.com/workers-ai/models/whisper/
const res = await fetch(url, {
Expand All @@ -50,9 +58,7 @@ export async function transcribeMp3WithWorkersAi({
// Best-effort content-type; CF can infer in many cases, but be explicit.
'Content-Type': 'audio/mpeg',
},
// Some fetch/undici TS typings are stricter than runtime and require
// `Uint8Array<ArrayBuffer>` rather than `Uint8Array<ArrayBufferLike>`.
body: mp3 as unknown as Uint8Array<ArrayBuffer>,
body: mp3Body,
})

if (!res.ok) {
Expand Down
170 changes: 161 additions & 9 deletions app/utils/semantic-search.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,95 @@ type VectorizeQueryResponse = {
}>
}

/**
* Parse a value that may be a string, returning a trimmed non-empty string.
*/
function asNonEmptyString(value: unknown): string | undefined {
if (typeof value !== 'string') return undefined
const trimmed = value.trim()
return trimmed ? trimmed : undefined
}

/**
* Normalize a URL/path into a stable key:
* - absolute URLs -> pathname
* - relative paths -> strip query/fragment and trailing slashes
*/
function normalizeUrlForKey(url: string): string {
// Prefer treating absolute URLs and relative paths as the same canonical key.
try {
if (/^https?:\/\//i.test(url)) {
const u = new URL(url)
return u.pathname !== '/' ? u.pathname.replace(/\/+$/, '') : u.pathname
}
} catch {
// ignore
}
const cleaned = (url.split(/[?#]/)[0] ?? '').trim()
if (!cleaned) return '/'
return cleaned !== '/' ? cleaned.replace(/\/+$/, '') : cleaned
}

/**
* Normalize a title for canonicalization (case-insensitive).
*/
function normalizeTitleForKey(title: string) {
// asNonEmptyString already trims; use lowercase to avoid casing-only duplicates.
return title.toLowerCase()
}

function normalizeSlugForKey(slug: string) {
// Normalize for case-insensitive dedupe parity with titles.
return slug.toLowerCase()
}

function parseDocRefFromVectorId(
vectorId: string,
): { type: string; slug: string } | null {
// Indexers generally use `<type>:<slug>:chunk:<n>` for chunk-level vectors.
// When metadata is missing/incomplete, we can still collapse chunk hits into a
// doc-level hit using the stable vector id structure.
const match =
/^(?<type>[^:]+):(?<slug>[^:]+):chunk:(?<chunkIndex>\d+)$/u.exec(vectorId)
const type = match?.groups?.type
const slug = match?.groups?.slug
if (!type || !slug) return null
return { type, slug }
}

/**
* Compute a doc-level identifier for semantic search results.
*
* Vectorize stores one vector per chunk; the canonical ID collapses chunk hits
* into a single doc hit so search results don't contain duplicates.
*/
function getCanonicalResultId({
vectorId,
type,
slug,
url,
title,
}: {
vectorId: string
type: string | undefined
slug: string | undefined
url: string | undefined
title: string | undefined
}) {
// The Vectorize index stores multiple chunk vectors per doc, so we need a
// canonical, doc-level identifier to collapse duplicates in query results.
if (type && slug) return `${type}:${normalizeSlugForKey(slug)}`
const fromVectorId = parseDocRefFromVectorId(vectorId)
if (fromVectorId) {
return `${fromVectorId.type}:${normalizeSlugForKey(fromVectorId.slug)}`
}
const normalizedUrl = url ? normalizeUrlForKey(url) : undefined
if (type && normalizedUrl) return `${type}:${normalizedUrl}`
if (normalizedUrl) return normalizedUrl
if (type && title) return `${type}:${normalizeTitleForKey(title)}`
return vectorId
}

function getRequiredSemanticSearchEnv() {
const accountId = process.env.CLOUDFLARE_ACCOUNT_ID
const apiToken = process.env.CLOUDFLARE_API_TOKEN
Expand Down Expand Up @@ -158,6 +247,10 @@ export async function semanticSearchKCD({
topK = 15,
}: {
query: string
/**
* Requested number of unique docs to return.
* Clamped to 20 because Vectorize metadata queries cap `topK` at 20.
*/
topK?: number
}): Promise<Array<SemanticSearchResult>> {
const { accountId, apiToken, indexName, embeddingModel } =
Expand All @@ -169,6 +262,15 @@ export async function semanticSearchKCD({
)
}

const safeTopK =
typeof topK === 'number' && Number.isFinite(topK)
? Math.max(1, Math.min(20, Math.floor(topK)))
: 15
// Vectorize returns chunk-level matches and overlapping chunks commonly score
// highly together. Overfetch and then de-dupe down to unique docs.
// When requesting metadata, Vectorize caps topK at 20.
const rawTopK = Math.min(20, safeTopK * 5)

const vector = await getEmbedding({
accountId,
apiToken,
Expand All @@ -181,21 +283,71 @@ export async function semanticSearchKCD({
apiToken,
indexName,
vector,
topK,
topK: rawTopK,
})
const result = (responseJson as any).result ?? responseJson
const matches = (result?.matches ?? []) as VectorizeQueryResponse['matches']

return matches.map((m) => {
type RankedResult = { rank: number; result: SemanticSearchResult }
const byCanonicalId = new Map<string, RankedResult>()

for (let i = 0; i < matches.length; i++) {
const m = matches[i]
if (!m) continue
const md = (m.metadata ?? {}) as Record<string, unknown>
return {
id: m.id,
const type = asNonEmptyString(md.type)
const slug = asNonEmptyString(md.slug)
const title = asNonEmptyString(md.title)
const url = asNonEmptyString(md.url)
const snippet = asNonEmptyString(md.snippet)

const canonicalId = getCanonicalResultId({
vectorId: m.id,
type,
slug,
url,
title,
})

const next: SemanticSearchResult = {
id: canonicalId,
score: m.score,
type: typeof md.type === 'string' ? md.type : undefined,
title: typeof md.title === 'string' ? md.title : undefined,
url: typeof md.url === 'string' ? md.url : undefined,
snippet: typeof md.snippet === 'string' ? md.snippet : undefined,
type,
title,
url,
snippet,
}
})

const existing = byCanonicalId.get(canonicalId)
if (!existing) {
byCanonicalId.set(canonicalId, { rank: i, result: next })
continue
}

const prev = existing.result
const prevScore = typeof prev.score === 'number' && Number.isFinite(prev.score) ? prev.score : -Infinity
const nextScore = typeof next.score === 'number' && Number.isFinite(next.score) ? next.score : -Infinity
const bestScore = Math.max(prevScore, nextScore)
const nextIsBetter = nextScore > prevScore

existing.result = {
id: canonicalId,
score: bestScore,
type: prev.type ?? next.type,
title: prev.title ?? next.title,
url: prev.url ?? next.url,
// Prefer the snippet from the highest-scoring chunk, but fall back to any snippet.
snippet: nextIsBetter ? next.snippet ?? prev.snippet : prev.snippet ?? next.snippet,
}
}

return [...byCanonicalId.values()]
.sort((a, b) => {
const scoreDiff = (b.result.score ?? 0) - (a.result.score ?? 0)
if (scoreDiff) return scoreDiff
return a.rank - b.rank
})
.slice(0, safeTopK)
.map((x) => x.result)
}

Loading