@@ -31,13 +31,14 @@ function normalizeUrlForKey(url: string): string {
3131 try {
3232 if ( / ^ h t t p s ? : \/ \/ / i. test ( url ) ) {
3333 const u = new URL ( url )
34- return u . pathname && u . pathname !== '/' ? u . pathname . replace ( / \/ + $ / , '' ) : u . pathname
34+ return u . pathname !== '/' ? u . pathname . replace ( / \/ + $ / , '' ) : u . pathname
3535 }
3636 } catch {
3737 // ignore
3838 }
39- const cleaned = ( url . split ( / [ ? # ] / ) [ 0 ] ?? url ) . trim ( )
40- return cleaned && cleaned !== '/' ? cleaned . replace ( / \/ + $ / , '' ) : cleaned
39+ const cleaned = ( url . split ( / [ ? # ] / ) [ 0 ] ?? '' ) . trim ( )
40+ if ( ! cleaned ) return '/'
41+ return cleaned !== '/' ? cleaned . replace ( / \/ + $ / , '' ) : cleaned
4142}
4243
4344/**
@@ -48,6 +49,11 @@ function normalizeTitleForKey(title: string) {
4849 return title . toLowerCase ( )
4950}
5051
52+ function normalizeSlugForKey ( slug : string ) {
53+ // Normalize for case-insensitive dedupe parity with titles.
54+ return slug . toLowerCase ( )
55+ }
56+
5157/**
5258 * Compute a doc-level identifier for semantic search results.
5359 *
@@ -69,9 +75,10 @@ function getCanonicalResultId({
6975} ) {
7076 // The Vectorize index stores multiple chunk vectors per doc, so we need a
7177 // canonical, doc-level identifier to collapse duplicates in query results.
72- if ( type && slug ) return `${ type } :${ slug } `
73- if ( type && url ) return `${ type } :${ normalizeUrlForKey ( url ) } `
74- if ( url ) return normalizeUrlForKey ( url )
78+ if ( type && slug ) return `${ type } :${ normalizeSlugForKey ( slug ) } `
79+ const normalizedUrl = url ? normalizeUrlForKey ( url ) : undefined
80+ if ( type && normalizedUrl ) return `${ type } :${ normalizedUrl } `
81+ if ( normalizedUrl ) return normalizedUrl
7582 if ( type && title ) return `${ type } :${ normalizeTitleForKey ( title ) } `
7683 return vectorId
7784}
@@ -222,6 +229,10 @@ export async function semanticSearchKCD({
222229 topK = 15 ,
223230} : {
224231 query : string
232+ /**
233+ * Requested number of unique docs to return.
234+ * Clamped to 20 because Vectorize metadata queries cap `topK` at 20.
235+ */
225236 topK ?: number
226237} ) : Promise < Array < SemanticSearchResult > > {
227238 const { accountId, apiToken, indexName, embeddingModel } =
@@ -234,7 +245,9 @@ export async function semanticSearchKCD({
234245 }
235246
236247 const safeTopK =
237- typeof topK === 'number' && Number . isFinite ( topK ) ? Math . max ( 1 , Math . floor ( topK ) ) : 15
248+ typeof topK === 'number' && Number . isFinite ( topK )
249+ ? Math . max ( 1 , Math . min ( 20 , Math . floor ( topK ) ) )
250+ : 15
238251 // Vectorize returns chunk-level matches and overlapping chunks commonly score
239252 // highly together. Overfetch and then de-dupe down to unique docs.
240253 // When requesting metadata, Vectorize caps topK at 20.
0 commit comments