Skip to content

Commit 2c7a9d0

Browse files
Add docs for semantic search normalization helpers
Co-authored-by: Kent C. Dodds <me+github@kentcdodds.com>
1 parent 97ad812 commit 2c7a9d0

File tree

1 file changed

+17
-0
lines changed

1 file changed

+17
-0
lines changed

app/utils/semantic-search.server.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,20 @@ type VectorizeQueryResponse = {
1212
}>
1313
}
1414

15+
/**
16+
* Parse a value that may be a string, returning a trimmed non-empty string.
17+
*/
1518
function asNonEmptyString(value: unknown): string | undefined {
1619
if (typeof value !== 'string') return undefined
1720
const trimmed = value.trim()
1821
return trimmed ? trimmed : undefined
1922
}
2023

24+
/**
25+
* Normalize a URL/path into a stable key:
26+
* - absolute URLs -> pathname
27+
* - relative paths -> strip query/fragment and trailing slashes
28+
*/
2129
function normalizeUrlForKey(url: string): string {
2230
// Prefer treating absolute URLs and relative paths as the same canonical key.
2331
try {
@@ -32,11 +40,20 @@ function normalizeUrlForKey(url: string): string {
3240
return cleaned && cleaned !== '/' ? cleaned.replace(/\/+$/, '') : cleaned
3341
}
3442

43+
/**
44+
* Normalize a title for canonicalization (case-insensitive).
45+
*/
3546
function normalizeTitleForKey(title: string) {
3647
// asNonEmptyString already trims; use lowercase to avoid casing-only duplicates.
3748
return title.toLowerCase()
3849
}
3950

51+
/**
52+
* Compute a doc-level identifier for semantic search results.
53+
*
54+
* Vectorize stores one vector per chunk; the canonical ID collapses chunk hits
55+
* into a single doc hit so search results don't contain duplicates.
56+
*/
4057
function getCanonicalResultId({
4158
vectorId,
4259
type,

0 commit comments

Comments
 (0)