Skip to content

Commit cb3a876

Browse files
authored
feat(knowledge): include sourceUrl in KB search results (#4533)
* feat(knowledge): include sourceUrl in KB search results * improvement(kb-search): mark sourceUrl nullable and cover non-null happy path in tests
1 parent d0b0ede commit cb3a876

11 files changed

Lines changed: 105 additions & 30 deletions

File tree

apps/docs/content/docs/en/tools/knowledge.mdx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ Search for similar content in a knowledge base using vector similarity
6060
| `results` | array | Array of search results from the knowledge base |
6161
|`documentId` | string | Document ID |
6262
|`documentName` | string | Document name |
63+
|`sourceUrl` | string | URL to the original source document \(e.g., Confluence page, Google Doc, Notion page\). Null for documents without an external source. |
6364
|`content` | string | Content of the result |
6465
|`chunkIndex` | number | Index of the chunk within the document |
6566
|`similarity` | number | Similarity score of the result |

apps/docs/openapi.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5030,6 +5030,7 @@
50305030
{
50315031
"documentId": "doc_abc123",
50325032
"documentName": "Getting Started.pdf",
5033+
"sourceUrl": "https://example.atlassian.net/wiki/spaces/DOCS/pages/12345",
50335034
"content": "To reset your password, go to Settings > Security.",
50345035
"chunkIndex": 3,
50355036
"similarity": 0.95,
@@ -6264,6 +6265,11 @@
62646265
"type": "string",
62656266
"description": "Filename of the source document."
62666267
},
6268+
"sourceUrl": {
6269+
"type": "string",
6270+
"nullable": true,
6271+
"description": "URL to the original source document for connector-synced documents (e.g., a Confluence page, Google Doc, or Notion page). Null for documents without an external source."
6272+
},
62676273
"content": {
62686274
"type": "string",
62696275
"description": "The matched chunk content."

apps/sim/app/api/knowledge/search/route.test.ts

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ const {
2424
mockHandleTagAndVectorSearch,
2525
mockGetQueryStrategy,
2626
mockGenerateSearchEmbedding,
27-
mockGetDocumentNamesByIds,
27+
mockGetDocumentMetadataByIds,
2828
} = vi.hoisted(() => ({
2929
mockDbChain: {
3030
select: vi.fn().mockReturnThis(),
@@ -43,7 +43,7 @@ const {
4343
mockHandleTagAndVectorSearch: vi.fn(),
4444
mockGetQueryStrategy: vi.fn(),
4545
mockGenerateSearchEmbedding: vi.fn(),
46-
mockGetDocumentNamesByIds: vi.fn(),
46+
mockGetDocumentMetadataByIds: vi.fn(),
4747
}))
4848

4949
const mockCheckKnowledgeBaseAccess = knowledgeApiUtilsMockFns.mockCheckKnowledgeBaseAccess
@@ -101,7 +101,7 @@ vi.mock('./utils', () => ({
101101
handleTagAndVectorSearch: mockHandleTagAndVectorSearch,
102102
getQueryStrategy: mockGetQueryStrategy,
103103
generateSearchEmbedding: mockGenerateSearchEmbedding,
104-
getDocumentNamesByIds: mockGetDocumentNamesByIds,
104+
getDocumentMetadataByIds: mockGetDocumentMetadataByIds,
105105
APIError: class APIError extends Error {
106106
public status: number
107107
constructor(message: string, status: number) {
@@ -159,9 +159,9 @@ describe('Knowledge Search API Route', () => {
159159
singleQueryOptimized: true,
160160
})
161161
mockGenerateSearchEmbedding.mockClear().mockResolvedValue([0.1, 0.2, 0.3, 0.4, 0.5])
162-
mockGetDocumentNamesByIds.mockClear().mockResolvedValue({
163-
doc1: 'Document 1',
164-
doc2: 'Document 2',
162+
mockGetDocumentMetadataByIds.mockClear().mockResolvedValue({
163+
doc1: { filename: 'Document 1', sourceUrl: null },
164+
doc2: { filename: 'Document 2', sourceUrl: null },
165165
})
166166
mockGetDocumentTagDefinitions.mockClear()
167167
hybridAuthMockFns.mockCheckSessionOrInternalAuth.mockClear().mockResolvedValue({
@@ -998,8 +998,11 @@ describe('Knowledge Search API Route', () => {
998998
})
999999

10001000
mockGenerateSearchEmbedding.mockResolvedValue([0.1, 0.2, 0.3])
1001-
mockGetDocumentNamesByIds.mockResolvedValue({
1002-
'doc-active': 'Active Document.pdf',
1001+
mockGetDocumentMetadataByIds.mockResolvedValue({
1002+
'doc-active': {
1003+
filename: 'Active Document.pdf',
1004+
sourceUrl: 'https://example.atlassian.net/wiki/spaces/DOCS/pages/12345',
1005+
},
10031006
})
10041007

10051008
const mockTagDefs = {
@@ -1023,6 +1026,9 @@ describe('Knowledge Search API Route', () => {
10231026
expect(data.data.results).toHaveLength(1)
10241027
expect(data.data.results[0].documentId).toBe('doc-active')
10251028
expect(data.data.results[0].documentName).toBe('Active Document.pdf')
1029+
expect(data.data.results[0].sourceUrl).toBe(
1030+
'https://example.atlassian.net/wiki/spaces/DOCS/pages/12345'
1031+
)
10261032
})
10271033

10281034
it('should exclude results from deleted documents in tag search', async () => {
@@ -1067,8 +1073,8 @@ describe('Knowledge Search API Route', () => {
10671073
singleQueryOptimized: true,
10681074
})
10691075

1070-
mockGetDocumentNamesByIds.mockResolvedValue({
1071-
'doc-active-tagged': 'Active Tagged Document.pdf',
1076+
mockGetDocumentMetadataByIds.mockResolvedValue({
1077+
'doc-active-tagged': { filename: 'Active Tagged Document.pdf', sourceUrl: null },
10721078
})
10731079

10741080
const mockTagDefs = {
@@ -1140,8 +1146,8 @@ describe('Knowledge Search API Route', () => {
11401146
})
11411147

11421148
mockGenerateSearchEmbedding.mockResolvedValue([0.1, 0.2, 0.3])
1143-
mockGetDocumentNamesByIds.mockResolvedValue({
1144-
'doc-active-combined': 'Active Combined Search.pdf',
1149+
mockGetDocumentMetadataByIds.mockResolvedValue({
1150+
'doc-active-combined': { filename: 'Active Combined Search.pdf', sourceUrl: null },
11451151
})
11461152

11471153
const mockTagDefs = {

apps/sim/app/api/knowledge/search/route.ts

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ import type { StructuredFilter } from '@/lib/knowledge/types'
1616
import { estimateTokenCount } from '@/lib/tokenization/estimators'
1717
import {
1818
generateSearchEmbedding,
19-
getDocumentNamesByIds,
19+
getDocumentMetadataByIds,
2020
getQueryStrategy,
2121
handleTagAndVectorSearch,
2222
handleTagOnlySearch,
@@ -413,7 +413,7 @@ export const POST = withRouteHandler(async (request: NextRequest) => {
413413
})
414414

415415
const documentIds = results.map((result) => result.documentId)
416-
const documentNameMap = await getDocumentNamesByIds(documentIds)
416+
const documentMetadataMap = await getDocumentMetadataByIds(documentIds)
417417

418418
try {
419419
PlatformEvents.knowledgeBaseSearched({
@@ -449,9 +449,11 @@ export const POST = withRouteHandler(async (request: NextRequest) => {
449449
})
450450

451451
const rerankerScore = rerankedScores.get(result.id)
452+
const docMeta = documentMetadataMap[result.documentId]
452453
return {
453454
documentId: result.documentId,
454-
documentName: documentNameMap[result.documentId] || undefined,
455+
documentName: docMeta?.filename || undefined,
456+
sourceUrl: docMeta?.sourceUrl ?? null,
455457
content: result.content,
456458
chunkIndex: result.chunkIndex,
457459
metadata: tags,

apps/sim/app/api/knowledge/search/utils.test.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -396,11 +396,11 @@ describe('Knowledge Search Utils', () => {
396396
})
397397
})
398398

399-
describe('getDocumentNamesByIds', () => {
399+
describe('getDocumentMetadataByIds', () => {
400400
it('should handle empty input gracefully', async () => {
401-
const { getDocumentNamesByIds } = await import('./utils')
401+
const { getDocumentMetadataByIds } = await import('./utils')
402402

403-
const result = await getDocumentNamesByIds([])
403+
const result = await getDocumentMetadataByIds([])
404404

405405
expect(result).toEqual({})
406406
})

apps/sim/app/api/knowledge/search/utils.ts

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,22 @@ import { document, embedding } from '@sim/db/schema'
33
import { and, eq, inArray, isNull, sql } from 'drizzle-orm'
44
import type { StructuredFilter } from '@/lib/knowledge/types'
55

6-
export async function getDocumentNamesByIds(
6+
export interface DocumentMetadata {
7+
filename: string
8+
sourceUrl: string | null
9+
}
10+
11+
/**
12+
* Batch-fetch display metadata for documents referenced by search results.
13+
* Excludes documents that are user-excluded, archived, or soft-deleted —
14+
* mirrors the visibility filters applied inside the search SQL itself, so
15+
* the lookup will never surface metadata for a row a caller could not have
16+
* legitimately matched. Returns a map keyed by document id; missing ids
17+
* indicate the document is no longer visible and should be skipped.
18+
*/
19+
export async function getDocumentMetadataByIds(
720
documentIds: string[]
8-
): Promise<Record<string, string>> {
21+
): Promise<Record<string, DocumentMetadata>> {
922
if (documentIds.length === 0) {
1023
return {}
1124
}
@@ -15,6 +28,7 @@ export async function getDocumentNamesByIds(
1528
.select({
1629
id: document.id,
1730
filename: document.filename,
31+
sourceUrl: document.sourceUrl,
1832
})
1933
.from(document)
2034
.where(
@@ -26,12 +40,12 @@ export async function getDocumentNamesByIds(
2640
)
2741
)
2842

29-
const documentNameMap: Record<string, string> = {}
43+
const map: Record<string, DocumentMetadata> = {}
3044
documents.forEach((doc) => {
31-
documentNameMap[doc.id] = doc.filename
45+
map[doc.id] = { filename: doc.filename, sourceUrl: doc.sourceUrl ?? null }
3246
})
3347

34-
return documentNameMap
48+
return map
3549
}
3650

3751
export interface SearchResult {

apps/sim/app/api/v1/knowledge/search/route.test.ts

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ const {
1515
mockHandleTagAndVectorSearch,
1616
mockGetQueryStrategy,
1717
mockGenerateSearchEmbedding,
18-
mockGetDocumentNamesByIds,
18+
mockGetDocumentMetadataByIds,
1919
mockAuthenticateRequest,
2020
mockValidateWorkspaceAccess,
2121
} = vi.hoisted(() => ({
@@ -24,7 +24,7 @@ const {
2424
mockHandleTagAndVectorSearch: vi.fn(),
2525
mockGetQueryStrategy: vi.fn(),
2626
mockGenerateSearchEmbedding: vi.fn(),
27-
mockGetDocumentNamesByIds: vi.fn(),
27+
mockGetDocumentMetadataByIds: vi.fn(),
2828
mockAuthenticateRequest: vi.fn(),
2929
mockValidateWorkspaceAccess: vi.fn(),
3030
}))
@@ -35,7 +35,7 @@ vi.mock('@/app/api/knowledge/search/utils', () => ({
3535
handleTagAndVectorSearch: mockHandleTagAndVectorSearch,
3636
getQueryStrategy: mockGetQueryStrategy,
3737
generateSearchEmbedding: mockGenerateSearchEmbedding,
38-
getDocumentNamesByIds: mockGetDocumentNamesByIds,
38+
getDocumentMetadataByIds: mockGetDocumentMetadataByIds,
3939
}))
4040

4141
vi.mock('@/app/api/knowledge/utils', () => knowledgeApiUtilsMock)
@@ -81,7 +81,7 @@ describe('v1 knowledge search route — per-KB embedding model', () => {
8181
mockGetQueryStrategy.mockReturnValue({ distanceThreshold: 0.5 })
8282
mockGenerateSearchEmbedding.mockResolvedValue([0.1, 0.2, 0.3])
8383
mockHandleVectorOnlySearch.mockResolvedValue([])
84-
mockGetDocumentNamesByIds.mockResolvedValue({})
84+
mockGetDocumentMetadataByIds.mockResolvedValue({})
8585
})
8686

8787
it('passes the KB embedding model into generateSearchEmbedding', async () => {
@@ -127,6 +127,42 @@ describe('v1 knowledge search route — per-KB embedding model', () => {
127127
expect(mockGenerateSearchEmbedding).not.toHaveBeenCalled()
128128
})
129129

130+
it('surfaces sourceUrl from document metadata in search results', async () => {
131+
mockCheckKnowledgeBaseAccess.mockResolvedValueOnce({
132+
hasAccess: true,
133+
knowledgeBase: baseKb('kb-confluence', 'text-embedding-3-small'),
134+
})
135+
mockHandleVectorOnlySearch.mockResolvedValue([
136+
{
137+
documentId: 'doc-confluence',
138+
knowledgeBaseId: 'kb-confluence',
139+
content: 'page content',
140+
chunkIndex: 0,
141+
distance: 0.1,
142+
},
143+
])
144+
mockGetDocumentMetadataByIds.mockResolvedValue({
145+
'doc-confluence': {
146+
filename: 'Runbook.md',
147+
sourceUrl: 'https://example.atlassian.net/wiki/spaces/DOCS/pages/12345',
148+
},
149+
})
150+
151+
const req = createMockRequest('POST', {
152+
workspaceId: 'ws-1',
153+
knowledgeBaseIds: 'kb-confluence',
154+
query: 'runbook',
155+
})
156+
const res = await POST(req)
157+
const body = await res.json()
158+
159+
expect(res.status).toBe(200)
160+
expect(body.data.results[0].sourceUrl).toBe(
161+
'https://example.atlassian.net/wiki/spaces/DOCS/pages/12345'
162+
)
163+
expect(body.data.results[0].documentName).toBe('Runbook.md')
164+
})
165+
130166
it('allows tag-only search across mixed embedding models', async () => {
131167
mockHandleTagOnlySearch.mockResolvedValue([])
132168
mockCheckKnowledgeBaseAccess.mockResolvedValueOnce({

apps/sim/app/api/v1/knowledge/search/route.ts

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import { buildUndefinedTagsError, validateTagValue } from '@/lib/knowledge/tags/
88
import type { StructuredFilter } from '@/lib/knowledge/types'
99
import {
1010
generateSearchEmbedding,
11-
getDocumentNamesByIds,
11+
getDocumentMetadataByIds,
1212
getQueryStrategy,
1313
handleTagAndVectorSearch,
1414
handleTagOnlySearch,
@@ -205,7 +205,7 @@ export const POST = withRouteHandler(async (request: NextRequest) => {
205205
})
206206

207207
const documentIds = results.map((r) => r.documentId)
208-
const documentNameMap = await getDocumentNamesByIds(documentIds)
208+
const documentMetadataMap = await getDocumentMetadataByIds(documentIds)
209209

210210
return NextResponse.json({
211211
success: true,
@@ -222,9 +222,11 @@ export const POST = withRouteHandler(async (request: NextRequest) => {
222222
}
223223
})
224224

225+
const docMeta = documentMetadataMap[result.documentId]
225226
return {
226227
documentId: result.documentId,
227-
documentName: documentNameMap[result.documentId] || undefined,
228+
documentName: docMeta?.filename || undefined,
229+
sourceUrl: docMeta?.sourceUrl ?? null,
228230
content: result.content,
229231
chunkIndex: result.chunkIndex,
230232
metadata: tags,

apps/sim/tools/knowledge/search.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,12 @@ export const knowledgeSearchTool: ToolConfig<any, KnowledgeSearchResponse> = {
177177
properties: {
178178
documentId: { type: 'string', description: 'Document ID' },
179179
documentName: { type: 'string', description: 'Document name' },
180+
sourceUrl: {
181+
type: 'string',
182+
nullable: true,
183+
description:
184+
'URL to the original source document (e.g., Confluence page, Google Doc, Notion page). Null for documents without an external source.',
185+
},
180186
content: { type: 'string', description: 'Content of the result' },
181187
chunkIndex: { type: 'number', description: 'Index of the chunk within the document' },
182188
similarity: { type: 'number', description: 'Similarity score of the result' },

apps/sim/tools/knowledge/types.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ export function inferDocumentFileInfo(documentName: string): {
3636
export interface KnowledgeSearchResult {
3737
documentId: string
3838
documentName: string
39+
sourceUrl: string | null
3940
content: string
4041
chunkIndex: number
4142
metadata: Record<string, any>

0 commit comments

Comments
 (0)