Skip to content

Commit c291a6d

Browse files
committed
feat(knowledge): include sourceUrl in KB search results
1 parent d0b0ede commit c291a6d

10 files changed

Lines changed: 61 additions & 30 deletions

File tree

apps/docs/content/docs/en/tools/knowledge.mdx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ Search for similar content in a knowledge base using vector similarity
6060
| `results` | array | Array of search results from the knowledge base |
6161
|`documentId` | string | Document ID |
6262
|`documentName` | string | Document name |
63+
|`sourceUrl` | string | URL to the original source document \(e.g., Confluence page, Google Doc, Notion page\). Null for documents without an external source. |
6364
|`content` | string | Content of the result |
6465
|`chunkIndex` | number | Index of the chunk within the document |
6566
|`similarity` | number | Similarity score of the result |

apps/docs/openapi.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5030,6 +5030,7 @@
50305030
{
50315031
"documentId": "doc_abc123",
50325032
"documentName": "Getting Started.pdf",
5033+
"sourceUrl": "https://example.atlassian.net/wiki/spaces/DOCS/pages/12345",
50335034
"content": "To reset your password, go to Settings > Security.",
50345035
"chunkIndex": 3,
50355036
"similarity": 0.95,
@@ -6264,6 +6265,11 @@
62646265
"type": "string",
62656266
"description": "Filename of the source document."
62666267
},
6268+
"sourceUrl": {
6269+
"type": "string",
6270+
"nullable": true,
6271+
"description": "URL to the original source document for connector-synced documents (e.g., a Confluence page, Google Doc, or Notion page). Null for documents without an external source."
6272+
},
62676273
"content": {
62686274
"type": "string",
62696275
"description": "The matched chunk content."

apps/sim/app/api/knowledge/search/route.test.ts

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ const {
2424
mockHandleTagAndVectorSearch,
2525
mockGetQueryStrategy,
2626
mockGenerateSearchEmbedding,
27-
mockGetDocumentNamesByIds,
27+
mockGetDocumentMetadataByIds,
2828
} = vi.hoisted(() => ({
2929
mockDbChain: {
3030
select: vi.fn().mockReturnThis(),
@@ -43,7 +43,7 @@ const {
4343
mockHandleTagAndVectorSearch: vi.fn(),
4444
mockGetQueryStrategy: vi.fn(),
4545
mockGenerateSearchEmbedding: vi.fn(),
46-
mockGetDocumentNamesByIds: vi.fn(),
46+
mockGetDocumentMetadataByIds: vi.fn(),
4747
}))
4848

4949
const mockCheckKnowledgeBaseAccess = knowledgeApiUtilsMockFns.mockCheckKnowledgeBaseAccess
@@ -101,7 +101,7 @@ vi.mock('./utils', () => ({
101101
handleTagAndVectorSearch: mockHandleTagAndVectorSearch,
102102
getQueryStrategy: mockGetQueryStrategy,
103103
generateSearchEmbedding: mockGenerateSearchEmbedding,
104-
getDocumentNamesByIds: mockGetDocumentNamesByIds,
104+
getDocumentMetadataByIds: mockGetDocumentMetadataByIds,
105105
APIError: class APIError extends Error {
106106
public status: number
107107
constructor(message: string, status: number) {
@@ -159,9 +159,9 @@ describe('Knowledge Search API Route', () => {
159159
singleQueryOptimized: true,
160160
})
161161
mockGenerateSearchEmbedding.mockClear().mockResolvedValue([0.1, 0.2, 0.3, 0.4, 0.5])
162-
mockGetDocumentNamesByIds.mockClear().mockResolvedValue({
163-
doc1: 'Document 1',
164-
doc2: 'Document 2',
162+
mockGetDocumentMetadataByIds.mockClear().mockResolvedValue({
163+
doc1: { filename: 'Document 1', sourceUrl: null },
164+
doc2: { filename: 'Document 2', sourceUrl: null },
165165
})
166166
mockGetDocumentTagDefinitions.mockClear()
167167
hybridAuthMockFns.mockCheckSessionOrInternalAuth.mockClear().mockResolvedValue({
@@ -998,8 +998,8 @@ describe('Knowledge Search API Route', () => {
998998
})
999999

10001000
mockGenerateSearchEmbedding.mockResolvedValue([0.1, 0.2, 0.3])
1001-
mockGetDocumentNamesByIds.mockResolvedValue({
1002-
'doc-active': 'Active Document.pdf',
1001+
mockGetDocumentMetadataByIds.mockResolvedValue({
1002+
'doc-active': { filename: 'Active Document.pdf', sourceUrl: null },
10031003
})
10041004

10051005
const mockTagDefs = {
@@ -1067,8 +1067,8 @@ describe('Knowledge Search API Route', () => {
10671067
singleQueryOptimized: true,
10681068
})
10691069

1070-
mockGetDocumentNamesByIds.mockResolvedValue({
1071-
'doc-active-tagged': 'Active Tagged Document.pdf',
1070+
mockGetDocumentMetadataByIds.mockResolvedValue({
1071+
'doc-active-tagged': { filename: 'Active Tagged Document.pdf', sourceUrl: null },
10721072
})
10731073

10741074
const mockTagDefs = {
@@ -1140,8 +1140,8 @@ describe('Knowledge Search API Route', () => {
11401140
})
11411141

11421142
mockGenerateSearchEmbedding.mockResolvedValue([0.1, 0.2, 0.3])
1143-
mockGetDocumentNamesByIds.mockResolvedValue({
1144-
'doc-active-combined': 'Active Combined Search.pdf',
1143+
mockGetDocumentMetadataByIds.mockResolvedValue({
1144+
'doc-active-combined': { filename: 'Active Combined Search.pdf', sourceUrl: null },
11451145
})
11461146

11471147
const mockTagDefs = {

apps/sim/app/api/knowledge/search/route.ts

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ import type { StructuredFilter } from '@/lib/knowledge/types'
1616
import { estimateTokenCount } from '@/lib/tokenization/estimators'
1717
import {
1818
generateSearchEmbedding,
19-
getDocumentNamesByIds,
19+
getDocumentMetadataByIds,
2020
getQueryStrategy,
2121
handleTagAndVectorSearch,
2222
handleTagOnlySearch,
@@ -413,7 +413,7 @@ export const POST = withRouteHandler(async (request: NextRequest) => {
413413
})
414414

415415
const documentIds = results.map((result) => result.documentId)
416-
const documentNameMap = await getDocumentNamesByIds(documentIds)
416+
const documentMetadataMap = await getDocumentMetadataByIds(documentIds)
417417

418418
try {
419419
PlatformEvents.knowledgeBaseSearched({
@@ -449,9 +449,11 @@ export const POST = withRouteHandler(async (request: NextRequest) => {
449449
})
450450

451451
const rerankerScore = rerankedScores.get(result.id)
452+
const docMeta = documentMetadataMap[result.documentId]
452453
return {
453454
documentId: result.documentId,
454-
documentName: documentNameMap[result.documentId] || undefined,
455+
documentName: docMeta?.filename || undefined,
456+
sourceUrl: docMeta?.sourceUrl ?? null,
455457
content: result.content,
456458
chunkIndex: result.chunkIndex,
457459
metadata: tags,

apps/sim/app/api/knowledge/search/utils.test.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -396,11 +396,11 @@ describe('Knowledge Search Utils', () => {
396396
})
397397
})
398398

399-
describe('getDocumentNamesByIds', () => {
399+
describe('getDocumentMetadataByIds', () => {
400400
it('should handle empty input gracefully', async () => {
401-
const { getDocumentNamesByIds } = await import('./utils')
401+
const { getDocumentMetadataByIds } = await import('./utils')
402402

403-
const result = await getDocumentNamesByIds([])
403+
const result = await getDocumentMetadataByIds([])
404404

405405
expect(result).toEqual({})
406406
})

apps/sim/app/api/knowledge/search/utils.ts

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,22 @@ import { document, embedding } from '@sim/db/schema'
33
import { and, eq, inArray, isNull, sql } from 'drizzle-orm'
44
import type { StructuredFilter } from '@/lib/knowledge/types'
55

6-
export async function getDocumentNamesByIds(
6+
export interface DocumentMetadata {
7+
filename: string
8+
sourceUrl: string | null
9+
}
10+
11+
/**
12+
* Batch-fetch display metadata for documents referenced by search results.
13+
* Excludes documents that are user-excluded, archived, or soft-deleted —
14+
* mirrors the visibility filters applied inside the search SQL itself, so
15+
* the lookup will never surface metadata for a row a caller could not have
16+
* legitimately matched. Returns a map keyed by document id; missing ids
17+
* indicate the document is no longer visible and should be skipped.
18+
*/
19+
export async function getDocumentMetadataByIds(
720
documentIds: string[]
8-
): Promise<Record<string, string>> {
21+
): Promise<Record<string, DocumentMetadata>> {
922
if (documentIds.length === 0) {
1023
return {}
1124
}
@@ -15,6 +28,7 @@ export async function getDocumentNamesByIds(
1528
.select({
1629
id: document.id,
1730
filename: document.filename,
31+
sourceUrl: document.sourceUrl,
1832
})
1933
.from(document)
2034
.where(
@@ -26,12 +40,12 @@ export async function getDocumentNamesByIds(
2640
)
2741
)
2842

29-
const documentNameMap: Record<string, string> = {}
43+
const map: Record<string, DocumentMetadata> = {}
3044
documents.forEach((doc) => {
31-
documentNameMap[doc.id] = doc.filename
45+
map[doc.id] = { filename: doc.filename, sourceUrl: doc.sourceUrl ?? null }
3246
})
3347

34-
return documentNameMap
48+
return map
3549
}
3650

3751
export interface SearchResult {

apps/sim/app/api/v1/knowledge/search/route.test.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ const {
1515
mockHandleTagAndVectorSearch,
1616
mockGetQueryStrategy,
1717
mockGenerateSearchEmbedding,
18-
mockGetDocumentNamesByIds,
18+
mockGetDocumentMetadataByIds,
1919
mockAuthenticateRequest,
2020
mockValidateWorkspaceAccess,
2121
} = vi.hoisted(() => ({
@@ -24,7 +24,7 @@ const {
2424
mockHandleTagAndVectorSearch: vi.fn(),
2525
mockGetQueryStrategy: vi.fn(),
2626
mockGenerateSearchEmbedding: vi.fn(),
27-
mockGetDocumentNamesByIds: vi.fn(),
27+
mockGetDocumentMetadataByIds: vi.fn(),
2828
mockAuthenticateRequest: vi.fn(),
2929
mockValidateWorkspaceAccess: vi.fn(),
3030
}))
@@ -35,7 +35,7 @@ vi.mock('@/app/api/knowledge/search/utils', () => ({
3535
handleTagAndVectorSearch: mockHandleTagAndVectorSearch,
3636
getQueryStrategy: mockGetQueryStrategy,
3737
generateSearchEmbedding: mockGenerateSearchEmbedding,
38-
getDocumentNamesByIds: mockGetDocumentNamesByIds,
38+
getDocumentMetadataByIds: mockGetDocumentMetadataByIds,
3939
}))
4040

4141
vi.mock('@/app/api/knowledge/utils', () => knowledgeApiUtilsMock)
@@ -81,7 +81,7 @@ describe('v1 knowledge search route — per-KB embedding model', () => {
8181
mockGetQueryStrategy.mockReturnValue({ distanceThreshold: 0.5 })
8282
mockGenerateSearchEmbedding.mockResolvedValue([0.1, 0.2, 0.3])
8383
mockHandleVectorOnlySearch.mockResolvedValue([])
84-
mockGetDocumentNamesByIds.mockResolvedValue({})
84+
mockGetDocumentMetadataByIds.mockResolvedValue({})
8585
})
8686

8787
it('passes the KB embedding model into generateSearchEmbedding', async () => {

apps/sim/app/api/v1/knowledge/search/route.ts

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import { buildUndefinedTagsError, validateTagValue } from '@/lib/knowledge/tags/
88
import type { StructuredFilter } from '@/lib/knowledge/types'
99
import {
1010
generateSearchEmbedding,
11-
getDocumentNamesByIds,
11+
getDocumentMetadataByIds,
1212
getQueryStrategy,
1313
handleTagAndVectorSearch,
1414
handleTagOnlySearch,
@@ -205,7 +205,7 @@ export const POST = withRouteHandler(async (request: NextRequest) => {
205205
})
206206

207207
const documentIds = results.map((r) => r.documentId)
208-
const documentNameMap = await getDocumentNamesByIds(documentIds)
208+
const documentMetadataMap = await getDocumentMetadataByIds(documentIds)
209209

210210
return NextResponse.json({
211211
success: true,
@@ -222,9 +222,11 @@ export const POST = withRouteHandler(async (request: NextRequest) => {
222222
}
223223
})
224224

225+
const docMeta = documentMetadataMap[result.documentId]
225226
return {
226227
documentId: result.documentId,
227-
documentName: documentNameMap[result.documentId] || undefined,
228+
documentName: docMeta?.filename || undefined,
229+
sourceUrl: docMeta?.sourceUrl ?? null,
228230
content: result.content,
229231
chunkIndex: result.chunkIndex,
230232
metadata: tags,

apps/sim/tools/knowledge/search.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,11 @@ export const knowledgeSearchTool: ToolConfig<any, KnowledgeSearchResponse> = {
177177
properties: {
178178
documentId: { type: 'string', description: 'Document ID' },
179179
documentName: { type: 'string', description: 'Document name' },
180+
sourceUrl: {
181+
type: 'string',
182+
description:
183+
'URL to the original source document (e.g., Confluence page, Google Doc, Notion page). Null for documents without an external source.',
184+
},
180185
content: { type: 'string', description: 'Content of the result' },
181186
chunkIndex: { type: 'number', description: 'Index of the chunk within the document' },
182187
similarity: { type: 'number', description: 'Similarity score of the result' },

apps/sim/tools/knowledge/types.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ export function inferDocumentFileInfo(documentName: string): {
3636
export interface KnowledgeSearchResult {
3737
documentId: string
3838
documentName: string
39+
sourceUrl: string | null
3940
content: string
4041
chunkIndex: number
4142
metadata: Record<string, any>

0 commit comments

Comments
 (0)