Skip to content

Commit 6cf02b9

Browse files
authored
fix(kb): exclude deleted docs from embeddings/vector search (#1319)
* update infra and remove railway * fix(kb): exclude deleted docs from queries * Revert "update infra and remove railway" This reverts commit b23258a.
1 parent 2dc75b1 commit 6cf02b9

File tree

3 files changed

+232
-2
lines changed

3 files changed

+232
-2
lines changed

apps/sim/app/api/knowledge/search/route.test.ts

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1006,4 +1006,210 @@ describe('Knowledge Search API Route', () => {
10061006
expect(mockGenerateSearchEmbedding).not.toHaveBeenCalled() // No embedding for tag-only
10071007
})
10081008
})
1009+
1010+
describe('Deleted document filtering', () => {
1011+
it('should exclude results from deleted documents in vector search', async () => {
1012+
mockGetUserId.mockResolvedValue('user-123')
1013+
1014+
mockCheckKnowledgeBaseAccess.mockResolvedValue({
1015+
hasAccess: true,
1016+
knowledgeBase: {
1017+
id: 'kb-123',
1018+
userId: 'user-123',
1019+
name: 'Test KB',
1020+
deletedAt: null,
1021+
},
1022+
})
1023+
1024+
mockHandleVectorOnlySearch.mockResolvedValue([
1025+
{
1026+
id: 'chunk-1',
1027+
content: 'Content from active document',
1028+
documentId: 'doc-active',
1029+
chunkIndex: 0,
1030+
tag1: null,
1031+
tag2: null,
1032+
tag3: null,
1033+
tag4: null,
1034+
tag5: null,
1035+
tag6: null,
1036+
tag7: null,
1037+
distance: 0.2,
1038+
knowledgeBaseId: 'kb-123',
1039+
},
1040+
])
1041+
1042+
mockGetQueryStrategy.mockReturnValue({
1043+
useParallel: false,
1044+
distanceThreshold: 1.0,
1045+
parallelLimit: 15,
1046+
singleQueryOptimized: true,
1047+
})
1048+
1049+
mockGenerateSearchEmbedding.mockResolvedValue([0.1, 0.2, 0.3])
1050+
mockGetDocumentNamesByIds.mockResolvedValue({
1051+
'doc-active': 'Active Document.pdf',
1052+
})
1053+
1054+
const mockTagDefs = {
1055+
select: vi.fn().mockReturnThis(),
1056+
from: vi.fn().mockReturnThis(),
1057+
where: vi.fn().mockResolvedValue([]),
1058+
}
1059+
mockDbChain.select.mockReturnValueOnce(mockTagDefs)
1060+
1061+
const req = createMockRequest('POST', {
1062+
knowledgeBaseIds: ['kb-123'],
1063+
query: 'test query',
1064+
topK: 10,
1065+
})
1066+
1067+
const { POST } = await import('@/app/api/knowledge/search/route')
1068+
const response = await POST(req)
1069+
const data = await response.json()
1070+
1071+
expect(response.status).toBe(200)
1072+
expect(data.success).toBe(true)
1073+
expect(data.data.results).toHaveLength(1)
1074+
expect(data.data.results[0].documentId).toBe('doc-active')
1075+
expect(data.data.results[0].documentName).toBe('Active Document.pdf')
1076+
})
1077+
1078+
it('should exclude results from deleted documents in tag search', async () => {
1079+
mockGetUserId.mockResolvedValue('user-123')
1080+
1081+
mockCheckKnowledgeBaseAccess.mockResolvedValue({
1082+
hasAccess: true,
1083+
knowledgeBase: {
1084+
id: 'kb-123',
1085+
userId: 'user-123',
1086+
name: 'Test KB',
1087+
deletedAt: null,
1088+
},
1089+
})
1090+
1091+
mockHandleTagOnlySearch.mockResolvedValue([
1092+
{
1093+
id: 'chunk-2',
1094+
content: 'Content from active document with tag',
1095+
documentId: 'doc-active-tagged',
1096+
chunkIndex: 0,
1097+
tag1: 'api',
1098+
tag2: null,
1099+
tag3: null,
1100+
tag4: null,
1101+
tag5: null,
1102+
tag6: null,
1103+
tag7: null,
1104+
distance: 0,
1105+
knowledgeBaseId: 'kb-123',
1106+
},
1107+
])
1108+
1109+
mockGetQueryStrategy.mockReturnValue({
1110+
useParallel: false,
1111+
distanceThreshold: 1.0,
1112+
parallelLimit: 15,
1113+
singleQueryOptimized: true,
1114+
})
1115+
1116+
mockGetDocumentNamesByIds.mockResolvedValue({
1117+
'doc-active-tagged': 'Active Tagged Document.pdf',
1118+
})
1119+
1120+
const mockTagDefs = {
1121+
select: vi.fn().mockReturnThis(),
1122+
from: vi.fn().mockReturnThis(),
1123+
where: vi.fn().mockResolvedValue([]),
1124+
}
1125+
mockDbChain.select.mockReturnValueOnce(mockTagDefs)
1126+
1127+
const req = createMockRequest('POST', {
1128+
knowledgeBaseIds: ['kb-123'],
1129+
filters: { tag1: 'api' },
1130+
topK: 10,
1131+
})
1132+
1133+
const { POST } = await import('@/app/api/knowledge/search/route')
1134+
const response = await POST(req)
1135+
const data = await response.json()
1136+
1137+
expect(response.status).toBe(200)
1138+
expect(data.success).toBe(true)
1139+
expect(data.data.results).toHaveLength(1)
1140+
expect(data.data.results[0].documentId).toBe('doc-active-tagged')
1141+
expect(data.data.results[0].documentName).toBe('Active Tagged Document.pdf')
1142+
expect(data.data.results[0].metadata).toEqual({ tag1: 'api' })
1143+
})
1144+
1145+
it('should exclude results from deleted documents in combined tag+vector search', async () => {
1146+
mockGetUserId.mockResolvedValue('user-123')
1147+
1148+
mockCheckKnowledgeBaseAccess.mockResolvedValue({
1149+
hasAccess: true,
1150+
knowledgeBase: {
1151+
id: 'kb-123',
1152+
userId: 'user-123',
1153+
name: 'Test KB',
1154+
deletedAt: null,
1155+
},
1156+
})
1157+
1158+
mockHandleTagAndVectorSearch.mockResolvedValue([
1159+
{
1160+
id: 'chunk-3',
1161+
content: 'Relevant content from active document',
1162+
documentId: 'doc-active-combined',
1163+
chunkIndex: 0,
1164+
tag1: 'guide',
1165+
tag2: null,
1166+
tag3: null,
1167+
tag4: null,
1168+
tag5: null,
1169+
tag6: null,
1170+
tag7: null,
1171+
distance: 0.15,
1172+
knowledgeBaseId: 'kb-123',
1173+
},
1174+
])
1175+
1176+
mockGetQueryStrategy.mockReturnValue({
1177+
useParallel: false,
1178+
distanceThreshold: 1.0,
1179+
parallelLimit: 15,
1180+
singleQueryOptimized: true,
1181+
})
1182+
1183+
mockGenerateSearchEmbedding.mockResolvedValue([0.1, 0.2, 0.3])
1184+
mockGetDocumentNamesByIds.mockResolvedValue({
1185+
'doc-active-combined': 'Active Combined Search.pdf',
1186+
})
1187+
1188+
const mockTagDefs = {
1189+
select: vi.fn().mockReturnThis(),
1190+
from: vi.fn().mockReturnThis(),
1191+
where: vi.fn().mockResolvedValue([]),
1192+
}
1193+
mockDbChain.select.mockReturnValueOnce(mockTagDefs)
1194+
1195+
const req = createMockRequest('POST', {
1196+
knowledgeBaseIds: ['kb-123'],
1197+
query: 'relevant content',
1198+
filters: { tag1: 'guide' },
1199+
topK: 10,
1200+
})
1201+
1202+
const { POST } = await import('@/app/api/knowledge/search/route')
1203+
const response = await POST(req)
1204+
const data = await response.json()
1205+
1206+
expect(response.status).toBe(200)
1207+
expect(data.success).toBe(true)
1208+
expect(data.data.results).toHaveLength(1)
1209+
expect(data.data.results[0].documentId).toBe('doc-active-combined')
1210+
expect(data.data.results[0].documentName).toBe('Active Combined Search.pdf')
1211+
expect(data.data.results[0].metadata).toEqual({ tag1: 'guide' })
1212+
expect(data.data.results[0].similarity).toBe(0.85) // 1 - 0.15 distance
1213+
})
1214+
})
10091215
})

apps/sim/app/api/knowledge/search/utils.test.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,4 +422,14 @@ describe('Knowledge Search Utils', () => {
422422
Object.keys(env).forEach((key) => delete (env as any)[key])
423423
})
424424
})
425+
426+
describe('getDocumentNamesByIds', () => {
427+
it('should handle empty input gracefully', async () => {
428+
const { getDocumentNamesByIds } = await import('./utils')
429+
430+
const result = await getDocumentNamesByIds([])
431+
432+
expect(result).toEqual({})
433+
})
434+
})
425435
})

apps/sim/app/api/knowledge/search/utils.ts

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { and, eq, inArray, sql } from 'drizzle-orm'
1+
import { and, eq, inArray, isNull, sql } from 'drizzle-orm'
22
import { createLogger } from '@/lib/logs/console/logger'
33
import { db } from '@/db'
44
import { document, embedding } from '@/db/schema'
@@ -19,7 +19,7 @@ export async function getDocumentNamesByIds(
1919
filename: document.filename,
2020
})
2121
.from(document)
22-
.where(inArray(document.id, uniqueIds))
22+
.where(and(inArray(document.id, uniqueIds), isNull(document.deletedAt)))
2323

2424
const documentNameMap: Record<string, string> = {}
2525
documents.forEach((doc) => {
@@ -119,21 +119,25 @@ async function executeTagFilterQuery(
119119
return await db
120120
.select({ id: embedding.id })
121121
.from(embedding)
122+
.innerJoin(document, eq(embedding.documentId, document.id))
122123
.where(
123124
and(
124125
eq(embedding.knowledgeBaseId, knowledgeBaseIds[0]),
125126
eq(embedding.enabled, true),
127+
isNull(document.deletedAt),
126128
...getTagFilters(filters, embedding)
127129
)
128130
)
129131
}
130132
return await db
131133
.select({ id: embedding.id })
132134
.from(embedding)
135+
.innerJoin(document, eq(embedding.documentId, document.id))
133136
.where(
134137
and(
135138
inArray(embedding.knowledgeBaseId, knowledgeBaseIds),
136139
eq(embedding.enabled, true),
140+
isNull(document.deletedAt),
137141
...getTagFilters(filters, embedding)
138142
)
139143
)
@@ -166,9 +170,11 @@ async function executeVectorSearchOnIds(
166170
knowledgeBaseId: embedding.knowledgeBaseId,
167171
})
168172
.from(embedding)
173+
.innerJoin(document, eq(embedding.documentId, document.id))
169174
.where(
170175
and(
171176
inArray(embedding.id, embeddingIds),
177+
isNull(document.deletedAt),
172178
sql`${embedding.embedding} <=> ${queryVector}::vector < ${distanceThreshold}`
173179
)
174180
)
@@ -209,10 +215,12 @@ export async function handleTagOnlySearch(params: SearchParams): Promise<SearchR
209215
knowledgeBaseId: embedding.knowledgeBaseId,
210216
})
211217
.from(embedding)
218+
.innerJoin(document, eq(embedding.documentId, document.id))
212219
.where(
213220
and(
214221
eq(embedding.knowledgeBaseId, kbId),
215222
eq(embedding.enabled, true),
223+
isNull(document.deletedAt),
216224
...getTagFilters(filters, embedding)
217225
)
218226
)
@@ -240,10 +248,12 @@ export async function handleTagOnlySearch(params: SearchParams): Promise<SearchR
240248
knowledgeBaseId: embedding.knowledgeBaseId,
241249
})
242250
.from(embedding)
251+
.innerJoin(document, eq(embedding.documentId, document.id))
243252
.where(
244253
and(
245254
inArray(embedding.knowledgeBaseId, knowledgeBaseIds),
246255
eq(embedding.enabled, true),
256+
isNull(document.deletedAt),
247257
...getTagFilters(filters, embedding)
248258
)
249259
)
@@ -283,10 +293,12 @@ export async function handleVectorOnlySearch(params: SearchParams): Promise<Sear
283293
knowledgeBaseId: embedding.knowledgeBaseId,
284294
})
285295
.from(embedding)
296+
.innerJoin(document, eq(embedding.documentId, document.id))
286297
.where(
287298
and(
288299
eq(embedding.knowledgeBaseId, kbId),
289300
eq(embedding.enabled, true),
301+
isNull(document.deletedAt),
290302
sql`${embedding.embedding} <=> ${queryVector}::vector < ${distanceThreshold}`
291303
)
292304
)
@@ -316,10 +328,12 @@ export async function handleVectorOnlySearch(params: SearchParams): Promise<Sear
316328
knowledgeBaseId: embedding.knowledgeBaseId,
317329
})
318330
.from(embedding)
331+
.innerJoin(document, eq(embedding.documentId, document.id))
319332
.where(
320333
and(
321334
inArray(embedding.knowledgeBaseId, knowledgeBaseIds),
322335
eq(embedding.enabled, true),
336+
isNull(document.deletedAt),
323337
sql`${embedding.embedding} <=> ${queryVector}::vector < ${distanceThreshold}`
324338
)
325339
)

0 commit comments

Comments
 (0)