Skip to content

Commit 2ddcdd3

Browse files
authored
fix(search): escape unsafe FTS terms (#741)
1 parent 4041c97 commit 2ddcdd3

File tree

2 files changed

+186
-3
lines changed

2 files changed

+186
-3
lines changed
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
import { expect, test, vi } from 'vitest'
2+
import { queryLexicalSearch } from './search-db'
3+
4+
function createDb({
5+
onAll,
6+
}: {
7+
onAll: (
8+
candidateQuery: string,
9+
topK: number,
10+
) => Promise<{ results?: Array<Record<string, unknown>> }>
11+
}) {
12+
const boundQueries: Array<{ candidateQuery: string; topK: number }> = []
13+
14+
const db = {
15+
prepare: vi.fn(() => ({
16+
bind: vi.fn((candidateQuery: string, topK: number) => ({
17+
all: vi.fn(async () => {
18+
boundQueries.push({ candidateQuery, topK })
19+
return await onAll(candidateQuery, topK)
20+
}),
21+
})),
22+
})),
23+
} as unknown as D1Database
24+
25+
return { db, boundQueries }
26+
}
27+
28+
test('queryLexicalSearch quotes hyphenated date tokens for FTS', async () => {
29+
const { db, boundQueries } = createDb({
30+
onAll: async () => ({
31+
results: [
32+
{
33+
id: 'youtube:office-hours:chunk:0',
34+
type: 'youtube',
35+
slug: 'office-hours',
36+
title: 'KCD Office Hours',
37+
url: '/youtube?video=office-hours',
38+
snippet: 'Office hours recording',
39+
startSeconds: 17,
40+
},
41+
],
42+
}),
43+
})
44+
45+
const results = await queryLexicalSearch({
46+
db,
47+
query: 'KCD Office Hours 2025-04-17',
48+
topK: 5,
49+
})
50+
51+
expect(boundQueries).toEqual([
52+
{
53+
candidateQuery: 'KCD OR Office OR Hours OR "2025-04-17"',
54+
topK: 5,
55+
},
56+
])
57+
expect(results).toEqual([
58+
{
59+
id: 'youtube:office-hours:chunk:0',
60+
type: 'youtube',
61+
slug: 'office-hours',
62+
title: 'KCD Office Hours',
63+
url: '/youtube?video=office-hours',
64+
snippet: 'Office hours recording',
65+
chunkIndex: undefined,
66+
chunkCount: undefined,
67+
startSeconds: 17,
68+
endSeconds: undefined,
69+
imageUrl: undefined,
70+
imageAlt: undefined,
71+
},
72+
])
73+
})
74+
75+
test('queryLexicalSearch retries when D1 reports no such column', async () => {
76+
const { db, boundQueries } = createDb({
77+
onAll: async () => {
78+
if (boundQueries.length === 1) {
79+
throw new Error('D1_ERROR: no such column: 04: SQLITE_ERROR')
80+
}
81+
return { results: [] }
82+
},
83+
})
84+
85+
const results = await queryLexicalSearch({
86+
db,
87+
query: 'KCD Office Hours 2025-04-17',
88+
topK: 5,
89+
})
90+
91+
expect(results).toEqual([])
92+
expect(boundQueries).toEqual([
93+
{
94+
candidateQuery: 'KCD OR Office OR Hours OR "2025-04-17"',
95+
topK: 5,
96+
},
97+
{
98+
candidateQuery: 'KCD OR Office OR Hours OR "2025-04-17"',
99+
topK: 5,
100+
},
101+
])
102+
})
103+
104+
test('queryLexicalSearch quotes uppercase OR tokens', async () => {
105+
const { db, boundQueries } = createDb({
106+
onAll: async () => ({
107+
results: [
108+
{
109+
id: 'blog:or-token:chunk:0',
110+
type: 'blog',
111+
slug: 'or-token',
112+
title: 'Literal OR token',
113+
url: '/blog/or-token',
114+
snippet: 'Contains the word OR as content',
115+
},
116+
],
117+
}),
118+
})
119+
120+
const results = await queryLexicalSearch({
121+
db,
122+
query: 'OR',
123+
topK: 5,
124+
})
125+
126+
expect(boundQueries).toEqual([
127+
{
128+
candidateQuery: '"OR"',
129+
topK: 5,
130+
},
131+
])
132+
expect(results[0]?.id).toBe('blog:or-token:chunk:0')
133+
})
134+
135+
test('queryLexicalSearch quotes uppercase NEAR tokens', async () => {
136+
const { db, boundQueries } = createDb({
137+
onAll: async () => ({
138+
results: [
139+
{
140+
id: 'blog:near-token:chunk:0',
141+
type: 'blog',
142+
slug: 'near-token',
143+
title: 'Literal NEAR token',
144+
url: '/blog/near-token',
145+
snippet: 'Contains the word NEAR as content',
146+
},
147+
],
148+
}),
149+
})
150+
151+
const results = await queryLexicalSearch({
152+
db,
153+
query: 'NEAR',
154+
topK: 5,
155+
})
156+
157+
expect(boundQueries).toEqual([
158+
{
159+
candidateQuery: '"NEAR"',
160+
topK: 5,
161+
},
162+
])
163+
expect(results[0]?.id).toBe('blog:near-token:chunk:0')
164+
})

services/search-worker/src/search-db.ts

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -130,17 +130,33 @@ function asNumber(value: unknown): number | undefined {
130130
return typeof value === 'number' && Number.isFinite(value) ? value : undefined
131131
}
132132

133+
function isReservedFtsOperator(term: string) {
134+
return /^(AND|OR|NOT|NEAR)$/u.test(term)
135+
}
136+
137+
function toSafeFtsTerm(term: string) {
138+
const trimmed = term.trim()
139+
if (!trimmed) return null
140+
if (/^[\p{L}\p{N}_]+$/u.test(trimmed) && !isReservedFtsOperator(trimmed)) {
141+
return trimmed
142+
}
143+
return `"${trimmed.replace(/"/g, '""')}"`
144+
}
145+
133146
function buildLexicalSearchMatchQuery(query: string) {
134147
const terms: string[] = []
135148
for (const match of query.matchAll(/"([^"]+)"|([\p{L}\p{N}_-]+)/gu)) {
136149
const phrase = match[1]?.trim()
137150
if (phrase) {
138-
terms.push(`"${phrase.replace(/"/g, '""')}"`)
151+
const safePhrase = toSafeFtsTerm(phrase)
152+
if (safePhrase) terms.push(safePhrase)
139153
continue
140154
}
141155

142156
const token = match[2]?.trim()
143-
if (token) terms.push(token)
157+
if (!token) continue
158+
const safeToken = toSafeFtsTerm(token)
159+
if (safeToken) terms.push(safeToken)
144160
}
145161

146162
if (!terms.length) return null
@@ -196,7 +212,9 @@ function buildDocRecords({
196212

197213
function isFtsQuerySyntaxError(error: unknown) {
198214
if (!(error instanceof Error)) return false
199-
return /fts5|syntax error|malformed match|unterminated/i.test(error.message)
215+
return /fts5|syntax error|malformed match|unterminated|no such column/i.test(
216+
error.message,
217+
)
200218
}
201219

202220
async function runStatementsInTransaction({
@@ -452,6 +470,7 @@ export async function queryLexicalSearch({
452470
const fallbackQuery = query
453471
.split(/\s+/u)
454472
.map((token) => token.replace(/[^\p{L}\p{N}_-]+/gu, '').trim())
473+
.map((token) => toSafeFtsTerm(token))
455474
.filter(Boolean)
456475
.join(' OR ')
457476
if (!fallbackQuery) return []

0 commit comments

Comments
 (0)