Skip to content

Commit c197e2f

Browse files
committed
Don't run unicode61 tokenizer categories tests before SQLite 3.25.0
1 parent 13b3f1d commit c197e2f

File tree

1 file changed

+69
-27
lines changed

1 file changed

+69
-27
lines changed

Tests/GRDBTests/FTS5TokenizerTests.swift

Lines changed: 69 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,15 @@ class FTS5TokenizerTests: GRDBTestCase {
198198
#endif
199199

200200
func testUnicode61TokenizerCategories() throws {
201+
// Prevent SQLCipher failures.
202+
// Categories are not mentioned in the SQLite release notes.
203+
// They were introduced on 2018-07-13 in https://sqlite.org/src/info/80d2b9e635e3100f
204+
// Next version is 3.25.0.
205+
// So we assume support for categories was introduced in SQLite 3.25.0.
206+
guard sqlite3_libversion_number() >= 3025000 else {
207+
throw XCTSkip("FTS5 unicode61 tokenizer categories are not available")
208+
}
209+
201210
// Default categories
202211
try makeDatabaseQueue().inDatabase { db in
203212
try db.create(virtualTable: "documents", using: FTS5()) { t in
@@ -273,106 +282,139 @@ class FTS5TokenizerTests: GRDBTestCase {
273282
let ascii = try db.makeTokenizer(.ascii())
274283
let porter = try db.makeTokenizer(.porter())
275284
let unicode61 = try db.makeTokenizer(.unicode61())
276-
let unicode61OnlyLowercaseLetters = try db.makeTokenizer(.unicode61(categories: "Ll"))
277-
let unicode61WithSymbols = try db.makeTokenizer(.unicode61(categories: "L* N* Co S*"))
278285
let unicode61WithDiacritics = try db.makeTokenizer(.unicode61(diacritics: .keep))
279286

280287
// Empty query
281288
try XCTAssertEqual(ascii.tokenize(query: "").map(\.token), [])
282289
try XCTAssertEqual(porter.tokenize(query: "").map(\.token), [])
283290
try XCTAssertEqual(unicode61.tokenize(query: "").map(\.token), [])
284-
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "").map(\.token), [])
285-
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "").map(\.token), [])
286291
try XCTAssertEqual(unicode61WithDiacritics.tokenize(query: "").map(\.token), [])
287292

288293
try XCTAssertEqual(ascii.tokenize(query: "?!").map(\.token), [])
289294
try XCTAssertEqual(porter.tokenize(query: "?!").map(\.token), [])
290295
try XCTAssertEqual(unicode61.tokenize(query: "?!").map(\.token), [])
291-
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "?!").map(\.token), [])
292-
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "?!").map(\.token), [])
293296
try XCTAssertEqual(unicode61WithDiacritics.tokenize(query: "?!").map(\.token), [])
294297

295298
// Token queries
296299
try XCTAssertEqual(ascii.tokenize(query: "Moby").map(\.token), ["moby"])
297300
try XCTAssertEqual(porter.tokenize(query: "Moby").map(\.token), ["mobi"])
298301
try XCTAssertEqual(unicode61.tokenize(query: "Moby").map(\.token), ["moby"])
299-
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "Moby").map(\.token), ["oby"])
300-
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "Moby").map(\.token), ["moby"])
301302
try XCTAssertEqual(unicode61WithDiacritics.tokenize(query: "Moby").map(\.token), ["moby"])
302303

303304
try XCTAssertEqual(ascii.tokenize(query: "écarlates").map(\.token), ["écarlates"])
304305
try XCTAssertEqual(porter.tokenize(query: "écarlates").map(\.token), ["ecarl"])
305306
try XCTAssertEqual(unicode61.tokenize(query: "écarlates").map(\.token), ["ecarlates"])
306-
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "écarlates").map(\.token), ["ecarlates"])
307-
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "écarlates").map(\.token), ["ecarlates"])
308307
try XCTAssertEqual(unicode61WithDiacritics.tokenize(query: "écarlates").map(\.token), ["écarlates"])
309308

310309
try XCTAssertEqual(ascii.tokenize(query: "fooéı👨👨🏿🇫🇷🇨🇮").map(\.token), ["fooéı👨👨🏿🇫🇷🇨🇮"])
311310
try XCTAssertEqual(porter.tokenize(query: "fooéı👨👨🏿🇫🇷🇨🇮").map(\.token), ["fooeı", "🏿"])
312311
try XCTAssertEqual(unicode61.tokenize(query: "fooéı👨👨🏿🇫🇷🇨🇮").map(\.token), ["fooeı", "🏿"])
313-
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "fooéı👨👨🏿🇫🇷🇨🇮").map(\.token), ["fooeı", "🏿"])
314-
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "fooéı👨👨🏿🇫🇷🇨🇮").map(\.token), ["fooeı👨👨🏿🇫🇷🇨🇮"])
315312
try XCTAssertEqual(unicode61WithDiacritics.tokenize(query: "fooéı👨👨🏿🇫🇷🇨🇮").map(\.token), ["fooéı", "🏿"])
316313

317314
try XCTAssertEqual(ascii.tokenize(query: "SQLite database").map(\.token), ["sqlite", "database"])
318315
try XCTAssertEqual(porter.tokenize(query: "SQLite database").map(\.token), ["sqlite", "databas"])
319316
try XCTAssertEqual(unicode61.tokenize(query: "SQLite database").map(\.token), ["sqlite", "database"])
320-
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "SQLite database").map(\.token), ["ite", "database"])
321-
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "SQLite database").map(\.token), ["sqlite", "database"])
322317
try XCTAssertEqual(unicode61WithDiacritics.tokenize(query: "SQLite database").map(\.token), ["sqlite", "database"])
323318

324319
try XCTAssertEqual(ascii.tokenize(query: "Édouard Manet").map(\.token), ["Édouard", "manet"])
325320
try XCTAssertEqual(porter.tokenize(query: "Édouard Manet").map(\.token), ["edouard", "manet"])
326321
try XCTAssertEqual(unicode61.tokenize(query: "Édouard Manet").map(\.token), ["edouard", "manet"])
327-
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "Édouard Manet").map(\.token), ["douard", "anet"])
328-
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "Édouard Manet").map(\.token), ["edouard", "manet"])
329322
try XCTAssertEqual(unicode61WithDiacritics.tokenize(query: "Édouard Manet").map(\.token), ["édouard", "manet"])
330323

331324
// Prefix queries
332325
try XCTAssertEqual(ascii.tokenize(query: "*").map(\.token), [])
333326
try XCTAssertEqual(porter.tokenize(query: "*").map(\.token), [])
334327
try XCTAssertEqual(unicode61.tokenize(query: "*").map(\.token), [])
335-
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "*").map(\.token), [])
336-
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "*").map(\.token), [])
337328
try XCTAssertEqual(unicode61WithDiacritics.tokenize(query: "*").map(\.token), [])
338329

339330
try XCTAssertEqual(ascii.tokenize(query: "Robin*").map(\.token), ["robin"])
340331
try XCTAssertEqual(porter.tokenize(query: "Robin*").map(\.token), ["robin"])
341332
try XCTAssertEqual(unicode61.tokenize(query: "Robin*").map(\.token), ["robin"])
342-
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "Robin*").map(\.token), ["obin"])
343-
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "Robin*").map(\.token), ["robin"])
344333
try XCTAssertEqual(unicode61WithDiacritics.tokenize(query: "Robin*").map(\.token), ["robin"])
345334

346335
// Phrase queries
347336
try XCTAssertEqual(ascii.tokenize(query: "\"foulent muscles\"").map(\.token), ["foulent", "muscles"])
348337
try XCTAssertEqual(porter.tokenize(query: "\"foulent muscles\"").map(\.token), ["foulent", "muscl"])
349338
try XCTAssertEqual(unicode61.tokenize(query: "\"foulent muscles\"").map(\.token), ["foulent", "muscles"])
350-
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "\"foulent muscles\"").map(\.token), ["foulent", "muscles"])
351-
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "\"foulent muscles\"").map(\.token), ["foulent", "muscles"])
352339
try XCTAssertEqual(unicode61WithDiacritics.tokenize(query: "\"foulent muscles\"").map(\.token), ["foulent", "muscles"])
353340

354341
try XCTAssertEqual(ascii.tokenize(query: "\"Kim Stan* Robin*\"").map(\.token), ["kim", "stan", "robin"])
355342
try XCTAssertEqual(porter.tokenize(query: "\"Kim Stan* Robin*\"").map(\.token), ["kim", "stan", "robin"])
356343
try XCTAssertEqual(unicode61.tokenize(query: "\"Kim Stan* Robin*\"").map(\.token), ["kim", "stan", "robin"])
357-
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "\"Kim Stan* Robin*\"").map(\.token), ["im", "tan", "obin"])
358-
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "\"Kim Stan* Robin*\"").map(\.token), ["kim", "stan", "robin"])
359344
try XCTAssertEqual(unicode61WithDiacritics.tokenize(query: "\"Kim Stan* Robin*\"").map(\.token), ["kim", "stan", "robin"])
360345

361346
// Logical queries
362347
try XCTAssertEqual(ascii.tokenize(query: "years AND months").map(\.token), ["years", "and", "months"])
363348
try XCTAssertEqual(porter.tokenize(query: "years AND months").map(\.token), ["year", "and", "month"])
364349
try XCTAssertEqual(unicode61.tokenize(query: "years AND months").map(\.token), ["years", "and", "months"])
365-
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "years AND months").map(\.token), ["years", "months"])
366-
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "years AND months").map(\.token), ["years", "and", "months"])
367350
try XCTAssertEqual(unicode61WithDiacritics.tokenize(query: "years AND months").map(\.token), ["years", "and", "months"])
368351

369352
// column queries
370353
try XCTAssertEqual(ascii.tokenize(query: "title:brest").map(\.token), ["title", "brest"])
371354
try XCTAssertEqual(porter.tokenize(query: "title:brest").map(\.token), ["titl", "brest"])
372355
try XCTAssertEqual(unicode61.tokenize(query: "title:brest").map(\.token), ["title", "brest"])
356+
try XCTAssertEqual(unicode61WithDiacritics.tokenize(query: "title:brest").map(\.token), ["title", "brest"])
357+
}
358+
}
359+
360+
func testTokenize_Unicode61TokenizerCategories() throws {
361+
// Prevent SQLCipher failures.
362+
// Categories are not mentioned in the SQLite release notes.
363+
// They were introduced on 2018-07-13 in https://sqlite.org/src/info/80d2b9e635e3100f
364+
// Next version is 3.25.0.
365+
// So we assume support for categories was introduced in SQLite 3.25.0.
366+
guard sqlite3_libversion_number() >= 3025000 else {
367+
throw XCTSkip("FTS5 unicode61 tokenizer categories are not available")
368+
}
369+
370+
try makeDatabaseQueue().inDatabase { db in
371+
let unicode61OnlyLowercaseLetters = try db.makeTokenizer(.unicode61(categories: "Ll"))
372+
let unicode61WithSymbols = try db.makeTokenizer(.unicode61(categories: "L* N* Co S*"))
373+
374+
// Empty query
375+
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "").map(\.token), [])
376+
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "").map(\.token), [])
377+
378+
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "?!").map(\.token), [])
379+
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "?!").map(\.token), [])
380+
381+
// Token queries
382+
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "Moby").map(\.token), ["oby"])
383+
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "Moby").map(\.token), ["moby"])
384+
385+
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "écarlates").map(\.token), ["ecarlates"])
386+
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "écarlates").map(\.token), ["ecarlates"])
387+
388+
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "fooéı👨👨🏿🇫🇷🇨🇮").map(\.token), ["fooeı", "🏿"])
389+
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "fooéı👨👨🏿🇫🇷🇨🇮").map(\.token), ["fooeı👨👨🏿🇫🇷🇨🇮"])
390+
391+
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "SQLite database").map(\.token), ["ite", "database"])
392+
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "SQLite database").map(\.token), ["sqlite", "database"])
393+
394+
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "Édouard Manet").map(\.token), ["douard", "anet"])
395+
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "Édouard Manet").map(\.token), ["edouard", "manet"])
396+
397+
// Prefix queries
398+
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "*").map(\.token), [])
399+
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "*").map(\.token), [])
400+
401+
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "Robin*").map(\.token), ["obin"])
402+
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "Robin*").map(\.token), ["robin"])
403+
404+
// Phrase queries
405+
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "\"foulent muscles\"").map(\.token), ["foulent", "muscles"])
406+
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "\"foulent muscles\"").map(\.token), ["foulent", "muscles"])
407+
408+
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "\"Kim Stan* Robin*\"").map(\.token), ["im", "tan", "obin"])
409+
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "\"Kim Stan* Robin*\"").map(\.token), ["kim", "stan", "robin"])
410+
411+
// Logical queries
412+
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "years AND months").map(\.token), ["years", "months"])
413+
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "years AND months").map(\.token), ["years", "and", "months"])
414+
415+
// column queries
373416
try XCTAssertEqual(unicode61OnlyLowercaseLetters.tokenize(query: "title:brest").map(\.token), ["title", "brest"])
374417
try XCTAssertEqual(unicode61WithSymbols.tokenize(query: "title:brest").map(\.token), ["title", "brest"])
375-
try XCTAssertEqual(unicode61WithDiacritics.tokenize(query: "title:brest").map(\.token), ["title", "brest"])
376418
}
377419
}
378420
}

0 commit comments

Comments
 (0)