@@ -198,6 +198,15 @@ class FTS5TokenizerTests: GRDBTestCase {
198
198
#endif
199
199
200
200
func testUnicode61TokenizerCategories( ) throws {
201
+ // Prevent SQLCipher failures.
202
+ // Categories are not mentioned in the SQLite release notes.
203
+ // They were introduced on 2018-07-13 in https://sqlite.org/src/info/80d2b9e635e3100f
204
+ // Next version is 3.25.0.
205
+ // So we assume support for categories was introduced in SQLite 3.25.0.
206
+ guard sqlite3_libversion_number ( ) >= 3025000 else {
207
+ throw XCTSkip ( " FTS5 unicode61 tokenizer categories are not available " )
208
+ }
209
+
201
210
// Default categories
202
211
try makeDatabaseQueue ( ) . inDatabase { db in
203
212
try db. create ( virtualTable: " documents " , using: FTS5 ( ) ) { t in
@@ -273,106 +282,139 @@ class FTS5TokenizerTests: GRDBTestCase {
273
282
let ascii = try db. makeTokenizer ( . ascii( ) )
274
283
let porter = try db. makeTokenizer ( . porter( ) )
275
284
let unicode61 = try db. makeTokenizer ( . unicode61( ) )
276
- let unicode61OnlyLowercaseLetters = try db. makeTokenizer ( . unicode61( categories: " Ll " ) )
277
- let unicode61WithSymbols = try db. makeTokenizer ( . unicode61( categories: " L* N* Co S* " ) )
278
285
let unicode61WithDiacritics = try db. makeTokenizer ( . unicode61( diacritics: . keep) )
279
286
280
287
// Empty query
281
288
try XCTAssertEqual ( ascii. tokenize ( query: " " ) . map ( \. token) , [ ] )
282
289
try XCTAssertEqual ( porter. tokenize ( query: " " ) . map ( \. token) , [ ] )
283
290
try XCTAssertEqual ( unicode61. tokenize ( query: " " ) . map ( \. token) , [ ] )
284
- try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " " ) . map ( \. token) , [ ] )
285
- try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " " ) . map ( \. token) , [ ] )
286
291
try XCTAssertEqual ( unicode61WithDiacritics. tokenize ( query: " " ) . map ( \. token) , [ ] )
287
292
288
293
try XCTAssertEqual ( ascii. tokenize ( query: " ?! " ) . map ( \. token) , [ ] )
289
294
try XCTAssertEqual ( porter. tokenize ( query: " ?! " ) . map ( \. token) , [ ] )
290
295
try XCTAssertEqual ( unicode61. tokenize ( query: " ?! " ) . map ( \. token) , [ ] )
291
- try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " ?! " ) . map ( \. token) , [ ] )
292
- try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " ?! " ) . map ( \. token) , [ ] )
293
296
try XCTAssertEqual ( unicode61WithDiacritics. tokenize ( query: " ?! " ) . map ( \. token) , [ ] )
294
297
295
298
// Token queries
296
299
try XCTAssertEqual ( ascii. tokenize ( query: " Moby " ) . map ( \. token) , [ " moby " ] )
297
300
try XCTAssertEqual ( porter. tokenize ( query: " Moby " ) . map ( \. token) , [ " mobi " ] )
298
301
try XCTAssertEqual ( unicode61. tokenize ( query: " Moby " ) . map ( \. token) , [ " moby " ] )
299
- try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " Moby " ) . map ( \. token) , [ " oby " ] )
300
- try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " Moby " ) . map ( \. token) , [ " moby " ] )
301
302
try XCTAssertEqual ( unicode61WithDiacritics. tokenize ( query: " Moby " ) . map ( \. token) , [ " moby " ] )
302
303
303
304
try XCTAssertEqual ( ascii. tokenize ( query: " écarlates " ) . map ( \. token) , [ " écarlates " ] )
304
305
try XCTAssertEqual ( porter. tokenize ( query: " écarlates " ) . map ( \. token) , [ " ecarl " ] )
305
306
try XCTAssertEqual ( unicode61. tokenize ( query: " écarlates " ) . map ( \. token) , [ " ecarlates " ] )
306
- try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " écarlates " ) . map ( \. token) , [ " ecarlates " ] )
307
- try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " écarlates " ) . map ( \. token) , [ " ecarlates " ] )
308
307
try XCTAssertEqual ( unicode61WithDiacritics. tokenize ( query: " écarlates " ) . map ( \. token) , [ " écarlates " ] )
309
308
310
309
try XCTAssertEqual ( ascii. tokenize ( query: " fooéı👨👨🏿🇫🇷🇨🇮 " ) . map ( \. token) , [ " fooéı👨👨🏿🇫🇷🇨🇮 " ] )
311
310
try XCTAssertEqual ( porter. tokenize ( query: " fooéı👨👨🏿🇫🇷🇨🇮 " ) . map ( \. token) , [ " fooeı " , " 🏿 " ] )
312
311
try XCTAssertEqual ( unicode61. tokenize ( query: " fooéı👨👨🏿🇫🇷🇨🇮 " ) . map ( \. token) , [ " fooeı " , " 🏿 " ] )
313
- try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " fooéı👨👨🏿🇫🇷🇨🇮 " ) . map ( \. token) , [ " fooeı " , " 🏿 " ] )
314
- try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " fooéı👨👨🏿🇫🇷🇨🇮 " ) . map ( \. token) , [ " fooeı👨👨🏿🇫🇷🇨🇮 " ] )
315
312
try XCTAssertEqual ( unicode61WithDiacritics. tokenize ( query: " fooéı👨👨🏿🇫🇷🇨🇮 " ) . map ( \. token) , [ " fooéı " , " 🏿 " ] )
316
313
317
314
try XCTAssertEqual ( ascii. tokenize ( query: " SQLite database " ) . map ( \. token) , [ " sqlite " , " database " ] )
318
315
try XCTAssertEqual ( porter. tokenize ( query: " SQLite database " ) . map ( \. token) , [ " sqlite " , " databas " ] )
319
316
try XCTAssertEqual ( unicode61. tokenize ( query: " SQLite database " ) . map ( \. token) , [ " sqlite " , " database " ] )
320
- try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " SQLite database " ) . map ( \. token) , [ " ite " , " database " ] )
321
- try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " SQLite database " ) . map ( \. token) , [ " sqlite " , " database " ] )
322
317
try XCTAssertEqual ( unicode61WithDiacritics. tokenize ( query: " SQLite database " ) . map ( \. token) , [ " sqlite " , " database " ] )
323
318
324
319
try XCTAssertEqual ( ascii. tokenize ( query: " Édouard Manet " ) . map ( \. token) , [ " Édouard " , " manet " ] )
325
320
try XCTAssertEqual ( porter. tokenize ( query: " Édouard Manet " ) . map ( \. token) , [ " edouard " , " manet " ] )
326
321
try XCTAssertEqual ( unicode61. tokenize ( query: " Édouard Manet " ) . map ( \. token) , [ " edouard " , " manet " ] )
327
- try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " Édouard Manet " ) . map ( \. token) , [ " douard " , " anet " ] )
328
- try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " Édouard Manet " ) . map ( \. token) , [ " edouard " , " manet " ] )
329
322
try XCTAssertEqual ( unicode61WithDiacritics. tokenize ( query: " Édouard Manet " ) . map ( \. token) , [ " édouard " , " manet " ] )
330
323
331
324
// Prefix queries
332
325
try XCTAssertEqual ( ascii. tokenize ( query: " * " ) . map ( \. token) , [ ] )
333
326
try XCTAssertEqual ( porter. tokenize ( query: " * " ) . map ( \. token) , [ ] )
334
327
try XCTAssertEqual ( unicode61. tokenize ( query: " * " ) . map ( \. token) , [ ] )
335
- try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " * " ) . map ( \. token) , [ ] )
336
- try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " * " ) . map ( \. token) , [ ] )
337
328
try XCTAssertEqual ( unicode61WithDiacritics. tokenize ( query: " * " ) . map ( \. token) , [ ] )
338
329
339
330
try XCTAssertEqual ( ascii. tokenize ( query: " Robin* " ) . map ( \. token) , [ " robin " ] )
340
331
try XCTAssertEqual ( porter. tokenize ( query: " Robin* " ) . map ( \. token) , [ " robin " ] )
341
332
try XCTAssertEqual ( unicode61. tokenize ( query: " Robin* " ) . map ( \. token) , [ " robin " ] )
342
- try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " Robin* " ) . map ( \. token) , [ " obin " ] )
343
- try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " Robin* " ) . map ( \. token) , [ " robin " ] )
344
333
try XCTAssertEqual ( unicode61WithDiacritics. tokenize ( query: " Robin* " ) . map ( \. token) , [ " robin " ] )
345
334
346
335
// Phrase queries
347
336
try XCTAssertEqual ( ascii. tokenize ( query: " \" foulent muscles \" " ) . map ( \. token) , [ " foulent " , " muscles " ] )
348
337
try XCTAssertEqual ( porter. tokenize ( query: " \" foulent muscles \" " ) . map ( \. token) , [ " foulent " , " muscl " ] )
349
338
try XCTAssertEqual ( unicode61. tokenize ( query: " \" foulent muscles \" " ) . map ( \. token) , [ " foulent " , " muscles " ] )
350
- try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " \" foulent muscles \" " ) . map ( \. token) , [ " foulent " , " muscles " ] )
351
- try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " \" foulent muscles \" " ) . map ( \. token) , [ " foulent " , " muscles " ] )
352
339
try XCTAssertEqual ( unicode61WithDiacritics. tokenize ( query: " \" foulent muscles \" " ) . map ( \. token) , [ " foulent " , " muscles " ] )
353
340
354
341
try XCTAssertEqual ( ascii. tokenize ( query: " \" Kim Stan* Robin* \" " ) . map ( \. token) , [ " kim " , " stan " , " robin " ] )
355
342
try XCTAssertEqual ( porter. tokenize ( query: " \" Kim Stan* Robin* \" " ) . map ( \. token) , [ " kim " , " stan " , " robin " ] )
356
343
try XCTAssertEqual ( unicode61. tokenize ( query: " \" Kim Stan* Robin* \" " ) . map ( \. token) , [ " kim " , " stan " , " robin " ] )
357
- try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " \" Kim Stan* Robin* \" " ) . map ( \. token) , [ " im " , " tan " , " obin " ] )
358
- try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " \" Kim Stan* Robin* \" " ) . map ( \. token) , [ " kim " , " stan " , " robin " ] )
359
344
try XCTAssertEqual ( unicode61WithDiacritics. tokenize ( query: " \" Kim Stan* Robin* \" " ) . map ( \. token) , [ " kim " , " stan " , " robin " ] )
360
345
361
346
// Logical queries
362
347
try XCTAssertEqual ( ascii. tokenize ( query: " years AND months " ) . map ( \. token) , [ " years " , " and " , " months " ] )
363
348
try XCTAssertEqual ( porter. tokenize ( query: " years AND months " ) . map ( \. token) , [ " year " , " and " , " month " ] )
364
349
try XCTAssertEqual ( unicode61. tokenize ( query: " years AND months " ) . map ( \. token) , [ " years " , " and " , " months " ] )
365
- try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " years AND months " ) . map ( \. token) , [ " years " , " months " ] )
366
- try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " years AND months " ) . map ( \. token) , [ " years " , " and " , " months " ] )
367
350
try XCTAssertEqual ( unicode61WithDiacritics. tokenize ( query: " years AND months " ) . map ( \. token) , [ " years " , " and " , " months " ] )
368
351
369
352
// column queries
370
353
try XCTAssertEqual ( ascii. tokenize ( query: " title:brest " ) . map ( \. token) , [ " title " , " brest " ] )
371
354
try XCTAssertEqual ( porter. tokenize ( query: " title:brest " ) . map ( \. token) , [ " titl " , " brest " ] )
372
355
try XCTAssertEqual ( unicode61. tokenize ( query: " title:brest " ) . map ( \. token) , [ " title " , " brest " ] )
356
+ try XCTAssertEqual ( unicode61WithDiacritics. tokenize ( query: " title:brest " ) . map ( \. token) , [ " title " , " brest " ] )
357
+ }
358
+ }
359
+
360
+ func testTokenize_Unicode61TokenizerCategories( ) throws {
361
+ // Prevent SQLCipher failures.
362
+ // Categories are not mentioned in the SQLite release notes.
363
+ // They were introduced on 2018-07-13 in https://sqlite.org/src/info/80d2b9e635e3100f
364
+ // Next version is 3.25.0.
365
+ // So we assume support for categories was introduced in SQLite 3.25.0.
366
+ guard sqlite3_libversion_number ( ) >= 3025000 else {
367
+ throw XCTSkip ( " FTS5 unicode61 tokenizer categories are not available " )
368
+ }
369
+
370
+ try makeDatabaseQueue ( ) . inDatabase { db in
371
+ let unicode61OnlyLowercaseLetters = try db. makeTokenizer ( . unicode61( categories: " Ll " ) )
372
+ let unicode61WithSymbols = try db. makeTokenizer ( . unicode61( categories: " L* N* Co S* " ) )
373
+
374
+ // Empty query
375
+ try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " " ) . map ( \. token) , [ ] )
376
+ try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " " ) . map ( \. token) , [ ] )
377
+
378
+ try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " ?! " ) . map ( \. token) , [ ] )
379
+ try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " ?! " ) . map ( \. token) , [ ] )
380
+
381
+ // Token queries
382
+ try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " Moby " ) . map ( \. token) , [ " oby " ] )
383
+ try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " Moby " ) . map ( \. token) , [ " moby " ] )
384
+
385
+ try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " écarlates " ) . map ( \. token) , [ " ecarlates " ] )
386
+ try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " écarlates " ) . map ( \. token) , [ " ecarlates " ] )
387
+
388
+ try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " fooéı👨👨🏿🇫🇷🇨🇮 " ) . map ( \. token) , [ " fooeı " , " 🏿 " ] )
389
+ try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " fooéı👨👨🏿🇫🇷🇨🇮 " ) . map ( \. token) , [ " fooeı👨👨🏿🇫🇷🇨🇮 " ] )
390
+
391
+ try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " SQLite database " ) . map ( \. token) , [ " ite " , " database " ] )
392
+ try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " SQLite database " ) . map ( \. token) , [ " sqlite " , " database " ] )
393
+
394
+ try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " Édouard Manet " ) . map ( \. token) , [ " douard " , " anet " ] )
395
+ try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " Édouard Manet " ) . map ( \. token) , [ " edouard " , " manet " ] )
396
+
397
+ // Prefix queries
398
+ try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " * " ) . map ( \. token) , [ ] )
399
+ try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " * " ) . map ( \. token) , [ ] )
400
+
401
+ try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " Robin* " ) . map ( \. token) , [ " obin " ] )
402
+ try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " Robin* " ) . map ( \. token) , [ " robin " ] )
403
+
404
+ // Phrase queries
405
+ try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " \" foulent muscles \" " ) . map ( \. token) , [ " foulent " , " muscles " ] )
406
+ try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " \" foulent muscles \" " ) . map ( \. token) , [ " foulent " , " muscles " ] )
407
+
408
+ try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " \" Kim Stan* Robin* \" " ) . map ( \. token) , [ " im " , " tan " , " obin " ] )
409
+ try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " \" Kim Stan* Robin* \" " ) . map ( \. token) , [ " kim " , " stan " , " robin " ] )
410
+
411
+ // Logical queries
412
+ try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " years AND months " ) . map ( \. token) , [ " years " , " months " ] )
413
+ try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " years AND months " ) . map ( \. token) , [ " years " , " and " , " months " ] )
414
+
415
+ // column queries
373
416
try XCTAssertEqual ( unicode61OnlyLowercaseLetters. tokenize ( query: " title:brest " ) . map ( \. token) , [ " title " , " brest " ] )
374
417
try XCTAssertEqual ( unicode61WithSymbols. tokenize ( query: " title:brest " ) . map ( \. token) , [ " title " , " brest " ] )
375
- try XCTAssertEqual ( unicode61WithDiacritics. tokenize ( query: " title:brest " ) . map ( \. token) , [ " title " , " brest " ] )
376
418
}
377
419
}
378
420
}
0 commit comments