Skip to content

Commit 2edcf4a

Browse files
authored
Merge branch 'main' into basic-cyrillic-normalization
2 parents 1872257 + dd260b9 commit 2edcf4a

File tree

14 files changed

+602
-126
lines changed

14 files changed

+602
-126
lines changed

charabia/Cargo.toml

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "charabia"
3-
version = "0.8.11"
3+
version = "0.9.0"
44
license = "MIT"
55
authors = ["Many <many@meilisearch.com>"]
66
edition = "2021"
@@ -12,30 +12,26 @@ categories = ["text-processing"]
1212
exclude = ["dictionaries/txt/thai/words.txt"]
1313

1414
[dependencies]
15-
aho-corasick = "1.1.2"
16-
cow-utils = "0.1"
15+
aho-corasick = "1.1.3"
1716
csv = "1.3.0"
18-
deunicode = "1.4.2"
19-
either = "1.9.0"
17+
either = "1.13.0"
2018
finl_unicode = { version= "1.2.0", optional = true }
2119
fst = "0.4"
22-
jieba-rs = { version = "0.6", optional = true }
20+
jieba-rs = { version = "0.7", optional = true }
2321
once_cell = "1.19.0"
24-
serde = "1.0"
22+
serde = "1.0.192"
2523
slice-group-by = "0.3.1"
2624
whatlang = "0.16.4"
27-
lindera = { version = "=0.31.0", default-features = false, optional = true }
25+
lindera = { version = "=0.32.2", default-features = false, optional = true }
2826
pinyin = { version = "0.10", default-features = false, features = [
2927
"with_tone",
3028
], optional = true }
3129
wana_kana = { version = "3.0.0", optional = true }
32-
unicode-normalization = "0.1.22"
33-
irg-kvariants = "0.1.0"
34-
litemap = "0.7.2"
35-
zerovec = "0.10.1"
30+
unicode-normalization = "0.1.23"
31+
irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" }
3632

3733
[features]
38-
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "russian"]
34+
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition", "turkish", "russian"]
3935

4036
# allow chinese specialized tokenization
4137
chinese = ["chinese-segmentation", "chinese-normalization"]
@@ -78,6 +74,9 @@ latin-snakecase = ["dep:finl_unicode"]
7874
# force Charabia to recompose Swedish characters
7975
swedish-recomposition = []
8076

77+
# allow turkish specialized tokenization
78+
turkish = []
79+
8180
[dev-dependencies]
8281
criterion = "0.5"
8382
jemallocator = "0.5.4"

charabia/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Charabia provides a simple API to segment, normalize, or tokenize (segment + nor
1616

1717
| Script / Language | specialized segmentation | specialized normalization | Segmentation Performance level | Tokenization Performance level |
1818
|---------------------|-------------------------------------------------------------------------------|---------------------------|-------------------|---|
19-
| **Latin** | ✅ CamelCase segmentation |[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization | 🟩 ~23MiB/sec | 🟨 ~9MiB/sec |
19+
| **Latin** | ✅ CamelCase segmentation |[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization + `ı` normalization | 🟩 ~23MiB/sec | 🟨 ~9MiB/sec |
2020
| **Greek** ||[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + final sigma normalization | 🟩 ~27MiB/sec | 🟨 ~8MiB/sec |
2121
| **Cyrillic** - **Georgian** ||[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase | 🟩 ~27MiB/sec | 🟨 ~9MiB/sec |
2222
| **Chinese** **CMN** 🇨🇳 |[jieba](https://github.com/messense/jieba-rs) |[compatibility decomposition](https://unicode.org/reports/tr15/) + kvariant conversion | 🟨 ~10MiB/sec | 🟧 ~5MiB/sec |

charabia/src/detection/mod.rs

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
use std::collections::HashMap;
2-
31
pub use script_language::{Language, Script};
42
use whatlang::Detector;
53

@@ -12,11 +10,11 @@ pub struct StrDetection<'o, 'al> {
1210
inner: &'o str,
1311
pub script: Option<Script>,
1412
pub language: Option<Language>,
15-
allow_list: Option<&'al HashMap<Script, Vec<Language>>>,
13+
allow_list: Option<&'al [Language]>,
1614
}
1715

1816
impl<'o, 'al> StrDetection<'o, 'al> {
19-
pub fn new(inner: &'o str, allow_list: Option<&'al HashMap<Script, Vec<Language>>>) -> Self {
17+
pub fn new(inner: &'o str, allow_list: Option<&'al [Language]>) -> Self {
2018
Self { inner, script: None, language: None, allow_list }
2119
}
2220

@@ -25,10 +23,14 @@ impl<'o, 'al> StrDetection<'o, 'al> {
2523
*self.script.get_or_insert_with(|| Self::detect_script(inner))
2624
}
2725

28-
pub fn language(&mut self) -> Language {
26+
pub fn language(&mut self) -> Option<Language> {
2927
let inner = self.inner;
30-
let script = self.script();
31-
*self.language.get_or_insert_with(|| Self::detect_lang(inner, script, self.allow_list))
28+
self.language = match self.language.take() {
29+
Some(lang) => Some(lang),
30+
None => Self::detect_lang(inner, self.allow_list),
31+
};
32+
33+
self.language
3234
}
3335

3436
/// detect script with whatlang,
@@ -39,33 +41,22 @@ impl<'o, 'al> StrDetection<'o, 'al> {
3941

4042
/// detect lang with whatlang
4143
/// if no language is detected, return Language::Other
42-
fn detect_lang(
43-
text: &str,
44-
script: Script,
45-
allow_list: Option<&HashMap<Script, Vec<Language>>>,
46-
) -> Language {
44+
fn detect_lang(text: &str, allow_list: Option<&[Language]>) -> Option<Language> {
4745
let detector = allow_list
48-
.and_then(|allow_list| allow_list.get(&script))
4946
.map(|allow_list| allow_list.iter().map(|lang| (*lang).into()).collect())
5047
.map(Detector::with_allowlist)
5148
.unwrap_or_default();
5249

53-
detector.detect_lang(text).map(Language::from).unwrap_or_default()
50+
detector.detect_lang(text).map(Language::from)
5451
}
5552
}
5653

5754
pub trait Detect<'o, 'al> {
58-
fn detect(
59-
&'o self,
60-
allow_list: Option<&'al HashMap<Script, Vec<Language>>>,
61-
) -> StrDetection<'o, 'al>;
55+
fn detect(&'o self, allow_list: Option<&'al [Language]>) -> StrDetection<'o, 'al>;
6256
}
6357

6458
impl<'o, 'al> Detect<'o, 'al> for &str {
65-
fn detect(
66-
&'o self,
67-
allow_list: Option<&'al HashMap<Script, Vec<Language>>>,
68-
) -> StrDetection<'o, 'al> {
59+
fn detect(&'o self, allow_list: Option<&'al [Language]>) -> StrDetection<'o, 'al> {
6960
StrDetection::new(self, allow_list)
7061
}
7162
}

charabia/src/detection/script_language.rs

Lines changed: 12 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,15 @@ use core::str::FromStr;
22

33
#[cfg(test)]
44
use quickcheck::{Arbitrary, Gen};
5+
use serde::{Deserialize, Serialize};
56

67
use super::chars;
78

89
macro_rules! make_language {
910
($($language:tt), +) => {
10-
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
11+
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize, PartialOrd, Ord)]
1112
pub enum Language {
1213
$($language),+,
13-
Other,
1414
}
1515
impl From<whatlang::Lang> for Language {
1616
fn from(other: whatlang::Lang) -> Language {
@@ -24,27 +24,19 @@ macro_rules! make_language {
2424
fn from(other: Language) -> whatlang::Lang {
2525
match other {
2626
$(Language::$language => whatlang::Lang::$language), +,
27-
_other => whatlang::Lang::Eng,
2827
}
2928
}
3029
}
3130

32-
impl Default for Language {
33-
fn default() -> Self {
34-
Self::Other
35-
}
36-
}
37-
3831
impl Language {
39-
pub fn name(&self) -> &'static str {
32+
pub fn code(&self) -> &'static str {
4033
match self {
4134
$(Language::$language => whatlang::Lang::$language.code()), +,
42-
_other => "other",
4335
}
4436
}
4537

46-
pub fn from_name<S: AsRef<str>>(code: S) -> Language {
47-
whatlang::Lang::from_code(code.as_ref()).map(Language::from).unwrap_or_default()
38+
pub fn from_code<S: AsRef<str>>(code: S) -> Option<Language> {
39+
whatlang::Lang::from_code(code.as_ref()).map(Language::from)
4840
}
4941
}
5042
};
@@ -124,7 +116,7 @@ make_language! {
124116

125117
macro_rules! make_script {
126118
($($script:tt), +) => {
127-
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
119+
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize, PartialOrd, Ord)]
128120
pub enum Script {
129121
$($script),+,
130122
Cj,
@@ -361,12 +353,12 @@ mod test {
361353

362354
#[test]
363355
fn from_into_language() {
364-
assert_eq!(Language::Eng.name(), "eng");
365-
assert_eq!(Language::from_name("eng"), Language::Eng);
366-
assert_eq!(Language::Jpn.name(), "jpn");
367-
assert_eq!(Language::from_name("jpn"), Language::Jpn);
368-
assert_eq!(Language::Cmn.name(), "cmn");
369-
assert_eq!(Language::from_name("cmn"), Language::Cmn);
356+
assert_eq!(Language::Eng.code(), "eng");
357+
assert_eq!(Language::from_code("eng"), Some(Language::Eng));
358+
assert_eq!(Language::Jpn.code(), "jpn");
359+
assert_eq!(Language::from_code("jpn"), Some(Language::Jpn));
360+
assert_eq!(Language::Cmn.code(), "cmn");
361+
assert_eq!(Language::from_code("cmn"), Some(Language::Cmn));
370362
}
371363

372364
#[test]

charabia/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ mod detection;
5656
mod token;
5757
mod tokenizer;
5858

59-
pub use detection::{Language, Script};
59+
pub use detection::{Language, Script, StrDetection};
6060
pub use normalizer::Normalize;
6161
pub use segmenter::Segment;
6262
pub use token::{SeparatorKind, Token, TokenKind};

charabia/src/normalizer/arabic.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ use crate::{Script, Token};
88
/// - normalizing the arabic Alef 'أ','إ','آ','ٱ' to 'ا'
99
/// - normalizing the arabic Yeh 'ى' to 'ي'
1010
/// - Normalizing the arabic Taa Marbuta 'ة' to 'ه'
11-
/// https://en.wikipedia.org/wiki/Arabic_alphabet
12-
/// https://en.wikipedia.org/wiki/Kashida
11+
/// https://en.wikipedia.org/wiki/Arabic_alphabet
12+
/// https://en.wikipedia.org/wiki/Kashida
1313
1414
pub struct ArabicNormalizer;
1515

charabia/src/normalizer/mod.rs

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ use self::quote::QuoteNormalizer;
1919
pub use self::russian::RussianNormalizer;
2020
#[cfg(feature = "swedish-recomposition")]
2121
use self::swedish_recomposition::SwedishRecompositionNormalizer;
22+
#[cfg(feature = "turkish")]
23+
pub use self::turkish::TurkishNormalizer;
2224
#[cfg(feature = "vietnamese")]
2325
pub use self::vietnamese::VietnameseNormalizer;
2426
use crate::segmenter::SegmentedTokenIter;
@@ -43,6 +45,8 @@ mod quote;
4345
mod russian;
4446
#[cfg(feature = "swedish-recomposition")]
4547
mod swedish_recomposition;
48+
#[cfg(feature = "turkish")]
49+
mod turkish;
4650
#[cfg(feature = "vietnamese")]
4751
mod vietnamese;
4852

@@ -77,6 +81,8 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
7781
Box::new(VietnameseNormalizer),
7882
#[cfg(feature = "russian")]
7983
Box::new(RussianNormalizer),
84+
#[cfg(feature = "turkish")]
85+
Box::new(TurkishNormalizer),
8086
]
8187
});
8288

@@ -87,12 +93,12 @@ pub(crate) const DEFAULT_NORMALIZER_OPTION: NormalizerOption = NormalizerOption
8793
};
8894

8995
/// Iterator over Normalized [`Token`]s.
90-
pub struct NormalizedTokenIter<'o, 'tb> {
91-
token_iter: SegmentedTokenIter<'o, 'tb>,
96+
pub struct NormalizedTokenIter<'o, 'aho, 'lang, 'tb> {
97+
token_iter: SegmentedTokenIter<'o, 'aho, 'lang>,
9298
options: &'tb NormalizerOption<'tb>,
9399
}
94100

95-
impl<'o> Iterator for NormalizedTokenIter<'o, '_> {
101+
impl<'o> Iterator for NormalizedTokenIter<'o, '_, '_, '_> {
96102
type Item = Token<'o>;
97103

98104
fn next(&mut self) -> Option<Self::Item> {
@@ -238,11 +244,14 @@ impl From<String> for CharOrStr {
238244
}
239245
}
240246

241-
impl<'o, 'tb> SegmentedTokenIter<'o, 'tb> {
247+
impl<'o, 'aho, 'lang> SegmentedTokenIter<'o, 'aho, 'lang> {
242248
/// Normalize [`Token`]s using all the compatible Normalizers.
243249
///
244250
/// A Latin `Token` would not be normalized the same as a Chinese `Token`.
245-
pub fn normalize(self, options: &'tb NormalizerOption<'tb>) -> NormalizedTokenIter<'o, 'tb> {
251+
pub fn normalize<'tb>(
252+
self,
253+
options: &'tb NormalizerOption<'tb>,
254+
) -> NormalizedTokenIter<'o, 'aho, 'lang, 'tb> {
246255
NormalizedTokenIter { token_iter: self, options }
247256
}
248257
}

charabia/src/normalizer/swedish_recomposition.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@ use once_cell::sync::Lazy;
55

66
use super::Normalizer;
77
use crate::normalizer::NormalizerOption;
8-
use crate::{Script, Token};
8+
use crate::{Language, Token};
99

1010
static MATCHING_STR: Lazy<AhoCorasick> = Lazy::new(|| {
11-
AhoCorasick::new(&["A\u{30a}", "a\u{30a}", "A\u{308}", "a\u{308}", "O\u{308}", "o\u{308}"])
11+
AhoCorasick::new(["A\u{30a}", "a\u{30a}", "A\u{308}", "a\u{308}", "O\u{308}", "o\u{308}"])
1212
.unwrap()
1313
});
1414

@@ -77,7 +77,7 @@ impl Normalizer for SwedishRecompositionNormalizer {
7777

7878
// Returns `true` if the Normalizer should be used.
7979
fn should_normalize(&self, token: &Token) -> bool {
80-
token.script == Script::Latin && MATCHING_STR.is_match(token.lemma())
80+
token.language == Some(Language::Swe) && MATCHING_STR.is_match(token.lemma())
8181
}
8282
}
8383

@@ -101,6 +101,7 @@ mod test {
101101
use crate::normalizer::test::test_normalizer;
102102
use crate::normalizer::Normalizer;
103103
use crate::token::TokenKind;
104+
use crate::Script;
104105

105106
// base tokens to normalize.
106107
fn tokens() -> Vec<Token<'static>> {
@@ -109,6 +110,7 @@ mod test {
109110
char_end: 13,
110111
byte_end: 19,
111112
script: Script::Latin,
113+
language: Some(Language::Swe),
112114
..Default::default()
113115
}]
114116
}
@@ -121,6 +123,7 @@ mod test {
121123
char_end: 13,
122124
byte_end: 19,
123125
script: Script::Latin,
126+
language: Some(Language::Swe),
124127
..Default::default()
125128
}]
126129
}
@@ -148,6 +151,7 @@ mod test {
148151
]),
149152
script: Script::Latin,
150153
kind: TokenKind::Word,
154+
language: Some(Language::Swe),
151155
..Default::default()
152156
}]
153157
}

0 commit comments

Comments
 (0)