|
1 | 1 | use hashbrown::HashMap; |
2 | 2 |
|
| 3 | +use constants::{MAX_TOTAL_DISTANCE, MAX_TRIGRAM_DISTANCE}; |
| 4 | +use info::Info; |
3 | 5 | use lang::*; |
| 6 | +use options::{List, Options}; |
4 | 7 | use script::*; |
5 | 8 | use trigrams::*; |
6 | | -use info::Info; |
7 | | -use options::{Options, List}; |
8 | | -use constants::{MAX_TRIGRAM_DISTANCE, MAX_TOTAL_DISTANCE}; |
9 | 9 |
|
10 | 10 | /// Detect a language and a script by a given text. |
11 | 11 | /// |
@@ -39,56 +39,66 @@ pub fn detect_lang_with_options(text: &str, options: &Options) -> Option<Lang> { |
39 | 39 |
|
40 | 40 | pub fn detect_with_options(text: &str, options: &Options) -> Option<Info> { |
41 | 41 | detect_script(text).and_then(|script| { |
42 | | - detect_lang_based_on_script(text, options, script).map( |(lang, confidence)| { |
43 | | - Info { lang, script, confidence } |
| 42 | + detect_lang_based_on_script(text, options, script).map(|(lang, confidence)| Info { |
| 43 | + lang, |
| 44 | + script, |
| 45 | + confidence, |
44 | 46 | }) |
45 | 47 | }) |
46 | 48 | } |
47 | 49 |
|
48 | | -fn detect_lang_based_on_script(text: &str, options: &Options, script : Script) -> Option<(Lang, f64)> { |
| 50 | +fn detect_lang_based_on_script( |
| 51 | + text: &str, |
| 52 | + options: &Options, |
| 53 | + script: Script, |
| 54 | +) -> Option<(Lang, f64)> { |
49 | 55 | match script { |
50 | | - Script::Latin => detect_lang_in_profiles(text, options, LATIN_LANGS), |
51 | | - Script::Cyrillic => detect_lang_in_profiles(text, options, CYRILLIC_LANGS), |
| 56 | + Script::Latin => detect_lang_in_profiles(text, options, LATIN_LANGS), |
| 57 | + Script::Cyrillic => detect_lang_in_profiles(text, options, CYRILLIC_LANGS), |
52 | 58 | Script::Devanagari => detect_lang_in_profiles(text, options, DEVANAGARI_LANGS), |
53 | | - Script::Hebrew => detect_lang_in_profiles(text, options, HEBREW_LANGS), |
54 | | - Script::Ethiopic => detect_lang_in_profiles(text, options, ETHIOPIC_LANGS), |
55 | | - Script::Arabic => detect_lang_in_profiles(text, options, ARABIC_LANGS), |
56 | | - Script::Mandarin => Some((Lang::Cmn, 1.0)), |
57 | | - Script::Bengali => Some((Lang::Ben, 1.0)), |
58 | | - Script::Hangul => Some((Lang::Kor, 1.0)), |
59 | | - Script::Georgian => Some((Lang::Kat, 1.0)), |
60 | | - Script::Greek => Some((Lang::Ell, 1.0)), |
61 | | - Script::Kannada => Some((Lang::Kan, 1.0)), |
62 | | - Script::Tamil => Some((Lang::Tam, 1.0)), |
63 | | - Script::Thai => Some((Lang::Tha, 1.0)), |
64 | | - Script::Gujarati => Some((Lang::Guj, 1.0)), |
65 | | - Script::Gurmukhi => Some((Lang::Pan, 1.0)), |
66 | | - Script::Telugu => Some((Lang::Tel, 1.0)), |
| 59 | + Script::Hebrew => detect_lang_in_profiles(text, options, HEBREW_LANGS), |
| 60 | + Script::Ethiopic => detect_lang_in_profiles(text, options, ETHIOPIC_LANGS), |
| 61 | + Script::Arabic => detect_lang_in_profiles(text, options, ARABIC_LANGS), |
| 62 | + Script::Mandarin => Some((Lang::Cmn, 1.0)), |
| 63 | + Script::Bengali => Some((Lang::Ben, 1.0)), |
| 64 | + Script::Hangul => Some((Lang::Kor, 1.0)), |
| 65 | + Script::Georgian => Some((Lang::Kat, 1.0)), |
| 66 | + Script::Greek => Some((Lang::Ell, 1.0)), |
| 67 | + Script::Kannada => Some((Lang::Kan, 1.0)), |
| 68 | + Script::Tamil => Some((Lang::Tam, 1.0)), |
| 69 | + Script::Thai => Some((Lang::Tha, 1.0)), |
| 70 | + Script::Gujarati => Some((Lang::Guj, 1.0)), |
| 71 | + Script::Gurmukhi => Some((Lang::Pan, 1.0)), |
| 72 | + Script::Telugu => Some((Lang::Tel, 1.0)), |
67 | 73 | Script::Malayalam => Some((Lang::Mal, 1.0)), |
68 | | - Script::Oriya => Some((Lang::Ori, 1.0)), |
69 | | - Script::Myanmar => Some((Lang::Mya, 1.0)), |
70 | | - Script::Sinhala => Some((Lang::Sin, 1.0)), |
71 | | - Script::Khmer => Some((Lang::Khm, 1.0)), |
72 | | - Script::Katakana | Script::Hiragana => Some((Lang::Jpn, 1.0)) |
| 74 | + Script::Oriya => Some((Lang::Ori, 1.0)), |
| 75 | + Script::Myanmar => Some((Lang::Mya, 1.0)), |
| 76 | + Script::Sinhala => Some((Lang::Sin, 1.0)), |
| 77 | + Script::Khmer => Some((Lang::Khm, 1.0)), |
| 78 | + Script::Katakana | Script::Hiragana => Some((Lang::Jpn, 1.0)), |
73 | 79 | } |
74 | 80 | } |
75 | 81 |
|
76 | | -fn detect_lang_in_profiles(text: &str, options: &Options, lang_profile_list : LangProfileList) -> Option<(Lang, f64)> { |
77 | | - let mut lang_distances : Vec<(Lang, u32)> = vec![]; |
| 82 | +fn detect_lang_in_profiles( |
| 83 | + text: &str, |
| 84 | + options: &Options, |
| 85 | + lang_profile_list: LangProfileList, |
| 86 | +) -> Option<(Lang, f64)> { |
| 87 | + let mut lang_distances: Vec<(Lang, u32)> = vec![]; |
78 | 88 | let trigrams = get_trigrams_with_positions(text); |
79 | 89 |
|
80 | 90 | for &(ref lang, lang_trigrams) in lang_profile_list { |
81 | 91 | match options.list { |
82 | 92 | Some(List::White(ref whitelist)) if !whitelist.contains(lang) => continue, |
83 | 93 | Some(List::Black(ref blacklist)) if blacklist.contains(lang) => continue, |
84 | | - _ => {}, |
| 94 | + _ => {} |
85 | 95 | } |
86 | 96 | let dist = calculate_distance(lang_trigrams, &trigrams); |
87 | 97 | lang_distances.push(((*lang), dist)); |
88 | 98 | } |
89 | 99 |
|
90 | 100 | // Sort languages by distance |
91 | | - lang_distances.sort_by_key(|key| key.1 ); |
| 101 | + lang_distances.sort_by_key(|key| key.1); |
92 | 102 |
|
93 | 103 | // Return None if lang_distances is empty |
94 | 104 | // Return the only language with is_reliable=true if there is only 1 item |
@@ -117,37 +127,36 @@ fn detect_lang_in_profiles(text: &str, options: &Options, lang_profile_list : La |
117 | 127 | // * Text really matches one of the languages. |
118 | 128 | // |
119 | 129 | // Number 500.0 is based on experiments and common sense expectations. |
120 | | - let mut confidence = (score1 as f64) / 500.0; |
| 130 | + let mut confidence = f64::from(score1) / 500.0; |
121 | 131 | if confidence > 1.0 { |
122 | 132 | confidence = 1.0; |
123 | 133 | } |
124 | 134 | return Some((lang_dist1.0, confidence)); |
125 | 135 | } |
126 | 136 |
|
127 | | - let rate = (score1 - score2) as f64 / (score2 as f64); |
| 137 | + let rate = f64::from(score1 - score2) / f64::from(score2); |
128 | 138 |
|
129 | 139 | // Hyperbola function. Everything that is above the function has confidence = 1.0 |
130 | 140 | // If rate is below, confidence is calculated proportionally. |
131 | 141 | // Numbers 12.0 and 0.05 are obtained experimentally, so the function represents common sense. |
132 | 142 | // |
133 | 143 | let confident_rate = (12.0 / trigrams.len() as f64) + 0.05; |
134 | | - let confidence = |
135 | | - if rate > confident_rate { |
136 | | - 1.0 |
137 | | - } else { |
138 | | - rate / confident_rate |
139 | | - }; |
| 144 | + let confidence = if rate > confident_rate { |
| 145 | + 1.0 |
| 146 | + } else { |
| 147 | + rate / confident_rate |
| 148 | + }; |
140 | 149 |
|
141 | 150 | Some((lang_dist1.0, confidence)) |
142 | 151 | } |
143 | 152 |
|
144 | | -fn calculate_distance(lang_trigrams: LangProfile, text_trigrams: &HashMap<String, u32>) -> u32 { |
| 153 | +fn calculate_distance(lang_trigrams: LangProfile, text_trigrams: &HashMap<String, u32>) -> u32 { |
145 | 154 | let mut total_dist = 0u32; |
146 | 155 |
|
147 | 156 | for (i, &trigram) in lang_trigrams.iter().enumerate() { |
148 | 157 | let dist = match text_trigrams.get(trigram) { |
149 | 158 | Some(&n) => (n as i32 - i as i32).abs() as u32, |
150 | | - None => MAX_TRIGRAM_DISTANCE |
| 159 | + None => MAX_TRIGRAM_DISTANCE, |
151 | 160 | }; |
152 | 161 | total_dist += dist; |
153 | 162 | } |
@@ -186,7 +195,16 @@ mod tests { |
186 | 195 | assert_eq!(info.lang, Lang::Tgl); |
187 | 196 |
|
188 | 197 | // with blacklist |
189 | | - let blacklist = vec![Lang::Tgl, Lang::Jav, Lang::Nld, Lang::Uzb, Lang::Swe, Lang::Nob, Lang::Ceb, Lang::Ilo]; |
| 198 | + let blacklist = vec![ |
| 199 | + Lang::Tgl, |
| 200 | + Lang::Jav, |
| 201 | + Lang::Nld, |
| 202 | + Lang::Uzb, |
| 203 | + Lang::Swe, |
| 204 | + Lang::Nob, |
| 205 | + Lang::Ceb, |
| 206 | + Lang::Ilo, |
| 207 | + ]; |
190 | 208 | let options = Options::new().set_blacklist(blacklist); |
191 | 209 | let output = detect_with_options(text, &options); |
192 | 210 | assert_eq!(output.is_some(), true); |
@@ -224,7 +242,9 @@ mod tests { |
224 | 242 | let info = detect("qwertyuioasdfghjklzxcvbnm").unwrap(); |
225 | 243 | assert!(!info.is_reliable()); |
226 | 244 |
|
227 | | - let info = detect("qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm").unwrap(); |
| 245 | + let info = |
| 246 | + detect("qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm") |
| 247 | + .unwrap(); |
228 | 248 | assert!(!info.is_reliable()); |
229 | 249 |
|
230 | 250 | // 1000 chars of randomly generated Cyrillic text |
|
0 commit comments