Skip to content

Commit bfafa21

Browse files
committed
feat: autolearn new phrases both with and without break words
1 parent 9847abe commit bfafa21

File tree

3 files changed

+199
-46
lines changed

3 files changed

+199
-46
lines changed

NEWS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ What's New in libchewing (unreleased)
3434
- Dictionary data are built directly from the libchewing-data repository
3535
- cli: init-database will no longer normalize the frequency of single word
3636
phrase to zero.
37+
- autolearn: new phrases with break words inside will be learned both with
38+
and without the break word.
3739

3840
* Dictionary
3941
- Default symbols.dat now includes commonly used emojis.

src/editor/mod.rs

Lines changed: 194 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,12 @@ use std::{
1111
cmp::{max, min},
1212
error::Error,
1313
fmt::{Debug, Display},
14+
mem,
1415
};
1516

1617
pub use self::{abbrev::AbbrevTable, selection::symbol::SymbolSelector};
1718
pub use estimate::{LaxUserFreqEstimate, UserFreqEstimate};
18-
use log::{debug, info, trace, warn};
19+
use log::{debug, error, info, trace, warn};
1920

2021
use crate::{
2122
conversion::{
@@ -700,40 +701,11 @@ impl SharedState {
700701
self.last_key_behavior = EditorKeyBehavior::Commit;
701702
}
702703
fn auto_learn(&mut self, intervals: &[Interval]) {
703-
debug!("intervals {:?}", intervals);
704-
let mut pending = String::new();
705-
let mut syllables = Vec::new();
706-
for interval in intervals {
707-
if interval.is_phrase && interval.len() == 1 && !is_break_word(&interval.str) {
708-
pending.push_str(&interval.str);
709-
syllables.extend_from_slice(&self.com.symbols()[interval.start..interval.end]);
710-
} else {
711-
if !pending.is_empty() {
712-
debug!("autolearn-2 {:?} as {}", &syllables, &pending);
713-
let _ = self.learn_phrase(&syllables, &pending);
714-
pending.clear();
715-
syllables.clear();
716-
}
717-
if interval.is_phrase {
718-
debug!(
719-
"autolearn-3 {:?} as {}",
720-
&self.com.symbols()[interval.start..interval.end],
721-
&interval.str
722-
);
723-
// FIXME avoid copy
724-
let _ = self.learn_phrase(
725-
&self.com.symbols()[interval.start..interval.end].to_vec(),
726-
&interval.str,
727-
);
728-
}
704+
for (symbols, phrase) in collect_new_phrases(intervals, self.com.symbols()) {
705+
if let Err(error) = self.learn_phrase(&symbols, &phrase) {
706+
error!("Failed to learn phrase {phrase} from {symbols:?}: {error:#}");
729707
}
730708
}
731-
if !pending.is_empty() {
732-
debug!("autolearn-1 {:?} as {}", &syllables, &pending);
733-
let _ = self.learn_phrase(&syllables, &pending);
734-
pending.clear();
735-
syllables.clear();
736-
}
737709
}
738710
}
739711

@@ -751,6 +723,50 @@ fn is_break_word(word: &str) -> bool {
751723
"路", "村", "在"].contains(&word)
752724
}
753725

726+
fn collect_new_phrases(intervals: &[Interval], symbols: &[Symbol]) -> Vec<(Vec<Symbol>, String)> {
727+
debug!("intervals {:?}", intervals);
728+
let mut pending = String::new();
729+
let mut syllables = Vec::new();
730+
let mut phrases = vec![];
731+
let mut collect = |syllables, pending| {
732+
if phrases.iter().find(|(_, p)| p == &pending).is_none() {
733+
debug!("autolearn {:?} as {}", &syllables, &pending);
734+
phrases.push((syllables, pending))
735+
}
736+
};
737+
// Step 1. collect all intervals
738+
for interval in intervals.iter().filter(|it| it.is_phrase) {
739+
let syllables = symbols[interval.start..interval.end].to_vec();
740+
let pending = interval.str.clone().into_string();
741+
collect(syllables, pending);
742+
}
743+
// Step 2. collect all intervals with length one with break words removed
744+
for interval in intervals.iter() {
745+
if interval.is_phrase && interval.len() == 1 && !is_break_word(&interval.str) {
746+
pending.push_str(&interval.str);
747+
syllables.extend_from_slice(&symbols[interval.start..interval.end]);
748+
} else if !pending.is_empty() {
749+
collect(mem::take(&mut syllables), mem::take(&mut pending));
750+
}
751+
}
752+
if !pending.is_empty() {
753+
collect(mem::take(&mut syllables), mem::take(&mut pending));
754+
}
755+
// Step 3. collect all intervals with length one including break words
756+
for interval in intervals {
757+
if interval.is_phrase && interval.len() == 1 {
758+
pending.push_str(&interval.str);
759+
syllables.extend_from_slice(&symbols[interval.start..interval.end]);
760+
} else if !pending.is_empty() {
761+
collect(mem::take(&mut syllables), mem::take(&mut pending));
762+
}
763+
}
764+
if !pending.is_empty() {
765+
collect(syllables, pending);
766+
}
767+
phrases
768+
}
769+
754770
impl BasicEditor for Editor {
755771
fn process_keyevent(&mut self, key_event: KeyboardEvent) -> EditorKeyBehavior {
756772
debug!("process_keyevent: {:?}", key_event);
@@ -1557,7 +1573,7 @@ mod tests {
15571573
use estimate::LaxUserFreqEstimate;
15581574

15591575
use crate::{
1560-
conversion::ChewingEngine,
1576+
conversion::{ChewingEngine, Interval, Symbol},
15611577
dictionary::{Layered, TrieBuf},
15621578
editor::{EditorKeyBehavior, EditorOptions, SymbolSelector, abbrev::AbbrevTable, estimate},
15631579
input::{
@@ -1566,9 +1582,10 @@ mod tests {
15661582
keysym,
15671583
},
15681584
syl,
1569-
zhuyin::Bopomofo,
1585+
zhuyin::Bopomofo as bpmf,
15701586
};
15711587

1588+
use super::collect_new_phrases;
15721589
use super::{BasicEditor, Editor};
15731590

15741591
const CAPSLOCK_EVENT: KeyboardEvent = KeyboardEvent::builder()
@@ -1597,7 +1614,7 @@ mod tests {
15971614
let key_behavior = editor.process_keyevent(ev);
15981615

15991616
assert_eq!(EditorKeyBehavior::Absorb, key_behavior);
1600-
assert_eq!(syl![Bopomofo::C], editor.syllable_buffer());
1617+
assert_eq!(syl![bpmf::C], editor.syllable_buffer());
16011618

16021619
let ev = KeyboardEvent {
16031620
code: keycode::KEY_K,
@@ -1607,13 +1624,13 @@ mod tests {
16071624
let key_behavior = editor.process_keyevent(ev);
16081625

16091626
assert_eq!(EditorKeyBehavior::Absorb, key_behavior);
1610-
assert_eq!(syl![Bopomofo::C, Bopomofo::E], editor.syllable_buffer());
1627+
assert_eq!(syl![bpmf::C, bpmf::E], editor.syllable_buffer());
16111628
}
16121629

16131630
#[test]
16141631
fn editing_mode_input_bopomofo_commit() {
16151632
let dict = TrieBuf::from([(
1616-
vec![crate::syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4]],
1633+
vec![crate::syl![bpmf::C, bpmf::E, bpmf::TONE4]],
16171634
vec![("冊", 100)],
16181635
)]);
16191636
let dict = Layered::new(vec![Box::new(dict)], Box::new(TrieBuf::new_in_memory()));
@@ -1645,7 +1662,7 @@ mod tests {
16451662
#[test]
16461663
fn editing_mode_input_bopomofo_select() {
16471664
let dict = TrieBuf::from([(
1648-
vec![crate::syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4]],
1665+
vec![crate::syl![bpmf::C, bpmf::E, bpmf::TONE4]],
16491666
vec![("冊", 100), ("測", 200)],
16501667
)]);
16511668
let dict = Layered::new(vec![Box::new(dict)], Box::new(TrieBuf::new_in_memory()));
@@ -1693,7 +1710,7 @@ mod tests {
16931710
#[test]
16941711
fn editing_mode_input_bopomofo_select_sorted() {
16951712
let dict = TrieBuf::from([(
1696-
vec![crate::syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4]],
1713+
vec![crate::syl![bpmf::C, bpmf::E, bpmf::TONE4]],
16971714
vec![("冊", 100), ("測", 200)],
16981715
)]);
16991716
let dict = Layered::new(vec![Box::new(dict)], Box::new(TrieBuf::new_in_memory()));
@@ -1741,7 +1758,7 @@ mod tests {
17411758
#[test]
17421759
fn editing_mode_input_chinese_to_english_mode() {
17431760
let dict = TrieBuf::from([(
1744-
vec![crate::syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4]],
1761+
vec![crate::syl![bpmf::C, bpmf::E, bpmf::TONE4]],
17451762
vec![("冊", 100)],
17461763
)]);
17471764
let dict = Layered::new(vec![Box::new(dict)], Box::new(TrieBuf::new_in_memory()));
@@ -1782,7 +1799,7 @@ mod tests {
17821799
#[test]
17831800
fn editing_mode_input_english_to_chinese_mode() {
17841801
let dict = TrieBuf::from([(
1785-
vec![crate::syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4]],
1802+
vec![crate::syl![bpmf::C, bpmf::E, bpmf::TONE4]],
17861803
vec![("冊", 100)],
17871804
)]);
17881805
let dict = Layered::new(vec![Box::new(dict)], Box::new(TrieBuf::new_in_memory()));
@@ -1839,7 +1856,7 @@ mod tests {
18391856
#[test]
18401857
fn editing_chinese_mode_input_special_symbol() {
18411858
let dict = TrieBuf::from([(
1842-
vec![crate::syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4]],
1859+
vec![crate::syl![bpmf::C, bpmf::E, bpmf::TONE4]],
18431860
vec![("冊", 100)],
18441861
)]);
18451862
let dict = Layered::new(vec![Box::new(dict)], Box::new(TrieBuf::new_in_memory()));
@@ -1934,4 +1951,138 @@ mod tests {
19341951
assert_eq!(EditorKeyBehavior::Bell, key_behavior);
19351952
assert_eq!(syl![], editor.syllable_buffer());
19361953
}
1954+
1955+
#[test]
1956+
fn collect_new_phrases_with_no_break_word() {
1957+
let intervals = [
1958+
Interval {
1959+
start: 0,
1960+
end: 2,
1961+
is_phrase: true,
1962+
str: "今天".into(),
1963+
},
1964+
Interval {
1965+
start: 2,
1966+
end: 4,
1967+
is_phrase: true,
1968+
str: "天氣".into(),
1969+
},
1970+
Interval {
1971+
start: 4,
1972+
end: 6,
1973+
is_phrase: true,
1974+
str: "真好".into(),
1975+
},
1976+
];
1977+
let symbols = [
1978+
Symbol::Syllable(syl![bpmf::J, bpmf::I, bpmf::EN]),
1979+
Symbol::Syllable(syl![bpmf::T, bpmf::I, bpmf::AN]),
1980+
Symbol::Syllable(syl![bpmf::T, bpmf::I, bpmf::AN]),
1981+
Symbol::Syllable(syl![bpmf::Q, bpmf::I, bpmf::TONE4]),
1982+
Symbol::Syllable(syl![bpmf::ZH, bpmf::EN]),
1983+
Symbol::Syllable(syl![bpmf::H, bpmf::AU, bpmf::TONE3]),
1984+
];
1985+
let phrases = collect_new_phrases(&intervals, &symbols);
1986+
assert_eq!(
1987+
vec![
1988+
(
1989+
vec![
1990+
Symbol::Syllable(syl![bpmf::J, bpmf::I, bpmf::EN]),
1991+
Symbol::Syllable(syl![bpmf::T, bpmf::I, bpmf::AN]),
1992+
],
1993+
"今天".to_string()
1994+
),
1995+
(
1996+
vec![
1997+
Symbol::Syllable(syl![bpmf::T, bpmf::I, bpmf::AN]),
1998+
Symbol::Syllable(syl![bpmf::Q, bpmf::I, bpmf::TONE4]),
1999+
],
2000+
"天氣".to_string()
2001+
),
2002+
(
2003+
vec![
2004+
Symbol::Syllable(syl![bpmf::ZH, bpmf::EN]),
2005+
Symbol::Syllable(syl![bpmf::H, bpmf::AU, bpmf::TONE3]),
2006+
],
2007+
"真好".to_string()
2008+
),
2009+
],
2010+
phrases
2011+
);
2012+
}
2013+
2014+
#[test]
2015+
fn collect_new_phrases_with_break_word() {
2016+
let intervals = [
2017+
Interval {
2018+
start: 0,
2019+
end: 2,
2020+
is_phrase: true,
2021+
str: "今天".into(),
2022+
},
2023+
Interval {
2024+
start: 2,
2025+
end: 3,
2026+
is_phrase: true,
2027+
str: "也".into(),
2028+
},
2029+
Interval {
2030+
start: 3,
2031+
end: 4,
2032+
is_phrase: true,
2033+
str: "是".into(),
2034+
},
2035+
Interval {
2036+
start: 4,
2037+
end: 7,
2038+
is_phrase: true,
2039+
str: "好天氣".into(),
2040+
},
2041+
];
2042+
let symbols = [
2043+
Symbol::Syllable(syl![bpmf::J, bpmf::I, bpmf::EN]),
2044+
Symbol::Syllable(syl![bpmf::T, bpmf::I, bpmf::AN]),
2045+
Symbol::Syllable(syl![bpmf::I, bpmf::EH, bpmf::TONE3]),
2046+
Symbol::Syllable(syl![bpmf::SH, bpmf::TONE4]),
2047+
Symbol::Syllable(syl![bpmf::H, bpmf::AU, bpmf::TONE3]),
2048+
Symbol::Syllable(syl![bpmf::T, bpmf::I, bpmf::AN]),
2049+
Symbol::Syllable(syl![bpmf::Q, bpmf::I, bpmf::TONE4]),
2050+
];
2051+
let phrases = collect_new_phrases(&intervals, &symbols);
2052+
assert_eq!(
2053+
vec![
2054+
(
2055+
vec![
2056+
Symbol::Syllable(syl![bpmf::J, bpmf::I, bpmf::EN]),
2057+
Symbol::Syllable(syl![bpmf::T, bpmf::I, bpmf::AN]),
2058+
],
2059+
"今天".to_string()
2060+
),
2061+
(
2062+
vec![Symbol::Syllable(syl![bpmf::I, bpmf::EH, bpmf::TONE3]),],
2063+
"也".to_string()
2064+
),
2065+
(
2066+
vec![Symbol::Syllable(syl![bpmf::SH, bpmf::TONE4])],
2067+
"是".to_string()
2068+
),
2069+
(
2070+
vec![
2071+
Symbol::Syllable(syl![bpmf::H, bpmf::AU, bpmf::TONE3]),
2072+
Symbol::Syllable(syl![bpmf::T, bpmf::I, bpmf::AN]),
2073+
Symbol::Syllable(syl![bpmf::Q, bpmf::I, bpmf::TONE4]),
2074+
],
2075+
"好天氣".to_string()
2076+
),
2077+
(
2078+
vec![
2079+
Symbol::Syllable(syl![bpmf::I, bpmf::EH, bpmf::TONE3]),
2080+
Symbol::Syllable(syl![bpmf::SH, bpmf::TONE4])
2081+
],
2082+
"也是".to_string()
2083+
),
2084+
],
2085+
phrases
2086+
);
2087+
}
19372088
}

tests/test-userphrase.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ void test_CtrlNum_add_phrase_right_start_with_symbol()
264264
ok(strcmp(const_buf, phrase) == 0, "first candidate `%s' shall be `%s'", const_buf, phrase);
265265

266266
chewing_delete(ctx);
267-
}
267+
}
268268

269269
void test_CtrlNum_add_phrase_left_start_with_symbol()
270270
{
@@ -383,7 +383,7 @@ void test_userphrase_auto_learn_hardcode_break()
383383
ok(has_userphrase(ctx, bopomofo, phrase) == 0, "`%s' shall not be in userphrase", phrase);
384384

385385
type_keystroke_by_string(ctx, "2k72k7<E>");
386-
ok(has_userphrase(ctx, bopomofo, phrase) == 0, "`%s' shall not be in userphrase", phrase);
386+
ok(has_userphrase(ctx, bopomofo, phrase) == 1, "`%s' shall be in userphrase", phrase);
387387

388388
chewing_delete(ctx);
389389
}
@@ -415,7 +415,7 @@ void test_userphrase_auto_learn_only_after_commit()
415415
/* user selectes another cadidate and commit: auto learn phrase(s), but not the selected candidate. */
416416
type_keystroke_by_string(ctx, "<L><D>2<E>");
417417
ok(has_userphrase(ctx, bopomofo_1, NULL) == 1, "`%s' shall be in userphrase", bopomofo_1);
418-
ok(has_userphrase(ctx, bopomofo_2, NULL) == 0, "`%s' shall not be in userphrase", bopomofo_2);
418+
ok(has_userphrase(ctx, bopomofo_2, NULL) == 1, "`%s' shall be in userphrase", bopomofo_2);
419419

420420
chewing_delete(ctx);
421421
}

0 commit comments

Comments
 (0)