Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions jieba/src/keywords/mod.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
use crate::Jieba;

use std::collections::BTreeSet;
use std::sync::LazyLock;

#[cfg(feature = "textrank")]
pub mod textrank;
#[cfg(feature = "tfidf")]
pub mod tfidf;

pub static DEFAULT_STOP_WORDS: LazyLock<BTreeSet<String>> = LazyLock::new(|| {
pub fn default_stop_words() -> BTreeSet<String> {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems we always use clone. Then we may probably not expose it as a static state.

BTreeSet::from_iter(
[
"the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are", "by", "be", "as", "on", "with",
Expand All @@ -17,7 +16,7 @@ pub static DEFAULT_STOP_WORDS: LazyLock<BTreeSet<String>> = LazyLock::new(|| {
.into_iter()
.map(ToString::to_string),
)
});
}

/// Keyword with weight.
#[derive(Debug, Clone, PartialEq)]
Expand Down Expand Up @@ -107,7 +106,7 @@ pub struct KeywordExtractConfigBuilder {
impl Default for KeywordExtractConfigBuilder {
fn default() -> Self {
KeywordExtractConfigBuilder {
stop_words: DEFAULT_STOP_WORDS.clone(),
stop_words: default_stop_words(),
min_keyword_length: 2,
use_hmm: false,
}
Expand Down
154 changes: 76 additions & 78 deletions jieba/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,13 @@
//! ```
//!

use cedarwood::Cedar;
use regex::{Match, Matches, Regex};
use std::cmp::Ordering;
use std::collections::HashMap;
use std::fmt;
use std::io::BufRead;

use cedarwood::Cedar;
use regex::{Match, Matches, Regex};
use std::sync::LazyLock;

pub(crate) type FxHashMap<K, V> = HashMap<K, V, rustc_hash::FxBuildHasher>;

Expand All @@ -87,7 +87,7 @@ pub use crate::keywords::textrank::TextRank;
#[cfg(feature = "tfidf")]
pub use crate::keywords::tfidf::TfIdf;
#[cfg(any(feature = "tfidf", feature = "textrank"))]
pub use crate::keywords::{DEFAULT_STOP_WORDS, Keyword, KeywordExtract, KeywordExtractConfig};
pub use crate::keywords::{Keyword, KeywordExtract, KeywordExtractConfig, default_stop_words};

mod errors;
mod hmm;
Expand All @@ -100,11 +100,16 @@ include_flate::flate!(static DEFAULT_DICT: str from "src/data/dict.txt");

use sparse_dag::StaticSparseDAG;

static RE_HAN_DEFAULT: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}a-zA-Z0-9+#&._%\-]+)").unwrap()
});
static RE_SKIP_DEFAULT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(\r\n|\s)").unwrap());
static RE_HAN_CUT_ALL: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}]+)").unwrap()
});
static RE_SKIP_CUT_ALL: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[^a-zA-Z0-9+#\n]").unwrap());

thread_local! {
static RE_HAN_DEFAULT: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}a-zA-Z0-9+#&\._%\-]+)").unwrap();
static RE_SKIP_DEFAULT: Regex = Regex::new(r"(\r\n|\s)").unwrap();
static RE_HAN_CUT_ALL: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}]+)").unwrap();
static RE_SKIP_CUT_ALL: Regex = Regex::new(r"[^a-zA-Z0-9+#\n]").unwrap();
static HMM_CONTEXT: std::cell::RefCell<hmm::HmmContext> = std::cell::RefCell::new(hmm::HmmContext::default());
}

Expand Down Expand Up @@ -639,61 +644,57 @@ impl Jieba {
let re_han = if cut_all { &RE_HAN_CUT_ALL } else { &RE_HAN_DEFAULT };
let re_skip = if cut_all { &RE_SKIP_CUT_ALL } else { &RE_SKIP_DEFAULT };

re_han.with(|re_han| {
re_skip.with(|re_skip| {
let heuristic_capacity = sentence.len() / 2;
let mut words = Vec::with_capacity(heuristic_capacity);

let splitter = SplitMatches::new(re_han, sentence);
let mut route = Vec::with_capacity(heuristic_capacity);
let mut dag = StaticSparseDAG::with_size_hint(heuristic_capacity);

for state in splitter {
match state {
SplitState::Matched(_) => {
let block = state.as_str();
assert!(!block.is_empty());

if cut_all {
self.cut_all_internal(block, &mut words);
} else if hmm {
HMM_CONTEXT.with(|ctx| {
let mut hmm_context = ctx.borrow_mut();
self.cut_dag_hmm(block, &mut words, &mut route, &mut dag, &mut hmm_context);
});
} else {
self.cut_dag_no_hmm(block, &mut words, &mut route, &mut dag);
}
let heuristic_capacity = sentence.len() / 2;
let mut words = Vec::with_capacity(heuristic_capacity);

let splitter = SplitMatches::new(re_han, sentence);
let mut route = Vec::with_capacity(heuristic_capacity);
let mut dag = StaticSparseDAG::with_size_hint(heuristic_capacity);

for state in splitter {
match state {
SplitState::Matched(_) => {
let block = state.as_str();
assert!(!block.is_empty());

if cut_all {
self.cut_all_internal(block, &mut words);
} else if hmm {
HMM_CONTEXT.with(|ctx| {
let mut hmm_context = ctx.borrow_mut();
self.cut_dag_hmm(block, &mut words, &mut route, &mut dag, &mut hmm_context);
});
} else {
self.cut_dag_no_hmm(block, &mut words, &mut route, &mut dag);
}
}
SplitState::Unmatched(_) => {
let block = state.as_str();
assert!(!block.is_empty());

let skip_splitter = SplitMatches::new(re_skip, block);
for skip_state in skip_splitter {
let word = skip_state.as_str();
if word.is_empty() {
continue;
}
SplitState::Unmatched(_) => {
let block = state.as_str();
assert!(!block.is_empty());

let skip_splitter = SplitMatches::new(re_skip, block);
for skip_state in skip_splitter {
let word = skip_state.as_str();
if word.is_empty() {
continue;
}
if cut_all || skip_state.is_matched() {
words.push(word);
if cut_all || skip_state.is_matched() {
words.push(word);
} else {
let mut word_indices = word.char_indices().map(|x| x.0).peekable();
while let Some(byte_start) = word_indices.next() {
if let Some(byte_end) = word_indices.peek() {
words.push(&word[byte_start..*byte_end]);
} else {
let mut word_indices = word.char_indices().map(|x| x.0).peekable();
while let Some(byte_start) = word_indices.next() {
if let Some(byte_end) = word_indices.peek() {
words.push(&word[byte_start..*byte_end]);
} else {
words.push(&word[byte_start..]);
}
}
words.push(&word[byte_start..]);
}
}
}
}
}
words
})
})
}
}
words
}

/// Cut the input text
Expand Down Expand Up @@ -894,34 +895,31 @@ mod tests {

#[test]
fn test_split_matches() {
RE_HAN_DEFAULT.with(|re_han| {
let splitter = SplitMatches::new(
re_han,
"👪 PS: 我觉得开源有一个好处,就是能够敦促自己不断改进 👪,避免敞帚自珍",
);
for state in splitter {
match state {
SplitState::Matched(_) => {
let block = state.as_str();
assert!(!block.is_empty());
}
SplitState::Unmatched(_) => {
let block = state.as_str();
assert!(!block.is_empty());
}
let re_han = &RE_HAN_DEFAULT;
let splitter = SplitMatches::new(
re_han,
"👪 PS: 我觉得开源有一个好处,就是能够敦促自己不断改进 👪,避免敞帚自珍",
);
for state in splitter {
match state {
SplitState::Matched(_) => {
let block = state.as_str();
assert!(!block.is_empty());
}
SplitState::Unmatched(_) => {
let block = state.as_str();
assert!(!block.is_empty());
}
}
});
}
}

#[test]
fn test_split_matches_against_unicode_sip() {
RE_HAN_DEFAULT.with(|re_han| {
let splitter = SplitMatches::new(re_han, "讥䶯䶰䶱䶲䶳䶴䶵𦡦");

let result: Vec<&str> = splitter.map(|x| x.as_str()).collect();
assert_eq!(result, vec!["讥䶯䶰䶱䶲䶳䶴䶵𦡦"]);
});
let re_han = &RE_HAN_DEFAULT;
let splitter = SplitMatches::new(re_han, "讥䶯䶰䶱䶲䶳䶴䶵𦡦");
let result: Vec<&str> = splitter.map(|x| x.as_str()).collect();
assert_eq!(result, vec!["讥䶯䶰䶱䶲䶳䶴䶵𦡦"]);
}

#[test]
Expand Down