Skip to content

Commit 9910e6b

Browse files
author
lukingcathy
authored
Upgrade to Rust 2024 edition, replace lazy_static and jemallocator (#129)
* Update edition to use workspace setting and set resolver * Replace `lazy_static` with `std::sync::LazyLock` * Add `unsafe` blocks to ensure proper handling of raw pointers and memory * Replace jemallocator to tikv-jemallocator * Run `cargo fmt`
1 parent 279869d commit 9910e6b

File tree

10 files changed

+97
-86
lines changed

10 files changed

+97
-86
lines changed

Cargo.toml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ keywords = ["nlp", "chinese", "segmenation"]
88
license = "MIT"
99
readme = "README.md"
1010
repository = "https://github.com/messense/jieba-rs"
11-
edition = '2021'
11+
edition.workspace = true
1212

1313
[package.metadata.docs.rs]
1414
all-features = true
@@ -19,7 +19,7 @@ wasm-bindgen-test = { workspace = true }
1919
rayon = { workspace = true }
2020

2121
[target.'cfg(unix)'.dev-dependencies]
22-
jemallocator = "0.5.0"
22+
tikv-jemallocator = "0.6.0"
2323

2424
[[bench]]
2525
name = "jieba_benchmark"
@@ -32,7 +32,6 @@ cedarwood = { workspace = true }
3232
derive_builder = { workspace = true, optional = true }
3333
fxhash = { workspace = true }
3434
include-flate = { workspace = true }
35-
lazy_static = { workspace = true }
3635
ordered-float = { workspace = true, optional = true }
3736
phf = { workspace = true }
3837
regex = { workspace = true }
@@ -44,8 +43,12 @@ tfidf = ["dep:ordered-float", "dep:derive_builder"]
4443
textrank = ["dep:ordered-float", "dep:derive_builder"]
4544

4645
[workspace]
46+
resolver = "3"
4747
members = [".", "capi", "jieba-macros", "examples/weicheng"]
4848

49+
[workspace.package]
50+
edition = "2024"
51+
4952
[workspace.dependencies]
5053
c_fixed_string = "0.2.0"
5154
cedarwood = "0.4"
@@ -54,7 +57,6 @@ derive_builder = "0.20.0"
5457
fxhash = "0.2.1"
5558
include-flate = "0.3.0"
5659
jieba-macros = { version = "0.7.1", path = "jieba-macros" }
57-
lazy_static = "1.0"
5860
ordered-float = "5.0"
5961
phf = "0.12.1"
6062
phf_codegen = "0.12.1"

benches/jieba_benchmark.rs

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,15 @@
1-
use codspeed_criterion_compat::{black_box, criterion_group, criterion_main, Criterion, Throughput};
1+
use codspeed_criterion_compat::{Criterion, Throughput, black_box, criterion_group, criterion_main};
22
use jieba_rs::{Jieba, KeywordExtract, TextRank, TfIdf, TokenizeMode};
3-
use lazy_static::lazy_static;
43
use rayon::iter::{IntoParallelIterator, ParallelIterator};
4+
use std::sync::LazyLock;
55

66
#[cfg(unix)]
77
#[global_allocator]
8-
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
8+
static ALLOC: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
99

10-
lazy_static! {
11-
static ref JIEBA: Jieba = Jieba::new();
12-
static ref TFIDF_EXTRACTOR: TfIdf = TfIdf::default();
13-
static ref TEXTRANK_EXTRACTOR: TextRank = TextRank::default();
14-
}
10+
static JIEBA: LazyLock<Jieba> = LazyLock::new(Jieba::new);
11+
static TFIDF_EXTRACTOR: LazyLock<TfIdf> = LazyLock::new(TfIdf::default);
12+
static TEXTRANK_EXTRACTOR: LazyLock<TextRank> = LazyLock::new(TextRank::default);
1513
static SENTENCE: &str = "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。";
1614

1715
fn criterion_benchmark(c: &mut Criterion) {

capi/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
name = "jieba-capi"
33
version = "0.1.0"
44
authors = ["messense <messense@icloud.com>"]
5-
edition = "2021"
5+
edition.workspace = true
66

77
[dependencies]
88
jieba-rs = { version = "0.7.0", path = "../", features = ["textrank", "tfidf"] }

capi/src/lib.rs

Lines changed: 61 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -100,10 +100,12 @@ impl FfiStr {
100100
/// Frees the underlying data. After this call, the internal pointer is invalid.
101101
pub unsafe fn free(&mut self) {
102102
if self.owned && !self.data.is_null() {
103-
String::from_raw_parts(self.data as *mut _, self.len, self.len);
104-
self.data = ptr::null_mut();
105-
self.len = 0;
106-
self.owned = false;
103+
unsafe {
104+
String::from_raw_parts(self.data as *mut _, self.len, self.len);
105+
self.data = ptr::null_mut();
106+
self.len = 0;
107+
self.owned = false;
108+
}
107109
}
108110
}
109111
}
@@ -123,28 +125,28 @@ impl Drop for FfiStr {
123125
///
124126
/// # Safety
125127
/// Used to release strings returned as results of function calls.
126-
#[no_mangle]
128+
#[unsafe(no_mangle)]
127129
pub unsafe extern "C" fn jieba_str_free(s: *mut FfiStr) {
128130
if !s.is_null() {
129-
(*s).free()
131+
unsafe { (*s).free() }
130132
}
131133
}
132134

133135
unsafe fn params_unwrap(cjieba_ref: &*mut CJieba, s: *const c_char, len: usize) -> (&Jieba, &CFixedStr) {
134-
let jieba = &(*(*cjieba_ref)).jieba;
135-
let c_str = CFixedStr::from_ptr(s, len);
136+
let jieba = unsafe { &(*(*cjieba_ref)).jieba };
137+
let c_str = unsafe { CFixedStr::from_ptr(s, len) };
136138
(jieba, c_str)
137139
}
138140

139141
unsafe fn params_unwrap_mut(cjieba_ref: &*mut CJieba, s: *const c_char, len: usize) -> (&mut Jieba, &CFixedStr) {
140-
let jieba = &mut (*(*cjieba_ref)).jieba;
141-
let c_str = CFixedStr::from_ptr(s, len);
142+
let jieba = unsafe { &mut (*(*cjieba_ref)).jieba };
143+
let c_str = unsafe { CFixedStr::from_ptr(s, len) };
142144
(jieba, c_str)
143145
}
144146

145147
/// # Safety
146148
/// Returned value must be freed by `jieba_free()`.
147-
#[no_mangle]
149+
#[unsafe(no_mangle)]
148150
pub extern "C" fn jieba_new() -> *mut CJieba {
149151
let cjieba = CJieba {
150152
jieba: Jieba::new(),
@@ -157,7 +159,7 @@ pub extern "C" fn jieba_new() -> *mut CJieba {
157159
///
158160
/// # Safety
159161
/// Returned value must be freed by `jieba_free()`.
160-
#[no_mangle]
162+
#[unsafe(no_mangle)]
161163
pub extern "C" fn jieba_empty() -> *mut CJieba {
162164
let cjieba = CJieba {
163165
jieba: Jieba::empty(),
@@ -168,23 +170,25 @@ pub extern "C" fn jieba_empty() -> *mut CJieba {
168170

169171
/// # Safety
170172
/// cjieba is result from `jieba_new()` call.
171-
#[no_mangle]
173+
#[unsafe(no_mangle)]
172174
pub unsafe extern "C" fn jieba_free(cjieba: *mut CJieba) {
173175
if !cjieba.is_null() {
174-
drop(Box::from_raw(cjieba));
176+
unsafe {
177+
drop(Box::from_raw(cjieba));
178+
}
175179
}
176180
}
177181

178182
/// # Safety
179183
/// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger.
180-
#[no_mangle]
184+
#[unsafe(no_mangle)]
181185
pub unsafe extern "C" fn jieba_cut(
182186
cjieba: *mut CJieba,
183187
sentence: *const c_char,
184188
len: usize,
185189
hmm: bool,
186190
) -> *mut CJiebaWords {
187-
let (jieba, c_str) = params_unwrap(&cjieba, sentence, len);
191+
let (jieba, c_str) = unsafe { params_unwrap(&cjieba, sentence, len) };
188192
// FIXME: remove allocation
189193
let s = String::from_utf8_lossy(c_str.as_bytes_full());
190194
let words = jieba.cut(&s, hmm);
@@ -200,9 +204,9 @@ pub unsafe extern "C" fn jieba_cut(
200204

201205
/// # Safety
202206
/// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger.
203-
#[no_mangle]
207+
#[unsafe(no_mangle)]
204208
pub unsafe extern "C" fn jieba_cut_all(cjieba: *mut CJieba, sentence: *const c_char, len: usize) -> *mut CJiebaWords {
205-
let (jieba, c_str) = params_unwrap(&cjieba, sentence, len);
209+
let (jieba, c_str) = unsafe { params_unwrap(&cjieba, sentence, len) };
206210
// FIXME: remove allocation
207211
let s = String::from_utf8_lossy(c_str.as_bytes_full());
208212
let words = (*jieba).cut_all(&s);
@@ -218,14 +222,14 @@ pub unsafe extern "C" fn jieba_cut_all(cjieba: *mut CJieba, sentence: *const c_c
218222

219223
/// # Safety
220224
/// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger.
221-
#[no_mangle]
225+
#[unsafe(no_mangle)]
222226
pub unsafe extern "C" fn jieba_cut_for_search(
223227
cjieba: *mut CJieba,
224228
sentence: *const c_char,
225229
len: usize,
226230
hmm: bool,
227231
) -> *mut CJiebaWords {
228-
let (jieba, c_str) = params_unwrap(&cjieba, sentence, len);
232+
let (jieba, c_str) = unsafe { params_unwrap(&cjieba, sentence, len) };
229233
// FIXME: remove allocation
230234
let s = String::from_utf8_lossy(c_str.as_bytes_full());
231235
let words = (*jieba).cut_for_search(&s, hmm);
@@ -243,7 +247,7 @@ pub unsafe extern "C" fn jieba_cut_for_search(
243247
/// cjieba must be valid object from `jieba_new()` and must outlive the returned CJiebaTFIDF instance.
244248
///
245249
/// Returned value must be freed by `jieba_tfidf_free()`.
246-
#[no_mangle]
250+
#[unsafe(no_mangle)]
247251
pub extern "C" fn jieba_tfidf_new(cjieba: *mut CJieba) -> *mut CJiebaTFIDF {
248252
let cjieba_tfidf = CJiebaTFIDF {
249253
cjieba,
@@ -255,18 +259,20 @@ pub extern "C" fn jieba_tfidf_new(cjieba: *mut CJieba) -> *mut CJiebaTFIDF {
255259

256260
/// # Safety
257261
/// cjieba_tfidf is result from `jieba_tfidf_new()` call.
258-
#[no_mangle]
262+
#[unsafe(no_mangle)]
259263
pub unsafe extern "C" fn jieba_tfidf_free(cjieba_tfidf: *mut CJiebaTFIDF) {
260264
if !cjieba_tfidf.is_null() {
261-
drop(Box::from_raw(cjieba_tfidf));
265+
unsafe {
266+
drop(Box::from_raw(cjieba_tfidf));
267+
}
262268
}
263269
}
264270

265271
/// # Safety
266272
/// cjieba_tfidf must be valid object from `jieba_tfidf_new()`. `sentence` must be `len` or larger.
267273
///
268274
/// Returned value must be freed by `jieba_words_free()`.
269-
#[no_mangle]
275+
#[unsafe(no_mangle)]
270276
pub unsafe extern "C" fn jieba_tfidf_extract(
271277
cjieba_tfidf: *mut CJiebaTFIDF,
272278
sentence: *const c_char,
@@ -275,9 +281,9 @@ pub unsafe extern "C" fn jieba_tfidf_extract(
275281
allowed_pos: *const *mut c_char,
276282
allowed_pos_len: usize,
277283
) -> *mut CJiebaWords {
278-
let cjieba_tfidf_ref = &(*cjieba_tfidf);
284+
let cjieba_tfidf_ref = unsafe { &(*cjieba_tfidf) };
279285
let tfidf = &cjieba_tfidf_ref.tfidf;
280-
let (jieba, c_str) = params_unwrap(&cjieba_tfidf_ref.cjieba, sentence, len);
286+
let (jieba, c_str) = unsafe { params_unwrap(&cjieba_tfidf_ref.cjieba, sentence, len) };
281287
// FIXME: remove allocation
282288
let s = String::from_utf8_lossy(c_str.as_bytes_full());
283289

@@ -286,9 +292,9 @@ pub unsafe extern "C" fn jieba_tfidf_extract(
286292
} else {
287293
let mut v = Vec::with_capacity(allowed_pos_len);
288294

289-
let slice: &[*mut c_char] = std::slice::from_raw_parts(allowed_pos, allowed_pos_len);
295+
let slice: &[*mut c_char] = unsafe { std::slice::from_raw_parts(allowed_pos, allowed_pos_len) };
290296
for ptr in slice.iter() {
291-
let cstring_allowed_pos = std::ffi::CString::from_raw(*ptr);
297+
let cstring_allowed_pos = unsafe { std::ffi::CString::from_raw(*ptr) };
292298
let string_allowed_pos = cstring_allowed_pos.into_string().expect("into_string().err() failed");
293299
v.push(string_allowed_pos);
294300
}
@@ -311,7 +317,7 @@ pub unsafe extern "C" fn jieba_tfidf_extract(
311317
/// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger.
312318
///
313319
/// Returned value must be freed by `jieba_words_free()`.
314-
#[no_mangle]
320+
#[unsafe(no_mangle)]
315321
pub unsafe extern "C" fn jieba_textrank_extract(
316322
cjieba: *mut CJieba,
317323
sentence: *const c_char,
@@ -320,7 +326,7 @@ pub unsafe extern "C" fn jieba_textrank_extract(
320326
allowed_pos: *const *mut c_char,
321327
allowed_pos_len: usize,
322328
) -> *mut CJiebaWords {
323-
let (jieba, c_str) = params_unwrap(&cjieba, sentence, len);
329+
let (jieba, c_str) = unsafe { params_unwrap(&cjieba, sentence, len) };
324330
// FIXME: remove allocation
325331
let s = String::from_utf8_lossy(c_str.as_bytes_full());
326332

@@ -329,9 +335,9 @@ pub unsafe extern "C" fn jieba_textrank_extract(
329335
} else {
330336
let mut v = Vec::with_capacity(allowed_pos_len);
331337

332-
let slice: &[*mut c_char] = std::slice::from_raw_parts(allowed_pos, allowed_pos_len);
338+
let slice: &[*mut c_char] = unsafe { std::slice::from_raw_parts(allowed_pos, allowed_pos_len) };
333339
for ptr in slice.iter() {
334-
let cstring_allowed_pos = std::ffi::CString::from_raw(*ptr);
340+
let cstring_allowed_pos = unsafe { std::ffi::CString::from_raw(*ptr) };
335341
let string_allowed_pos = cstring_allowed_pos.into_string().expect("into_string().err() failed");
336342
v.push(string_allowed_pos);
337343
}
@@ -353,27 +359,29 @@ pub unsafe extern "C" fn jieba_textrank_extract(
353359

354360
/// # Safety
355361
/// c_tags is result from `jieba_textrank_extract()` or `jieba_tfidf_extract()` call.
356-
#[no_mangle]
362+
#[unsafe(no_mangle)]
357363
pub unsafe extern "C" fn jieba_words_free(c_words: *mut CJiebaWords) {
358364
if !c_words.is_null() {
359-
Vec::from_raw_parts((*c_words).words, (*c_words).len, (*c_words).len);
360-
drop(Box::from_raw(c_words));
365+
unsafe {
366+
Vec::from_raw_parts((*c_words).words, (*c_words).len, (*c_words).len);
367+
drop(Box::from_raw(c_words));
368+
}
361369
}
362370
}
363371

364372
/// # Safety
365373
/// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger.
366374
///
367375
/// Returned value must be freed by `jieba_tokens_free()`.
368-
#[no_mangle]
376+
#[unsafe(no_mangle)]
369377
pub unsafe extern "C" fn jieba_tokenize(
370378
cjieba: *mut CJieba,
371379
sentence: *const c_char,
372380
len: usize,
373381
mode: TokenizeMode,
374382
hmm: bool,
375383
) -> *mut CJiebaTokens {
376-
let (jieba, c_str) = params_unwrap(&cjieba, sentence, len);
384+
let (jieba, c_str) = unsafe { params_unwrap(&cjieba, sentence, len) };
377385
// FIXME: remove allocation
378386
let s = String::from_utf8_lossy(c_str.as_bytes_full());
379387
let tokens = (*jieba).tokenize(&s, mode.into(), hmm);
@@ -396,26 +404,28 @@ pub unsafe extern "C" fn jieba_tokenize(
396404

397405
/// # Safety
398406
/// c_tokens is result from `jieba_tokenize()` call.
399-
#[no_mangle]
407+
#[unsafe(no_mangle)]
400408
pub unsafe extern "C" fn jieba_tokens_free(c_tokens: *mut CJiebaTokens) {
401409
if !c_tokens.is_null() {
402-
Vec::from_raw_parts((*c_tokens).tokens, (*c_tokens).len, (*c_tokens).len);
403-
drop(Box::from_raw(c_tokens));
410+
unsafe {
411+
Vec::from_raw_parts((*c_tokens).tokens, (*c_tokens).len, (*c_tokens).len);
412+
drop(Box::from_raw(c_tokens));
413+
}
404414
}
405415
}
406416

407417
/// # Safety
408418
/// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger.
409419
///
410420
/// Returned value must be freed by `jieba_tags_free()`.
411-
#[no_mangle]
421+
#[unsafe(no_mangle)]
412422
pub unsafe extern "C" fn jieba_tag(
413423
cjieba: *mut CJieba,
414424
sentence: *const c_char,
415425
len: usize,
416426
hmm: bool,
417427
) -> *mut CJiebaTags {
418-
let (jieba, c_str) = params_unwrap(&cjieba, sentence, len);
428+
let (jieba, c_str) = unsafe { params_unwrap(&cjieba, sentence, len) };
419429
// FIXME: remove allocation
420430
let s = String::from_utf8_lossy(c_str.as_bytes_full());
421431
let tags = (*jieba).tag(&s, hmm);
@@ -437,29 +447,31 @@ pub unsafe extern "C" fn jieba_tag(
437447

438448
/// # Safety
439449
/// c_tags is result from `jieba_tag()` call.
440-
#[no_mangle]
450+
#[unsafe(no_mangle)]
441451
pub unsafe extern "C" fn jieba_tags_free(c_tags: *mut CJiebaTags) {
442452
if !c_tags.is_null() {
443-
Vec::from_raw_parts((*c_tags).tags, (*c_tags).len, (*c_tags).len);
444-
drop(Box::from_raw(c_tags));
453+
unsafe {
454+
Vec::from_raw_parts((*c_tags).tags, (*c_tags).len, (*c_tags).len);
455+
drop(Box::from_raw(c_tags));
456+
}
445457
}
446458
}
447459

448460
/// # Safety
449461
/// cjieba must be valid object from `jieba_new()`. `word` must be `len` or larger.
450-
#[no_mangle]
462+
#[unsafe(no_mangle)]
451463
pub unsafe extern "C" fn jieba_add_word(cjieba: *mut CJieba, word: *const c_char, len: usize) -> usize {
452-
let (jieba, c_str) = params_unwrap_mut(&cjieba, word, len);
464+
let (jieba, c_str) = unsafe { params_unwrap_mut(&cjieba, word, len) };
453465
// FIXME: remove allocation
454466
let s = String::from_utf8_lossy(c_str.as_bytes_full());
455467
jieba.add_word(&s, None, None)
456468
}
457469

458470
/// # Safety
459471
/// cjieba must be valid object from `jieba_new()`. `segment` must be `len` or larger.
460-
#[no_mangle]
472+
#[unsafe(no_mangle)]
461473
pub unsafe extern "C" fn jieba_suggest_freq(cjieba: *mut CJieba, segment: *const c_char, len: usize) -> usize {
462-
let (jieba, c_str) = params_unwrap(&cjieba, segment, len);
474+
let (jieba, c_str) = unsafe { params_unwrap(&cjieba, segment, len) };
463475
// FIXME: remove allocation
464476
let s = String::from_utf8_lossy(c_str.as_bytes_full());
465477

0 commit comments

Comments
 (0)