Skip to content

Commit 3821114

Browse files
committed
feat!(jieba): upgrade to [email protected]
Also provide customizable API for Jieba and TfIdf
1 parent ec14da0 commit 3821114

File tree

49 files changed

+619828
-913
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+619828
-913
lines changed

.github/workflows/ci.yaml

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,8 +248,13 @@ jobs:
248248
settings:
249249
- host: macos-latest
250250
target: 'x86_64-apple-darwin'
251+
architecture: x64
252+
- host: macos-latest
253+
target: 'aarch64-apple-darwin'
254+
architecture: arm64
251255
- host: windows-latest
252256
target: 'x86_64-pc-windows-msvc'
257+
architecture: x64
253258
node: ['18', '20']
254259
runs-on: ${{ matrix.settings.host }}
255260

@@ -263,7 +268,7 @@ jobs:
263268
with:
264269
node-version: ${{ matrix.node }}
265270
cache: yarn
266-
architecture: x64
271+
architecture: ${{ matrix.settings.architecture }}
267272

268273
- name: Install dependencies
269274
run: yarn install --immutable --mode=skip-build
@@ -312,6 +317,9 @@ jobs:
312317
name: bindings-x86_64-unknown-linux-gnu
313318
path: artifacts
314319

320+
- name: create-npm-dirs
321+
run: yarn workspaces foreach -A -j 1 run napi create-npm-dirs
322+
315323
- name: Move artifacts
316324
run: yarn artifacts
317325
shell: bash
@@ -357,6 +365,9 @@ jobs:
357365
- name: Build TypeScript
358366
run: yarn build:ts
359367

368+
- name: create-npm-dirs
369+
run: yarn workspaces foreach -A -j 1 run napi create-npm-dirs
370+
360371
- name: Download artifacts
361372
uses: actions/download-artifact@v4
362373
with:
@@ -404,6 +415,9 @@ jobs:
404415
yarn config set supportedArchitectures.libc "glibc"
405416
yarn install --immutable --mode=skip-build
406417
418+
- name: create-npm-dirs
419+
run: yarn workspaces foreach -A -j 1 run napi create-npm-dirs
420+
407421
- name: Move artifacts
408422
run: yarn artifacts
409423
shell: bash
@@ -455,6 +469,9 @@ jobs:
455469
name: bindings-aarch64-unknown-linux-musl
456470
path: artifacts
457471

472+
- name: create-npm-dirs
473+
run: yarn workspaces foreach -A -j 1 run napi create-npm-dirs
474+
458475
- name: Move artifacts
459476
run: yarn artifacts
460477
shell: bash
@@ -503,6 +520,9 @@ jobs:
503520
name: bindings-armv7-unknown-linux-gnueabihf
504521
path: artifacts
505522

523+
- name: create-npm-dirs
524+
run: yarn workspaces foreach -A -j 1 run napi create-npm-dirs
525+
506526
- name: Move artifacts
507527
run: yarn artifacts
508528
shell: bash
@@ -550,6 +570,8 @@ jobs:
550570
with:
551571
name: bindings-wasm32-wasip1-threads
552572
path: artifacts
573+
- name: create-npm-dirs
574+
run: yarn workspaces foreach -A -j 1 run napi create-npm-dirs
553575
- name: Move artifacts
554576
run: yarn artifacts
555577
shell: bash
@@ -596,6 +618,9 @@ jobs:
596618
run: ls -R artifacts
597619
shell: bash
598620

621+
- name: create-npm-dirs
622+
run: yarn workspaces foreach -A -j 1 run napi create-npm-dirs
623+
599624
- name: Move artifacts
600625
run: yarn artifacts
601626

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ resolver = "2"
1212
getrandom = "0.2"
1313
global_alloc = { path = "./crates/alloc" }
1414
indexmap = { version = "2", features = ["serde"] }
15-
jieba-rs = { version = "0.6", features = ["default-dict", "tfidf", "textrank"] }
15+
jieba-rs = { version = "0.7", default-features = false, features = ["tfidf", "textrank"] }
1616
jsonwebtoken = { version = "9" }
1717
mimalloc = "0.1"
1818
napi = { version = "3.0.0-alpha", default-features = false, features = ["napi3"] }

packages/jieba/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ global_alloc = { workspace = true }
1212
jieba-rs = { workspace = true }
1313
napi = { workspace = true, default-features = false, features = ["napi3"] }
1414
napi-derive = { workspace = true }
15-
once_cell = { workspace = true }
1615

1716
[build-dependencies]
1817
napi-build = { workspace = true }

packages/jieba/README.md

Lines changed: 42 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -14,61 +14,59 @@
1414
Due to [jieba-rs is 33% faster than cppjieba](https://blog.paulme.ng/posts/2019-06-30-optimizing-jieba-rs-to-be-33percents-faster-than-cppjieba.html), and N-API is faster than `v8` C++ API, `@node-rs/jieba` is faster than `nodejieba`.
1515

1616
```bash
17-
@node-rs/jieba x 3,763 ops/sec ±1.18% (92 runs sampled)
18-
nodejieba x 2,783 ops/sec ±0.67% (91 runs sampled)
19-
Cut 1184 words bench suite: Fastest is @node-rs/jieba
20-
21-
@node-rs/jieba x 16.10 ops/sec ±1.58% (44 runs sampled)
22-
nodejieba x 9.81 ops/sec ±2.39% (29 runs sampled)
23-
Cut 246568 words bench suite: Fastest is @node-rs/jieba
24-
25-
@node-rs/jieba x 1,739 ops/sec ±0.87% (92 runs sampled)
26-
nodejieba x 931 ops/sec ±1.31% (89 runs sampled)
27-
Tag 1184 words bench suite: Fastest is @node-rs/jieba
28-
29-
@node-rs/jieba x 6.19 ops/sec ±2.01% (20 runs sampled)
30-
nodejieba x 3.06 ops/sec ±5.39% (12 runs sampled)
31-
Tag 246568 words bench suite: Fastest is @node-rs/jieba
17+
Benchmark Cut 1184 words result
18+
┌─────────┬──────────────────┬─────────┬────────────────────┬──────────┬─────────┐
19+
│ (index) │ Task Name │ ops/sec │ Average Time (ns) │ Margin │ Samples │
20+
├─────────┼──────────────────┼─────────┼────────────────────┼──────────┼─────────┤
21+
│ 0 │ '@node-rs/jieba''8,246' │ 121266.9342871014 │ '±0.17%' │ 4124 │
22+
│ 1 │ 'nodejieba''6,392' │ 156439.52799499547 │ '±0.20%' │ 3197 │
23+
└─────────┴──────────────────┴─────────┴────────────────────┴──────────┴─────────┘
24+
Benchmark Cut 246568 words result
25+
┌─────────┬──────────────────┬─────────┬────────────────────┬──────────┬─────────┐
26+
│ (index) │ Task Name │ ops/sec │ Average Time (ns) │ Margin │ Samples │
27+
├─────────┼──────────────────┼─────────┼────────────────────┼──────────┼─────────┤
28+
│ 0 │ '@node-rs/jieba''32' │ 30760703.470588237 │ '±3.01%' │ 17 │
29+
│ 1 │ 'nodejieba''19' │ 51275112.699999996 │ '±2.68%' │ 10 │
30+
└─────────┴──────────────────┴─────────┴────────────────────┴──────────┴─────────┘
31+
Benchmark Tag 1184 words result
32+
┌─────────┬──────────────────┬─────────┬───────────────────┬──────────┬─────────┐
33+
│ (index) │ Task Name │ ops/sec │ Average Time (ns) │ Margin │ Samples │
34+
├─────────┼──────────────────┼─────────┼───────────────────┼──────────┼─────────┤
35+
│ 0 │ '@node-rs/jieba''3,174' │ 315048.8916876547 │ '±0.20%' │ 1588 │
36+
│ 1 │ 'nodejieba''2,672' │ 374213.8870605615 │ '±0.23%' │ 1337 │
37+
└─────────┴──────────────────┴─────────┴───────────────────┴──────────┴─────────┘
38+
Benchmark Tag 246568 words result
39+
┌─────────┬──────────────────┬─────────┬────────────────────┬──────────┬─────────┐
40+
│ (index) │ Task Name │ ops/sec │ Average Time (ns) │ Margin │ Samples │
41+
├─────────┼──────────────────┼─────────┼────────────────────┼──────────┼─────────┤
42+
│ 0 │ '@node-rs/jieba''11' │ 84886341.7999999 │ '±5.74%' │ 10 │
43+
│ 1 │ 'nodejieba''7' │ 125781083.30000004 │ '±4.75%' │ 10 │
44+
└─────────┴──────────────────┴─────────┴────────────────────┴──────────┴─────────┘
3245
```
3346

34-
## Support matrix
35-
36-
| | node12 | node14 | node16 | node18 |
37-
| ---------------- | ------ | ------ | ------ | ------ |
38-
| Windows x64 |||||
39-
| Windows x32 |||||
40-
| Windows arm64 |||||
41-
| macOS x64 |||||
42-
| macOS arm64 |||||
43-
| Linux x64 gnu |||||
44-
| Linux x64 musl |||||
45-
| Linux arm gnu |||||
46-
| Linux arm64 gnu |||||
47-
| Linux arm64 musl |||||
48-
| Android arm64 |||||
49-
| Android armv7 |||||
50-
| FreeBSD x64 |||||
51-
5247
## Usage
5348

5449
```javascript
55-
const { load, cut } = require('@node-rs/jieba')
50+
import { Jieba } from '@node-rs/jieba'
51+
import { dict } from '@node-rs/jieba/dict'
5652

57-
load()
58-
// loadDict(fs.readFileSync(...))
59-
// loadTFIDFDict(fs.readFileSync(...))
53+
// load jieba with the default dict
54+
const jieba = Jieba.withDict(dict)
6055

61-
cut('我们中出了一个叛徒', false)
56+
console.info(jieba.cut('我们中出了一个叛徒', false))
6257

6358
// ["我们", "中", "出", "了", "一个", "叛徒"]
6459
```
6560

6661
```javascript
67-
const { load, cut } = require('@node-rs/jieba')
62+
import { Jieba, TfIdf } from '@node-rs/jieba'
63+
import { dict, idf } from '@node-rs/jieba/dict'
6864

69-
load()
65+
const jieba = Jieba.withDict(dict)
66+
const tfIdf = TfIdf.withDict(idf)
7067

71-
extract(
68+
tfIdf.extractKeywords(
69+
jieba,
7270
'今天纽约的天气真好啊,京华大酒店的张尧经理吃了一只北京烤鸭。后天纽约的天气不好,昨天纽约的天气也不好,北京烤鸭真好吃',
7371
3,
7472
)
@@ -83,15 +81,14 @@ extract(
8381
### Load custom dictionaries
8482

8583
```javascript
86-
const { loadDict, cut } = require('@node-rs/jieba')
84+
import { Jieba } from '@node-rs/jieba'
8785
const customDict = ['哪行 50', '干一行 51', '行一行 52', '行行 53']
8886

8987
const dictBuffer = Buffer.from(customDict.join('\n'), 'utf-8')
90-
// loadDict doc: https://github.com/fxsjy/jieba?tab=readme-ov-file#%E8%BD%BD%E5%85%A5%E8%AF%8D%E5%85%B8
91-
loadDict(dictBuffer)
88+
const jieba = Jieba.withDict(dictBuffer)
9289

9390
const text = '人要是行干一行行一行,一行行行行行,行行行干哪行都行'
94-
const output = cut(text, false)
91+
const output = jieba.cut(text, false)
9592
console.log('分词结果⤵️\n', output.join('/'))
9693
// Before: 人/要是/行/干/一行行/一行/,/一行行/行/行/行/,/行/行/行/干/哪/行/都行
9794
// After: 人/要是/行/干一行/行一行/,/一行行/行行/行/,/行行/行/干/哪行/都行

packages/jieba/__tests__/__snapshots__/jieba.spec.ts.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,3 +141,22 @@ Generated by [AVA](https://avajs.dev).
141141
weight: 'number',
142142
},
143143
]
144+
145+
## should be able to load custom TFID dict
146+
147+
> Snapshot 1
148+
149+
[
150+
{
151+
keyword: 'CEO',
152+
weight: 1.6825,
153+
},
154+
{
155+
keyword: '不用',
156+
weight: 1.6825,
157+
},
158+
{
159+
keyword: '专业',
160+
weight: 1.6825,
161+
},
162+
]
57 Bytes
Binary file not shown.
Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,43 @@
11
import test from 'ava'
22

3-
import { cut, tag, extract, loadTFIDFDict, loadDict } from '../index'
3+
import { Jieba, TfIdf } from '../index.js'
4+
import { dict, idf } from '../dict.js'
45

56
const sentence = '我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,走上人生巅峰。'
67

8+
const jieba = Jieba.withDict(dict)
9+
const tfIdf = TfIdf.withDict(idf)
10+
711
test('cut result should be equal to nodejieba', (t) => {
8-
t.snapshot(cut(sentence))
12+
t.snapshot(jieba.cut(sentence))
913
})
1014

1115
test('tag result shoule be equal to nodejieba', (t) => {
12-
t.snapshot(tag(sentence))
16+
t.snapshot(jieba.tag(sentence))
1317
})
1418

1519
test('extract should be equal to nodejieba', (t) => {
1620
const sentence =
1721
'今天纽约的天气真好啊,京华大酒店的张尧经理吃了一只北京烤鸭。后天纽约的天气不好,昨天纽约的天气也不好,北京烤鸭真好吃'
1822
const topn = 3
1923
t.snapshot(
20-
extract(sentence, topn).map((t) => ({
24+
tfIdf.extractKeywords(jieba, sentence, topn).map((t) => ({
2125
keyword: t.keyword,
2226
weight: typeof t.weight,
2327
})),
2428
)
2529
})
2630

27-
test.skip('should be able to load custom TFID dict', (t) => {
31+
test('should be able to load custom TFID dict', (t) => {
2832
const userdict = Buffer.from('专业 20.19')
29-
loadTFIDFDict(userdict)
33+
const tfIdf = TfIdf.withDict(userdict)
3034
const fixture = '我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。'
31-
t.snapshot(extract(fixture, 3))
35+
t.snapshot(tfIdf.extractKeywords(jieba, fixture, 3))
3236
})
3337

34-
test.skip('should be able to load custom dict', (t) => {
38+
test('should be able to load custom dict', (t) => {
3539
const userdict = Buffer.from('出了 10000')
36-
loadDict(userdict)
40+
const jieba = Jieba.withDict(userdict)
3741
const fixture = '我们中出了一个叛徒'
38-
t.notThrows(() => cut(fixture))
42+
t.notThrows(() => jieba.cut(fixture))
3943
})

0 commit comments

Comments
 (0)