Skip to content

Commit e5ffdad

Browse files
committed
feat!(jieba): upgrade to [email protected]
Also provide customizable API for Jieba and TfIdf
1 parent ec14da0 commit e5ffdad

File tree

48 files changed

+619802
-912
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+619802
-912
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ resolver = "2"
1212
getrandom = "0.2"
1313
global_alloc = { path = "./crates/alloc" }
1414
indexmap = { version = "2", features = ["serde"] }
15-
jieba-rs = { version = "0.6", features = ["default-dict", "tfidf", "textrank"] }
15+
jieba-rs = { version = "0.7", default-features = false, features = ["tfidf", "textrank"] }
1616
jsonwebtoken = { version = "9" }
1717
mimalloc = "0.1"
1818
napi = { version = "3.0.0-alpha", default-features = false, features = ["napi3"] }

packages/jieba/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ global_alloc = { workspace = true }
1212
jieba-rs = { workspace = true }
1313
napi = { workspace = true, default-features = false, features = ["napi3"] }
1414
napi-derive = { workspace = true }
15-
once_cell = { workspace = true }
1615

1716
[build-dependencies]
1817
napi-build = { workspace = true }

packages/jieba/README.md

Lines changed: 42 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -14,61 +14,59 @@
1414
Due to [jieba-rs is 33% faster than cppjieba](https://blog.paulme.ng/posts/2019-06-30-optimizing-jieba-rs-to-be-33percents-faster-than-cppjieba.html), and N-API is faster than `v8` C++ API, `@node-rs/jieba` is faster than `nodejieba`.
1515

1616
```bash
17-
@node-rs/jieba x 3,763 ops/sec ±1.18% (92 runs sampled)
18-
nodejieba x 2,783 ops/sec ±0.67% (91 runs sampled)
19-
Cut 1184 words bench suite: Fastest is @node-rs/jieba
20-
21-
@node-rs/jieba x 16.10 ops/sec ±1.58% (44 runs sampled)
22-
nodejieba x 9.81 ops/sec ±2.39% (29 runs sampled)
23-
Cut 246568 words bench suite: Fastest is @node-rs/jieba
24-
25-
@node-rs/jieba x 1,739 ops/sec ±0.87% (92 runs sampled)
26-
nodejieba x 931 ops/sec ±1.31% (89 runs sampled)
27-
Tag 1184 words bench suite: Fastest is @node-rs/jieba
28-
29-
@node-rs/jieba x 6.19 ops/sec ±2.01% (20 runs sampled)
30-
nodejieba x 3.06 ops/sec ±5.39% (12 runs sampled)
31-
Tag 246568 words bench suite: Fastest is @node-rs/jieba
17+
Benchmark Cut 1184 words result
18+
┌─────────┬──────────────────┬─────────┬────────────────────┬──────────┬─────────┐
19+
│ (index) │ Task Name │ ops/sec │ Average Time (ns) │ Margin │ Samples │
20+
├─────────┼──────────────────┼─────────┼────────────────────┼──────────┼─────────┤
21+
│ 0 │ '@node-rs/jieba''8,246' │ 121266.9342871014 │ '±0.17%' │ 4124 │
22+
│ 1 │ 'nodejieba''6,392' │ 156439.52799499547 │ '±0.20%' │ 3197 │
23+
└─────────┴──────────────────┴─────────┴────────────────────┴──────────┴─────────┘
24+
Benchmark Cut 246568 words result
25+
┌─────────┬──────────────────┬─────────┬────────────────────┬──────────┬─────────┐
26+
│ (index) │ Task Name │ ops/sec │ Average Time (ns) │ Margin │ Samples │
27+
├─────────┼──────────────────┼─────────┼────────────────────┼──────────┼─────────┤
28+
│ 0 │ '@node-rs/jieba''32' │ 30760703.470588237 │ '±3.01%' │ 17 │
29+
│ 1 │ 'nodejieba''19' │ 51275112.699999996 │ '±2.68%' │ 10 │
30+
└─────────┴──────────────────┴─────────┴────────────────────┴──────────┴─────────┘
31+
Benchmark Tag 1184 words result
32+
┌─────────┬──────────────────┬─────────┬───────────────────┬──────────┬─────────┐
33+
│ (index) │ Task Name │ ops/sec │ Average Time (ns) │ Margin │ Samples │
34+
├─────────┼──────────────────┼─────────┼───────────────────┼──────────┼─────────┤
35+
│ 0 │ '@node-rs/jieba''3,174' │ 315048.8916876547 │ '±0.20%' │ 1588 │
36+
│ 1 │ 'nodejieba''2,672' │ 374213.8870605615 │ '±0.23%' │ 1337 │
37+
└─────────┴──────────────────┴─────────┴───────────────────┴──────────┴─────────┘
38+
Benchmark Tag 246568 words result
39+
┌─────────┬──────────────────┬─────────┬────────────────────┬──────────┬─────────┐
40+
│ (index) │ Task Name │ ops/sec │ Average Time (ns) │ Margin │ Samples │
41+
├─────────┼──────────────────┼─────────┼────────────────────┼──────────┼─────────┤
42+
│ 0 │ '@node-rs/jieba''11' │ 84886341.7999999 │ '±5.74%' │ 10 │
43+
│ 1 │ 'nodejieba''7' │ 125781083.30000004 │ '±4.75%' │ 10 │
44+
└─────────┴──────────────────┴─────────┴────────────────────┴──────────┴─────────┘
3245
```
3346

34-
## Support matrix
35-
36-
| | node12 | node14 | node16 | node18 |
37-
| ---------------- | ------ | ------ | ------ | ------ |
38-
| Windows x64 |||||
39-
| Windows x32 |||||
40-
| Windows arm64 |||||
41-
| macOS x64 |||||
42-
| macOS arm64 |||||
43-
| Linux x64 gnu |||||
44-
| Linux x64 musl |||||
45-
| Linux arm gnu |||||
46-
| Linux arm64 gnu |||||
47-
| Linux arm64 musl |||||
48-
| Android arm64 |||||
49-
| Android armv7 |||||
50-
| FreeBSD x64 |||||
51-
5247
## Usage
5348

5449
```javascript
55-
const { load, cut } = require('@node-rs/jieba')
50+
import { Jieba } from '@node-rs/jieba'
51+
import { dict } from '@node-rs/jieba/dict'
5652

57-
load()
58-
// loadDict(fs.readFileSync(...))
59-
// loadTFIDFDict(fs.readFileSync(...))
53+
// load jieba with the default dict
54+
const jieba = Jieba.withDict(dict)
6055

61-
cut('我们中出了一个叛徒', false)
56+
console.info(jieba.cut('我们中出了一个叛徒', false))
6257

6358
// ["我们", "中", "出", "了", "一个", "叛徒"]
6459
```
6560

6661
```javascript
67-
const { load, cut } = require('@node-rs/jieba')
62+
import { Jieba, TfIdf } from '@node-rs/jieba'
63+
import { dict, idf } from '@node-rs/jieba/dict'
6864

69-
load()
65+
const jieba = Jieba.withDict(dict)
66+
const tfIdf = TfIdf.withDict(idf)
7067

71-
extract(
68+
tfIdf.extractKeywords(
69+
jieba,
7270
'今天纽约的天气真好啊,京华大酒店的张尧经理吃了一只北京烤鸭。后天纽约的天气不好,昨天纽约的天气也不好,北京烤鸭真好吃',
7371
3,
7472
)
@@ -83,15 +81,14 @@ extract(
8381
### Load custom dictionaries
8482

8583
```javascript
86-
const { loadDict, cut } = require('@node-rs/jieba')
84+
import { Jieba } from '@node-rs/jieba'
8785
const customDict = ['哪行 50', '干一行 51', '行一行 52', '行行 53']
8886

8987
const dictBuffer = Buffer.from(customDict.join('\n'), 'utf-8')
90-
// loadDict doc: https://github.com/fxsjy/jieba?tab=readme-ov-file#%E8%BD%BD%E5%85%A5%E8%AF%8D%E5%85%B8
91-
loadDict(dictBuffer)
88+
const jieba = Jieba.withDict(dictBuffer)
9289

9390
const text = '人要是行干一行行一行,一行行行行行,行行行干哪行都行'
94-
const output = cut(text, false)
91+
const output = jieba.cut(text, false)
9592
console.log('分词结果⤵️\n', output.join('/'))
9693
// Before: 人/要是/行/干/一行行/一行/,/一行行/行/行/行/,/行/行/行/干/哪/行/都行
9794
// After: 人/要是/行/干一行/行一行/,/一行行/行行/行/,/行行/行/干/哪行/都行

packages/jieba/__tests__/__snapshots__/jieba.spec.ts.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,3 +141,22 @@ Generated by [AVA](https://avajs.dev).
141141
weight: 'number',
142142
},
143143
]
144+
145+
## should be able to load custom TFID dict
146+
147+
> Snapshot 1
148+
149+
[
150+
{
151+
keyword: 'CEO',
152+
weight: 1.6825,
153+
},
154+
{
155+
keyword: '不用',
156+
weight: 1.6825,
157+
},
158+
{
159+
keyword: '专业',
160+
weight: 1.6825,
161+
},
162+
]
57 Bytes
Binary file not shown.
Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,43 @@
11
import test from 'ava'
22

3-
import { cut, tag, extract, loadTFIDFDict, loadDict } from '../index'
3+
import { Jieba, TfIdf } from '../index.js'
4+
import { dict, idf } from '../dict.js'
45

56
const sentence = '我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,走上人生巅峰。'
67

8+
const jieba = Jieba.withDict(dict)
9+
const tfIdf = TfIdf.withDict(idf)
10+
711
test('cut result should be equal to nodejieba', (t) => {
8-
t.snapshot(cut(sentence))
12+
t.snapshot(jieba.cut(sentence))
913
})
1014

1115
test('tag result shoule be equal to nodejieba', (t) => {
12-
t.snapshot(tag(sentence))
16+
t.snapshot(jieba.tag(sentence))
1317
})
1418

1519
test('extract should be equal to nodejieba', (t) => {
1620
const sentence =
1721
'今天纽约的天气真好啊,京华大酒店的张尧经理吃了一只北京烤鸭。后天纽约的天气不好,昨天纽约的天气也不好,北京烤鸭真好吃'
1822
const topn = 3
1923
t.snapshot(
20-
extract(sentence, topn).map((t) => ({
24+
tfIdf.extractKeywords(jieba, sentence, topn).map((t) => ({
2125
keyword: t.keyword,
2226
weight: typeof t.weight,
2327
})),
2428
)
2529
})
2630

27-
test.skip('should be able to load custom TFID dict', (t) => {
31+
test('should be able to load custom TFID dict', (t) => {
2832
const userdict = Buffer.from('专业 20.19')
29-
loadTFIDFDict(userdict)
33+
const tfIdf = TfIdf.withDict(userdict)
3034
const fixture = '我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。'
31-
t.snapshot(extract(fixture, 3))
35+
t.snapshot(tfIdf.extractKeywords(jieba, fixture, 3))
3236
})
3337

34-
test.skip('should be able to load custom dict', (t) => {
38+
test('should be able to load custom dict', (t) => {
3539
const userdict = Buffer.from('出了 10000')
36-
loadDict(userdict)
40+
const jieba = Jieba.withDict(userdict)
3741
const fixture = '我们中出了一个叛徒'
38-
t.notThrows(() => cut(fixture))
42+
t.notThrows(() => jieba.cut(fixture))
3943
})

packages/jieba/benchmark/jieba.js

Lines changed: 52 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,19 @@
1-
const fs = require('fs')
2-
const { join } = require('path')
1+
import { readFileSync } from 'node:fs'
2+
import { join } from 'node:path'
3+
import { fileURLToPath } from 'node:url'
34

4-
const { Suite } = require('benchmark')
5-
const chalk = require('chalk')
6-
const nodejieba = require('nodejieba')
5+
import { Bench } from 'tinybench'
6+
import chalk from 'chalk'
7+
import nodejieba from 'nodejieba'
78

8-
const { load, cut, tag } = require('../index')
9+
import { Jieba, TfIdf } from '../index.js'
10+
import { dict, idf } from '../dict.js'
911

10-
load()
11-
nodejieba.load()
12+
const { load, cut, tag } = nodejieba
13+
14+
const __dirname = join(fileURLToPath(import.meta.url), '..')
1215

13-
const fixture = fs.readFileSync(join(__dirname, 'weicheng.txt'), 'utf8')
16+
const fixture = readFileSync(join(__dirname, 'weicheng.txt'), 'utf8')
1417

1518
const preface = `
1619
重印前记《围城》一九四七年在上海初版,一九四八年再版,一九四九年三版,以后国内没有重印过。偶然碰见它的新版,那都是香港的“盗印”本。没有看到台湾的“盗印”,据说在那里它是禁书。美国哥伦比亚大学夏志清教授的英文著作里对它作了过高的评价,导致了一些西方语言的译本。日本京都大学荒井健教授很久以前就通知我他要翻译,近年来也陆续在刊物上发表了译文。现在,人民文学出版社建议重新排印,以便原著在国内较易找着,我感到意外和忻辛。
@@ -23,30 +26,48 @@ const preface = `
2326

2427
const prefaceLength = preface.length
2528

26-
function createBench(suitename, transform, napi, jieba, input) {
27-
const cutSuite = new Suite(suitename)
28-
console.assert(transform(napi(input)) === transform(jieba(input)))
29-
30-
cutSuite
31-
.add('@node-rs/jieba', () => {
32-
napi(input)
33-
})
34-
.add('nodejieba', () => {
35-
jieba(input)
36-
})
37-
.on('cycle', function (event) {
38-
console.info(String(event.target))
39-
})
40-
.on('complete', function () {
41-
console.info(`${this.name} bench suite: Fastest is ${chalk.green(this.filter('fastest').map('name'))}`)
42-
})
43-
.run()
29+
async function createBench(suitename, transform, napi, jieba) {
30+
const suite = new Bench()
31+
console.assert(transform(napi()) === transform(jieba()))
32+
33+
suite.add('@node-rs/jieba', napi).add('nodejieba', jieba)
34+
35+
await suite.warmup()
36+
37+
await suite.run()
38+
39+
console.info(chalk.green(`Benchmark ${suitename} result`))
40+
console.table(suite.table())
4441
}
4542

46-
createBench(`Cut ${prefaceLength} words`, (output) => output.join(''), cut, nodejieba.cut, preface)
43+
load()
44+
const jieba = Jieba.withDict(dict)
45+
const tfIdf = TfIdf.withDict(idf)
46+
47+
await createBench(
48+
`Cut ${prefaceLength} words`,
49+
(output) => output.join(''),
50+
() => jieba.cut(preface),
51+
() => cut(preface),
52+
)
4753

48-
createBench(`Cut ${fixture.toString().length} words`, (output) => output.join(''), cut, nodejieba.cut, fixture)
54+
await createBench(
55+
`Cut ${fixture.toString().length} words`,
56+
(output) => output.join(''),
57+
() => jieba.cut(fixture),
58+
() => cut(fixture),
59+
)
4960

50-
createBench(`Tag ${prefaceLength} words`, (output) => typeof output, tag, nodejieba.tag, preface)
61+
await createBench(
62+
`Tag ${prefaceLength} words`,
63+
(output) => typeof output,
64+
() => jieba.tag(preface),
65+
() => tag(preface),
66+
)
5167

52-
createBench(`Tag ${fixture.toString().length} words`, (output) => typeof output, tag, nodejieba.tag, fixture)
68+
await createBench(
69+
`Tag ${fixture.toString().length} words`,
70+
(output) => typeof output,
71+
() => jieba.tag(fixture),
72+
() => tag(fixture),
73+
)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"type": "module"
3+
}

packages/jieba/dict.d.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
export const dict: Uint8Array
2+
export const idf: Uint8Array

packages/jieba/dict.js

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
const fs = require('fs')
2+
const { join } = require('path')
3+
4+
module.exports.dict = fs.readFileSync(join(__dirname, 'dict.txt'))
5+
module.exports.idf = fs.readFileSync(join(__dirname, 'idf.txt'))

0 commit comments

Comments
 (0)