Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 29 additions & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -248,8 +248,13 @@ jobs:
settings:
- host: macos-latest
target: 'x86_64-apple-darwin'
architecture: x64
- host: macos-latest
target: 'aarch64-apple-darwin'
architecture: arm64
- host: windows-latest
target: 'x86_64-pc-windows-msvc'
architecture: x64
node: ['18', '20']
runs-on: ${{ matrix.settings.host }}

Expand All @@ -263,7 +268,7 @@ jobs:
with:
node-version: ${{ matrix.node }}
cache: yarn
architecture: x64
architecture: ${{ matrix.settings.architecture }}

- name: Install dependencies
run: yarn install --immutable --mode=skip-build
Expand All @@ -274,6 +279,9 @@ jobs:
name: bindings-${{ matrix.settings.target }}
path: artifacts

- name: create-npm-dirs
run: yarn workspaces foreach -A -j 1 run napi create-npm-dirs

- name: Move artifacts
run: yarn artifacts
shell: bash
Expand Down Expand Up @@ -312,6 +320,9 @@ jobs:
name: bindings-x86_64-unknown-linux-gnu
path: artifacts

- name: create-npm-dirs
run: yarn workspaces foreach -A -j 1 run napi create-npm-dirs

- name: Move artifacts
run: yarn artifacts
shell: bash
Expand Down Expand Up @@ -357,6 +368,9 @@ jobs:
- name: Build TypeScript
run: yarn build:ts

- name: create-npm-dirs
run: yarn workspaces foreach -A -j 1 run napi create-npm-dirs

- name: Download artifacts
uses: actions/download-artifact@v4
with:
Expand Down Expand Up @@ -404,6 +418,9 @@ jobs:
yarn config set supportedArchitectures.libc "glibc"
yarn install --immutable --mode=skip-build

- name: create-npm-dirs
run: yarn workspaces foreach -A -j 1 run napi create-npm-dirs

- name: Move artifacts
run: yarn artifacts
shell: bash
Expand Down Expand Up @@ -455,6 +472,9 @@ jobs:
name: bindings-aarch64-unknown-linux-musl
path: artifacts

- name: create-npm-dirs
run: yarn workspaces foreach -A -j 1 run napi create-npm-dirs

- name: Move artifacts
run: yarn artifacts
shell: bash
Expand Down Expand Up @@ -503,6 +523,9 @@ jobs:
name: bindings-armv7-unknown-linux-gnueabihf
path: artifacts

- name: create-npm-dirs
run: yarn workspaces foreach -A -j 1 run napi create-npm-dirs

- name: Move artifacts
run: yarn artifacts
shell: bash
Expand Down Expand Up @@ -550,6 +573,8 @@ jobs:
with:
name: bindings-wasm32-wasip1-threads
path: artifacts
- name: create-npm-dirs
run: yarn workspaces foreach -A -j 1 run napi create-npm-dirs
- name: Move artifacts
run: yarn artifacts
shell: bash
Expand Down Expand Up @@ -596,6 +621,9 @@ jobs:
run: ls -R artifacts
shell: bash

- name: create-npm-dirs
run: yarn workspaces foreach -A -j 1 run napi create-npm-dirs

- name: Move artifacts
run: yarn artifacts

Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ resolver = "2"
getrandom = "0.2"
global_alloc = { path = "./crates/alloc" }
indexmap = { version = "2", features = ["serde"] }
jieba-rs = { version = "0.6", features = ["default-dict", "tfidf", "textrank"] }
jieba-rs = { version = "0.7", default-features = false, features = ["tfidf", "textrank"] }
jsonwebtoken = { version = "9" }
mimalloc = "0.1"
napi = { version = "3.0.0-alpha", default-features = false, features = ["napi3"] }
Expand Down
1 change: 0 additions & 1 deletion packages/jieba/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ global_alloc = { workspace = true }
jieba-rs = { workspace = true }
napi = { workspace = true, default-features = false, features = ["napi3"] }
napi-derive = { workspace = true }
once_cell = { workspace = true }

[build-dependencies]
napi-build = { workspace = true }
87 changes: 42 additions & 45 deletions packages/jieba/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,61 +14,59 @@
Due to [jieba-rs is 33% faster than cppjieba](https://blog.paulme.ng/posts/2019-06-30-optimizing-jieba-rs-to-be-33percents-faster-than-cppjieba.html), and N-API is faster than `v8` C++ API, `@node-rs/jieba` is faster than `nodejieba`.

```bash
@node-rs/jieba x 3,763 ops/sec ±1.18% (92 runs sampled)
nodejieba x 2,783 ops/sec ±0.67% (91 runs sampled)
Cut 1184 words bench suite: Fastest is @node-rs/jieba

@node-rs/jieba x 16.10 ops/sec ±1.58% (44 runs sampled)
nodejieba x 9.81 ops/sec ±2.39% (29 runs sampled)
Cut 246568 words bench suite: Fastest is @node-rs/jieba

@node-rs/jieba x 1,739 ops/sec ±0.87% (92 runs sampled)
nodejieba x 931 ops/sec ±1.31% (89 runs sampled)
Tag 1184 words bench suite: Fastest is @node-rs/jieba

@node-rs/jieba x 6.19 ops/sec ±2.01% (20 runs sampled)
nodejieba x 3.06 ops/sec ±5.39% (12 runs sampled)
Tag 246568 words bench suite: Fastest is @node-rs/jieba
Benchmark Cut 1184 words result
┌─────────┬──────────────────┬─────────┬────────────────────┬──────────┬─────────┐
│ (index) │ Task Name │ ops/sec │ Average Time (ns) │ Margin │ Samples │
├─────────┼──────────────────┼─────────┼────────────────────┼──────────┼─────────┤
│ 0 │ '@node-rs/jieba' │ '8,246' │ 121266.9342871014 │ '±0.17%' │ 4124 │
│ 1 │ 'nodejieba' │ '6,392' │ 156439.52799499547 │ '±0.20%' │ 3197 │
└─────────┴──────────────────┴─────────┴────────────────────┴──────────┴─────────┘
Benchmark Cut 246568 words result
┌─────────┬──────────────────┬─────────┬────────────────────┬──────────┬─────────┐
│ (index) │ Task Name │ ops/sec │ Average Time (ns) │ Margin │ Samples │
├─────────┼──────────────────┼─────────┼────────────────────┼──────────┼─────────┤
│ 0 │ '@node-rs/jieba' │ '32' │ 30760703.470588237 │ '±3.01%' │ 17 │
│ 1 │ 'nodejieba' │ '19' │ 51275112.699999996 │ '±2.68%' │ 10 │
└─────────┴──────────────────┴─────────┴────────────────────┴──────────┴─────────┘
Benchmark Tag 1184 words result
┌─────────┬──────────────────┬─────────┬───────────────────┬──────────┬─────────┐
│ (index) │ Task Name │ ops/sec │ Average Time (ns) │ Margin │ Samples │
├─────────┼──────────────────┼─────────┼───────────────────┼──────────┼─────────┤
│ 0 │ '@node-rs/jieba' │ '3,174' │ 315048.8916876547 │ '±0.20%' │ 1588 │
│ 1 │ 'nodejieba' │ '2,672' │ 374213.8870605615 │ '±0.23%' │ 1337 │
└─────────┴──────────────────┴─────────┴───────────────────┴──────────┴─────────┘
Benchmark Tag 246568 words result
┌─────────┬──────────────────┬─────────┬────────────────────┬──────────┬─────────┐
│ (index) │ Task Name │ ops/sec │ Average Time (ns) │ Margin │ Samples │
├─────────┼──────────────────┼─────────┼────────────────────┼──────────┼─────────┤
│ 0 │ '@node-rs/jieba' │ '11' │ 84886341.7999999 │ '±5.74%' │ 10 │
│ 1 │ 'nodejieba' │ '7' │ 125781083.30000004 │ '±4.75%' │ 10 │
└─────────┴──────────────────┴─────────┴────────────────────┴──────────┴─────────┘
```

## Support matrix

| | node12 | node14 | node16 | node18 |
| ---------------- | ------ | ------ | ------ | ------ |
| Windows x64 | ✓ | ✓ | ✓ | ✓ |
| Windows x32 | ✓ | ✓ | ✓ | ✓ |
| Windows arm64 | ✓ | ✓ | ✓ | ✓ |
| macOS x64 | ✓ | ✓ | ✓ | ✓ |
| macOS arm64 | ✓ | ✓ | ✓ | ✓ |
| Linux x64 gnu | ✓ | ✓ | ✓ | ✓ |
| Linux x64 musl | ✓ | ✓ | ✓ | ✓ |
| Linux arm gnu | ✓ | ✓ | ✓ | ✓ |
| Linux arm64 gnu | ✓ | ✓ | ✓ | ✓ |
| Linux arm64 musl | ✓ | ✓ | ✓ | ✓ |
| Android arm64 | ✓ | ✓ | ✓ | ✓ |
| Android armv7 | ✓ | ✓ | ✓ | ✓ |
| FreeBSD x64 | ✓ | ✓ | ✓ | ✓ |

## Usage

```javascript
const { load, cut } = require('@node-rs/jieba')
import { Jieba } from '@node-rs/jieba'
import { dict } from '@node-rs/jieba/dict'

load()
// loadDict(fs.readFileSync(...))
// loadTFIDFDict(fs.readFileSync(...))
// load jieba with the default dict
const jieba = Jieba.withDict(dict)

cut('我们中出了一个叛徒', false)
console.info(jieba.cut('我们中出了一个叛徒', false))

// ["我们", "中", "出", "了", "一个", "叛徒"]
```

```javascript
const { load, cut } = require('@node-rs/jieba')
import { Jieba, TfIdf } from '@node-rs/jieba'
import { dict, idf } from '@node-rs/jieba/dict'

load()
const jieba = Jieba.withDict(dict)
const tfIdf = TfIdf.withDict(idf)

extract(
tfIdf.extractKeywords(
jieba,
'今天纽约的天气真好啊,京华大酒店的张尧经理吃了一只北京烤鸭。后天纽约的天气不好,昨天纽约的天气也不好,北京烤鸭真好吃',
3,
)
Expand All @@ -83,15 +81,14 @@ extract(
### Load custom dictionaries

```javascript
const { loadDict, cut } = require('@node-rs/jieba')
import { Jieba } from '@node-rs/jieba'
const customDict = ['哪行 50', '干一行 51', '行一行 52', '行行 53']

const dictBuffer = Buffer.from(customDict.join('\n'), 'utf-8')
// loadDict doc: https://github.com/fxsjy/jieba?tab=readme-ov-file#%E8%BD%BD%E5%85%A5%E8%AF%8D%E5%85%B8
loadDict(dictBuffer)
const jieba = Jieba.withDict(dictBuffer)

const text = '人要是行干一行行一行,一行行行行行,行行行干哪行都行'
const output = cut(text, false)
const output = jieba.cut(text, false)
console.log('分词结果⤵️\n', output.join('/'))
// Before: 人/要是/行/干/一行行/一行/,/一行行/行/行/行/,/行/行/行/干/哪/行/都行
// After: 人/要是/行/干一行/行一行/,/一行行/行行/行/,/行行/行/干/哪行/都行
Expand Down
19 changes: 19 additions & 0 deletions packages/jieba/__tests__/__snapshots__/jieba.spec.ts.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,22 @@ Generated by [AVA](https://avajs.dev).
weight: 'number',
},
]

## should be able to load custom TFID dict

> Snapshot 1

[
{
keyword: 'CEO',
weight: 1.6825,
},
{
keyword: '不用',
weight: 1.6825,
},
{
keyword: '专业',
weight: 1.6825,
},
]
Binary file modified packages/jieba/__tests__/__snapshots__/jieba.spec.ts.snap
Binary file not shown.
24 changes: 14 additions & 10 deletions packages/jieba/__tests__/jieba.spec.ts
Original file line number Diff line number Diff line change
@@ -1,39 +1,43 @@
import test from 'ava'

import { cut, tag, extract, loadTFIDFDict, loadDict } from '../index'
import { Jieba, TfIdf } from '../index.js'
import { dict, idf } from '../dict.js'

const sentence = '我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,走上人生巅峰。'

const jieba = Jieba.withDict(dict)
const tfIdf = TfIdf.withDict(idf)

test('cut result should be equal to nodejieba', (t) => {
t.snapshot(cut(sentence))
t.snapshot(jieba.cut(sentence))
})

test('tag result shoule be equal to nodejieba', (t) => {
t.snapshot(tag(sentence))
t.snapshot(jieba.tag(sentence))
})

test('extract should be equal to nodejieba', (t) => {
const sentence =
'今天纽约的天气真好啊,京华大酒店的张尧经理吃了一只北京烤鸭。后天纽约的天气不好,昨天纽约的天气也不好,北京烤鸭真好吃'
const topn = 3
t.snapshot(
extract(sentence, topn).map((t) => ({
tfIdf.extractKeywords(jieba, sentence, topn).map((t) => ({
keyword: t.keyword,
weight: typeof t.weight,
})),
)
})

test.skip('should be able to load custom TFID dict', (t) => {
test('should be able to load custom TFID dict', (t) => {
const userdict = Buffer.from('专业 20.19')
loadTFIDFDict(userdict)
const tfIdf = TfIdf.withDict(userdict)
const fixture = '我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。'
t.snapshot(extract(fixture, 3))
t.snapshot(tfIdf.extractKeywords(jieba, fixture, 3))
})

test.skip('should be able to load custom dict', (t) => {
test('should be able to load custom dict', (t) => {
const userdict = Buffer.from('出了 10000')
loadDict(userdict)
const jieba = Jieba.withDict(userdict)
const fixture = '我们中出了一个叛徒'
t.notThrows(() => cut(fixture))
t.notThrows(() => jieba.cut(fixture))
})
Loading