frankslin
diff --git a/‎wasm-lib/README.md‎
Lines changed: 5 additions & 2 deletions b/‎wasm-lib/README.md‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎wasm-lib/README.zh.md‎
Lines changed: 4 additions & 1 deletion b/‎wasm-lib/README.zh.md‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎wasm-lib/build.sh‎
Lines changed: 3 additions & 0 deletions b/‎wasm-lib/build.sh‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎wasm-lib/data/config/s2twp_jieba.json‎
Lines changed: 31 additions & 0 deletions b/‎wasm-lib/data/config/s2twp_jieba.json‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎wasm-lib/data/config/tw2sp_jieba.json‎
Lines changed: 35 additions & 0 deletions b/‎wasm-lib/data/config/tw2sp_jieba.json‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎wasm-lib/data/jieba_dict/BUILD.bazel‎
Lines changed: 6 additions & 0 deletions b/‎wasm-lib/data/jieba_dict/BUILD.bazel‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎wasm-lib/data/jieba_dict/README.md‎
Lines changed: 45 additions & 0 deletions b/‎wasm-lib/data/jieba_dict/README.md‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎wasm-lib/data/jieba_dict/hmm_model.utf8‎
Lines changed: 34 additions & 0 deletions b/‎wasm-lib/data/jieba_dict/hmm_model.utf8‎
Lines changed: 34 additions & 0 deletions
@@ -78,10 +78,12 @@ const result = await converter("服务器软件");  // 伺服器軟體
 | Config | Description | Example |
 |--------|-------------|---------|
 | `s2twp` | Simplified → Taiwan Traditional (with regional phrases) | 软件 → 軟體 |
+| `s2twp_jieba` | Simplified → Taiwan Traditional (jieba segmentation) | 城堡的士兵 → 城堡的士兵 |
 | `s2tw` | Simplified → Taiwan Traditional | 心里 → 心裡 |
 | `s2hk` | Simplified → Hong Kong Traditional | 心里  → 心裏 |
 | `s2t` | Simplified → OpenCC Standard Traditional | 简体 → 簡體 |
 | `tw2sp` | Taiwan → Simplified (with regional phrases) | 滑鼠 → 鼠标 |
+| `tw2sp_jieba` | Taiwan → Simplified (jieba segmentation) | 慰藉著 → 慰藉着 |
 | `tw2s` | Taiwan → Simplified | 軟體 → 软件 |
 | `tw2t` | Taiwan → Traditional | 吃飯 → 喫飯 |
 | `hk2s` | Hong Kong → Simplified | 打印機 → 打印机 |
@@ -276,7 +278,7 @@ console.log(await t2s("繁體"));   // 繁体
 ```typescript
 import OpenCC from 'opencc-wasm';
 
-type ConfigName = 's2t' | 's2tw' | 's2twp' | 't2s';
+type ConfigName = 's2t' | 's2tw' | 's2twp' | 's2twp_jieba' | 't2s' | 'tw2sp_jieba';
 
 async function convert(config: ConfigName, text: string): Promise<string> {
   const converter = OpenCC.Converter({ config });
@@ -342,7 +344,7 @@ wasm-lib/
 │   │   ├── index.cjs
 │   │   ├── opencc-wasm.cjs
 │   │   └── opencc-wasm.wasm
-│   └── data/           ← OpenCC configs + dicts
+│   └── data/           ← OpenCC configs + dicts (+ jieba files if enabled)
 ├── index.js            ← Source API
 ├── index.d.ts          ← TypeScript definitions
 └── scripts/
@@ -375,6 +377,7 @@ A: Initial load downloads configs + dicts (~1-2MB). Subsequent conversions are f
 
 - Uses persistent OpenCC handles to avoid reloading configs
 - Dictionaries stored in `/data/dict/` in virtual FS
+- Jieba assets stored in `/data/jieba_dict/` (dict, hmm_model, user dict, idf, stop_words)
 - Memory grows on demand (`ALLOW_MEMORY_GROWTH=1`)
 - Performance: Focuses on fidelity and compatibility with official OpenCC. May be slower than pure-JS implementations for raw throughput, but guarantees full OpenCC behavior.
 
 
@@ -78,10 +78,12 @@ const result = await converter("服务器软件");  // 伺服器軟體
 | 設定檔 | 說明 | 範例 |
 |--------|------|------|
 | `s2twp` | 簡體 → 台灣正體（含地域用詞轉換） | 軟體 → 軟體 |
+| `s2twp_jieba` | 簡體 → 台灣正體（jieba 分詞） | 城堡的士兵 → 城堡的士兵 |
 | `s2tw` | 簡體 → 台灣正體 | 心里 → 心裡 |
 | `s2hk` | 簡體 → 香港繁體 | 心里  → 心裏 |
 | `s2t` | 簡體 → OpenCC 標準繁體 | 简体 → 簡體 |
 | `tw2sp` | 台灣正體 → 簡體（含地域用詞轉換） | 滑鼠 → 鼠标 |
+| `tw2sp_jieba` | 台灣正體 → 簡體（jieba 分詞） | 慰藉著 → 慰藉着 |
 | `tw2s` | 台灣正體 → 簡體 | 軟體 → 软件 |
 | `tw2t` | 台灣正體 → OpenCC 標準繁體 | 吃飯 → 喫飯 |
 | `hk2s` | 香港繁體 → 簡體 | 打印機 → 打印机 |
@@ -276,7 +278,7 @@ console.log(await t2s("繁體"));   // 繁体
 ```typescript
 import OpenCC from 'opencc-wasm';
 
-type ConfigName = 's2t' | 's2tw' | 's2twp' | 't2s';
+type ConfigName = 's2t' | 's2tw' | 's2twp' | 's2twp_jieba' | 't2s' | 'tw2sp_jieba';
 
 async function convert(config: ConfigName, text: string): Promise<string> {
   const converter = OpenCC.Converter({ config });
@@ -375,6 +377,7 @@ A：首次載入需要下載設定檔和字典檔（約 1-2MB）。後續轉換
 
 - 使用持久的 OpenCC 控制代碼避免重複載入設定
 - 字典儲存在虛擬檔案系統的 `/data/dict/` 中
+- Jieba 資產儲存在 `/data/jieba_dict/`（詞典、hmm_model、user dict、idf、stop_words）
 - 記憶體按需成長（`ALLOW_MEMORY_GROWTH=1`）
 - 效能：專注於精確度和與官方 OpenCC 的相容性。原始吞吐量可能比純 JavaScript 實作慢，但保證完整的 OpenCC 行為。
 
 
@@ -22,6 +22,7 @@ OPENCC_SRCS=(
   ${OPENCC_SRC_DIR}/src/Dict.cpp
   ${OPENCC_SRC_DIR}/src/DictEntry.cpp
   ${OPENCC_SRC_DIR}/src/DictGroup.cpp
+  ${OPENCC_SRC_DIR}/src/JiebaSegmentation.cpp
   ${OPENCC_SRC_DIR}/src/Lexicon.cpp
   ${OPENCC_SRC_DIR}/src/MarisaDict.cpp
   ${OPENCC_SRC_DIR}/src/MaxMatchSegmentation.cpp
@@ -49,6 +50,7 @@ MARISA_SRCS=(
 # 头文件搜索路径
 INCLUDE_FLAGS=(
   -I${OPENCC_SRC_DIR}/src
+  -I${OPENCC_SRC_DIR}/deps/libcppjieba/include
   -I${MARISA_DIR}/include
   -I${MARISA_DIR}/lib
   -I${OPENCC_SRC_DIR}/deps/rapidjson-1.1.0
@@ -63,6 +65,7 @@ INCLUDE_FLAGS=(
 # -O2: 体积/性能权衡
 COMMON_FLAGS=(
   -DOPENCC_WASM_WITH_OPENCC
+  -DENABLE_JIEBA
   "${OPENCC_SRCS[@]}"
   "${MARISA_SRCS[@]}"
   src/main.cpp
 
@@ -0,0 +1,31 @@
+{
+  "name": "Simplified Chinese to Traditional Chinese (Taiwan standard, with phrases, Jieba Segmentation - Experimental)",
+  "segmentation": {
+    "type": "jieba",
+    "dict_path": "jieba_dict/jieba.dict.utf8",
+    "model_path": "jieba_dict/hmm_model.utf8",
+    "user_dict_path": "jieba_dict/user.dict.utf8"
+  },
+  "conversion_chain": [{
+    "dict": {
+      "type": "group",
+      "dicts": [{
+        "type": "ocd2",
+        "file": "STPhrases.ocd2"
+      }, {
+        "type": "ocd2",
+        "file": "STCharacters.ocd2"
+      }]
+    }
+  }, {
+    "dict": {
+      "type": "ocd2",
+      "file": "TWPhrases.ocd2"
+    }
+  }, {
+    "dict": {
+      "type": "ocd2",
+      "file": "TWVariants.ocd2"
+    }
+  }]
+}
@@ -0,0 +1,35 @@
+{
+  "name": "Traditional Chinese (Taiwan standard) to Simplified Chinese (with phrases, Jieba Segmentation - Experimental)",
+  "segmentation": {
+    "type": "jieba",
+    "dict_path": "jieba_dict/jieba.dict.utf8",
+    "model_path": "jieba_dict/hmm_model.utf8",
+    "user_dict_path": "jieba_dict/user.dict.utf8"
+  },
+  "conversion_chain": [{
+    "dict": {
+      "type": "group",
+      "dicts": [{
+        "type": "ocd2",
+        "file": "TWPhrasesRev.ocd2"
+      }, {
+        "type": "ocd2",
+        "file": "TWVariantsRevPhrases.ocd2"
+      }, {
+        "type": "ocd2",
+        "file": "TWVariantsRev.ocd2"
+      }]
+    }
+  }, {
+    "dict": {
+      "type": "group",
+      "dicts": [{
+        "type": "ocd2",
+        "file": "TSPhrases.ocd2"
+      }, {
+        "type": "ocd2",
+        "file": "TSCharacters.ocd2"
+      }]
+    }
+  }]
+}
@@ -0,0 +1,6 @@
+package(default_visibility = ["//visibility:public"])
+
+filegroup(
+    name = "jieba_dict",
+    srcs = glob(["*.utf8", "README.md"]),
+)
@@ -0,0 +1,45 @@
+# Jieba 分词词典
+
+此目录包含 Jieba 中文分词所需的词典文件，来源于 [libcppjieba](https://github.com/yanyiwu/libcppjieba)。
+
+## 文件说明
+
+- **jieba.dict.utf8** (4.9 MB) - 主词典文件，包含词语及其词频
+- **hmm_model.utf8** (508 KB) - 隐马尔可夫模型（HMM）文件，用于识别未登录词
+- **user.dict.utf8** (33 B) - 用户自定义词典（可选）
+
+## 许可证
+
+这些词典文件继承自 jieba 项目，遵循 MIT 许可证。
+
+## 使用方式
+
+在 OpenCC 配置文件中指定这些词典的路径。IDF 和停用词数据
+会从 `deps/libcppjieba/dict/` 自动解析，无需复制到此目录：
+
+```json
+{
+  "segmentation": {
+    "type": "jieba",
+    "dict_path": "jieba_dict/jieba.dict.utf8",
+    "model_path": "jieba_dict/hmm_model.utf8",
+    "user_dict_path": "jieba_dict/user.dict.utf8"
+  }
+}
+```
+
+## 自定义用户词典
+
+您可以编辑 `user.dict.utf8` 添加自定义词语，格式为：
+
+```
+词语 词频 词性
+```
+
+例如：
+```
+云计算 5 n
+机器学习 8 n
+```
+
+每行一个词语，词频和词性可选。