Skip to content

Commit 683a0af

Browse files
authored
Add support for custom pinyin dictionary file (#201)
- Introduce pinyin_dict() SQL function to set custom pinyin.txt - Allow switching pinyin mapping at runtime with error handling - Update docs and example.sql to demonstrate usage - Add tests for custom and invalid pinyin files
1 parent ea854b0 commit 683a0af

File tree

9 files changed

+162
-13
lines changed

9 files changed

+162
-13
lines changed

README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,26 @@ sqlite> select simple_highlight(t1, 0, '[', ']') as text from t1 where text matc
6363
5. simple_snippet() 实现截取 match 片段的功能,与 sqlite 自带的 snippet 功能类似,同样是增强连续 match 的词汇分到同一组的逻辑
6464
6. jieba_query() 实现jieba分词的效果,在索引不变的情况下,可以实现更精准的匹配。可以通过 `-DSIMPLE_WITH_JIEBA=OFF ` 关掉结巴分词的功能 [#35](https://github.com/wangfenjin/simple/pull/35)
6565
7. jieba_dict() 指定 dict 的目录,只需要调用一次,需要在调用 jieba_query() 之前指定。
66+
8. pinyin_dict() 支持指定自定义的 `pinyin.txt` 文件路径。调用成功后会立即切换拼音映射;如果文件格式不正确,会返回错误并保持当前映射不变。
67+
68+
### 自定义 pinyin.txt
69+
70+
默认会使用内置在 so 中的 `contrib/pinyin.txt`。如果希望使用自己的拼音表,可以在查询前调用:
71+
72+
```sql
73+
select pinyin_dict('/path/to/pinyin.txt');
74+
```
75+
76+
`pinyin.txt` 每行格式与默认文件一致,例如:
77+
78+
```text
79+
U+3007: líng,yuán,xīng
80+
U+3007: líng,yuán,xīng # 行尾注释也支持(前面需要空格)
81+
```
82+
83+
注意:
84+
- 建议在建索引和查询前先调用一次 `pinyin_dict()`
85+
- 如果替换了拼音映射,已有索引中的拼音 token 不会自动重建,需要按你的业务策略重建索引。
6686

6787
## 开发
6888

contrib/pinyin-mini.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# demo custom pinyin dictionary for example.sql
2+
U+5468: zhōu # 周
3+
U+4F26: lún # 伦

example.sql

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,19 @@
33
-- load so file
44
.load libsimple
55

6+
select '自定义拼音词典示例(只包含 周/伦 的拼音):';
7+
-- 在本仓库里从 output/bin 运行本例时,路径可使用 ../../contrib/pinyin-mini.txt
8+
select pinyin_dict('../../contrib/pinyin-mini.txt');
9+
CREATE VIRTUAL TABLE t0 USING fts5(x, tokenize = 'simple');
10+
insert into t0(x) values ('周杰伦');
11+
select ' 搜索 zhou,命中数量(预期 1):', count(*) from t0 where x match simple_query('zhou');
12+
select ' 搜索 lun,命中数量(预期 1):', count(*) from t0 where x match simple_query('lun');
13+
select ' 搜索 jie,命中数量(预期 0):', count(*) from t0 where x match simple_query('jie');
14+
select ' 搜索 zhou lun,命中数量(预期 1):', count(*) from t0 where x match simple_query('zhou lun');
15+
drop table t0;
16+
-- 切回默认词典,保证下面示例行为不变
17+
select pinyin_dict('../../contrib/pinyin.txt');
18+
619
select '启用拼音分词:';
720
-- set tokenize to simple
821
CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'simple');

src/entry.cc

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,13 +97,32 @@ static void simple_query(sqlite3_context *pCtx, int nVal, sqlite3_value **apVal)
9797
sqlite3_result_null(pCtx);
9898
}
9999

100+
static void pinyin_dict(sqlite3_context *pCtx, int nVal, sqlite3_value **apVal) {
101+
if (nVal >= 1) {
102+
const char *text = (const char *)sqlite3_value_text(apVal[0]);
103+
if (text) {
104+
std::string err;
105+
std::string path(text);
106+
if (simple_tokenizer::SimpleTokenizer::set_pinyin_dict(path, err)) {
107+
sqlite3_result_text(pCtx, path.c_str(), -1, SQLITE_TRANSIENT);
108+
} else {
109+
sqlite3_result_error(pCtx, err.c_str(), -1);
110+
}
111+
return;
112+
}
113+
}
114+
sqlite3_result_null(pCtx);
115+
}
116+
100117
int sqlite3_simple_init(sqlite3 *db, char **pzErrMsg, const sqlite3_api_routines *pApi) {
101118
(void)pzErrMsg;
102119
int rc = SQLITE_OK;
103120
SQLITE_EXTENSION_INIT2(pApi)
104121

105122
rc = sqlite3_create_function(db, "simple_query", -1, SQLITE_UTF8 | SQLITE_DETERMINISTIC, NULL, &simple_query, NULL,
106123
NULL);
124+
rc = sqlite3_create_function(db, "pinyin_dict", 1, SQLITE_UTF8 | SQLITE_DETERMINISTIC, NULL, &pinyin_dict, NULL,
125+
NULL);
107126
#ifdef USE_JIEBA
108127
rc = sqlite3_create_function(db, "jieba_query", -1, SQLITE_UTF8 | SQLITE_DETERMINISTIC, NULL, &jieba_query, NULL,
109128
NULL);

src/pinyin.cc

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "pinyin.h"
22

33
#include <cmrc/cmrc.hpp>
4+
#include <fstream>
45
#include <map>
56
#include <regex>
67
#include <set>
@@ -12,7 +13,9 @@ CMRC_DECLARE(pinyin_text);
1213

1314
namespace simple_tokenizer {
1415

15-
PinYin::PinYin() { pinyin = build_pinyin_map(); }
16+
PinYin::PinYin() : PinYin("") {}
17+
18+
PinYin::PinYin(const std::string &pinyin_file_path) { pinyin = build_pinyin_map(pinyin_file_path); }
1619

1720
std::set<std::string> PinYin::to_plain(const std::string &input) {
1821
std::set<std::string> s;
@@ -49,21 +52,45 @@ std::set<std::string> PinYin::to_plain(const std::string &input) {
4952
}
5053

5154
// clang-format off
52-
std::map<int, std::vector<std::string> > PinYin::build_pinyin_map() {
55+
std::map<int, std::vector<std::string> > PinYin::build_pinyin_map(const std::string &pinyin_file_path) {
5356
std::map<int, std::vector<std::string> > map;
5457
// clang-format on
55-
auto fs = cmrc::pinyin_text::get_filesystem();
56-
auto pinyin_data = fs.open("contrib/pinyin.txt");
57-
std::istringstream pinyin_file(std::string(pinyin_data.begin(), pinyin_data.end()));
58+
std::istringstream embedded_pinyin_file;
59+
std::ifstream custom_pinyin_file;
60+
std::istream *pinyin_file = nullptr;
61+
if (pinyin_file_path.empty()) {
62+
auto fs = cmrc::pinyin_text::get_filesystem();
63+
auto pinyin_data = fs.open("contrib/pinyin.txt");
64+
embedded_pinyin_file = std::istringstream(std::string(pinyin_data.begin(), pinyin_data.end()));
65+
pinyin_file = &embedded_pinyin_file;
66+
} else {
67+
custom_pinyin_file.open(pinyin_file_path);
68+
if (!custom_pinyin_file.is_open()) {
69+
throw std::runtime_error("failed to open pinyin file: " + pinyin_file_path);
70+
}
71+
pinyin_file = &custom_pinyin_file;
72+
}
5873
std::string line;
5974
char delimiter = ' ';
6075
std::string cp, py;
61-
while (std::getline(pinyin_file, line)) {
76+
int line_no = 0;
77+
while (std::getline(*pinyin_file, line)) {
78+
++line_no;
6279
if (line.length() == 0 || line[0] == '#') continue;
6380
std::stringstream tokenStream(line);
6481
std::getline(tokenStream, cp, delimiter);
6582
std::getline(tokenStream, py, delimiter);
66-
int codepoint = static_cast<int>(std::stoul(cp.substr(2, cp.length() - 3), 0, 16l));
83+
if (cp.length() < 4 || cp.rfind("U+", 0) != 0 || cp.back() != ':' || py.empty()) {
84+
throw std::runtime_error("invalid pinyin format at line " + std::to_string(line_no));
85+
}
86+
87+
int codepoint = 0;
88+
try {
89+
codepoint = static_cast<int>(std::stoul(cp.substr(2, cp.length() - 3), 0, 16l));
90+
} catch (const std::exception &) {
91+
throw std::runtime_error("invalid pinyin codepoint at line " + std::to_string(line_no));
92+
}
93+
6794
std::set<std::string> s = to_plain(py);
6895
std::vector<std::string> m(s.size());
6996
std::copy(s.begin(), s.end(), m.begin());

src/pinyin.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ class PinYin {
106106
};
107107
// clang-format on
108108
std::set<std::string> to_plain(const std::string &input);
109-
std::map<int, std::vector<std::string> > build_pinyin_map();
109+
std::map<int, std::vector<std::string> > build_pinyin_map(const std::string &pinyin_file_path);
110110
static int codepoint(const std::string &u);
111111
std::vector<std::string> _split_pinyin(const std::string &input, int begin, int end);
112112

@@ -115,6 +115,7 @@ class PinYin {
115115
static int get_str_len(unsigned char byte);
116116
std::set<std::string> split_pinyin(const std::string &input);
117117
PinYin();
118+
explicit PinYin(const std::string &pinyin_file_path);
118119
};
119120

120121
} // namespace simple_tokenizer

src/simple_tokenizer.cc

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,48 @@
33
#include <algorithm>
44
#include <cctype>
55
#include <cstdlib>
6+
#include <memory>
7+
#include <mutex>
68
#include <set>
79
#include <string>
810
#include <vector>
911

1012
namespace simple_tokenizer {
13+
namespace {
14+
std::mutex pinyin_mutex;
15+
std::shared_ptr<PinYin> global_pinyin;
16+
}
17+
1118
SimpleTokenizer::SimpleTokenizer(const char **azArg, int nArg) {
1219
if (nArg >= 1) {
1320
enable_pinyin = atoi(azArg[0]) != 0;
1421
}
1522
}
1623

17-
PinYin *SimpleTokenizer::get_pinyin() {
18-
static auto *py = new PinYin();
19-
return py;
24+
std::shared_ptr<PinYin> SimpleTokenizer::get_pinyin() {
25+
std::lock_guard<std::mutex> lock(pinyin_mutex);
26+
if (global_pinyin == nullptr) {
27+
global_pinyin = std::make_shared<PinYin>();
28+
}
29+
return global_pinyin;
30+
}
31+
32+
bool SimpleTokenizer::set_pinyin_dict(const std::string &pinyin_file_path, std::string &err) {
33+
std::shared_ptr<PinYin> new_pinyin;
34+
try {
35+
if (pinyin_file_path.empty()) {
36+
new_pinyin = std::make_shared<PinYin>();
37+
} else {
38+
new_pinyin = std::make_shared<PinYin>(pinyin_file_path);
39+
}
40+
} catch (const std::exception &e) {
41+
err = e.what();
42+
return false;
43+
}
44+
45+
std::lock_guard<std::mutex> lock(pinyin_mutex);
46+
global_pinyin = new_pinyin;
47+
return true;
2048
}
2149

2250
static TokenCategory from_char(char c) {
@@ -159,7 +187,8 @@ int SimpleTokenizer::tokenize(void *pCtx, int flags, const char *text, int textL
159187

160188
rc = xToken(pCtx, 0, result.c_str(), (int)result.length(), start, index);
161189
if (enable_pinyin && category == TokenCategory::OTHER && (flags & FTS5_TOKENIZE_DOCUMENT)) {
162-
const std::vector<std::string> &pys = SimpleTokenizer::get_pinyin()->get_pinyin(result);
190+
std::shared_ptr<PinYin> pinyin = SimpleTokenizer::get_pinyin();
191+
const std::vector<std::string> &pys = pinyin->get_pinyin(result);
163192
for (const std::string &s : pys) {
164193
rc = xToken(pCtx, FTS5_TOKEN_COLOCATED, s.c_str(), (int)s.length(), start, index);
165194
}

src/simple_tokenizer.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,14 @@ enum class TokenCategory {
2626

2727
class SimpleTokenizer {
2828
private:
29-
static PinYin *get_pinyin();
29+
static std::shared_ptr<PinYin> get_pinyin();
3030
bool enable_pinyin = true;
3131

3232
public:
3333
SimpleTokenizer(const char **zaArg, int nArg);
3434
int tokenize(void *pCtx, int flags, const char *text, int textLen, xTokenFn xToken) const;
3535
static std::string tokenize_query(const char *text, int textLen, int flags = 1);
36+
static bool set_pinyin_dict(const std::string &pinyin_file_path, std::string &err);
3637
#ifdef USE_JIEBA
3738
static std::string tokenize_jieba_query(const char *text, int textLen, int flags = 1);
3839
#endif

test/pinyin_test.cc

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
#include "pinyin.h"
22

3+
#include <cstdio>
4+
#include <fstream>
5+
#include <stdexcept>
6+
37
#include "gtest/gtest.h"
48

59
using namespace simple_tokenizer;
@@ -15,3 +19,35 @@ TEST(simple, pinyin_split) {
1519
for (auto r : res) std::cout << r << "\t";
1620
std::cout << std::endl;
1721
}
22+
23+
TEST(simple, pinyin_custom_file) {
24+
std::string path = "simple_custom_pinyin_test.txt";
25+
std::ofstream file(path);
26+
ASSERT_TRUE(file.is_open());
27+
file << "# custom pinyin file\n";
28+
file << "U+6770: jié # trailing comment\n";
29+
file.close();
30+
31+
PinYin pinyin(path);
32+
auto res = pinyin.get_pinyin("");
33+
ASSERT_EQ(res.size(), 2);
34+
ASSERT_EQ(res[0], "j");
35+
ASSERT_EQ(res[1], "jie");
36+
std::remove(path.c_str());
37+
}
38+
39+
TEST(simple, pinyin_invalid_custom_file) {
40+
std::string path = "simple_invalid_pinyin_test.txt";
41+
std::ofstream file(path);
42+
ASSERT_TRUE(file.is_open());
43+
file << "invalid line\n";
44+
file.close();
45+
46+
EXPECT_THROW(
47+
{
48+
PinYin pinyin(path);
49+
(void)pinyin;
50+
},
51+
std::runtime_error);
52+
std::remove(path.c_str());
53+
}

0 commit comments

Comments
 (0)