11#include " pinyin.h"
22
33#include < cmrc/cmrc.hpp>
4+ #include < fstream>
45#include < map>
56#include < regex>
67#include < set>
@@ -12,7 +13,9 @@ CMRC_DECLARE(pinyin_text);
1213
1314namespace simple_tokenizer {
1415
15- PinYin::PinYin () { pinyin = build_pinyin_map (); }
16+ PinYin::PinYin () : PinYin(" " ) {}
17+
18+ PinYin::PinYin (const std::string &pinyin_file_path) { pinyin = build_pinyin_map (pinyin_file_path); }
1619
1720std::set<std::string> PinYin::to_plain (const std::string &input) {
1821 std::set<std::string> s;
@@ -49,21 +52,45 @@ std::set<std::string> PinYin::to_plain(const std::string &input) {
4952}
5053
5154// clang-format off
52- std::map<int , std::vector<std::string> > PinYin::build_pinyin_map () {
55+ std::map<int , std::vector<std::string> > PinYin::build_pinyin_map (const std::string &pinyin_file_path ) {
5356 std::map<int , std::vector<std::string> > map;
5457 // clang-format on
55- auto fs = cmrc::pinyin_text::get_filesystem ();
56- auto pinyin_data = fs.open (" contrib/pinyin.txt" );
57- std::istringstream pinyin_file (std::string (pinyin_data.begin (), pinyin_data.end ()));
58+ std::istringstream embedded_pinyin_file;
59+ std::ifstream custom_pinyin_file;
60+ std::istream *pinyin_file = nullptr ;
61+ if (pinyin_file_path.empty ()) {
62+ auto fs = cmrc::pinyin_text::get_filesystem ();
63+ auto pinyin_data = fs.open (" contrib/pinyin.txt" );
64+ embedded_pinyin_file = std::istringstream (std::string (pinyin_data.begin (), pinyin_data.end ()));
65+ pinyin_file = &embedded_pinyin_file;
66+ } else {
67+ custom_pinyin_file.open (pinyin_file_path);
68+ if (!custom_pinyin_file.is_open ()) {
69+ throw std::runtime_error (" failed to open pinyin file: " + pinyin_file_path);
70+ }
71+ pinyin_file = &custom_pinyin_file;
72+ }
5873 std::string line;
5974 char delimiter = ' ' ;
6075 std::string cp, py;
61- while (std::getline (pinyin_file, line)) {
76+ int line_no = 0 ;
77+ while (std::getline (*pinyin_file, line)) {
78+ ++line_no;
6279 if (line.length () == 0 || line[0 ] == ' #' ) continue ;
6380 std::stringstream tokenStream (line);
6481 std::getline (tokenStream, cp, delimiter);
6582 std::getline (tokenStream, py, delimiter);
66- int codepoint = static_cast <int >(std::stoul (cp.substr (2 , cp.length () - 3 ), 0 , 16l ));
83+ if (cp.length () < 4 || cp.rfind (" U+" , 0 ) != 0 || cp.back () != ' :' || py.empty ()) {
84+ throw std::runtime_error (" invalid pinyin format at line " + std::to_string (line_no));
85+ }
86+
87+ int codepoint = 0 ;
88+ try {
89+ codepoint = static_cast <int >(std::stoul (cp.substr (2 , cp.length () - 3 ), 0 , 16l ));
90+ } catch (const std::exception &) {
91+ throw std::runtime_error (" invalid pinyin codepoint at line " + std::to_string (line_no));
92+ }
93+
6794 std::set<std::string> s = to_plain (py);
6895 std::vector<std::string> m (s.size ());
6996 std::copy (s.begin (), s.end (), m.begin ());
0 commit comments