Skip to content

Commit fd9026c

Browse files
tempstudioqwaqrm
andauthored
Add support for JA in runtime, and add C API for more portability (#303)
* add jpn and C API * simplify * Update README.md --------- Co-authored-by: qwaqrm <[email protected]>
1 parent 7fda17b commit fd9026c

File tree

7 files changed

+136
-1
lines changed

7 files changed

+136
-1
lines changed

runtime/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@ $ cmake -B build -DCMAKE_BUILD_TYPE=Release
77
$ cmake --build build
88
```
99

10+
On Windows:
11+
``` bash
12+
$ cmake -DCMAKE_BUILD_TYPE=Release -B build -G "Visual Studio 17 2022" -DBUILD_SHARED_LIBS=0 -DCMAKE_CXX_FLAGS="/ZI"
13+
$ cmake --build build
14+
```
15+
1016
2. How to use
1117

1218
``` bash

runtime/processor/CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,12 @@ else()
1111
target_link_libraries(wetext_processor PUBLIC dl fst wetext_utils)
1212
endif()
1313
endif()
14+
15+
# ----------------------------------------------------------------------------
16+
# C API shared library (wetext_processor_c)
17+
# ----------------------------------------------------------------------------
18+
add_library(wetext_processor_c SHARED
19+
wetext_processor_c_api.cc
20+
)
21+
22+
target_link_libraries(wetext_processor_c PUBLIC wetext_processor)

runtime/processor/wetext_processor.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ Processor::Processor(const std::string& tagger_path,
3030
parse_type_ = ParseType::kZH_ITN;
3131
} else if (tagger_path.find("en_tn_") != tagger_path.npos) {
3232
parse_type_ = ParseType::kEN_TN;
33+
} else if (tagger_path.find("ja_tn_") != tagger_path.npos) {
34+
parse_type_ = ParseType::kJA_TN;
3335
} else {
3436
LOG(FATAL) << "Invalid fst prefix, prefix should contain"
3537
<< " either \"_tn_\" or \"_itn_\".";
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#include "processor/wetext_processor_c_api.h"
2+
3+
#include <cstring>
4+
#include <memory>
5+
#include <string>
6+
#include <utility>
7+
8+
#include "processor/wetext_processor.h"
9+
10+
using wetext::Processor;
11+
12+
// Utility ------------------------------------------------------------------
13+
namespace {
14+
// Copies an std::string into a newly allocated C string that the caller must
15+
// free via wetext_free_string().
16+
const char* CopyToCString(const std::string& str) {
17+
char* out = new char[str.size() + 1];
18+
std::memcpy(out, str.c_str(), str.size() + 1);
19+
return out;
20+
}
21+
} // namespace
22+
23+
// Public API ---------------------------------------------------------------
24+
25+
WetextProcessorHandle wetext_create_processor(const char* tagger_path,
26+
const char* verbalizer_path) {
27+
if (!tagger_path || !verbalizer_path) {
28+
return nullptr;
29+
}
30+
try {
31+
Processor* proc = new Processor(tagger_path, verbalizer_path);
32+
return static_cast<WetextProcessorHandle>(proc);
33+
} catch (...) {
34+
return nullptr;
35+
}
36+
}
37+
38+
void wetext_destroy_processor(WetextProcessorHandle handle) {
39+
if (!handle) return;
40+
Processor* proc = static_cast<Processor*>(handle);
41+
delete proc;
42+
}
43+
44+
#define WETEXT_RETURN_STRING(expr) \
45+
if (!handle || !input) return nullptr; \
46+
Processor* proc = static_cast<Processor*>(handle); \
47+
std::string result = (expr); \
48+
return CopyToCString(result);
49+
50+
const char* wetext_tag(WetextProcessorHandle handle, const char* input) {
51+
WETEXT_RETURN_STRING(proc->Tag(input));
52+
}
53+
54+
const char* wetext_verbalize(WetextProcessorHandle handle, const char* input) {
55+
WETEXT_RETURN_STRING(proc->Verbalize(input));
56+
}
57+
58+
const char* wetext_normalize(WetextProcessorHandle handle, const char* input) {
59+
WETEXT_RETURN_STRING(proc->Normalize(input));
60+
}
61+
62+
void wetext_free_string(const char* str) {
63+
delete[] str;
64+
}
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#ifndef WETEXT_PROCESSOR_C_API_H_
2+
#define WETEXT_PROCESSOR_C_API_H_
3+
4+
#ifdef __cplusplus
5+
extern "C" {
6+
#endif
7+
8+
// Symbol visibility
9+
#if defined(_WIN32) || defined(_WIN64)
10+
// Symbols are auto-exported on Windows because CMake sets
11+
// `CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON` for this target. We therefore
12+
// leave `WETEXT_API` empty to avoid the usual dllexport/dllimport
13+
// clutter while still allowing the same header to compile elsewhere.
14+
#define WETEXT_API
15+
#else
16+
#define WETEXT_API __attribute__((visibility("default")))
17+
#endif
18+
19+
// Opaque handle to the underlying wetext::Processor C++ object
20+
typedef void* WetextProcessorHandle;
21+
22+
// Create / destroy ---------------------------------------------------------
23+
24+
// Creates a new Processor instance. Returns nullptr on failure.
25+
WETEXT_API WetextProcessorHandle wetext_create_processor(const char* tagger_path,
26+
const char* verbalizer_path);
27+
28+
// Destroys a Processor instance obtained via wetext_create_processor().
29+
WETEXT_API void wetext_destroy_processor(WetextProcessorHandle handle);
30+
31+
// Processing APIs ----------------------------------------------------------
32+
33+
// The returned C-string is heap allocated and must be released with
34+
// wetext_free_string() once you are done with it.
35+
WETEXT_API const char* wetext_tag(WetextProcessorHandle handle, const char* input);
36+
WETEXT_API const char* wetext_verbalize(WetextProcessorHandle handle, const char* input);
37+
WETEXT_API const char* wetext_normalize(WetextProcessorHandle handle, const char* input);
38+
39+
// Frees a string returned by any of the processing APIs above.
40+
WETEXT_API void wetext_free_string(const char* str);
41+
42+
#ifdef __cplusplus
43+
} // extern "C"
44+
#endif
45+
46+
#endif // WETEXT_PROCESSOR_C_API_H_

runtime/processor/wetext_token_parser.cc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ const std::unordered_map<std::string, std::vector<std::string>> ZH_TN_ORDERS = {
3232
{"measure", {"denominator", "numerator", "value"}},
3333
{"money", {"value", "currency"}},
3434
{"time", {"noon", "hour", "minute", "second"}}};
35+
const std::unordered_map<std::string, std::vector<std::string>> JA_TN_ORDERS = {
36+
{"date", {"year", "month", "day"}},
37+
{"money", {"value", "currency"}}};
38+
3539
const std::unordered_map<std::string, std::vector<std::string>> EN_TN_ORDERS = {
3640
{"date", {"preserve_order", "text", "day", "month", "year"}},
3741
{"money", {"integer_part", "fractional_part", "quantity", "currency_maj"}}};
@@ -49,6 +53,8 @@ TokenParser::TokenParser(ParseType type) {
4953
orders_ = ZH_ITN_ORDERS;
5054
} else if (type == ParseType::kEN_TN) {
5155
orders_ = EN_TN_ORDERS;
56+
} else if (type == ParseType::kJA_TN) {
57+
orders_ = JA_TN_ORDERS;
5258
} else {
5359
LOG(FATAL) << "Invalid order";
5460
}

runtime/processor/wetext_token_parser.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,9 @@ struct Token {
6464
enum ParseType {
6565
kZH_TN = 0x00, // Chinese Text Normalization
6666
kZH_ITN = 0x01, // Chinese Inverse Text Normalization
67-
kEN_TN = 0x02 // English Text Normalization
67+
kEN_TN = 0x02, // English Text Normalization
68+
kEN_ITN = 0x03, // English Inverse Text Normalization (Unsupported)
69+
kJA_TN = 0x04 // Japanese Text Normalization
6870
};
6971

7072
class TokenParser {

0 commit comments

Comments
 (0)