|
5 | 5 | * This source code is licensed under the BSD-style license found in the
|
6 | 6 | * LICENSE file in the root directory of this source tree.
|
7 | 7 | */
|
8 |
| -// A weak symbol for create_regex, only using RE2 regex library. |
| 8 | +// Default implementation for create_regex, only using RE2 regex library. |
9 | 9 | // regex_lookahead.cpp has the implementation of create_regex with lookahead
|
10 | 10 | // support, backed by PCRE2 and std::regex.
|
11 | 11 |
|
12 | 12 | #include <pytorch/tokenizers/re2_regex.h>
|
13 | 13 | #include <pytorch/tokenizers/regex.h>
|
14 | 14 |
|
15 |
| -#include <iostream> |
16 |
| - |
17 | 15 | namespace tokenizers {
|
18 | 16 |
|
| 17 | +// Default implementation that returns failure |
| 18 | +static Result<std::unique_ptr<IRegex>> default_create_fallback_regex( |
| 19 | + const std::string& pattern) { |
| 20 | + (void)pattern; |
| 21 | + return tokenizers::Error::RegexFailure; |
| 22 | +} |
| 23 | + |
| 24 | +FallbackRegexFn fallback_regex = default_create_fallback_regex; |
| 25 | + |
| 26 | +bool register_override_fallback_regex(FallbackRegexFn fn) { |
| 27 | + TK_LOG(Info, "Registering override fallback regex"); |
| 28 | + fallback_regex = fn; |
| 29 | + return true; |
| 30 | +} |
| 31 | + |
| 32 | +FallbackRegexFn get_fallback_regex() { |
| 33 | + return fallback_regex; |
| 34 | +} |
| 35 | + |
19 | 36 | Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
|
20 | 37 | // Try RE2 first
|
21 |
| - auto re2 = std::make_unique<Re2Regex>("(" + pattern + ")"); |
| 38 | + auto re2 = std::make_unique<Re2Regex>(); |
| 39 | + auto err = re2->compile("(" + pattern + ")"); |
22 | 40 |
|
23 |
| - if (re2->regex_->ok()) { |
| 41 | + if (err == Error::Ok) { |
24 | 42 | return static_cast<std::unique_ptr<IRegex>>(std::move(re2));
|
25 | 43 | }
|
26 | 44 |
|
27 |
| - std::cerr << "RE2 failed to compile pattern: " << pattern << "\n"; |
28 |
| - std::cerr << "Error: " << (re2->regex_->error()) << std::endl; |
29 |
| - |
30 |
| - if (re2->regex_->error_code() == re2::RE2::ErrorBadPerlOp) { |
31 |
| - auto res = create_fallback_regex(pattern); |
32 |
| - if (!res.ok()) { |
33 |
| - std::cerr |
34 |
| - << "RE2 doesn't support lookahead patterns. " |
35 |
| - << "Link with the lookahead-enabled version of this library to enable support." |
36 |
| - << std::endl; |
37 |
| - } else { |
38 |
| - return res; |
39 |
| - } |
| 45 | + auto res = get_fallback_regex()(pattern); |
| 46 | + if (!res.ok()) { |
| 47 | + TK_LOG( |
| 48 | + Error, |
| 49 | + "RE2 doesn't support lookahead patterns. Link with `regex_lookahead` to enable support."); |
| 50 | + } else { |
| 51 | + return res; |
40 | 52 | }
|
41 | 53 |
|
42 | 54 | return tokenizers::Error::RegexFailure;
|
43 | 55 | }
|
44 |
| - |
45 |
| -#ifdef _MSC_VER |
46 |
| -#pragma weak create_fallback_regex |
47 |
| -#endif // _MSC_VER |
48 |
| -Result<std::unique_ptr<IRegex>> create_fallback_regex( |
49 |
| - const std::string& pattern) { |
50 |
| - (void)pattern; |
51 |
| - return tokenizers::Error::RegexFailure; |
52 |
| -} |
53 |
| - |
54 | 56 | } // namespace tokenizers
|
0 commit comments