Skip to content

Commit ed0f79e

Browse files
committed
Bug 1973862: Support locale tailoring in Intl.Segmenter. r=spidermonkey-reviewers,dminor
Call the new segmenter constructors from ICU4X 2.0 to get locale-dependent word and sentence segmentation. Differential Revision: https://phabricator.services.mozilla.com/D255329
1 parent e2781c3 commit ed0f79e

File tree

4 files changed

+80
-10
lines changed

4 files changed

+80
-10
lines changed

config/check_spidermonkey_style.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
"icu4x/Date.hpp", # ICU4X
6969
"icu4x/GraphemeClusterSegmenter.hpp", # ICU4X
7070
"icu4x/IsoDate.hpp", # ICU4X
71+
"icu4x/Locale.hpp", # ICU4X
7172
"icu4x/SentenceSegmenter.hpp", # ICU4X
7273
"icu4x/WordSegmenter.hpp", # ICU4X
7374
"jit/ABIFunctionTypeGenerated.h", # generated in $OBJDIR"

js/src/builtin/intl/Segmenter.cpp

Lines changed: 62 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@
1818

1919
#include "builtin/Array.h"
2020
#include "builtin/intl/CommonFunctions.h"
21+
#include "builtin/intl/StringAsciiChars.h"
2122
#include "gc/AllocKind.h"
2223
#include "gc/GCContext.h"
2324
#include "icu4x/GraphemeClusterSegmenter.hpp"
25+
#include "icu4x/Locale.hpp"
2426
#include "icu4x/SentenceSegmenter.hpp"
2527
#include "icu4x/WordSegmenter.hpp"
2628
#include "js/CallArgs.h"
@@ -400,7 +402,7 @@ struct WordSegmenter {
400402
SegmenterBreakIteratorType<WordSegmenterBreakIteratorTwoByte>;
401403

402404
static constexpr auto& create =
403-
icu4x::capi::icu4x_WordSegmenter_create_auto_mv1;
405+
icu4x::capi::icu4x_WordSegmenter_create_auto_with_content_locale_mv1;
404406
static constexpr auto& destroy = icu4x::capi::icu4x_WordSegmenter_destroy_mv1;
405407
};
406408

@@ -444,19 +446,74 @@ struct SentenceSegmenter {
444446
SegmenterBreakIteratorType<SentenceSegmenterBreakIteratorTwoByte>;
445447

446448
static constexpr auto& create =
447-
icu4x::capi::icu4x_SentenceSegmenter_create_mv1;
449+
icu4x::capi::icu4x_SentenceSegmenter_create_with_content_locale_mv1;
448450
static constexpr auto& destroy =
449451
icu4x::capi::icu4x_SentenceSegmenter_destroy_mv1;
450452
};
451453

454+
class ICU4XLocaleDeleter {
455+
public:
456+
void operator()(icu4x::capi::Locale* ptr) {
457+
icu4x::capi::icu4x_Locale_destroy_mv1(ptr);
458+
}
459+
};
460+
461+
using UniqueICU4XLocale =
462+
mozilla::UniquePtr<icu4x::capi::Locale, ICU4XLocaleDeleter>;
463+
464+
static UniqueICU4XLocale CreateICU4XLocale(JSContext* cx,
465+
Handle<JSString*> str) {
466+
auto* linear = str->ensureLinear(cx);
467+
if (!linear) {
468+
return nullptr;
469+
}
470+
471+
icu4x::capi::icu4x_Locale_from_string_mv1_result result{};
472+
{
473+
intl::StringAsciiChars chars(linear);
474+
if (!chars.init(cx)) {
475+
return nullptr;
476+
}
477+
478+
auto span = static_cast<mozilla::Span<const char>>(chars);
479+
result =
480+
icu4x::capi::icu4x_Locale_from_string_mv1({span.data(), span.size()});
481+
}
482+
483+
if (!result.is_ok) {
484+
intl::ReportInternalError(cx);
485+
return nullptr;
486+
}
487+
return UniqueICU4XLocale{result.ok};
488+
}
489+
452490
/**
453-
* Create a new ICU4X segmenter instance.
491+
* Create a new, locale-invariant ICU4X segmenter instance.
454492
*/
455493
template <typename Interface>
456494
static typename Interface::Segmenter* CreateSegmenter() {
457495
return Interface::create();
458496
}
459497

498+
/**
499+
* Create a new ICU4X segmenter instance, tailored for |locale|.
500+
*/
501+
template <typename Interface>
502+
static typename Interface::Segmenter* CreateSegmenter(
503+
JSContext* cx, Handle<JSString*> locale) {
504+
auto loc = CreateICU4XLocale(cx, locale);
505+
if (!loc) {
506+
return nullptr;
507+
}
508+
509+
auto result = Interface::create(loc.get());
510+
if (!result.is_ok) {
511+
intl::ReportInternalError(cx);
512+
return nullptr;
513+
}
514+
return result.ok;
515+
}
516+
460517
static bool EnsureInternalsResolved(JSContext* cx,
461518
Handle<SegmenterObject*> segmenter) {
462519
if (segmenter->getLocale()) {
@@ -506,15 +563,15 @@ static bool EnsureInternalsResolved(JSContext* cx,
506563
break;
507564
}
508565
case SegmenterGranularity::Word: {
509-
auto* seg = CreateSegmenter<WordSegmenter>();
566+
auto* seg = CreateSegmenter<WordSegmenter>(cx, locale);
510567
if (!seg) {
511568
return false;
512569
}
513570
segmenter->setSegmenter(seg);
514571
break;
515572
}
516573
case SegmenterGranularity::Sentence: {
517-
auto* seg = CreateSegmenter<SentenceSegmenter>();
574+
auto* seg = CreateSegmenter<SentenceSegmenter>(cx, locale);
518575
if (!seg) {
519576
return false;
520577
}

js/src/tests/non262/Intl/Segmenter/sentence-latin.js

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,8 @@ for (let [string, words] of Object.entries(strings)) {
8585
// A single sentence in English.
8686
assertEq([...english.segment(string)].length, 1);
8787

88-
// ICU4C: Two sentences in Greek.
89-
// assertEq([...greek.segment(string)].length, 2);
90-
91-
// ICU4X: A single sentence in Greek.
92-
assertEq([...greek.segment(string)].length, 1);
88+
// Two sentences in Greek.
89+
assertEq([...greek.segment(string)].length, 2);
9390
}
9491

9592
if (typeof reportCompare === "function")

js/src/tests/non262/Intl/Segmenter/word-latin1.js

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,5 +206,20 @@ assertSegments("_'_", ["_", "'", "_"]);
206206
assertEq(new Intl.Segmenter("en-posix").resolvedOptions().locale, "en");
207207
assertEq(new Intl.Segmenter("en-u-va-posix").resolvedOptions().locale, "en");
208208

209+
// Locale-dependent word segmentation.
210+
{
211+
// https://en.wikipedia.org/wiki/Colon_(punctuation)#Abbreviation_mark
212+
let string = "Word:with:colon";
213+
214+
let english = new Intl.Segmenter("en", {granularity: "word"});
215+
let svenska = new Intl.Segmenter("sv", {granularity: "word"});
216+
217+
// Three words with two separators in English.
218+
assertEq([...english.segment(string)].length, 5);
219+
220+
// A single word in Swedish.
221+
assertEq([...svenska.segment(string)].length, 1);
222+
}
223+
209224
if (typeof reportCompare === "function")
210225
reportCompare(0, 0);

0 commit comments

Comments
 (0)