Skip to content

Commit 5a42d26

Browse files
jayneticsk0kubun
authored andcommitted
Make word prop match join_control to conform to UTS 18
See <https://bugs.ruby-lang.org/issues/19417#note-3>. https://unicode.org/reports/tr18/#word states word should match join_control chars. It did not previously: ```ruby [*0x0..0xD799, *0xE000..0x10FFFF].map { |n| n.chr 'utf-8' } => all_chars all_chars.grep(/\p{join_control}/) => jc jc.count # => 2 jc.grep(/\p{word}/).count # => 0 ``` [Backport #19417] --- Backporting note: I regenerated `enc/unicode/15.0.0/name2ctype.h` using `make update-unicode`.
1 parent fd036db commit 5a42d26

File tree

4 files changed

+14
-2
lines changed

4 files changed

+14
-2
lines changed

enc/unicode/15.0.0/name2ctype.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3846,7 +3846,7 @@ static const OnigCodePoint CR_XDigit[] = {
38463846

38473847
/* 'Word': [[:Word:]] */
38483848
static const OnigCodePoint CR_Word[] = {
3849-
770,
3849+
771,
38503850
0x0030, 0x0039,
38513851
0x0041, 0x005a,
38523852
0x005f, 0x005f,
@@ -4144,6 +4144,7 @@ static const OnigCodePoint CR_Word[] = {
41444144
0x1fe0, 0x1fec,
41454145
0x1ff2, 0x1ff4,
41464146
0x1ff6, 0x1ffc,
4147+
0x200c, 0x200d,
41474148
0x203f, 0x2040,
41484149
0x2054, 0x2054,
41494150
0x2071, 0x2071,

spec/ruby/language/regexp/character_classes_spec.rb

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -562,6 +562,13 @@
562562
"\u{16EE}".match(/[[:word:]]/).to_a.should == ["\u{16EE}"]
563563
end
564564

565+
ruby_bug "#19417", ""..."3.4.6" do
566+
it "matches Unicode join control characters with [[:word:]]" do
567+
"\u{200C}".match(/[[:word:]]/).to_a.should == ["\u{200C}"]
568+
"\u{200D}".match(/[[:word:]]/).to_a.should == ["\u{200D}"]
569+
end
570+
end
571+
565572
it "doesn't match Unicode No characters with [[:word:]]" do
566573
"\u{17F0}".match(/[[:word:]]/).should be_nil
567574
end

test/ruby/test_regexp.rb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1296,6 +1296,9 @@ def test_posix_bracket
12961296
assert_match(/\A[[:space:]]+\z/, "\r\n\v\f\r\s\u0085")
12971297
assert_match(/\A[[:ascii:]]+\z/, "\x00\x7F")
12981298
assert_no_match(/[[:ascii:]]/, "\x80\xFF")
1299+
1300+
assert_match(/[[:word:]]/, "\u{200C}")
1301+
assert_match(/[[:word:]]/, "\u{200D}")
12991302
end
13001303

13011304
def test_cclass_R

tool/enc-unicode.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,8 @@ def define_posix_props(data)
143143
data['Space'] = data['White_Space']
144144
data['Blank'] = data['Space_Separator'] + [0x0009]
145145
data['Cntrl'] = data['Cc']
146-
data['Word'] = data['Alpha'] + data['Mark'] + data['Digit'] + data['Connector_Punctuation']
146+
data['Word'] = data['Alpha'] + data['Mark'] + data['Digit'] +
147+
data['Connector_Punctuation'] + data['Join_Control']
147148
data['Graph'] = data['Any'] - data['Space'] - data['Cntrl'] -
148149
data['Surrogate'] - data['Unassigned']
149150
data['Print'] = data['Graph'] + data['Space_Separator']

0 commit comments

Comments
 (0)