Skip to content

Commit 99457f8

Browse files
committed
Support Unicode 17.0
This adds full support for this latest version of Unicode. What was essentially missing was updating the rules for the break properties, like \b{wb}. This is always a pain, but the changes made for 15.1 and 16.0 made it much easier.
1 parent c028ec7 commit 99457f8

File tree

9 files changed

+160
-157
lines changed

9 files changed

+160
-157
lines changed

charclass_invlists.inc

Lines changed: 112 additions & 113 deletions
Large diffs are not rendered by default.

lib/unicore/mktables

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ sub NON_ASCII_PLATFORM { ord("A") != 65 }
5050
# expected, a warning will be generated. If an older version is being
5151
# compiled, any bounds tests that fail in the generated test file (-maketest
5252
# option) will be marked as TODO.
53-
my $version_of_mk_invlist_bounds = v16.0.0;
53+
my $version_of_mk_invlist_bounds = v17.0.0;
5454

5555
##########################################################################
5656
#

lib/unicore/uni_keywords.pl

Lines changed: 6 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pod/perldelta.pod

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ here, but most should go in the L</Performance Enhancements> section.
2727

2828
[ List each enhancement as a =head2 entry ]
2929

30+
=head2 Unicode 17.0 is supported
31+
32+
See L<https://www.unicode.org/versions/Unicode17.0.0/>.
33+
3034
=head1 Security
3135

3236
XXX Any security-related notices go here. In particular, any security

regcharclass.h

Lines changed: 5 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

regen/mk_invlists.pl

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2499,12 +2499,12 @@ ()
24992499
match_return => 'LB_NOBREAK',
25002500
rule => '19a',
25012501
},
2502-
LB_various_then_HY_or_HH_v_AL => {
2502+
LB_various_then_HY_or_HH_v_AL_or_HL => {
25032503
enum => $lb_enum++,
25042504
match_return => 'LB_NOBREAK',
25052505
rule => '20a',
25062506
},
2507-
LB_HL_then_HY_or_BA_sans_EA_v_nonHL => {
2507+
LB_HL_then_HY_or_HH_v_nonHL => {
25082508
enum => $lb_enum++,
25092509
match_return => 'LB_NOBREAK',
25102510
rule => '21a',
@@ -2693,7 +2693,7 @@ ()
26932693
# LB12a Do not break before NBSP and related characters, except after
26942694
# spaces and hyphens.
26952695
# [^SP BA HY] × GL
2696-
set_lb_nobreak([ qw(^ SP BA HY) ], 'GL', '12a');
2696+
set_lb_nobreak([ qw(^ SP BA HY HH) ], 'GL', '12a');
26972697

26982698
# LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as
26992699
# tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples
@@ -2800,25 +2800,28 @@ ()
28002800
# LB20a Do not break after a word-initial hyphen.
28012801
# ( sot | BK | CR | LF | NL | SP | ZW | CB | GL )
28022802
# ( HY | HH )
2803-
# × AL
2804-
$dfa = 'LB_various_then_HY_or_HH_v_AL';
2805-
add_lb_dfa($_, 'AL', $dfa, '20a') for qw(HY HH);
2803+
# × ( AL | HL )
2804+
$dfa = 'LB_various_then_HY_or_HH_v_AL_or_HL';
2805+
for $lhs (qw(HY HH)) {
2806+
add_lb_dfa($lhs, $_, $dfa, '20a') for qw(AL HL);
2807+
}
28062808

28072809
# LB21 Do not break before hyphen-minus, other hyphens, fixed-width
28082810
# spaces, small kana, and other non-starters, or after acute accents.
28092811
# × BA
2812+
# × HH
28102813
# × HY
28112814
# × NS
28122815
$rule = 21;
2813-
set_lb_nobreak('*', $_, $rule) for qw(BA HY NS);
2816+
set_lb_nobreak('*', $_, $rule) for qw(BA HH HY NS);
28142817
# BB ×
28152818
set_lb_nobreak('BB', '*', $rule);
28162819

2817-
# LB21a Don't break after Hebrew + HY.
2818-
# HL (HY | [ BA - $EastAsian ]) × [^HL]
2820+
# LB21a Don't break after the hyphen in Hebrew + Hyphen + non-Hebrew.
2821+
# HL (HY | HH) × [^HL]
28192822
$rule = '21a';
2820-
$dfa = 'LB_HL_then_HY_or_BA_sans_EA_v_nonHL';
2821-
add_lb_dfa($_, [ qw(^ HL) ], $dfa, $rule) for qw(HY BA_sans_EA);
2823+
$dfa = 'LB_HL_then_HY_or_HH_v_nonHL';
2824+
add_lb_dfa($_, [ qw(^ HL) ], $dfa, $rule) for qw(HY HH);
28222825

28232826
# LB21b Don’t break between Solidus and Hebrew letters.
28242827
# SY × HL

regexec.c

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5605,11 +5605,11 @@ S_isLB(pTHX_ LB_enum before,
56055605
|| isLB_CP(prev);
56065606
break;
56075607

5608-
case LB_various_then_HY_or_HH_v_AL:
5608+
case LB_various_then_HY_or_HH_v_AL_or_HL:
56095609
/* LB20a Do not break after a word-initial hyphen.
56105610
* ( sot | BK | CR | LF | NL | SP | ZW | CB | GL )
56115611
* ( HY | HH ] )
5612-
* × AL */
5612+
* × AL | HL */
56135613
prev = backup_one_LB_but_over_CM_ZWJ(strbeg, &prev_pos,
56145614
utf8_target);
56155615
matched = ( isLB_EDGE(prev)
@@ -5637,9 +5637,10 @@ S_isLB(pTHX_ LB_enum before,
56375637
matched = isLB_B2(prev);
56385638
break;
56395639

5640-
case LB_HL_then_HY_or_BA_sans_EA_v_nonHL:
5641-
/* LB21a Don't break after Hebrew + HY.
5642-
* HL (HY | [ BA - $EastAsian ]) × [^HL] */
5640+
case LB_HL_then_HY_or_HH_v_nonHL:
5641+
/* LB21a Do not break after the hyphen in Hebrew + Hyphen +
5642+
* non-Hebrew.
5643+
* HL (HY | HH ) × [^HL] */
56435644
matched = isLB_HL(backup_one_LB_but_over_CM_ZWJ(strbeg, &prev_pos,
56445645
utf8_target));
56455646
break;

regexp_constants.h

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
* a2f16fb873ab4fcdf3221cb1a8a85a134ddd6ed03603181823ff5206af3741ce lib/unicore/BidiMirroring.txt
3636
* c0edefaf1a19771e830a82735472716af6bf3c3975f6c2a23ffbe2580fbbcb15 lib/unicore/Blocks.txt
3737
* ff8d8fefbf123574205085d6714c36149eb946d717a0c585c27f0f4ef58c4183 lib/unicore/CaseFolding.txt
38-
* 2f239196ef3b5b61db5cc476e9bd80f534d15aa1b74e1be1dea5d042a344c85f lib/unicore/CompositionExclusions.txt
38+
* 5e6e9c8f8e76561da04cb1703a9306c63707be2ed8ff2eb12cd3a942368a6f72 lib/unicore/CompositionExclusions.txt
3939
* f8ecdf768bdc210f201abd271d9bc587825618a86a7046a8146cc816393f1998 lib/unicore/DAge.txt
4040
* 24c7fed1195c482faaefd5c1e7eb821c5ee1fb6de07ecdbaa64b56a99da22c08 lib/unicore/DCoreProperties.txt
4141
* 71fd6a206a2c0cdd41feb6b7f656aa31091db45e9cedc926985d718397f9e488 lib/unicore/DNormalizationProps.txt
@@ -49,16 +49,15 @@
4949
* 17bb07f5e37f995351ddcef393c04464a9e3891ed0c0bd56a03a5c5e400a6326 lib/unicore/Jamo.txt
5050
* e6a18fa91f8f6a6f8e534b1d3f128c21ada45bfe152eb6b1bcc5e15fd8ac92e6 lib/unicore/LineBreak.txt
5151
* 793f6f1e4d15fd90f05ae66460191dc4d75d1fea90136a25f30dd6a4cb950eac lib/unicore/NameAliases.txt
52-
* 0cd1c928edac72ea1369187599727c4f0d92367316b4c5ee5140475ab9298be0 lib/unicore/NamedSequences.txt
53-
* 5019ffd530751a741900c849c0e010332f142a3612234639bd200b82138a87db lib/unicore/NormTest.txt
52+
* 360dac27d5abafdcd8c03a8597f47acf92e8ebf7f6ee28020c173ed8b2ed0cc5 lib/unicore/NamedSequences.txt
5453
* 130dcddcaadaf071008bdfce1e7743e04fdfbc910886f017d9f9ac931d8c64dd lib/unicore/PropList.txt
55-
* 64e9a5f76f7a1e8b5a47d6a1f9a26522a251208f5276bdfa1559dac7cf2e827a lib/unicore/PropValueAliases.txt
54+
* 670d2bebb48649c04fabfbf033308073dcff47946324a8033237254c048b3b01 lib/unicore/PropValueAliases.txt
5655
* 4441f573caf952ffece1d7c892e7715bd7136dfc26f96eb6f268bf1e474715fb lib/unicore/PropertyAliases.txt
5756
* ec2107e58825a1586acee8e0911ce18260394ac8b87e535ca325f1ccbeb06bc6 lib/unicore/ScriptExtensions.txt
5857
* 9f5e50d3abaee7d6ce09480f325c706f485ae3240912527e651954d2d6b035bf lib/unicore/Scripts.txt
5958
* efc25faf19de21b92c1194c111c932e03d2a5eaf18194e33f1156e96de4c9588 lib/unicore/SpecialCasing.txt
6059
* 2e1efc1dcb59c575eedf5ccae60f95229f706ee6d031835247d843c11d96470c lib/unicore/UnicodeData.txt
61-
* a7b46c19e24355257030b73be046c71b172595ec4a106867d3f988e3a4007208 lib/unicore/Unikemet.txt
60+
* 76a3081265e6eb673873f9c93d6f36062e82c7ed027c5c1a592accfbe48c20a5 lib/unicore/Unikemet.txt
6261
* dcef09c3fb24d356b042569c328ec341efc5b53447700d799f2fb4834c3cd3cd lib/unicore/VerticalOrientation.txt
6362
* e2d134d2c52919bace503ebb6a551c1855fe1a1faec18478c78fff254a1793ec lib/unicore/auxiliary/GCBTest.txt
6463
* d6b51d1d2ae5c33b451b7ed994b48f1f4dc62b2272a5831e7fd418514a6bae89 lib/unicore/auxiliary/GraphemeBreakProperty.txt
@@ -79,9 +78,9 @@
7978
* dad3ef492d198d6f1dde4922b175f7371a27dfe62fce489f3e04807015a4c682 lib/unicore/extracted/DLineBreak.txt
8079
* 7c83684d3336b698381745b78a971c3e1242cb3fcac58604469086c19b6edcee lib/unicore/extracted/DNumType.txt
8180
* 139b976bdc288be01c80f018523da769cf2845109b5a7f0f8a432db64bfedcfa lib/unicore/extracted/DNumValues.txt
82-
* a09a6a34898bdbe7bd387837f0cad2e4a035cd5340d0c12c03b4e66b0ccdb7a6 lib/unicore/mktables
81+
* 03640d8ad18fc65de766f2034a927f7442960e998d3243845ca9b9fe31bfe1ab lib/unicore/mktables
8382
* 8c30575264b2772c7a69c5bb6069a28f0e0a7a0df735871bde2d99ee674316ac lib/unicore/version
8483
* 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
8584
* c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl
86-
* ec2bea650f1338fcd1f2f5b0e589f89e653c180b3994468788fd343a2869ced7 regen/mk_invlists.pl
85+
* 33a5e583d836b8bb97b6b1d1c6d1766defe4cbdeb8bcbe865d76f71206762be9 regen/mk_invlists.pl
8786
* ex: set ro ft=c: */

uni_keywords.h

Lines changed: 6 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)