From 6cc07d11de2d1e91a782e71dd73a6d2643d63027 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sat, 25 Oct 2025 16:49:13 -0600 Subject: [PATCH 1/3] reg_mesg.t: Only one error per test This just fills out a couple of tests so that they don't prematurely end. That makes it clear that the eorror that does get shown isn't also due to other mistakes in the test. --- t/re/reg_mesg.t | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t index b8b4ec6a3d8e..a7662e9ce0b3 100644 --- a/t/re/reg_mesg.t +++ b/t/re/reg_mesg.t @@ -176,7 +176,7 @@ my @death = '/(?/' => 'Sequence (? incomplete {#} m/(?{#}/', '/(?;x/' => 'Sequence (?;...) not recognized {#} m/(?;{#}x/', - '/(?<;x/' => 'Group name must start with a non-digit word character {#} m/(?<;{#}x/', + '/(?<;name>match)/' => 'Group name must start with a non-digit word character {#} m/(?<;{#}name>match)/', '/(?\ix/' => 'Sequence (?\...) not recognized {#} m/(?\{#}ix/', '/(?\mx/' => 'Sequence (?\...) not recognized {#} m/(?\{#}mx/', '/(?\:x/' => 'Sequence (?\...) not recognized {#} m/(?\{#}:x/', @@ -222,7 +222,7 @@ my @death = '/\g{-abc}/' => 'Group name must start with a non-digit word character {#} m/\g{-{#}abc}/', '/\g{1-1}/' => 'Sequence \g{... not terminated {#} m/\g{1{#}-1}/', '/\g{ -1 foo }/' => 'Sequence \g{... not terminated {#} m/\g{ -1 {#}foo }/', - '/(?<;x/' => 'Group name must start with a non-digit word character {#} m/(?<;{#}x/', + '/(?<;name>match)/' => 'Group name must start with a non-digit word character {#} m/(?<;{#}name>match)/', 'my $m = "\\\"; $m =~ $m', => 'Trailing \ in regex m/\/', @@ -310,7 +310,7 @@ my @death = 'm/(?&a/' => 'Sequence (?&... not terminated {#} m/(?&a{#}/', 'm/(?P=/' => 'Sequence ?P=... not terminated {#} m/(?P={#}/', "m/(?'/" => "Sequence (?'... not terminated {#} m/(?'{#}/", - "m/(? "Sequence (?<... not terminated {#} m/(?<{#}/", + "m/(? "Sequence (?<... not terminated {#} m/(? 'Sequence (?&... not terminated {#} m/(?&{#}/', 'm/(?( 'Sequence (?(<... not terminated {#} m/(?(<{#}/', "m/(?('/" => "Sequence (?('... not terminated {#} m/(?('{#}/", @@ -485,7 +485,7 @@ my @death_utf8 = mark_as_utf8( '/ネ(?/' => 'Sequence (? incomplete {#} m/ネ(?{#}/', '/ネ(?;ネ/' => 'Sequence (?;...) not recognized {#} m/ネ(?;{#}ネ/', - '/ネ(?<;ネ/' => 'Group name must start with a non-digit word character {#} m/ネ(?<;{#}ネ/', + '/ネ(?<;name>match)ネ/' => 'Group name must start with a non-digit word character {#} m/ネ(?<;{#}name>match)ネ/', '/ネ(?\ixネ/' => 'Sequence (?\...) not recognized {#} m/ネ(?\{#}ixネ/', '/ネ(?^lu:ネ)/' => 'Regexp modifiers "l" and "u" are mutually exclusive {#} m/ネ(?^lu{#}:ネ)/', '/ネ(?lil:ネ)/' => 'Regexp modifier "l" may not appear twice {#} m/ネ(?lil{#}:ネ)/', From ba0080604745130e108aa0cfdcb041950dbe3583 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sat, 25 Oct 2025 09:21:55 -0600 Subject: [PATCH 2/3] perldiag: Update description for regex group names This was written before Unicode, and its wording does not accurately extend beyond ASCII. This commit clarifies the description. --- pod/perldiag.pod | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/pod/perldiag.pod b/pod/perldiag.pod index e6250bd970ee..1c9eec84a13a 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -2866,9 +2866,19 @@ has since been undefined. =item Group name must start with a non-digit word character in regex; marked by S<<-- HERE> in m/%s/ -(F) Group names must follow the rules for perl identifiers, meaning -they must start with a non-digit word character. A common cause of -this error is using (?&0) instead of (?0). See L. +(F) Group names must follow the rules for Perl identifiers, meaning +they must start with a character that matches C<\p{XID_Start}> plus the +underscore. This means the first character may not be a digit. +Subsequent characters must match C<\p{XID_Continue}>. + +A common cause of this error is using (?&0) instead of (?0). + +This message was formulated before Perl supported Unicode; so it is +not accurate for Unicode characters outside the ASCII-range. There are +many word characters in Unicode that may not start a group name, and a +few that may not be a continuation character. + +See L. =item ()-group starts with a count From bdade0c2738adbbeb5aafaa4bf47083168afe079 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sat, 25 Oct 2025 17:33:05 -0600 Subject: [PATCH 3/3] regcomp.c: Need to account for UTF group name I found this by reading the code. Prior to this commit, the parse pointer was advanced by one byte; it should be advanced by one character. As long as the the character was ASCII, things worked. I looked through the regcomp.c source for other mis-use of the macro changed by this commit; none were obvious. --- regcomp.c | 4 ++-- t/re/reg_mesg.t | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/regcomp.c b/regcomp.c index 774311f12dc5..c6a2e6ee0e14 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2533,8 +2533,8 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) && (advance = isWORDCHAR_utf8_safe( (U8 *) RExC_parse, (U8 *) RExC_end))); } else { - RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending - character */ + /* so the <- from the vFAIL is after the offending character */ + RExC_parse_inc_safe(); vFAIL("Group name must start with a non-digit word character"); } sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t index a7662e9ce0b3..676843fb1624 100644 --- a/t/re/reg_mesg.t +++ b/t/re/reg_mesg.t @@ -547,6 +547,7 @@ my @death_utf8 = mark_as_utf8( '/[\cネ]/' => "Character following \"\\c\" must be printable ASCII {#} m/[\\cネ{#}]/", '/\b{ネ}/' => "'ネ' is an unknown bound type {#} m/\\b{ネ{#}}/", '/\B{ネ}/' => "'ネ' is an unknown bound type {#} m/\\B{ネ{#}}/", + '/ネ(?<‿name>match)ネ/; #no latin1' => 'Group name must start with a non-digit word character {#} m/ネ(?<‿{#}name>match)ネ/', ); push @death, @death_utf8;