Use valid_utf8_to_uv() consistently in core

khwilliamson · khwilliamson · commit 6a491f827f10 · 2025-09-01T07:49:14.000-06:00
This is the new preferred synonym.
diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs
@@ -1788,7 +1788,7 @@ test_valid_utf8_to_uvchr(s)
          */
         RETVAL = newAV_mortal();
 
-        ret = valid_utf8_to_uvchr((U8*) SvPV_nolen(s), &retlen);
+        ret = valid_utf8_to_uv((U8*) SvPV_nolen(s), &retlen);
 
         /* Returns the return value in [0]; <retlen> in [1] */
         av_push_simple(RETVAL, newSVuv(ret));
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
@@ -883,12 +883,12 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
 
     $ret_ref = test_valid_utf8_to_uvchr($bytes);
     is($ret_ref->[0], $n,
-                   "Verify valid_utf8_to_uvchr($display_bytes) returns $hex_n");
+                   "Verify valid_utf8_to_uv($display_bytes) returns $hex_n");
     is($ret_ref->[1], $len,
-       "Verify valid_utf8_to_uvchr() for $hex_n returns expected length: $len");
+       "Verify valid_utf8_to_uv() for $hex_n returns expected length: $len");
 
     is(scalar @warnings, 0,
-               "Verify valid_utf8_to_uvchr() for $hex_n generated no warnings")
+               "Verify valid_utf8_to_uv() for $hex_n generated no warnings")
       or output_warnings(@warnings);
 
     # Similarly for uvchr_to_utf8
diff --git a/mathoms.c b/mathoms.c
@@ -139,7 +139,7 @@ Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
     PERL_UNUSED_CONTEXT;
     PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
 
-    return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen));
+    return NATIVE_TO_UNI(valid_utf8_to_uv(s, retlen));
 }
 
 U8 *
diff --git a/op.c b/op.c
@@ -6585,7 +6585,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                 Size_t t_char_len;
 
                 /* Get the first character */
-                t_cp = valid_utf8_to_uvchr(t, &t_char_len);
+                t_cp = valid_utf8_to_uv(t, &t_char_len);
                 t += t_char_len;
 
                 /* If the next byte indicates that this wasn't the first
@@ -6596,7 +6596,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                 else { /* Otherwise, ignore the indicator byte, and get the
                           final element, and add the whole range */
                     t++;
-                    t_cp_end = valid_utf8_to_uvchr(t, &t_char_len);
+                    t_cp_end = valid_utf8_to_uv(t, &t_char_len);
                     t += t_char_len;
 
                     inverted_tlist = _add_range_to_invlist(inverted_tlist,
@@ -6781,7 +6781,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                      * next code point is the next UTF-8 char in the input.  We
                      * know the input is valid, because the toker constructed
                      * it */
-                    t_cp = CP_ADJUST(valid_utf8_to_uvchr(t, &t_char_len));
+                    t_cp = CP_ADJUST(valid_utf8_to_uv(t, &t_char_len));
                     t += t_char_len;
 
                     /* UTF-8 strings (only) have been parsed in toke.c to have
@@ -6793,7 +6793,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                         && ! FORCE_RANGE_LEN_1(t_cp))
                     {
                         t++;
-                        t_range_count = valid_utf8_to_uvchr(t, &t_char_len)
+                        t_range_count = valid_utf8_to_uv(t, &t_char_len)
                                       - t_cp + 1;
                         t += t_char_len;
                     }
@@ -6836,13 +6836,13 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                     else {
                         Size_t r_char_len;
 
-                        r_cp = CP_ADJUST(valid_utf8_to_uvchr(r, &r_char_len));
+                        r_cp = CP_ADJUST(valid_utf8_to_uv(r, &r_char_len));
                         r += r_char_len;
                         if (   r < rend && *r == RANGE_INDICATOR
                             && ! FORCE_RANGE_LEN_1(r_cp))
                         {
                             r++;
-                            r_range_count = valid_utf8_to_uvchr(r,
+                            r_range_count = valid_utf8_to_uv(r,
                                                     &r_char_len) - r_cp + 1;
                             r += r_char_len;
                         }
diff --git a/regcomp.c b/regcomp.c
@@ -5294,7 +5294,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
 
             /* Convert from string to numeric code point */
             *code_point_p = (SvUTF8(value_sv))
-                            ? valid_utf8_to_uvchr(value, NULL)
+                            ? valid_utf8_to_uv(value, NULL)
                             : *value;
 
             /* Have parsed this entire single code point \N{...}.  *cp_count
@@ -15045,7 +15045,7 @@ S_parse_uniprop_string(pTHX_
                 goto failed;
             }
 
-            cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
+            cp = valid_utf8_to_uv((U8 *) SvPVX(character), &character_len);
             if (character_len == SvCUR(character)) {
                 prop_definition = add_cp_to_invlist(NULL, cp);
             }
@@ -15068,7 +15068,7 @@ S_parse_uniprop_string(pTHX_
                 av_push_simple(this_string, newSVuv(cp));
 
                 do {
-                    cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
+                    cp = valid_utf8_to_uv((U8 *) remaining, &character_len);
                     av_push_simple(this_string, newSVuv(cp));
                     remaining += character_len;
                 } while (remaining < SvEND(character));
diff --git a/regcomp_trie.c b/regcomp_trie.c
@@ -414,7 +414,7 @@ is the recommended Unicode-aware way of saying
     if ( UTF ) {                                                              \
         /* if it is UTF then it is either already folded, or does not need    \
          * folding */                                                         \
-        uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
+        uvc = valid_utf8_to_uv( (const U8*) uc, &len);                     \
     }                                                                         \
     else if (folder == PL_fold_latin1) {                                      \
         /* This folder implies Unicode rules, which in the range expressible  \
diff --git a/regexec.c b/regexec.c
@@ -4939,7 +4939,7 @@ S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
          * case.  We set 'multi_fold_from' to the single folded-from character,
          * which is handled in an extra iteration below */
         if (utf8_pat) {
-            folded = valid_utf8_to_uvchr(pat, NULL);
+            folded = valid_utf8_to_uv(pat, NULL);
             multi_fold_from
                           = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
         }
@@ -11836,7 +11836,7 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
         /* Here, isn't an ASCII digit.  Find the code point of the character */
         if (! UTF8_IS_INVARIANT(*s)) {
             Size_t len;
-            cp = valid_utf8_to_uvchr((U8 *) s, &len);
+            cp = valid_utf8_to_uv((U8 *) s, &len);
             s += len;
         }
         else {
diff --git a/toke.c b/toke.c
@@ -3325,8 +3325,8 @@ S_scan_const(pTHX_ char *start)
                     /* We know the utf8 is valid, because we just constructed
                      * it ourselves in previous loop iterations */
                     min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
-                    range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
-                    range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
+                    range_min = valid_utf8_to_uv( (U8*) min_ptr, NULL);
+                    range_max = valid_utf8_to_uv( (U8*) max_ptr, NULL);
 
                     /* This compensates for not all code setting
                      * 'has_above_latin1', so that we don't skip stuff that
@@ -11712,7 +11712,7 @@ Perl_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int
         }
 
         close_delim_code = (UTF)
-                           ? valid_utf8_to_uvchr((U8 *) close_delim_str, NULL)
+                           ? valid_utf8_to_uv((U8 *) close_delim_str, NULL)
                            : * (U8 *) close_delim_str;
     }
     else {  /* Here, the delimiter isn't paired, hence the close is the same as
diff --git a/utf8.c b/utf8.c
@@ -4198,7 +4198,7 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
   bad_crossing:
 
     /* Failed, have to return the original */
-    original = valid_utf8_to_uvchr(p, lenp);
+    original = valid_utf8_to_uv(p, lenp);
 
     /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
     ck_warner(packWARN(WARN_LOCALE),
@@ -4575,7 +4575,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
             while (s < send) {
                 if (isASCII(*s)) {
                     /* Crossed, have to return the original */
-                    original = valid_utf8_to_uvchr(p, lenp);
+                    original = valid_utf8_to_uv(p, lenp);
 
                     /* But in these instances, there is an alternative we can
                      * return that is valid */

Original file line number	Diff line number	Diff line change
`@@ -139,7 +139,7 @@ Perl_utf8_to_uvuni(pTHX_ const U8 s, STRLEN retlen)`
`139`	`139`	`PERL_UNUSED_CONTEXT;`
`140`	`140`	`PERL_ARGS_ASSERT_UTF8_TO_UVUNI;`
`141`	`141`
`142`		`- return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen));`
	`142`	`+ return NATIVE_TO_UNI(valid_utf8_to_uv(s, retlen));`
`143`	`143`	`}`
`144`	`144`
`145`	`145`	`U8 *`