@@ -2650,46 +2650,76 @@ Perl_utf8_hop(const U8 *s, SSize_t off)
2650
2650
2651
2651
/*
2652
2652
=for apidoc utf8_hop_forward
2653
+ =for apidoc utf8_hop_forward_overshoot
2653
2654
2654
- Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2655
- forward. C<s> does not need to be pointing to the starting byte of a
2656
- character. If it isn't, one count of C<off> will be used up to get to the
2657
- start of the next character.
2655
+ These each take as input a position, C<s0>, into a string encoded as UTF-8
2656
+ which ends at the byte before C<end>, and return the position within it that is
2657
+ C<s0> displaced by up to C<off> characters forwards.
2658
2658
2659
- C<off> must be non-negative.
2659
+ If there are fewer than C<off> characters between C<s0> and C<end>, the
2660
+ functions return C<end>.
2660
2661
2661
- C<s> must be before or equal to C<end>. If after, the function panics.
2662
+ The functions differ in two ways
2662
2663
2663
- When moving forward it will not move beyond C<end>.
2664
+ =over 4
2665
+
2666
+ =item *
2667
+
2668
+ C<utf8_hop_forward_overshoot> can return how many characters beyond the edge
2669
+ the request was for. When its parameter, C<&remaining>, is not NULL, the
2670
+ function stores into it the count of the excess; zero if the request was
2671
+ completely fulfilled. The actual number of characters that were displaced can
2672
+ then be calculated as S<C<off - remaining>>.
2673
+
2674
+ =item *
2675
+
2676
+ C<utf8_hop_forward> will panic if called with C<s0> already positioned at or
2677
+ beyond the edge of the string ending at C<end> and the request is to go even
2678
+ further over the edge. C<utf8_hop_forward_overshoot> presumes the caller will
2679
+ handle any errors, and just stores C<off> into C<remaining> without doing
2680
+ anything else.
2681
+
2682
+ =back
2683
+
2684
+ (The above contains a slight lie. When C<remaining> is NULL, the two functions
2685
+ act identically.)
2686
+
2687
+ C<s0> does not need to be pointing to the starting byte of a character. If it
2688
+ isn't, one count of C<off> will be used up to get to that start.
2664
2689
2665
- Will not exceed this limit even if the string is not valid "UTF-8".
2690
+ C<off> must be non-negative, and if zero, no action is taken; C<s0> is returned
2691
+ unchanged.
2666
2692
2667
2693
=cut
2668
2694
*/
2695
+ # define Perl_utf8_hop_forward ( s , off , end ) \
2696
+ Perl_utf8_hop_forward_overshoot(s, off, end, NULL)
2669
2697
2670
2698
PERL_STATIC_INLINE U8 *
2671
- Perl_utf8_hop_forward (const U8 * s , SSize_t off , const U8 * end )
2699
+ Perl_utf8_hop_forward_overshoot (const U8 * s , SSize_t off ,
2700
+ const U8 * const end , SSize_t * remaining )
2672
2701
{
2673
- PERL_ARGS_ASSERT_UTF8_HOP_FORWARD ;
2702
+ PERL_ARGS_ASSERT_UTF8_HOP_FORWARD_OVERSHOOT ;
2674
2703
assert (off >= 0 );
2675
2704
2676
2705
if (off != 0 ) {
2677
- if (UNLIKELY (s >= end )) {
2706
+ if (UNLIKELY (s >= end && ! remaining )) {
2678
2707
Perl_croak_nocontext ("panic: Start of forward hop (0x%p) is %zd"
2679
2708
" bytes beyond legal end position (0x%p)" ,
2680
2709
s , 1 + s - end , end );
2681
2710
}
2682
2711
2683
2712
if (UNLIKELY (UTF8_IS_CONTINUATION (* s ))) {
2684
- /* Get to next non-continuation byte */
2685
- do {
2713
+ do { /* Get to next non-continuation byte */
2714
+ if (! UTF8_IS_CONTINUATION (* s )) {
2715
+ off -- ;
2716
+ break ;
2717
+ }
2686
2718
s ++ ;
2687
- }
2688
- while (s < end && UTF8_IS_CONTINUATION (* s ));
2689
- off -- ;
2719
+ } while (s < end );
2690
2720
}
2691
2721
2692
- while (off -- && s < end ) {
2722
+ while (off > 0 && s < end ) {
2693
2723
STRLEN skip = UTF8SKIP (s );
2694
2724
2695
2725
/* Quit without counting this character if it overshoots the edge.
@@ -2698,10 +2728,16 @@ Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
2698
2728
s = end ;
2699
2729
break ;
2700
2730
}
2731
+
2701
2732
s += skip ;
2733
+ off -- ;
2702
2734
}
2703
2735
}
2704
2736
2737
+ if (remaining ) {
2738
+ * remaining = off ;
2739
+ }
2740
+
2705
2741
GCC_DIAG_IGNORE (- Wcast - qual )
2706
2742
return (U8 * )s ;
2707
2743
GCC_DIAG_RESTORE
0 commit comments