@@ -2816,39 +2816,51 @@ Perl_utf8_hop_back_overshoot(const U8 *s, SSize_t off,
2816
2816
}
2817
2817
2818
2818
/*
2819
- =for apidoc utf8_hop_safe
2819
+ =for apidoc utf8_hop_safe
2820
+ =for apidoc_item utf8_hop_safe_overshoot
2820
2821
2821
- Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2822
- either forward or backward. C<s> does not need to be pointing to the starting
2823
- byte of a character. If it isn't, one count of C<off> will be used up to get
2824
- to the start of the next character for forward hops, and to the start of the
2825
- current character for negative ones.
2822
+ These each take as input a string encoded as UTF-8 which starts at C<start>,
2823
+ ending at C<end>, and a position into it given by C<s>, and return the
2824
+ position within it that is C<s> displaced by up to C<off> characters, either
2825
+ forwards if C<off> is positive, or backwards if C<off> is negative. (Nothing
2826
+ is done if C<off> is 0.)
2827
+
2828
+ If there are fewer than C<off> characters between C<s> and the respective edge,
2829
+ the functions return that edge.
2826
2830
2827
- When moving backward it will not move before C<start>.
2831
+ The functions differ in that C<utf8_hop_overshoot> can return how many
2832
+ characters beyond the edge the request was for. When its parameter,
2833
+ C<&remaining>, is not NULL, the function stores into it the count of the
2834
+ excess; zero if the request was completely fulfilled. The actual number of
2835
+ characters that were displaced can then be calculated as S<C<off - remaining>>.
2836
+ This function acts identically to plain C<utf8_hop_safe> when this parameter is
2837
+ NULL.
2828
2838
2829
- When moving forward it will not move beyond C<end>.
2839
+ C<s> does not need to be pointing to the starting byte of a character. If it
2840
+ isn't, one count of C<off> will be used up to get to that start.
2830
2841
2831
- Will not exceed those limits even if the string is not valid "UTF-8".
2842
+ To be more precise, the displacement is by the absolute value of C<off>, and
2843
+ the excess count is the absolute value of C<remaining>.
2832
2844
2833
2845
=cut
2834
2846
*/
2835
2847
2848
+ #define Perl_utf8_hop_safe (s , o , b , e ) Perl_utf8_hop_overshoot(s, o, b, e, 0)
2849
+
2836
2850
PERL_STATIC_INLINE U8 *
2837
- Perl_utf8_hop_safe (const U8 * s , SSize_t off , const U8 * start , const U8 * end )
2851
+ Perl_utf8_hop_overshoot (const U8 * s , SSize_t off ,
2852
+ const U8 * const start , const U8 * const end ,
2853
+ SSize_t * remaining )
2838
2854
{
2839
- PERL_ARGS_ASSERT_UTF8_HOP_SAFE ;
2840
-
2841
- /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2842
- * the bitops (especially ~) can create illegal UTF-8.
2843
- * In other words: in Perl UTF-8 is not just for Unicode. */
2855
+ PERL_ARGS_ASSERT_UTF8_HOP_OVERSHOOT ;
2844
2856
2845
2857
assert (start <= s && s <= end );
2846
2858
2847
2859
if (off >= 0 ) {
2848
- return utf8_hop_forward (s , off , end );
2860
+ return utf8_hop_forward_overshoot (s , off , end , remaining );
2849
2861
}
2850
2862
else {
2851
- return utf8_hop_back (s , off , start );
2863
+ return utf8_hop_back_overshoot (s , off , start , remaining );
2852
2864
}
2853
2865
}
2854
2866
0 commit comments