@@ -2708,28 +2708,48 @@ Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
2708
2708
}
2709
2709
2710
2710
/*
2711
- =for apidoc utf8_hop_back
2712
-
2713
- Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2714
- backward. C<s> does not need to be pointing to the starting byte of a
2715
- character. If it isn't, one count of C<off> will be used up to get to that
2716
- start.
2717
-
2718
- C<off> must be non-positive.
2719
-
2720
- C<s> must be after or equal to C<start>.
2721
-
2722
- When moving backward it will not move before C<start>.
2723
-
2724
- Will not exceed this limit even if the string is not valid "UTF-8".
2711
+ =for apidoc utf8_hop_back
2712
+ =for apidoc_item utf8_hop_back_overshoot
2713
+
2714
+ These each take as input a string encoded as UTF-8 which starts at C<start>,
2715
+ and a position into it given by C<s>, and return the position within it that is
2716
+ C<s> displaced by up to C<off> characters backwards.
2717
+
2718
+ If there are fewer than C<off> characters between C<start> and C<s>, the
2719
+ functions return C<start>.
2720
+
2721
+ The functions differ in that C<utf8_hop_back_overshoot> can return how many
2722
+ characters C<off> beyond the edge the request was for. When its parameter,
2723
+ C<&remaining>, is not NULL, the function stores into it the count of the
2724
+ excess; zero if the request was completely fulfilled. The actual number of
2725
+ characters that were displaced can then be calculated as S<C<off - remaining>>.
2726
+ This function acts identically to plain C<utf8_hop_back> when this parameter is
2727
+ NULL.
2728
+
2729
+ C<s> does not need to be pointing to the starting byte of a character. If it
2730
+ isn't, one count of C<off> will be used up to get to that start.
2731
+
2732
+ C<off> must be non-positive, and if zero, no action is taken; C<s> is returned
2733
+ unchanged. That it otherwise must be negative means that the earlier
2734
+ description is a lie, to avoid burdening you with this detail too soon. An
2735
+ C<off> of C<-2> means to displace two characters backwards, so the displacement
2736
+ is actually the absolute value of C<off>. C<remaining> will also be
2737
+ non-positive. If there was only one character between C<start> and C<s>, and a
2738
+ displacement of C<-2> was requested, C<remaining> would be set to C<-1>. The
2739
+ subtraction formula works, yielding the result that only C<-1> character was
2740
+ displaced.
2725
2741
2726
2742
=cut
2727
2743
*/
2728
2744
2745
+ # define Perl_utf8_hop_back ( s , off , start ) \
2746
+ Perl_utf8_hop_back_overshoot(s, off, start, NULL)
2747
+
2729
2748
PERL_STATIC_INLINE U8 *
2730
- Perl_utf8_hop_back (const U8 * s , SSize_t off , const U8 * start )
2749
+ Perl_utf8_hop_back_overshoot (const U8 * s , SSize_t off ,
2750
+ const U8 * const start , SSize_t * remaining )
2731
2751
{
2732
- PERL_ARGS_ASSERT_UTF8_HOP_BACK ;
2752
+ PERL_ARGS_ASSERT_UTF8_HOP_BACK_OVERSHOOT ;
2733
2753
assert (start <= s );
2734
2754
assert (off <= 0 );
2735
2755
@@ -2740,10 +2760,18 @@ Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
2740
2760
* moved is large, and core perl doesn't currently move more than a few
2741
2761
* characters at a time. You can reinstate it if it does become
2742
2762
* advantageous. */
2743
- while (off ++ && s > start ) {
2744
- do {
2763
+ while (off < 0 && s > start ) {
2764
+ do { /* Find the beginning of this character */
2745
2765
s -- ;
2746
- } while (s > start && UTF8_IS_CONTINUATION (* s ));
2766
+ if (! UTF8_IS_CONTINUATION (* s )) {
2767
+ off ++ ;
2768
+ break ;
2769
+ }
2770
+ } while (s > start );
2771
+ }
2772
+
2773
+ if (remaining ) {
2774
+ * remaining = off ;
2747
2775
}
2748
2776
2749
2777
GCC_DIAG_IGNORE (- Wcast - qual )
0 commit comments