Skip to content

Commit 9b82965

Browse files
committed
Add utf8_hop_overshoot()
This is like utf8_hop_safe(), but also returns the number of characters that would have overshot the edge if it had been allowed to go beyond the edge
1 parent 4334805 commit 9b82965

File tree

4 files changed

+45
-22
lines changed

4 files changed

+45
-22
lines changed

embed.fnc

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3630,10 +3630,16 @@ ARTdip |U8 * |utf8_hop_forward_overshoot \
36303630
|SSize_t off \
36313631
|NN const U8 * const end \
36323632
|NULLOK SSize_t *remaining
3633-
ARTdip |U8 * |utf8_hop_safe |NN const U8 *s \
3633+
ARTdip |U8 * |utf8_hop_overshoot \
3634+
|NN const U8 *s \
36343635
|SSize_t off \
3635-
|NN const U8 *start \
3636-
|NN const U8 *end
3636+
|NN const U8 * const start \
3637+
|NN const U8 * const end \
3638+
|NULLOK SSize_t *remaining
3639+
ARTdmp |U8 * |utf8_hop_safe |NN const U8 *s \
3640+
|SSize_t off \
3641+
|NN const U8 * const start \
3642+
|NN const U8 * const end
36373643
ARdp |STRLEN |utf8_length |NN const U8 *s0 \
36383644
|NN const U8 *e
36393645

embed.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -790,6 +790,7 @@
790790
# define utf8_hop_back_overshoot Perl_utf8_hop_back_overshoot
791791
# define utf8_hop_forward Perl_utf8_hop_forward
792792
# define utf8_hop_forward_overshoot Perl_utf8_hop_forward_overshoot
793+
# define utf8_hop_overshoot Perl_utf8_hop_overshoot
793794
# define utf8_hop_safe Perl_utf8_hop_safe
794795
# define utf8_length(a,b) Perl_utf8_length(aTHX_ a,b)
795796
# define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b)

inline.h

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2816,39 +2816,51 @@ Perl_utf8_hop_back_overshoot(const U8 *s, SSize_t off,
28162816
}
28172817

28182818
/*
2819-
=for apidoc utf8_hop_safe
2819+
=for apidoc utf8_hop_safe
2820+
=for apidoc_item utf8_hop_safe_overshoot
28202821
2821-
Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2822-
either forward or backward. C<s> does not need to be pointing to the starting
2823-
byte of a character. If it isn't, one count of C<off> will be used up to get
2824-
to the start of the next character for forward hops, and to the start of the
2825-
current character for negative ones.
2822+
These each take as input a string encoded as UTF-8 which starts at C<start>,
2823+
ending at C<end>, and a position into it given by C<s>, and return the
2824+
position within it that is C<s> displaced by up to C<off> characters, either
2825+
forwards if C<off> is positive, or backwards if C<off> is negative. (Nothing
2826+
is done if C<off> is 0.)
2827+
2828+
If there are fewer than C<off> characters between C<s> and the respective edge,
2829+
the functions return that edge.
28262830
2827-
When moving backward it will not move before C<start>.
2831+
The functions differ in that C<utf8_hop_overshoot> can return how many
2832+
characters beyond the edge the request was for. When its parameter,
2833+
C<&remaining>, is not NULL, the function stores into it the count of the
2834+
excess; zero if the request was completely fulfilled. The actual number of
2835+
characters that were displaced can then be calculated as S<C<off - remaining>>.
2836+
This function acts identically to plain C<utf8_hop_safe> when this parameter is
2837+
NULL.
28282838
2829-
When moving forward it will not move beyond C<end>.
2839+
C<s> does not need to be pointing to the starting byte of a character. If it
2840+
isn't, one count of C<off> will be used up to get to that start.
28302841
2831-
Will not exceed those limits even if the string is not valid "UTF-8".
2842+
To be more precise, the displacement is by the absolute value of C<off>, and
2843+
the excess count is the absolute value of C<remaining>.
28322844
28332845
=cut
28342846
*/
28352847

2848+
#define Perl_utf8_hop_safe(s, o, b, e) Perl_utf8_hop_overshoot(s, o, b, e, 0)
2849+
28362850
PERL_STATIC_INLINE U8 *
2837-
Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
2851+
Perl_utf8_hop_overshoot(const U8 *s, SSize_t off,
2852+
const U8 * const start, const U8 * const end,
2853+
SSize_t * remaining)
28382854
{
2839-
PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
2840-
2841-
/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2842-
* the bitops (especially ~) can create illegal UTF-8.
2843-
* In other words: in Perl UTF-8 is not just for Unicode. */
2855+
PERL_ARGS_ASSERT_UTF8_HOP_OVERSHOOT;
28442856

28452857
assert(start <= s && s <= end);
28462858

28472859
if (off >= 0) {
2848-
return utf8_hop_forward(s, off, end);
2860+
return utf8_hop_forward_overshoot(s, off, end, remaining);
28492861
}
28502862
else {
2851-
return utf8_hop_back(s, off, start);
2863+
return utf8_hop_back_overshoot(s, off, start, remaining);
28522864
}
28532865
}
28542866

proto.h

Lines changed: 6 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)