Skip to content

Commit 4334805

Browse files
committed
Add utf8_hop_forward_overshoot()
This is like plain utf8_hop_forward() except it returns how many characters the request would have overshot the edge if it had been allowed to go beyond the edge. This allows the caller to do error handling. The code has to be changed to be more careful (than before this commit) with counting the actual number of characters consumed in the hop.
1 parent 075b057 commit 4334805

File tree

4 files changed

+67
-21
lines changed

4 files changed

+67
-21
lines changed

embed.fnc

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3621,10 +3621,15 @@ ARTdip |U8 * |utf8_hop_back_overshoot \
36213621
|SSize_t off \
36223622
|NN const U8 * const start \
36233623
|NULLOK SSize_t *remaining
3624-
ARTdip |U8 * |utf8_hop_forward \
3624+
ARTdmp |U8 * |utf8_hop_forward \
36253625
|NN const U8 *s \
36263626
|SSize_t off \
3627-
|NN const U8 *end
3627+
|NN const U8 * const end
3628+
ARTdip |U8 * |utf8_hop_forward_overshoot \
3629+
|NN const U8 *s \
3630+
|SSize_t off \
3631+
|NN const U8 * const end \
3632+
|NULLOK SSize_t *remaining
36283633
ARTdip |U8 * |utf8_hop_safe |NN const U8 *s \
36293634
|SSize_t off \
36303635
|NN const U8 *start \

embed.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -789,6 +789,7 @@
789789
# define utf8_hop_back Perl_utf8_hop_back
790790
# define utf8_hop_back_overshoot Perl_utf8_hop_back_overshoot
791791
# define utf8_hop_forward Perl_utf8_hop_forward
792+
# define utf8_hop_forward_overshoot Perl_utf8_hop_forward_overshoot
792793
# define utf8_hop_safe Perl_utf8_hop_safe
793794
# define utf8_length(a,b) Perl_utf8_length(aTHX_ a,b)
794795
# define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b)

inline.h

Lines changed: 53 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2650,46 +2650,76 @@ Perl_utf8_hop(const U8 *s, SSize_t off)
26502650

26512651
/*
26522652
=for apidoc utf8_hop_forward
2653+
=for apidoc utf8_hop_forward_overshoot
26532654
2654-
Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2655-
forward. C<s> does not need to be pointing to the starting byte of a
2656-
character. If it isn't, one count of C<off> will be used up to get to the
2657-
start of the next character.
2655+
These each take as input a position, C<s0>, into a string encoded as UTF-8
2656+
which ends at the byte before C<end>, and return the position within it that is
2657+
C<s0> displaced by up to C<off> characters forwards.
26582658
2659-
C<off> must be non-negative.
2659+
If there are fewer than C<off> characters between C<s0> and C<end>, the
2660+
functions return C<end>.
26602661
2661-
C<s> must be before or equal to C<end>. If after, the function panics.
2662+
The functions differ in two ways
26622663
2663-
When moving forward it will not move beyond C<end>.
2664+
=over 4
2665+
2666+
=item *
2667+
2668+
C<utf8_hop_forward_overshoot> can return how many characters beyond the edge
2669+
the request was for. When its parameter, C<&remaining>, is not NULL, the
2670+
function stores into it the count of the excess; zero if the request was
2671+
completely fulfilled. The actual number of characters that were displaced can
2672+
then be calculated as S<C<off - remaining>>.
2673+
2674+
=item *
2675+
2676+
C<utf8_hop_forward> will panic if called with C<s0> already positioned at or
2677+
beyond the edge of the string ending at C<end> and the request is to go even
2678+
further over the edge. C<utf8_hop_forward_overshoot> presumes the caller will
2679+
handle any errors, and just stores C<off> into C<remaining> without doing
2680+
anything else.
2681+
2682+
=back
2683+
2684+
(The above contains a slight lie. When C<remaining> is NULL, the two functions
2685+
act identically.)
2686+
2687+
C<s0> does not need to be pointing to the starting byte of a character. If it
2688+
isn't, one count of C<off> will be used up to get to that start.
26642689
2665-
Will not exceed this limit even if the string is not valid "UTF-8".
2690+
C<off> must be non-negative, and if zero, no action is taken; C<s0> is returned
2691+
unchanged.
26662692
26672693
=cut
26682694
*/
2695+
# define Perl_utf8_hop_forward( s, off, end) \
2696+
Perl_utf8_hop_forward_overshoot(s, off, end, NULL)
26692697

26702698
PERL_STATIC_INLINE U8 *
2671-
Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
2699+
Perl_utf8_hop_forward_overshoot(const U8 * s, SSize_t off,
2700+
const U8 * const end, SSize_t *remaining)
26722701
{
2673-
PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
2702+
PERL_ARGS_ASSERT_UTF8_HOP_FORWARD_OVERSHOOT;
26742703
assert(off >= 0);
26752704

26762705
if (off != 0) {
2677-
if (UNLIKELY(s >= end)) {
2706+
if (UNLIKELY(s >= end && ! remaining)) {
26782707
Perl_croak_nocontext("panic: Start of forward hop (0x%p) is %zd"
26792708
" bytes beyond legal end position (0x%p)",
26802709
s, 1 + s - end, end);
26812710
}
26822711

26832712
if (UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
2684-
/* Get to next non-continuation byte */
2685-
do {
2713+
do { /* Get to next non-continuation byte */
2714+
if (! UTF8_IS_CONTINUATION(*s)) {
2715+
off--;
2716+
break;
2717+
}
26862718
s++;
2687-
}
2688-
while (s < end && UTF8_IS_CONTINUATION(*s));
2689-
off--;
2719+
} while (s < end);
26902720
}
26912721

2692-
while (off-- && s < end) {
2722+
while (off > 0 && s < end) {
26932723
STRLEN skip = UTF8SKIP(s);
26942724

26952725
/* Quit without counting this character if it overshoots the edge.
@@ -2698,10 +2728,16 @@ Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
26982728
s = end;
26992729
break;
27002730
}
2731+
27012732
s += skip;
2733+
off--;
27022734
}
27032735
}
27042736

2737+
if (remaining) {
2738+
*remaining = off;
2739+
}
2740+
27052741
GCC_DIAG_IGNORE(-Wcast-qual)
27062742
return (U8 *)s;
27072743
GCC_DIAG_RESTORE

proto.h

Lines changed: 6 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)