Skip to content

Commit 0a5edc8

Browse files
committed
utf8_to_bytes_: Add non-destructive write option
This causes this function to be able to both overwrite the input, and to instead create new memory. It changes bytes_from_utf8() to use this new capability instead of being a near duplication of the core code of this function. Prior to this commit, bytes_from_utf8() just allocated memory the size of the original string, and started copying into it. When it came to a sequence that wasn't convertible, it stopped, and freed up the copy. The new behavior has it checking first before the malloc that the string is convertible. That has the advantage that there is no malloc without being sure it will be useful; but the disadvantage that there is an extra pass through the input string, but that pass is per-word. The next commit will introduce another advantage. Thanks to Tony Cook for the 'free_me' idea
1 parent 5ebf771 commit 0a5edc8

File tree

5 files changed

+62
-56
lines changed

5 files changed

+62
-56
lines changed

embed.fnc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3700,7 +3700,12 @@ Adpx |U8 * |utf8_to_bytes |NN U8 *s \
37003700
|NN STRLEN *lenp
37013701
Cp |bool |utf8_to_bytes_ |NN U8 **s_ptr \
37023702
|NN STRLEN *lenp \
3703+
|NN U8 **free_me \
37033704
|Perl_utf8_to_bytes_arg result_as
3705+
Admp |bool |utf8_to_bytes_new_pv \
3706+
|NN U8 const **s_ptr \
3707+
|NN STRLEN *lenp \
3708+
|NN U8 *free_me
37043709
Admp |bool |utf8_to_bytes_overwrite \
37053710
|NN U8 **s_ptr \
37063711
|NN STRLEN *lenp

embed.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -859,7 +859,8 @@
859859
# define utf8_hop_safe Perl_utf8_hop_safe
860860
# define utf8_length(a,b) Perl_utf8_length(aTHX_ a,b)
861861
# define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b)
862-
# define utf8_to_bytes_(a,b,c) Perl_utf8_to_bytes_(aTHX_ a,b,c)
862+
# define utf8_to_bytes_(a,b,c,d) Perl_utf8_to_bytes_(aTHX_ a,b,c,d)
863+
# define utf8_to_bytes_new_pv(a,b,c) Perl_utf8_to_bytes_new_pv(aTHX,a,b,c)
863864
# define utf8_to_bytes_overwrite(a,b) Perl_utf8_to_bytes_overwrite(aTHX,a,b)
864865
# define utf8_to_uvchr_buf_helper(a,b,c) Perl_utf8_to_uvchr_buf_helper(aTHX_ a,b,c)
865866
# define utf8n_to_uvchr_msgs Perl_utf8n_to_uvchr_msgs

proto.h

Lines changed: 5 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

utf8.c

Lines changed: 42 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -2378,12 +2378,16 @@ If you need a copy of the string, see L</bytes_from_utf8>.
23782378
*/
23792379

23802380
bool
2381-
Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp,
2381+
Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp, U8 ** free_me,
23822382
Perl_utf8_to_bytes_arg result_as)
23832383
{
23842384
PERL_ARGS_ASSERT_UTF8_TO_BYTES_;
23852385
PERL_UNUSED_CONTEXT;
23862386

2387+
if (result_as == PL_utf8_to_bytes_new_memory) {
2388+
*free_me = NULL;
2389+
}
2390+
23872391
U8 * first_variant;
23882392

23892393
/* This is a no-op if no variants at all in the input */
@@ -2505,7 +2509,15 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp,
25052509
s++;
25062510
}
25072511

2508-
U8 *d0 = s0;
2512+
U8 *d0;
2513+
if (result_as == PL_utf8_to_bytes_overwrite) {
2514+
d0 = s0;
2515+
}
2516+
else {
2517+
Newx(d0, *lenp + 1, U8);
2518+
Copy(s0, d0, invariant_length, U8);
2519+
}
2520+
25092521
U8 * d = d0 + invariant_length;
25102522

25112523
/* For the cases where the per-word algorithm wasn't used, everything is
@@ -2546,6 +2558,10 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp,
25462558
*d = '\0';
25472559
*lenp = d - d0;
25482560

2561+
if (result_as != PL_utf8_to_bytes_overwrite) {
2562+
*s_ptr = *free_me = d0;
2563+
}
2564+
25492565
return true;
25502566

25512567
cant_convert: ;
@@ -2556,10 +2572,16 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp,
25562572
* text are C2 and C3, but didn't examine it to make sure each of those was
25572573
* followed by precisely one continuation, for example.
25582574
*
2559-
* We have to undo all we've done before, back down to the first UTF-8
2560-
* variant. Note that each 2-byte variant we've done so far (converted to
2561-
* single byte) slides things to the left one byte, and so we have bytes
2562-
* that haven't been written over.
2575+
* If the result is in newly allocated memory, just free it */
2576+
if (result_as != PL_utf8_to_bytes_overwrite) {
2577+
Safefree(d0);
2578+
return false;
2579+
}
2580+
2581+
/* Otherwise, we have to undo all we've done before, back down to the first
2582+
* UTF-8 variant. Note that each 2-byte variant we've done so far
2583+
* (converted to single byte) slides things to the left one byte, and so we
2584+
* have bytes that haven't been written over.
25632585
*
25642586
* Here, 'd' points to the next position to overwrite, and 's' points to
25652587
* the first invalid byte. That means 'd's contents haven't been changed
@@ -2641,57 +2663,25 @@ U8 *
26412663
Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
26422664
{
26432665
PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
2644-
PERL_UNUSED_CONTEXT;
2645-
2646-
if (! *is_utf8p) {
2647-
return (U8 *) s;
2648-
}
2649-
2650-
const U8 * const s0 = s;
2651-
const U8 * const send = s + *lenp;
2652-
const U8 * first_variant;
2653-
2654-
/* The initial portion of 's' that consists of invariants can be Copied
2655-
* as-is. If it is entirely invariant, the whole thing can be Copied. */
2656-
if (is_utf8_invariant_string_loc(s, *lenp, &first_variant)) {
2657-
first_variant = send;
2658-
}
2659-
2660-
U8 *d;
2661-
Newx(d, (*lenp) + 1, U8);
2662-
Copy(s, d, first_variant - s, U8);
2663-
2664-
U8 *converted_start = d;
2665-
d += first_variant - s;
2666-
s = first_variant;
2667-
2668-
while (s < send) {
2669-
U8 c = *s++;
2670-
if (! UTF8_IS_INVARIANT(c)) {
26712666

2672-
/* Then it is multi-byte encoded. If the code point is above 0xFF,
2673-
* have to stop now */
2674-
if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
2675-
Safefree(converted_start);
2676-
return (U8 *) s0;
2667+
if (*is_utf8p) {
2668+
U8 * new_memory = NULL;
2669+
if (utf8_to_bytes_new_pv(&s, lenp, &new_memory)) {
2670+
*is_utf8p = false;
2671+
2672+
/* Our callers are always expecting new memory upon success. Give
2673+
* it to them, adding a trailing NUL if not already there */
2674+
if (new_memory == NULL) {
2675+
U8 * new_s;
2676+
Newx(new_s, *lenp + 1, U8);
2677+
Copy(s, new_s, *lenp, U8);
2678+
new_s[*lenp] = '\0';
2679+
s = new_s;
26772680
}
2678-
2679-
c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2680-
s++;
26812681
}
2682-
*d++ = c;
26832682
}
26842683

2685-
/* Here, converted the whole of the input */
2686-
*is_utf8p = FALSE;
2687-
2688-
*d = '\0';
2689-
*lenp = d - converted_start;
2690-
2691-
/* Trim unused space */
2692-
Renew(converted_start, *lenp + 1, U8);
2693-
2694-
return converted_start;
2684+
return (U8 *) s;
26952685
}
26962686

26972687
/*

utf8.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1312,8 +1312,15 @@ typedef enum {
13121312
PL_utf8_to_bytes_use_temporary,
13131313
} Perl_utf8_to_bytes_arg;
13141314

1315+
/* INT2PTR() is because this parameter should not be used in this case, but
1316+
* there is a NN assertion for it. It causes that to pass but to still
1317+
* segfault if wrongly gets used */
13151318
#define Perl_utf8_to_bytes_overwrite(mTHX, s, l) \
1316-
Perl_utf8_to_bytes_(aTHX_ s, l, PL_utf8_to_bytes_overwrite)
1319+
Perl_utf8_to_bytes_(aTHX_ s, l, INT2PTR(U8 **, 1), \
1320+
PL_utf8_to_bytes_overwrite)
1321+
#define Perl_utf8_to_bytes_new_pv(mTHX, s, l, f) \
1322+
Perl_utf8_to_bytes_(aTHX_ (U8 **) s, l, f, \
1323+
PL_utf8_to_bytes_new_memory)
13171324

13181325
/* Do not use; should be deprecated. Use isUTF8_CHAR() instead; this is
13191326
* retained solely for backwards compatibility */

0 commit comments

Comments
 (0)