Skip to content

Commit 6d61748

Browse files
committed
Add preliminary utf8_to_bytes_()
This is an internal function, designed to be an extension of utf8_to_bytes(), with a slightly different API. This commit just adds it and calls it from just utf8_to_bytes. Future commits will extend this API.
1 parent 5137fe1 commit 6d61748

File tree

4 files changed

+36
-17
lines changed

4 files changed

+36
-17
lines changed

embed.fnc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3698,6 +3698,8 @@ CDbdp |UV |utf8n_to_uvuni |NN const U8 *s \
36983698
|U32 flags
36993699
Adpx |U8 * |utf8_to_bytes |NN U8 *s \
37003700
|NN STRLEN *lenp
3701+
Cp |bool |utf8_to_bytes_ |NN U8 **s_ptr \
3702+
|NN STRLEN *lenp
37013703
EMXp |U8 * |utf16_to_utf8 |NN U8 *p \
37023704
|NN U8 *d \
37033705
|Size_t bytelen \

embed.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -859,6 +859,7 @@
859859
# define utf8_hop_safe Perl_utf8_hop_safe
860860
# define utf8_length(a,b) Perl_utf8_length(aTHX_ a,b)
861861
# define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b)
862+
# define utf8_to_bytes_(a,b) Perl_utf8_to_bytes_(aTHX_ a,b)
862863
# define utf8_to_uvchr_buf_helper(a,b,c) Perl_utf8_to_uvchr_buf_helper(aTHX_ a,b,c)
863864
# define utf8n_to_uvchr_msgs Perl_utf8n_to_uvchr_msgs
864865
# define uvchr_to_utf8(a,b) Perl_uvchr_to_utf8(aTHX,a,b)

proto.h

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

utf8.c

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2377,25 +2377,27 @@ If you need a copy of the string, see L</bytes_from_utf8>.
23772377
=cut
23782378
*/
23792379

2380-
U8 *
2381-
Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
2380+
bool
2381+
Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp)
23822382
{
2383-
U8 * first_variant;
2384-
2385-
PERL_ARGS_ASSERT_UTF8_TO_BYTES;
2383+
PERL_ARGS_ASSERT_UTF8_TO_BYTES_;
23862384
PERL_UNUSED_CONTEXT;
23872385

2386+
U8 * first_variant;
2387+
23882388
/* This is a no-op if no variants at all in the input */
2389-
if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) {
2390-
return s;
2389+
if (is_utf8_invariant_string_loc(*s_ptr, *lenp,
2390+
(const U8 **) &first_variant))
2391+
{
2392+
return true;
23912393
}
23922394

23932395
/* Nothing before 'first_variant' needs to be changed, so start the real
23942396
* work there */
23952397

2396-
U8 * const s0 = s;
2398+
U8 * const s0 = *s_ptr;
23972399
U8 * const send = s0 + *lenp;
2398-
s = first_variant;
2400+
U8 * s = first_variant;
23992401

24002402
#ifndef EBCDIC /* The below relies on the bit patterns of UTF-8 */
24012403

@@ -2425,8 +2427,7 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
24252427
while (s < partial_word_end) {
24262428
if (! UTF8_IS_INVARIANT(*s)) {
24272429
if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
2428-
*lenp = ((STRLEN) -1);
2429-
return NULL;
2430+
return false;
24302431
}
24312432
s++;
24322433
}
@@ -2476,8 +2477,7 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
24762477
* If they're not equal, there are start bytes that aren't C2
24772478
* nor C3, hence this is not downgradable */
24782479
if (start_bytes != C2_C3_start_bytes) {
2479-
*lenp = ((STRLEN) -1);
2480-
return NULL;
2480+
return false;
24812481
}
24822482

24832483
s += PERL_WORDSIZE;
@@ -2496,8 +2496,7 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
24962496
while (s < send) {
24972497
if (! UTF8_IS_INVARIANT(*s)) {
24982498
if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
2499-
*lenp = ((STRLEN) -1);
2500-
return NULL;
2499+
return false;
25012500
}
25022501
s++;
25032502
}
@@ -2543,7 +2542,7 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
25432542
*d = '\0';
25442543
*lenp = d - s0;
25452544

2546-
return s0;
2545+
return true;
25472546

25482547
cant_convert: ;
25492548

@@ -2590,7 +2589,19 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
25902589
}
25912590
}
25922591

2593-
*lenp = ((STRLEN) -1);
2592+
return false;
2593+
}
2594+
2595+
U8 *
2596+
Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
2597+
{
2598+
PERL_ARGS_ASSERT_UTF8_TO_BYTES;
2599+
2600+
if (utf8_to_bytes_(&s, lenp)) {
2601+
return s;
2602+
}
2603+
2604+
*lenp = (STRLEN) -1;
25942605
return NULL;
25952606
}
25962607

0 commit comments

Comments
 (0)