@@ -2378,12 +2378,16 @@ If you need a copy of the string, see L</bytes_from_utf8>.
2378
2378
*/
2379
2379
2380
2380
bool
2381
- Perl_utf8_to_bytes_ (pTHX_ U8 * * s_ptr , STRLEN * lenp ,
2381
+ Perl_utf8_to_bytes_ (pTHX_ U8 * * s_ptr , STRLEN * lenp , U8 * * free_me ,
2382
2382
Perl_utf8_to_bytes_arg result_as )
2383
2383
{
2384
2384
PERL_ARGS_ASSERT_UTF8_TO_BYTES_ ;
2385
2385
PERL_UNUSED_CONTEXT ;
2386
2386
2387
+ if (result_as == PL_utf8_to_bytes_new_memory ) {
2388
+ * free_me = NULL ;
2389
+ }
2390
+
2387
2391
U8 * first_variant ;
2388
2392
2389
2393
/* This is a no-op if no variants at all in the input */
@@ -2505,7 +2509,15 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp,
2505
2509
s ++ ;
2506
2510
}
2507
2511
2508
- U8 * d0 = s0 ;
2512
+ U8 * d0 ;
2513
+ if (result_as == PL_utf8_to_bytes_overwrite ) {
2514
+ d0 = s0 ;
2515
+ }
2516
+ else {
2517
+ Newx (d0 , * lenp + 1 , U8 );
2518
+ Copy (s0 , d0 , invariant_length , U8 );
2519
+ }
2520
+
2509
2521
U8 * d = d0 + invariant_length ;
2510
2522
2511
2523
/* For the cases where the per-word algorithm wasn't used, everything is
@@ -2546,6 +2558,10 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp,
2546
2558
* d = '\0' ;
2547
2559
* lenp = d - d0 ;
2548
2560
2561
+ if (result_as != PL_utf8_to_bytes_overwrite ) {
2562
+ * s_ptr = * free_me = d0 ;
2563
+ }
2564
+
2549
2565
return true;
2550
2566
2551
2567
cant_convert : ;
@@ -2556,10 +2572,16 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp,
2556
2572
* text are C2 and C3, but didn't examine it to make sure each of those was
2557
2573
* followed by precisely one continuation, for example.
2558
2574
*
2559
- * We have to undo all we've done before, back down to the first UTF-8
2560
- * variant. Note that each 2-byte variant we've done so far (converted to
2561
- * single byte) slides things to the left one byte, and so we have bytes
2562
- * that haven't been written over.
2575
+ * If the result is in newly allocated memory, just free it */
2576
+ if (result_as != PL_utf8_to_bytes_overwrite ) {
2577
+ Safefree (d0 );
2578
+ return false;
2579
+ }
2580
+
2581
+ /* Otherwise, we have to undo all we've done before, back down to the first
2582
+ * UTF-8 variant. Note that each 2-byte variant we've done so far
2583
+ * (converted to single byte) slides things to the left one byte, and so we
2584
+ * have bytes that haven't been written over.
2563
2585
*
2564
2586
* Here, 'd' points to the next position to overwrite, and 's' points to
2565
2587
* the first invalid byte. That means 'd's contents haven't been changed
@@ -2641,57 +2663,25 @@ U8 *
2641
2663
Perl_bytes_from_utf8 (pTHX_ const U8 * s , STRLEN * lenp , bool * is_utf8p )
2642
2664
{
2643
2665
PERL_ARGS_ASSERT_BYTES_FROM_UTF8 ;
2644
- PERL_UNUSED_CONTEXT ;
2645
-
2646
- if (! * is_utf8p ) {
2647
- return (U8 * ) s ;
2648
- }
2649
-
2650
- const U8 * const s0 = s ;
2651
- const U8 * const send = s + * lenp ;
2652
- const U8 * first_variant ;
2653
-
2654
- /* The initial portion of 's' that consists of invariants can be Copied
2655
- * as-is. If it is entirely invariant, the whole thing can be Copied. */
2656
- if (is_utf8_invariant_string_loc (s , * lenp , & first_variant )) {
2657
- first_variant = send ;
2658
- }
2659
-
2660
- U8 * d ;
2661
- Newx (d , (* lenp ) + 1 , U8 );
2662
- Copy (s , d , first_variant - s , U8 );
2663
-
2664
- U8 * converted_start = d ;
2665
- d += first_variant - s ;
2666
- s = first_variant ;
2667
-
2668
- while (s < send ) {
2669
- U8 c = * s ++ ;
2670
- if (! UTF8_IS_INVARIANT (c )) {
2671
2666
2672
- /* Then it is multi-byte encoded. If the code point is above 0xFF,
2673
- * have to stop now */
2674
- if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE (s - 1 , send ))) {
2675
- Safefree (converted_start );
2676
- return (U8 * ) s0 ;
2667
+ if (* is_utf8p ) {
2668
+ U8 * new_memory = NULL ;
2669
+ if (utf8_to_bytes_new_pv (& s , lenp , & new_memory )) {
2670
+ * is_utf8p = false;
2671
+
2672
+ /* Our callers are always expecting new memory upon success. Give
2673
+ * it to them, adding a trailing NUL if not already there */
2674
+ if (new_memory == NULL ) {
2675
+ U8 * new_s ;
2676
+ Newx (new_s , * lenp + 1 , U8 );
2677
+ Copy (s , new_s , * lenp , U8 );
2678
+ new_s [* lenp ] = '\0' ;
2679
+ s = new_s ;
2677
2680
}
2678
-
2679
- c = EIGHT_BIT_UTF8_TO_NATIVE (c , * s );
2680
- s ++ ;
2681
2681
}
2682
- * d ++ = c ;
2683
2682
}
2684
2683
2685
- /* Here, converted the whole of the input */
2686
- * is_utf8p = FALSE;
2687
-
2688
- * d = '\0' ;
2689
- * lenp = d - converted_start ;
2690
-
2691
- /* Trim unused space */
2692
- Renew (converted_start , * lenp + 1 , U8 );
2693
-
2694
- return converted_start ;
2684
+ return (U8 * ) s ;
2695
2685
}
2696
2686
2697
2687
/*
0 commit comments