@@ -104,9 +104,7 @@ zend_result dom_modern_document_implementation_read(dom_object *obj, zval *retva
104104
105105static void dom_decoding_encoding_ctx_init (dom_decoding_encoding_ctx * ctx )
106106{
107- ctx -> encode_data = lxb_encoding_data (LXB_ENCODING_UTF_8 );
108- ctx -> decode_data = NULL ;
109- /* Set fast path on by default so that the decoder finishing is skipped if this was never initialised properly. */
107+ ctx -> decode_data = ctx -> encode_data = lxb_encoding_data (LXB_ENCODING_UTF_8 );
110108 ctx -> fast_path = true;
111109 (void ) lxb_encoding_encode_init (
112110 & ctx -> encode ,
@@ -115,6 +113,13 @@ static void dom_decoding_encoding_ctx_init(dom_decoding_encoding_ctx *ctx)
115113 sizeof (ctx -> encoding_output ) / sizeof (* ctx -> encoding_output )
116114 );
117115 (void ) lxb_encoding_encode_replace_set (& ctx -> encode , LXB_ENCODING_REPLACEMENT_BYTES , LXB_ENCODING_REPLACEMENT_SIZE );
116+ (void ) lxb_encoding_decode_init (
117+ & ctx -> decode ,
118+ ctx -> decode_data ,
119+ ctx -> codepoints ,
120+ sizeof (ctx -> codepoints ) / sizeof (* ctx -> codepoints )
121+ );
122+ (void ) lxb_encoding_decode_replace_set (& ctx -> decode , LXB_ENCODING_REPLACEMENT_BUFFER , LXB_ENCODING_REPLACEMENT_BUFFER_LEN );
118123}
119124
120125static const char * dom_lexbor_tokenizer_error_code_to_string (lxb_html_tokenizer_error_id_t id )
@@ -523,6 +528,8 @@ static bool dom_decode_encode_fast_path(
523528 size_t * tree_error_offset
524529)
525530{
531+ decoding_encoding_ctx -> decode .status = LXB_STATUS_OK ;
532+
526533 const lxb_char_t * buf_ref = * buf_ref_ref ;
527534 const lxb_char_t * last_output = buf_ref ;
528535 while (buf_ref != buf_end ) {
@@ -551,6 +558,17 @@ static bool dom_decode_encode_fast_path(
551558 )) {
552559 goto fail_oom ;
553560 }
561+
562+ if (codepoint == LXB_ENCODING_DECODE_CONTINUE ) {
563+ ZEND_ASSERT (buf_ref == buf_end );
564+ /* The decoder needs more data but the entire buffer is consumed.
565+ * All valid data is outputted, and if the remaining data for the code point
566+ * is invalid, the next call will output the replacement bytes. */
567+ * buf_ref_ref = buf_ref ;
568+ decoding_encoding_ctx -> decode .status = LXB_STATUS_CONTINUE ;
569+ return true;
570+ }
571+
554572 if (!dom_process_parse_chunk (
555573 ctx ,
556574 document ,
@@ -563,6 +581,7 @@ static bool dom_decode_encode_fast_path(
563581 )) {
564582 goto fail_oom ;
565583 }
584+
566585 last_output = buf_ref ;
567586 }
568587 }
@@ -676,29 +695,22 @@ static bool dom_parse_decode_encode_finish(
676695 size_t * tree_error_offset
677696)
678697{
679- if (!decoding_encoding_ctx -> fast_path ) {
680- /* Fast path handles codepoints one by one, so this part is not applicable in that case */
681- (void ) lxb_encoding_decode_finish (& decoding_encoding_ctx -> decode );
682- size_t decoding_buffer_size = lxb_encoding_decode_buf_used (& decoding_encoding_ctx -> decode );
683- if (decoding_buffer_size > 0 ) {
684- const lxb_codepoint_t * codepoints_ref = (const lxb_codepoint_t * ) decoding_encoding_ctx -> codepoints ;
685- const lxb_codepoint_t * codepoints_end = codepoints_ref + decoding_buffer_size ;
686- (void ) decoding_encoding_ctx -> encode_data -> encode (& decoding_encoding_ctx -> encode , & codepoints_ref , codepoints_end );
687- if (!dom_process_parse_chunk (
688- ctx ,
689- document ,
690- parser ,
691- lxb_encoding_encode_buf_used (& decoding_encoding_ctx -> encode ),
692- decoding_encoding_ctx -> encoding_output ,
693- decoding_buffer_size ,
694- tokenizer_error_offset ,
695- tree_error_offset
696- )) {
697- return false;
698- }
699- }
698+ lxb_status_t status ;
699+
700+ status = lxb_encoding_decode_finish (& decoding_encoding_ctx -> decode );
701+ ZEND_ASSERT (status == LXB_STATUS_OK );
702+
703+ size_t decoding_buffer_size = lxb_encoding_decode_buf_used (& decoding_encoding_ctx -> decode );
704+ if (decoding_buffer_size > 0 ) {
705+ const lxb_codepoint_t * codepoints_ref = (const lxb_codepoint_t * ) decoding_encoding_ctx -> codepoints ;
706+ const lxb_codepoint_t * codepoints_end = codepoints_ref + decoding_buffer_size ;
707+ status = decoding_encoding_ctx -> encode_data -> encode (& decoding_encoding_ctx -> encode , & codepoints_ref , codepoints_end );
708+ ZEND_ASSERT (status == LXB_STATUS_OK );
709+ /* No need to produce output here, as we finish the encoder below and pass the chunk. */
700710 }
701- (void ) lxb_encoding_encode_finish (& decoding_encoding_ctx -> encode );
711+
712+ status = lxb_encoding_encode_finish (& decoding_encoding_ctx -> encode );
713+ ZEND_ASSERT (status == LXB_STATUS_OK );
702714 if (lxb_encoding_encode_buf_used (& decoding_encoding_ctx -> encode )
703715 && !dom_process_parse_chunk (
704716 ctx ,
0 commit comments