@@ -236,6 +236,9 @@ def _extract_text_recursive(element, text_parts: list):
236236 """
237237 Recursively extract text from element and its children.
238238
239+ Block-level elements accumulate their inline content with spaces,
240+ then add the complete block as a single text part.
241+
239242 Args:
240243 element: lxml element
241244 text_parts: List to append text to
@@ -255,25 +258,63 @@ def _extract_text_recursive(element, text_parts: list):
255258 else :
256259 tag = ''
257260
261+ tag_lower = tag .lower ()
262+
263+ # If this is a block element, accumulate all inline content with spaces
264+ if tag_lower in block_tags :
265+ inline_parts = []
266+ _extract_inline_text (element , inline_parts )
267+ block_text = ' ' .join (inline_parts )
268+ # Clean up multiple spaces
269+ block_text = ' ' .join (block_text .split ())
270+ if block_text :
271+ text_parts .append (block_text )
272+ text_parts .append ('' ) # Empty string creates paragraph break
273+ else :
274+ # For non-block elements, process normally (handles nested structures)
275+ # Handle element text
276+ if hasattr (element , 'text' ) and element .text :
277+ text = element .text .strip ()
278+ if text :
279+ text_parts .append (text )
280+
281+ # Process children
282+ for child in element :
283+ _extract_text_recursive (child , text_parts )
284+
285+ # Handle tail text (text after child element)
286+ if hasattr (child , 'tail' ) and child .tail :
287+ tail = child .tail .strip ()
288+ if tail :
289+ text_parts .append (tail )
290+
291+
292+ def _extract_inline_text (element , inline_parts : list ):
293+ """
294+ Extract all text from an element and its children as inline content.
295+
296+ This accumulates text without adding paragraph breaks, suitable for
297+ gathering all content within a block-level element.
298+
299+ Args:
300+ element: lxml element
301+ inline_parts: List to append text parts to
302+ """
258303 # Handle element text
259304 if hasattr (element , 'text' ) and element .text :
260305 text = element .text .strip ()
261306 if text :
262- text_parts .append (text )
307+ inline_parts .append (text )
263308
264- # Process children
309+ # Process children recursively
265310 for child in element :
266- _extract_text_recursive (child , text_parts )
311+ _extract_inline_text (child , inline_parts )
267312
268313 # Handle tail text (text after child element)
269314 if hasattr (child , 'tail' ) and child .tail :
270315 tail = child .tail .strip ()
271316 if tail :
272- text_parts .append (tail )
273-
274- # Add paragraph break after block elements
275- if tag .lower () in block_tags and text_parts and text_parts [- 1 ] != '' :
276- text_parts .append ('' ) # Empty string creates paragraph break
317+ inline_parts .append (tail )
277318
278319
279320def _clean_translation_tags (text : str ) -> str :
0 commit comments