Skip to content

Commit 8687219

Browse files
committed
Improve Fast Translation Formating (html)
1 parent 6d65408 commit 8687219

File tree

1 file changed

+49
-8
lines changed

1 file changed

+49
-8
lines changed

src/core/epub/epub_fast_processor.py

Lines changed: 49 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,9 @@ def _extract_text_recursive(element, text_parts: list):
236236
"""
237237
Recursively extract text from element and its children.
238238
239+
Block-level elements accumulate their inline content with spaces,
240+
then add the complete block as a single text part.
241+
239242
Args:
240243
element: lxml element
241244
text_parts: List to append text to
@@ -255,25 +258,63 @@ def _extract_text_recursive(element, text_parts: list):
255258
else:
256259
tag = ''
257260

261+
tag_lower = tag.lower()
262+
263+
# If this is a block element, accumulate all inline content with spaces
264+
if tag_lower in block_tags:
265+
inline_parts = []
266+
_extract_inline_text(element, inline_parts)
267+
block_text = ' '.join(inline_parts)
268+
# Clean up multiple spaces
269+
block_text = ' '.join(block_text.split())
270+
if block_text:
271+
text_parts.append(block_text)
272+
text_parts.append('') # Empty string creates paragraph break
273+
else:
274+
# For non-block elements, process normally (handles nested structures)
275+
# Handle element text
276+
if hasattr(element, 'text') and element.text:
277+
text = element.text.strip()
278+
if text:
279+
text_parts.append(text)
280+
281+
# Process children
282+
for child in element:
283+
_extract_text_recursive(child, text_parts)
284+
285+
# Handle tail text (text after child element)
286+
if hasattr(child, 'tail') and child.tail:
287+
tail = child.tail.strip()
288+
if tail:
289+
text_parts.append(tail)
290+
291+
292+
def _extract_inline_text(element, inline_parts: list):
293+
"""
294+
Extract all text from an element and its children as inline content.
295+
296+
This accumulates text without adding paragraph breaks, suitable for
297+
gathering all content within a block-level element.
298+
299+
Args:
300+
element: lxml element
301+
inline_parts: List to append text parts to
302+
"""
258303
# Handle element text
259304
if hasattr(element, 'text') and element.text:
260305
text = element.text.strip()
261306
if text:
262-
text_parts.append(text)
307+
inline_parts.append(text)
263308

264-
# Process children
309+
# Process children recursively
265310
for child in element:
266-
_extract_text_recursive(child, text_parts)
311+
_extract_inline_text(child, inline_parts)
267312

268313
# Handle tail text (text after child element)
269314
if hasattr(child, 'tail') and child.tail:
270315
tail = child.tail.strip()
271316
if tail:
272-
text_parts.append(tail)
273-
274-
# Add paragraph break after block elements
275-
if tag.lower() in block_tags and text_parts and text_parts[-1] != '':
276-
text_parts.append('') # Empty string creates paragraph break
317+
inline_parts.append(tail)
277318

278319

279320
def _clean_translation_tags(text: str) -> str:

0 commit comments

Comments
 (0)