Skip to content

Commit fd43617

Browse files
committed
progress bar
1 parent e7125e0 commit fd43617

File tree

4 files changed

+406
-87
lines changed

4 files changed

+406
-87
lines changed

src/core/epub/translator.py

Lines changed: 166 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import zipfile
1313
import tempfile
1414
import aiofiles
15-
from typing import Dict, Any, Optional, Callable, Tuple
15+
from typing import Dict, Any, Optional, Callable, Tuple, List
1616
from lxml import etree
1717
from tqdm.auto import tqdm
1818

@@ -425,15 +425,8 @@ async def _translate_single_file(
425425
log_callback("epub_file_translate_start",
426426
f"Translating file {file_idx + 1}/{total_files}: {content_href}")
427427

428-
# Create file-specific progress callback
429-
def file_progress_callback(file_progress):
430-
if progress_callback:
431-
# Map file progress (0-100) to overall progress
432-
base_progress = ((file_idx / total_files) * 90) + 5
433-
file_contribution = (file_progress / 100) * (90 / total_files)
434-
progress_callback(base_progress + file_contribution)
435-
436428
# Translate using simplified mode
429+
# Note: progress_callback is now token-aware wrapper from _process_all_content_files
437430
success = await translate_xhtml_simplified(
438431
doc_root=doc_root,
439432
source_language=source_language,
@@ -442,7 +435,7 @@ def file_progress_callback(file_progress):
442435
llm_client=llm_client,
443436
max_tokens_per_chunk=max_tokens_per_chunk,
444437
log_callback=log_callback,
445-
progress_callback=file_progress_callback,
438+
progress_callback=progress_callback, # Pass through token-based wrapper directly
446439
context_manager=context_manager,
447440
max_retries=max_attempts
448441
)
@@ -470,6 +463,82 @@ def file_progress_callback(file_progress):
470463
return None, file_path_abs, False
471464

472465

466+
async def _count_all_chunks(
467+
content_files: list,
468+
opf_dir: str,
469+
max_tokens_per_chunk: int,
470+
log_callback: Optional[Callable] = None
471+
) -> List[Tuple[str, List[Dict], int]]:
472+
"""
473+
Pre-count all chunks across all XHTML files.
474+
475+
Returns list of (file_href, chunks_info, total_tokens) for each file.
476+
This allows accurate progress tracking based on actual chunk count, not file count.
477+
"""
478+
from .xhtml_translator import _setup_translation, _preserve_tags, _create_chunks
479+
from .body_serializer import extract_body_html
480+
import aiofiles
481+
from lxml import etree
482+
483+
file_chunk_info = []
484+
total_chunks_all_files = 0
485+
486+
if log_callback:
487+
log_callback("epub_precount_start", f"📊 Pre-counting chunks across {len(content_files)} files...")
488+
489+
for content_href in content_files:
490+
file_path_abs = os.path.normpath(os.path.join(opf_dir, content_href))
491+
if not os.path.exists(file_path_abs):
492+
# File not found, will be skipped during translation
493+
file_chunk_info.append((content_href, [], 0))
494+
continue
495+
496+
try:
497+
async with aiofiles.open(file_path_abs, 'r', encoding='utf-8') as f:
498+
content = await f.read()
499+
500+
parser = etree.XMLParser(encoding='utf-8', recover=True, remove_blank_text=False)
501+
doc_root = etree.fromstring(content.encode('utf-8'), parser)
502+
503+
# Extract body and count chunks (same logic as translation)
504+
body_html, body_element, tag_preserver = _setup_translation(doc_root, log_callback, None)
505+
506+
if not body_html:
507+
file_chunk_info.append((content_href, [], 0))
508+
continue
509+
510+
# Preserve tags
511+
text_with_placeholders, global_tag_map, _ = _preserve_tags(body_html, tag_preserver, log_callback)
512+
513+
# Create chunks
514+
chunks = _create_chunks(text_with_placeholders, global_tag_map, max_tokens_per_chunk, log_callback, None)
515+
516+
# Calculate total tokens for this file
517+
from src.core.chunking.token_chunker import TokenChunker
518+
token_counter = TokenChunker(max_tokens=max_tokens_per_chunk)
519+
file_total_tokens = sum(token_counter.count_tokens(chunk['text']) for chunk in chunks)
520+
521+
file_chunk_info.append((content_href, chunks, file_total_tokens))
522+
total_chunks_all_files += len(chunks)
523+
524+
except Exception as e:
525+
if log_callback:
526+
log_callback("epub_precount_error", f"Error pre-counting chunks in '{content_href}': {e}")
527+
file_chunk_info.append((content_href, [], 0))
528+
529+
if log_callback:
530+
log_callback("epub_precount_complete",
531+
f"📊 Pre-count complete: {total_chunks_all_files} total chunks across {len(content_files)} files")
532+
# Log per-file breakdown for debugging
533+
for i, (href, chunks, tokens) in enumerate(file_chunk_info[:5]): # Show first 5 files
534+
log_callback("epub_precount_file_detail",
535+
f" File {i+1}: {href}{len(chunks)} chunks, {tokens} tokens")
536+
if len(file_chunk_info) > 5:
537+
log_callback("epub_precount_more", f" ... and {len(file_chunk_info) - 5} more files")
538+
539+
return file_chunk_info
540+
541+
473542
async def _process_all_content_files(
474543
content_files: list,
475544
opf_dir: str,
@@ -488,7 +557,7 @@ async def _process_all_content_files(
488557
stats_callback: Optional[Callable] = None,
489558
check_interruption_callback: Optional[Callable] = None
490559
) -> Dict:
491-
"""Process all XHTML content files.
560+
"""Process all XHTML content files with accurate token-based progress tracking.
492561
493562
Args:
494563
content_files: List of content file hrefs
@@ -511,17 +580,41 @@ async def _process_all_content_files(
511580
Returns:
512581
Dictionary with processing results and parsed documents
513582
"""
583+
from src.core.progress_tracker import TokenProgressTracker
584+
585+
# Pre-count all chunks across all files for accurate progress
586+
file_chunk_info = await _count_all_chunks(content_files, opf_dir, max_tokens_per_chunk, log_callback)
587+
588+
# Initialize token-based progress tracker
589+
progress_tracker = TokenProgressTracker()
590+
progress_tracker.start()
591+
592+
# Register all chunks with their token counts
593+
total_registered_chunks = 0
594+
total_registered_tokens = 0
595+
for file_href, chunks, file_tokens in file_chunk_info:
596+
if chunks:
597+
from src.core.chunking.token_chunker import TokenChunker
598+
token_counter = TokenChunker(max_tokens=max_tokens_per_chunk)
599+
for chunk in chunks:
600+
token_count = token_counter.count_tokens(chunk['text'])
601+
progress_tracker.register_chunk(token_count)
602+
total_registered_chunks += 1
603+
total_registered_tokens += token_count
604+
605+
if log_callback:
606+
log_callback("epub_tracker_initialized",
607+
f"📈 Progress tracker initialized: {total_registered_chunks} chunks, {total_registered_tokens} tokens")
608+
609+
# Initial stats
610+
if stats_callback:
611+
stats_callback(progress_tracker.get_stats().to_dict())
612+
514613
parsed_xhtml_docs: Dict[str, etree._Element] = {}
515614
total_files = len(content_files)
516615
completed_files = 0
517616
failed_files = 0
518-
519-
if stats_callback:
520-
stats_callback({
521-
'total_chunks': total_files,
522-
'completed_chunks': 0,
523-
'failed_chunks': 0
524-
})
617+
global_chunk_index = 0 # Track global chunk index across all files
525618

526619
iterator = tqdm(
527620
enumerate(content_files),
@@ -538,17 +631,52 @@ async def _process_all_content_files(
538631
f"Translation interrupted at file {file_idx + 1}/{total_files}")
539632
break
540633

634+
# Get chunk info for this file
635+
_, file_chunks, _ = file_chunk_info[file_idx]
636+
file_chunk_count = len(file_chunks)
637+
541638
# Skip already processed files on resume
542639
if file_idx < resume_from_index:
543640
completed_files += 1
641+
# Mark all chunks in this file as completed for progress tracker
642+
for chunk_idx in range(file_chunk_count):
643+
progress_tracker.mark_completed(global_chunk_index + chunk_idx, 0.0)
644+
global_chunk_index += file_chunk_count
544645
continue
545646

546-
# Update progress
547-
if progress_callback:
548-
progress_percent = ((file_idx / total_files) * 90) + 5
549-
progress_callback(progress_percent)
647+
# Create wrapper progress callback
648+
# The existing code calls progress_callback with percentage (0-100) per file
649+
# We intercept that and update the global tracker based on chunk completion
650+
file_start_chunk_idx = global_chunk_index
651+
last_reported_chunk = [0] # Mutable to capture in closure
652+
653+
def file_progress_wrapper(file_percent: float):
654+
"""
655+
Intercept file-level progress (0-100%) and convert to chunk-level progress.
656+
657+
file_percent: Progress within this file (0-100)
658+
"""
659+
if file_chunk_count == 0:
660+
return
661+
662+
# Estimate which chunk we're on based on percentage
663+
chunks_completed_in_file = int((file_percent / 100) * file_chunk_count)
664+
chunks_completed_in_file = min(chunks_completed_in_file, file_chunk_count)
665+
666+
# Mark newly completed chunks since last call
667+
for chunk_offset in range(last_reported_chunk[0], chunks_completed_in_file):
668+
actual_global_idx = file_start_chunk_idx + chunk_offset
669+
progress_tracker.mark_completed(actual_global_idx, 0.0) # No time measurement available
550670

551-
# Translate the file
671+
last_reported_chunk[0] = chunks_completed_in_file
672+
673+
# Update global progress
674+
if progress_callback:
675+
progress_callback(progress_tracker.get_progress_percent())
676+
if stats_callback:
677+
stats_callback(progress_tracker.get_stats().to_dict())
678+
679+
# Translate the file (reuse existing function)
552680
doc_root, file_path_abs, success = await _translate_single_file(
553681
file_idx=file_idx,
554682
content_href=content_href,
@@ -563,9 +691,17 @@ async def _process_all_content_files(
563691
translation_id=translation_id,
564692
total_files=total_files,
565693
log_callback=log_callback,
566-
progress_callback=progress_callback
694+
progress_callback=file_progress_wrapper # Use our wrapper
567695
)
568696

697+
# Ensure all chunks in this file are marked complete after translation finishes
698+
for chunk_offset in range(last_reported_chunk[0], file_chunk_count):
699+
actual_global_idx = file_start_chunk_idx + chunk_offset
700+
progress_tracker.mark_completed(actual_global_idx, 0.0)
701+
702+
# Advance global chunk index by number of chunks in this file
703+
global_chunk_index += file_chunk_count
704+
569705
# Save the document if translation succeeded
570706
# Note: doc_root is modified in-place only if _replace_body succeeds
571707
# If it fails, doc_root still contains the original content (no data loss)
@@ -583,24 +719,22 @@ async def _process_all_content_files(
583719
else:
584720
failed_files += 1
585721

586-
# Update statistics
722+
# Update statistics from progress tracker
587723
if stats_callback:
588-
stats_callback({
589-
'completed_chunks': completed_files,
590-
'failed_chunks': failed_files
591-
})
724+
stats_callback(progress_tracker.get_stats().to_dict())
592725

593726
# Save checkpoint after each file
594727
if checkpoint_manager and translation_id:
728+
stats = progress_tracker.get_stats()
595729
checkpoint_manager.save_checkpoint(
596730
translation_id=translation_id,
597731
chunk_index=file_idx + 1,
598732
original_text=content_href,
599733
translated_text=content_href,
600734
chunk_data={'last_file': content_href},
601-
total_chunks=len(content_files),
602-
completed_chunks=completed_files,
603-
failed_chunks=failed_files
735+
total_chunks=stats.total_chunks,
736+
completed_chunks=stats.completed_chunks,
737+
failed_chunks=stats.failed_chunks
604738
)
605739

606740
return {

src/core/epub/xhtml_translator.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -603,9 +603,6 @@ async def _translate_all_chunks(
603603
translated_chunks = []
604604

605605
for i, chunk in enumerate(chunks):
606-
if progress_callback:
607-
progress_callback((i / len(chunks)) * 100)
608-
609606
translated = await translate_chunk_with_fallback(
610607
chunk_text=chunk['text'],
611608
local_tag_map=chunk['local_tag_map'],
@@ -622,6 +619,10 @@ async def _translate_all_chunks(
622619
)
623620
translated_chunks.append(translated)
624621

622+
# Report progress after completing each chunk
623+
if progress_callback:
624+
progress_callback(((i + 1) / len(chunks)) * 100)
625+
625626
return translated_chunks, stats
626627

627628

0 commit comments

Comments
 (0)