1212import zipfile
1313import tempfile
1414import aiofiles
15- from typing import Dict , Any , Optional , Callable , Tuple
15+ from typing import Dict , Any , Optional , Callable , Tuple , List
1616from lxml import etree
1717from tqdm .auto import tqdm
1818
@@ -425,15 +425,8 @@ async def _translate_single_file(
425425 log_callback ("epub_file_translate_start" ,
426426 f"Translating file { file_idx + 1 } /{ total_files } : { content_href } " )
427427
428- # Create file-specific progress callback
429- def file_progress_callback (file_progress ):
430- if progress_callback :
431- # Map file progress (0-100) to overall progress
432- base_progress = ((file_idx / total_files ) * 90 ) + 5
433- file_contribution = (file_progress / 100 ) * (90 / total_files )
434- progress_callback (base_progress + file_contribution )
435-
436428 # Translate using simplified mode
429+ # Note: progress_callback is now token-aware wrapper from _process_all_content_files
437430 success = await translate_xhtml_simplified (
438431 doc_root = doc_root ,
439432 source_language = source_language ,
@@ -442,7 +435,7 @@ def file_progress_callback(file_progress):
442435 llm_client = llm_client ,
443436 max_tokens_per_chunk = max_tokens_per_chunk ,
444437 log_callback = log_callback ,
445- progress_callback = file_progress_callback ,
438+ progress_callback = progress_callback , # Pass through token-based wrapper directly
446439 context_manager = context_manager ,
447440 max_retries = max_attempts
448441 )
@@ -470,6 +463,82 @@ def file_progress_callback(file_progress):
470463 return None , file_path_abs , False
471464
472465
466+ async def _count_all_chunks (
467+ content_files : list ,
468+ opf_dir : str ,
469+ max_tokens_per_chunk : int ,
470+ log_callback : Optional [Callable ] = None
471+ ) -> List [Tuple [str , List [Dict ], int ]]:
472+ """
473+ Pre-count all chunks across all XHTML files.
474+
475+ Returns list of (file_href, chunks_info, total_tokens) for each file.
476+ This allows accurate progress tracking based on actual chunk count, not file count.
477+ """
478+ from .xhtml_translator import _setup_translation , _preserve_tags , _create_chunks
479+ from .body_serializer import extract_body_html
480+ import aiofiles
481+ from lxml import etree
482+
483+ file_chunk_info = []
484+ total_chunks_all_files = 0
485+
486+ if log_callback :
487+ log_callback ("epub_precount_start" , f"📊 Pre-counting chunks across { len (content_files )} files..." )
488+
489+ for content_href in content_files :
490+ file_path_abs = os .path .normpath (os .path .join (opf_dir , content_href ))
491+ if not os .path .exists (file_path_abs ):
492+ # File not found, will be skipped during translation
493+ file_chunk_info .append ((content_href , [], 0 ))
494+ continue
495+
496+ try :
497+ async with aiofiles .open (file_path_abs , 'r' , encoding = 'utf-8' ) as f :
498+ content = await f .read ()
499+
500+ parser = etree .XMLParser (encoding = 'utf-8' , recover = True , remove_blank_text = False )
501+ doc_root = etree .fromstring (content .encode ('utf-8' ), parser )
502+
503+ # Extract body and count chunks (same logic as translation)
504+ body_html , body_element , tag_preserver = _setup_translation (doc_root , log_callback , None )
505+
506+ if not body_html :
507+ file_chunk_info .append ((content_href , [], 0 ))
508+ continue
509+
510+ # Preserve tags
511+ text_with_placeholders , global_tag_map , _ = _preserve_tags (body_html , tag_preserver , log_callback )
512+
513+ # Create chunks
514+ chunks = _create_chunks (text_with_placeholders , global_tag_map , max_tokens_per_chunk , log_callback , None )
515+
516+ # Calculate total tokens for this file
517+ from src .core .chunking .token_chunker import TokenChunker
518+ token_counter = TokenChunker (max_tokens = max_tokens_per_chunk )
519+ file_total_tokens = sum (token_counter .count_tokens (chunk ['text' ]) for chunk in chunks )
520+
521+ file_chunk_info .append ((content_href , chunks , file_total_tokens ))
522+ total_chunks_all_files += len (chunks )
523+
524+ except Exception as e :
525+ if log_callback :
526+ log_callback ("epub_precount_error" , f"Error pre-counting chunks in '{ content_href } ': { e } " )
527+ file_chunk_info .append ((content_href , [], 0 ))
528+
529+ if log_callback :
530+ log_callback ("epub_precount_complete" ,
531+ f"📊 Pre-count complete: { total_chunks_all_files } total chunks across { len (content_files )} files" )
532+ # Log per-file breakdown for debugging
533+ for i , (href , chunks , tokens ) in enumerate (file_chunk_info [:5 ]): # Show first 5 files
534+ log_callback ("epub_precount_file_detail" ,
535+ f" File { i + 1 } : { href } → { len (chunks )} chunks, { tokens } tokens" )
536+ if len (file_chunk_info ) > 5 :
537+ log_callback ("epub_precount_more" , f" ... and { len (file_chunk_info ) - 5 } more files" )
538+
539+ return file_chunk_info
540+
541+
473542async def _process_all_content_files (
474543 content_files : list ,
475544 opf_dir : str ,
@@ -488,7 +557,7 @@ async def _process_all_content_files(
488557 stats_callback : Optional [Callable ] = None ,
489558 check_interruption_callback : Optional [Callable ] = None
490559) -> Dict :
491- """Process all XHTML content files.
560+ """Process all XHTML content files with accurate token-based progress tracking .
492561
493562 Args:
494563 content_files: List of content file hrefs
@@ -511,17 +580,41 @@ async def _process_all_content_files(
511580 Returns:
512581 Dictionary with processing results and parsed documents
513582 """
583+ from src .core .progress_tracker import TokenProgressTracker
584+
585+ # Pre-count all chunks across all files for accurate progress
586+ file_chunk_info = await _count_all_chunks (content_files , opf_dir , max_tokens_per_chunk , log_callback )
587+
588+ # Initialize token-based progress tracker
589+ progress_tracker = TokenProgressTracker ()
590+ progress_tracker .start ()
591+
592+ # Register all chunks with their token counts
593+ total_registered_chunks = 0
594+ total_registered_tokens = 0
595+ for file_href , chunks , file_tokens in file_chunk_info :
596+ if chunks :
597+ from src .core .chunking .token_chunker import TokenChunker
598+ token_counter = TokenChunker (max_tokens = max_tokens_per_chunk )
599+ for chunk in chunks :
600+ token_count = token_counter .count_tokens (chunk ['text' ])
601+ progress_tracker .register_chunk (token_count )
602+ total_registered_chunks += 1
603+ total_registered_tokens += token_count
604+
605+ if log_callback :
606+ log_callback ("epub_tracker_initialized" ,
607+ f"📈 Progress tracker initialized: { total_registered_chunks } chunks, { total_registered_tokens } tokens" )
608+
609+ # Initial stats
610+ if stats_callback :
611+ stats_callback (progress_tracker .get_stats ().to_dict ())
612+
514613 parsed_xhtml_docs : Dict [str , etree ._Element ] = {}
515614 total_files = len (content_files )
516615 completed_files = 0
517616 failed_files = 0
518-
519- if stats_callback :
520- stats_callback ({
521- 'total_chunks' : total_files ,
522- 'completed_chunks' : 0 ,
523- 'failed_chunks' : 0
524- })
617+ global_chunk_index = 0 # Track global chunk index across all files
525618
526619 iterator = tqdm (
527620 enumerate (content_files ),
@@ -538,17 +631,52 @@ async def _process_all_content_files(
538631 f"Translation interrupted at file { file_idx + 1 } /{ total_files } " )
539632 break
540633
634+ # Get chunk info for this file
635+ _ , file_chunks , _ = file_chunk_info [file_idx ]
636+ file_chunk_count = len (file_chunks )
637+
541638 # Skip already processed files on resume
542639 if file_idx < resume_from_index :
543640 completed_files += 1
641+ # Mark all chunks in this file as completed for progress tracker
642+ for chunk_idx in range (file_chunk_count ):
643+ progress_tracker .mark_completed (global_chunk_index + chunk_idx , 0.0 )
644+ global_chunk_index += file_chunk_count
544645 continue
545646
546- # Update progress
547- if progress_callback :
548- progress_percent = ((file_idx / total_files ) * 90 ) + 5
549- progress_callback (progress_percent )
647+ # Create wrapper progress callback
648+ # The existing code calls progress_callback with percentage (0-100) per file
649+ # We intercept that and update the global tracker based on chunk completion
650+ file_start_chunk_idx = global_chunk_index
651+ last_reported_chunk = [0 ] # Mutable to capture in closure
652+
653+ def file_progress_wrapper (file_percent : float ):
654+ """
655+ Intercept file-level progress (0-100%) and convert to chunk-level progress.
656+
657+ file_percent: Progress within this file (0-100)
658+ """
659+ if file_chunk_count == 0 :
660+ return
661+
662+ # Estimate which chunk we're on based on percentage
663+ chunks_completed_in_file = int ((file_percent / 100 ) * file_chunk_count )
664+ chunks_completed_in_file = min (chunks_completed_in_file , file_chunk_count )
665+
666+ # Mark newly completed chunks since last call
667+ for chunk_offset in range (last_reported_chunk [0 ], chunks_completed_in_file ):
668+ actual_global_idx = file_start_chunk_idx + chunk_offset
669+ progress_tracker .mark_completed (actual_global_idx , 0.0 ) # No time measurement available
550670
551- # Translate the file
671+ last_reported_chunk [0 ] = chunks_completed_in_file
672+
673+ # Update global progress
674+ if progress_callback :
675+ progress_callback (progress_tracker .get_progress_percent ())
676+ if stats_callback :
677+ stats_callback (progress_tracker .get_stats ().to_dict ())
678+
679+ # Translate the file (reuse existing function)
552680 doc_root , file_path_abs , success = await _translate_single_file (
553681 file_idx = file_idx ,
554682 content_href = content_href ,
@@ -563,9 +691,17 @@ async def _process_all_content_files(
563691 translation_id = translation_id ,
564692 total_files = total_files ,
565693 log_callback = log_callback ,
566- progress_callback = progress_callback
694+ progress_callback = file_progress_wrapper # Use our wrapper
567695 )
568696
697+ # Ensure all chunks in this file are marked complete after translation finishes
698+ for chunk_offset in range (last_reported_chunk [0 ], file_chunk_count ):
699+ actual_global_idx = file_start_chunk_idx + chunk_offset
700+ progress_tracker .mark_completed (actual_global_idx , 0.0 )
701+
702+ # Advance global chunk index by number of chunks in this file
703+ global_chunk_index += file_chunk_count
704+
569705 # Save the document if translation succeeded
570706 # Note: doc_root is modified in-place only if _replace_body succeeds
571707 # If it fails, doc_root still contains the original content (no data loss)
@@ -583,24 +719,22 @@ async def _process_all_content_files(
583719 else :
584720 failed_files += 1
585721
586- # Update statistics
722+ # Update statistics from progress tracker
587723 if stats_callback :
588- stats_callback ({
589- 'completed_chunks' : completed_files ,
590- 'failed_chunks' : failed_files
591- })
724+ stats_callback (progress_tracker .get_stats ().to_dict ())
592725
593726 # Save checkpoint after each file
594727 if checkpoint_manager and translation_id :
728+ stats = progress_tracker .get_stats ()
595729 checkpoint_manager .save_checkpoint (
596730 translation_id = translation_id ,
597731 chunk_index = file_idx + 1 ,
598732 original_text = content_href ,
599733 translated_text = content_href ,
600734 chunk_data = {'last_file' : content_href },
601- total_chunks = len ( content_files ) ,
602- completed_chunks = completed_files ,
603- failed_chunks = failed_files
735+ total_chunks = stats . total_chunks ,
736+ completed_chunks = stats . completed_chunks ,
737+ failed_chunks = stats . failed_chunks
604738 )
605739
606740 return {
0 commit comments