.

hydropix · hydropix · commit 266517889ff2 · 2026-02-02T23:55:53.000+01:00
diff --git a/src/core/epub/translation_metrics.py b/src/core/epub/translation_metrics.py
@@ -20,13 +20,27 @@ class TranslationMetrics:
     1. Phase 1: Normal translation (with retry attempts)
     2. Phase 2: Token alignment fallback (translate without placeholders, reinsert proportionally)
     3. Phase 3: Untranslated fallback (if all retries fail, returns original text)
+
+    Refinement support:
+    - When refinement is enabled, total_chunks represents the ORIGINAL chunk count
+    - enable_refinement flag tracks if this is a two-phase workflow
+    - In refinement phase, use refinement_chunks_completed to track progress
     """
     # === Counts ===
     total_chunks: int = 0
     successful_first_try: int = 0
     successful_after_retry: int = 0
     fallback_used: int = 0  # Phase 3: Chunks returned untranslated after all phases failed
     failed_chunks: int = 0
+    
+    # === Progress tracking ===
+    processed_chunks: int = 0  # Chunks fully processed (regardless of success/failure)
+    # This is used for progress calculation to avoid fluctuations during retries
+
+    # === Refinement tracking ===
+    enable_refinement: bool = False  # If True, this is a two-phase workflow
+    refinement_phase: bool = False  # If True, currently in refinement phase
+    refinement_chunks_completed: int = 0  # Chunks completed in refinement phase
 
     # === Retry & Error Tracking ===
     retry_attempts: int = 0  # Total number of retry attempts made
@@ -97,6 +111,13 @@ def record_failure(self, chunk_size: int) -> None:
         # Note: total_chunks is initialized in _translate_all_chunks, not incremented here
         self.failed_chunks += 1
         self._update_chunk_stats(chunk_size)
+    
+    def record_processed(self) -> None:
+        """Record that a chunk has been fully processed (success or failure).
+        
+        This is used for progress tracking to ensure the progress bar only moves forward.
+        """
+        self.processed_chunks += 1
 
     def _update_chunk_stats(self, chunk_size: int) -> None:
         """Update chunk size statistics."""
@@ -139,10 +160,37 @@ def first_try_rate(self) -> float:
         return self.successful_first_try / self.total_chunks
 
     def to_dict(self) -> Dict:
-        """Convert metrics to dictionary for serialization."""
+        """Convert metrics to dictionary for serialization.
+
+        For two-phase workflows (translation + refinement):
+        - total_chunks is doubled to reflect both phases
+        - completed_chunks accounts for both translation and refinement progress
+        - Phase 1 (translation): 0-50% of total work (0 to N chunks)
+        - Phase 2 (refinement): 50-100% of total work (N to 2N chunks)
+        
+        Note: We use processed_chunks for translation progress to avoid fluctuations
+        during retries. A chunk is only counted when fully processed (success or failure).
+        """
+        # Calculate total chunks and completed chunks based on refinement status
+        if self.enable_refinement:
+            # Two-phase workflow: double the total chunks
+            effective_total_chunks = self.total_chunks * 2
+
+            if self.refinement_phase:
+                # In refinement phase: translation complete (N) + refinement progress
+                effective_completed = self.total_chunks + self.refinement_chunks_completed
+            else:
+                # In translation phase: use processed_chunks to avoid retry fluctuations
+                effective_completed = self.processed_chunks
+        else:
+            # Single-phase workflow: no adjustment needed
+            effective_total_chunks = self.total_chunks
+            # Use processed_chunks for consistent progress tracking
+            effective_completed = self.processed_chunks
+
         return {
-            "total_chunks": self.total_chunks,
-            "completed_chunks": self.successful_first_try + self.successful_after_retry,
+            "total_chunks": effective_total_chunks,
+            "completed_chunks": effective_completed,
             "successful_first_try": self.successful_first_try,
             "successful_after_retry": self.successful_after_retry,
             "fallback_used": self.fallback_used,
@@ -165,7 +213,13 @@ def to_dict(self) -> Dict:
             "max_chunk_size": self.max_chunk_size,
             "success_rate": self.success_rate,
             "first_try_rate": self.first_try_rate,
-            "retry_distribution": self.retry_distribution
+            "retry_distribution": self.retry_distribution,
+            # Add refinement info for debugging
+            "enable_refinement": self.enable_refinement,
+            "refinement_phase": self.refinement_phase,
+            "refinement_chunks_completed": self.refinement_chunks_completed,
+            # Progress tracking
+            "processed_chunks": self.processed_chunks
         }
 
     @classmethod
@@ -365,6 +419,12 @@ def merge(self, other: 'TranslationMetrics') -> None:
         self.total_tokens_processed += other.total_tokens_processed
         self.total_tokens_generated += other.total_tokens_generated
         self.total_chunk_size += other.total_chunk_size
+        
+        # Merge refinement tracking (needed for accurate progress across multiple files)
+        self.refinement_chunks_completed += other.refinement_chunks_completed
+        
+        # Merge progress tracking
+        self.processed_chunks += other.processed_chunks
 
         # Merge min/max
         if other.min_chunk_size != float('inf'):
diff --git a/src/core/epub/translator.py b/src/core/epub/translator.py
@@ -673,6 +673,10 @@ async def _process_all_content_files(
         content_files, opf_dir, max_tokens_per_chunk, log_callback
     )
 
+    # Check if refinement is enabled - this doubles the total work
+    enable_refinement = prompt_options and prompt_options.get('refine', False)
+    effective_total_chunks = total_chunks * 2 if enable_refinement else total_chunks
+
     # Start with restored documents
     parsed_xhtml_docs: Dict[str, etree._Element] = restored_docs.copy() if restored_docs else {}
     total_files = len(content_files)
@@ -691,7 +695,7 @@ async def _process_all_content_files(
     # Send initial stats if resuming (to update UI immediately)
     if stats_callback and resume_from_index > 0:
         stats_callback({
-            'total_chunks': total_chunks,
+            'total_chunks': effective_total_chunks,
             'completed_chunks': completed_chunks_global,
             'failed_chunks': 0,
             'total_tokens': 0
@@ -730,9 +734,14 @@ def file_stats_wrapper(file_stats_dict: Dict):
             current_file_completed = file_stats_dict.get('completed_chunks', 0)
             global_completed = completed_chunks_global + current_file_completed
 
+            # Handle refinement mode: when refinement is enabled, the total work doubles
+            # (translation phase + refinement phase), so we need to use the doubled total
+            enable_refinement = file_stats_dict.get('enable_refinement', False)
+            effective_total = total_chunks * 2 if enable_refinement else total_chunks
+
             # Report combined stats (accumulated + current file)
             stats_callback({
-                'total_chunks': total_chunks,
+                'total_chunks': effective_total,
                 'completed_chunks': global_completed,
                 'failed_chunks': accumulated_stats.failed_chunks + file_stats_dict.get('failed_chunks', 0),
                 'total_tokens': accumulated_stats.total_tokens_processed + accumulated_stats.total_tokens_generated + file_stats_dict.get('total_tokens_processed', 0) + file_stats_dict.get('total_tokens_generated', 0)
@@ -768,9 +777,21 @@ def file_stats_wrapper(file_stats_dict: Dict):
 
         # Report stats if callback provided
         if stats_callback and file_stats:
+            # Calculate effective completed chunks
+            # When refinement is enabled, we need to account for both phases
+            if enable_refinement:
+                # Calculate base completed chunks (without refinement doubling)
+                base_completed = accumulated_stats.successful_first_try + accumulated_stats.successful_after_retry
+                # Add refinement progress if any files have completed refinement
+                # Note: accumulated_stats.refinement_chunks_completed only tracks current file's refinement
+                # We need to add completed_chunks_global (which counts base chunks) + any refinement progress
+                effective_completed = completed_chunks_global + accumulated_stats.refinement_chunks_completed
+            else:
+                effective_completed = completed_chunks_global
+            
             stats_callback({
-                'total_chunks': total_chunks,
-                'completed_chunks': completed_chunks_global,
+                'total_chunks': effective_total_chunks,
+                'completed_chunks': effective_completed,
                 'failed_chunks': accumulated_stats.failed_chunks,
                 'total_tokens': accumulated_stats.total_tokens_processed + accumulated_stats.total_tokens_generated
             })
@@ -804,7 +825,7 @@ def file_stats_wrapper(file_stats_dict: Dict):
         'parsed_docs': parsed_xhtml_docs,
         'completed_files': completed_files,
         'failed_files': failed_files,
-        'total_chunks': total_chunks,
+        'total_chunks': effective_total_chunks,
         'completed_chunks': completed_chunks_global,
         'failed_chunks': accumulated_stats.failed_chunks,
         'translation_stats': accumulated_stats
diff --git a/src/core/epub/xhtml_translator.py b/src/core/epub/xhtml_translator.py
@@ -475,6 +475,7 @@ async def translate_chunk_with_fallback(
                     log_callback("retry_success", f"✓ Translation succeeded after {attempt + 1} attempt(s)")
 
             result = placeholder_mgr.restore_to_global(translated, global_indices)
+            stats.record_processed()  # Mark chunk as fully processed
             return result
         else:
             # Track placeholder error
@@ -545,6 +546,7 @@ async def translate_chunk_with_fallback(
 
                 # 6. Restore global indices and return
                 result = placeholder_mgr.restore_to_global(result_with_placeholders, global_indices)
+                stats.record_processed()  # Mark chunk as fully processed
                 return result
             else:
                 _log_error(log_callback, "phase2_validation_failed", "✗ Phase 2 validation failed")
@@ -565,6 +567,7 @@ async def translate_chunk_with_fallback(
 
     # Return the original chunk_text with global indices restored
     result_final = placeholder_mgr.restore_to_global(chunk_text, global_indices)
+    stats.record_processed()  # Mark chunk as fully processed (even on failure)
     return result_final
 
 
@@ -1102,7 +1105,9 @@ async def _refine_epub_chunks(
     context_manager: Optional[AdaptiveContextManager],
     placeholder_format: Tuple[str, str],
     log_callback: Optional[Callable],
-    prompt_options: Optional[Dict]
+    prompt_options: Optional[Dict],
+    stats_callback: Optional[Callable] = None,
+    stats: Optional['TranslationMetrics'] = None
 ) -> List[str]:
     """
     Refine translated EPUB chunks using a second LLM pass.
@@ -1119,7 +1124,10 @@ async def _refine_epub_chunks(
         llm_client: LLM client instance
         context_manager: Optional context manager
         placeholder_format: Placeholder format tuple (prefix, suffix)
-        log_callback: Optional logging callback        prompt_options: Prompt options dict
+        log_callback: Optional logging callback
+        prompt_options: Prompt options dict
+        stats_callback: Optional callback for progress updates during refinement
+        stats: Optional TranslationMetrics to update during refinement
 
     Returns:
         List of refined chunk texts
@@ -1265,7 +1273,11 @@ async def _refine_epub_chunks(
             refined_chunks.append(translated_text)
             _log_error(log_callback, "epub_refinement_error", f"Chunk {idx + 1}/{total_chunks}: error during refinement: {e}")
 
-        # Update progress
+        # Update progress after each refinement chunk
+        # Since refinement is Phase 2 of a two-phase workflow, increment refinement counter
+        if stats_callback and stats:
+            stats.refinement_chunks_completed = len(refined_chunks)
+            stats_callback(stats.to_dict())
     if log_callback:
         successful_refinements = sum(1 for orig, ref in zip(translated_chunks, refined_chunks) if orig != ref)
         log_callback("epub_refinement_complete",
@@ -1434,6 +1446,10 @@ async def translate_xhtml_simplified(
     # Check if refinement is enabled
     enable_refinement = prompt_options and prompt_options.get('refine')
 
+    # Configure stats for refinement tracking
+    stats.enable_refinement = enable_refinement
+    stats.refinement_phase = False  # Start in translation phase
+
     if log_callback:
         log_callback("epub_refinement_config",
                      f"Refinement enabled: {enable_refinement} (prompt_options={prompt_options})")
@@ -1477,6 +1493,10 @@ async def translate_xhtml_simplified(
 
     # 4.5. Refinement (optional - only if not interrupted)
     if enable_refinement and translated_chunks:
+        # Switch stats to refinement phase
+        stats.refinement_phase = True
+        stats.refinement_chunks_completed = 0
+
         if log_callback:
             log_callback("epub_refinement_start",
                         f"✨ Starting EPUB refinement pass to polish translation quality... ({len(translated_chunks)} chunks)")
@@ -1490,7 +1510,9 @@ async def translate_xhtml_simplified(
             context_manager=context_manager,
             placeholder_format=placeholder_format,
             log_callback=log_callback,  # Pass through to parent's token tracker
-            prompt_options=prompt_options
+            prompt_options=prompt_options,
+            stats_callback=stats_callback,  # Pass stats callback for progress updates
+            stats=stats  # Pass stats object to update during refinement
         )
 
         if refined_result:
diff --git a/src/core/progress_tracker.py b/src/core/progress_tracker.py
@@ -120,42 +120,81 @@ def start_refinement_phase(self):
 
     def get_progress_percent(self) -> float:
         """
-        Calculate progress as percentage of total tokens completed.
+        Calculate progress as percentage of total work completed.
 
-        For two-phase workflows:
-        - Phase 1 (translation): returns 0-50%
-        - Phase 2 (refinement): returns 50-100%
+        For two-phase workflows (enable_refinement=True):
+        - Total work = total_tokens * 2 (translation + refinement)
+        - Phase 1 (translation): returns 0-50% based on tokens translated
+        - Phase 2 (refinement): returns 50-100% based on tokens refined
+
+        For single-phase workflows:
+        - Returns 0-100% based on tokens translated
         """
         if self._total_tokens == 0:
             return 0.0
 
-        # Calculate raw progress (0-100%)
-        raw_progress = (self._completed_tokens / self._total_tokens) * 100
-
         if not self._enable_refinement:
-            return raw_progress
+            # Single-phase: direct calculation
+            return (self._completed_tokens / self._total_tokens) * 100
+
+        # Two-phase workflow: total work is double (translate + refine)
+        total_work_tokens = self._total_tokens * 2
 
-        # Apply phase weighting for two-phase workflows
         if self._current_phase == 1:
             # Translation phase: 0-50%
-            return raw_progress * 0.5
+            # completed_tokens out of total_work_tokens (which is double)
+            return (self._completed_tokens / total_work_tokens) * 100
         else:
             # Refinement phase: 50-100%
-            return 50.0 + (raw_progress * 0.5)
+            # First phase already contributed 50%, now add refinement progress
+            phase1_contribution = 50.0
+            phase2_progress = (self._completed_tokens / self._total_tokens) * 50.0
+            return phase1_contribution + phase2_progress
 
     def get_estimated_remaining_seconds(self) -> float:
-        """Estimate remaining time based on token count and real performance."""
+        """
+        Estimate remaining time based on token count and real performance.
+
+        For two-phase workflows, accounts for remaining work in both phases.
+        """
         if self._completed_chunks == 0:
             # Initial estimate before any real data
-            return (self.FIXED_PROMPT_OVERHEAD * self._total_chunks) + \
-                   (self._total_tokens * self._token_rate)
+            total_work_chunks = self._total_chunks * 2 if self._enable_refinement else self._total_chunks
+            total_work_tokens = self._total_tokens * 2 if self._enable_refinement else self._total_tokens
+            return (self.FIXED_PROMPT_OVERHEAD * total_work_chunks) + \
+                   (total_work_tokens * self._token_rate)
 
         # Use calibrated rate
-        remaining_tokens = self._total_tokens - self._completed_tokens
-        remaining_chunks = self._total_chunks - self._completed_chunks
+        if not self._enable_refinement:
+            # Single-phase: simple calculation
+            remaining_tokens = self._total_tokens - self._completed_tokens
+            remaining_chunks = self._total_chunks - self._completed_chunks
+            return (self.FIXED_PROMPT_OVERHEAD * remaining_chunks) + \
+                   (remaining_tokens * self._token_rate)
+
+        # Two-phase workflow
+        if self._current_phase == 1:
+            # Still in translation phase - need to account for:
+            # 1. Remaining translation work
+            # 2. All refinement work (entire second phase)
+            remaining_translation_tokens = self._total_tokens - self._completed_tokens
+            remaining_translation_chunks = self._total_chunks - self._completed_chunks
+
+            phase1_remaining = (self.FIXED_PROMPT_OVERHEAD * remaining_translation_chunks) + \
+                              (remaining_translation_tokens * self._token_rate)
+
+            # Phase 2 will process all chunks again
+            phase2_total = (self.FIXED_PROMPT_OVERHEAD * self._total_chunks) + \
+                          (self._total_tokens * self._token_rate)
+
+            return phase1_remaining + phase2_total
+        else:
+            # In refinement phase - only refinement work remains
+            remaining_refinement_tokens = self._total_tokens - self._completed_tokens
+            remaining_refinement_chunks = self._total_chunks - self._completed_chunks
 
-        return (self.FIXED_PROMPT_OVERHEAD * remaining_chunks) + \
-               (remaining_tokens * self._token_rate)
+            return (self.FIXED_PROMPT_OVERHEAD * remaining_refinement_chunks) + \
+                   (remaining_refinement_tokens * self._token_rate)
 
     def get_stats(self) -> ProgressStats:
         """Get immutable snapshot of current progress statistics."""
diff --git a/test_epub_refinement_progress.py b/test_epub_refinement_progress.py