Skip to content

Commit 2665178

Browse files
committed
.
1 parent be51307 commit 2665178

File tree

5 files changed

+309
-31
lines changed

5 files changed

+309
-31
lines changed

src/core/epub/translation_metrics.py

Lines changed: 64 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,27 @@ class TranslationMetrics:
2020
1. Phase 1: Normal translation (with retry attempts)
2121
2. Phase 2: Token alignment fallback (translate without placeholders, reinsert proportionally)
2222
3. Phase 3: Untranslated fallback (if all retries fail, returns original text)
23+
24+
Refinement support:
25+
- When refinement is enabled, total_chunks represents the ORIGINAL chunk count
26+
- enable_refinement flag tracks if this is a two-phase workflow
27+
- In refinement phase, use refinement_chunks_completed to track progress
2328
"""
2429
# === Counts ===
2530
total_chunks: int = 0
2631
successful_first_try: int = 0
2732
successful_after_retry: int = 0
2833
fallback_used: int = 0 # Phase 3: Chunks returned untranslated after all phases failed
2934
failed_chunks: int = 0
35+
36+
# === Progress tracking ===
37+
processed_chunks: int = 0 # Chunks fully processed (regardless of success/failure)
38+
# This is used for progress calculation to avoid fluctuations during retries
39+
40+
# === Refinement tracking ===
41+
enable_refinement: bool = False # If True, this is a two-phase workflow
42+
refinement_phase: bool = False # If True, currently in refinement phase
43+
refinement_chunks_completed: int = 0 # Chunks completed in refinement phase
3044

3145
# === Retry & Error Tracking ===
3246
retry_attempts: int = 0 # Total number of retry attempts made
@@ -97,6 +111,13 @@ def record_failure(self, chunk_size: int) -> None:
97111
# Note: total_chunks is initialized in _translate_all_chunks, not incremented here
98112
self.failed_chunks += 1
99113
self._update_chunk_stats(chunk_size)
114+
115+
def record_processed(self) -> None:
116+
"""Record that a chunk has been fully processed (success or failure).
117+
118+
This is used for progress tracking to ensure the progress bar only moves forward.
119+
"""
120+
self.processed_chunks += 1
100121

101122
def _update_chunk_stats(self, chunk_size: int) -> None:
102123
"""Update chunk size statistics."""
@@ -139,10 +160,37 @@ def first_try_rate(self) -> float:
139160
return self.successful_first_try / self.total_chunks
140161

141162
def to_dict(self) -> Dict:
142-
"""Convert metrics to dictionary for serialization."""
163+
"""Convert metrics to dictionary for serialization.
164+
165+
For two-phase workflows (translation + refinement):
166+
- total_chunks is doubled to reflect both phases
167+
- completed_chunks accounts for both translation and refinement progress
168+
- Phase 1 (translation): 0-50% of total work (0 to N chunks)
169+
- Phase 2 (refinement): 50-100% of total work (N to 2N chunks)
170+
171+
Note: We use processed_chunks for translation progress to avoid fluctuations
172+
during retries. A chunk is only counted when fully processed (success or failure).
173+
"""
174+
# Calculate total chunks and completed chunks based on refinement status
175+
if self.enable_refinement:
176+
# Two-phase workflow: double the total chunks
177+
effective_total_chunks = self.total_chunks * 2
178+
179+
if self.refinement_phase:
180+
# In refinement phase: translation complete (N) + refinement progress
181+
effective_completed = self.total_chunks + self.refinement_chunks_completed
182+
else:
183+
# In translation phase: use processed_chunks to avoid retry fluctuations
184+
effective_completed = self.processed_chunks
185+
else:
186+
# Single-phase workflow: no adjustment needed
187+
effective_total_chunks = self.total_chunks
188+
# Use processed_chunks for consistent progress tracking
189+
effective_completed = self.processed_chunks
190+
143191
return {
144-
"total_chunks": self.total_chunks,
145-
"completed_chunks": self.successful_first_try + self.successful_after_retry,
192+
"total_chunks": effective_total_chunks,
193+
"completed_chunks": effective_completed,
146194
"successful_first_try": self.successful_first_try,
147195
"successful_after_retry": self.successful_after_retry,
148196
"fallback_used": self.fallback_used,
@@ -165,7 +213,13 @@ def to_dict(self) -> Dict:
165213
"max_chunk_size": self.max_chunk_size,
166214
"success_rate": self.success_rate,
167215
"first_try_rate": self.first_try_rate,
168-
"retry_distribution": self.retry_distribution
216+
"retry_distribution": self.retry_distribution,
217+
# Add refinement info for debugging
218+
"enable_refinement": self.enable_refinement,
219+
"refinement_phase": self.refinement_phase,
220+
"refinement_chunks_completed": self.refinement_chunks_completed,
221+
# Progress tracking
222+
"processed_chunks": self.processed_chunks
169223
}
170224

171225
@classmethod
@@ -365,6 +419,12 @@ def merge(self, other: 'TranslationMetrics') -> None:
365419
self.total_tokens_processed += other.total_tokens_processed
366420
self.total_tokens_generated += other.total_tokens_generated
367421
self.total_chunk_size += other.total_chunk_size
422+
423+
# Merge refinement tracking (needed for accurate progress across multiple files)
424+
self.refinement_chunks_completed += other.refinement_chunks_completed
425+
426+
# Merge progress tracking
427+
self.processed_chunks += other.processed_chunks
368428

369429
# Merge min/max
370430
if other.min_chunk_size != float('inf'):

src/core/epub/translator.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -673,6 +673,10 @@ async def _process_all_content_files(
673673
content_files, opf_dir, max_tokens_per_chunk, log_callback
674674
)
675675

676+
# Check if refinement is enabled - this doubles the total work
677+
enable_refinement = prompt_options and prompt_options.get('refine', False)
678+
effective_total_chunks = total_chunks * 2 if enable_refinement else total_chunks
679+
676680
# Start with restored documents
677681
parsed_xhtml_docs: Dict[str, etree._Element] = restored_docs.copy() if restored_docs else {}
678682
total_files = len(content_files)
@@ -691,7 +695,7 @@ async def _process_all_content_files(
691695
# Send initial stats if resuming (to update UI immediately)
692696
if stats_callback and resume_from_index > 0:
693697
stats_callback({
694-
'total_chunks': total_chunks,
698+
'total_chunks': effective_total_chunks,
695699
'completed_chunks': completed_chunks_global,
696700
'failed_chunks': 0,
697701
'total_tokens': 0
@@ -730,9 +734,14 @@ def file_stats_wrapper(file_stats_dict: Dict):
730734
current_file_completed = file_stats_dict.get('completed_chunks', 0)
731735
global_completed = completed_chunks_global + current_file_completed
732736

737+
# Handle refinement mode: when refinement is enabled, the total work doubles
738+
# (translation phase + refinement phase), so we need to use the doubled total
739+
enable_refinement = file_stats_dict.get('enable_refinement', False)
740+
effective_total = total_chunks * 2 if enable_refinement else total_chunks
741+
733742
# Report combined stats (accumulated + current file)
734743
stats_callback({
735-
'total_chunks': total_chunks,
744+
'total_chunks': effective_total,
736745
'completed_chunks': global_completed,
737746
'failed_chunks': accumulated_stats.failed_chunks + file_stats_dict.get('failed_chunks', 0),
738747
'total_tokens': accumulated_stats.total_tokens_processed + accumulated_stats.total_tokens_generated + file_stats_dict.get('total_tokens_processed', 0) + file_stats_dict.get('total_tokens_generated', 0)
@@ -768,9 +777,21 @@ def file_stats_wrapper(file_stats_dict: Dict):
768777

769778
# Report stats if callback provided
770779
if stats_callback and file_stats:
780+
# Calculate effective completed chunks
781+
# When refinement is enabled, we need to account for both phases
782+
if enable_refinement:
783+
# Calculate base completed chunks (without refinement doubling)
784+
base_completed = accumulated_stats.successful_first_try + accumulated_stats.successful_after_retry
785+
# Add refinement progress if any files have completed refinement
786+
# Note: accumulated_stats.refinement_chunks_completed only tracks current file's refinement
787+
# We need to add completed_chunks_global (which counts base chunks) + any refinement progress
788+
effective_completed = completed_chunks_global + accumulated_stats.refinement_chunks_completed
789+
else:
790+
effective_completed = completed_chunks_global
791+
771792
stats_callback({
772-
'total_chunks': total_chunks,
773-
'completed_chunks': completed_chunks_global,
793+
'total_chunks': effective_total_chunks,
794+
'completed_chunks': effective_completed,
774795
'failed_chunks': accumulated_stats.failed_chunks,
775796
'total_tokens': accumulated_stats.total_tokens_processed + accumulated_stats.total_tokens_generated
776797
})
@@ -804,7 +825,7 @@ def file_stats_wrapper(file_stats_dict: Dict):
804825
'parsed_docs': parsed_xhtml_docs,
805826
'completed_files': completed_files,
806827
'failed_files': failed_files,
807-
'total_chunks': total_chunks,
828+
'total_chunks': effective_total_chunks,
808829
'completed_chunks': completed_chunks_global,
809830
'failed_chunks': accumulated_stats.failed_chunks,
810831
'translation_stats': accumulated_stats

src/core/epub/xhtml_translator.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,7 @@ async def translate_chunk_with_fallback(
475475
log_callback("retry_success", f"✓ Translation succeeded after {attempt + 1} attempt(s)")
476476

477477
result = placeholder_mgr.restore_to_global(translated, global_indices)
478+
stats.record_processed() # Mark chunk as fully processed
478479
return result
479480
else:
480481
# Track placeholder error
@@ -545,6 +546,7 @@ async def translate_chunk_with_fallback(
545546

546547
# 6. Restore global indices and return
547548
result = placeholder_mgr.restore_to_global(result_with_placeholders, global_indices)
549+
stats.record_processed() # Mark chunk as fully processed
548550
return result
549551
else:
550552
_log_error(log_callback, "phase2_validation_failed", "✗ Phase 2 validation failed")
@@ -565,6 +567,7 @@ async def translate_chunk_with_fallback(
565567

566568
# Return the original chunk_text with global indices restored
567569
result_final = placeholder_mgr.restore_to_global(chunk_text, global_indices)
570+
stats.record_processed() # Mark chunk as fully processed (even on failure)
568571
return result_final
569572

570573

@@ -1102,7 +1105,9 @@ async def _refine_epub_chunks(
11021105
context_manager: Optional[AdaptiveContextManager],
11031106
placeholder_format: Tuple[str, str],
11041107
log_callback: Optional[Callable],
1105-
prompt_options: Optional[Dict]
1108+
prompt_options: Optional[Dict],
1109+
stats_callback: Optional[Callable] = None,
1110+
stats: Optional['TranslationMetrics'] = None
11061111
) -> List[str]:
11071112
"""
11081113
Refine translated EPUB chunks using a second LLM pass.
@@ -1119,7 +1124,10 @@ async def _refine_epub_chunks(
11191124
llm_client: LLM client instance
11201125
context_manager: Optional context manager
11211126
placeholder_format: Placeholder format tuple (prefix, suffix)
1122-
log_callback: Optional logging callback prompt_options: Prompt options dict
1127+
log_callback: Optional logging callback
1128+
prompt_options: Prompt options dict
1129+
stats_callback: Optional callback for progress updates during refinement
1130+
stats: Optional TranslationMetrics to update during refinement
11231131
11241132
Returns:
11251133
List of refined chunk texts
@@ -1265,7 +1273,11 @@ async def _refine_epub_chunks(
12651273
refined_chunks.append(translated_text)
12661274
_log_error(log_callback, "epub_refinement_error", f"Chunk {idx + 1}/{total_chunks}: error during refinement: {e}")
12671275

1268-
# Update progress
1276+
# Update progress after each refinement chunk
1277+
# Since refinement is Phase 2 of a two-phase workflow, increment refinement counter
1278+
if stats_callback and stats:
1279+
stats.refinement_chunks_completed = len(refined_chunks)
1280+
stats_callback(stats.to_dict())
12691281
if log_callback:
12701282
successful_refinements = sum(1 for orig, ref in zip(translated_chunks, refined_chunks) if orig != ref)
12711283
log_callback("epub_refinement_complete",
@@ -1434,6 +1446,10 @@ async def translate_xhtml_simplified(
14341446
# Check if refinement is enabled
14351447
enable_refinement = prompt_options and prompt_options.get('refine')
14361448

1449+
# Configure stats for refinement tracking
1450+
stats.enable_refinement = enable_refinement
1451+
stats.refinement_phase = False # Start in translation phase
1452+
14371453
if log_callback:
14381454
log_callback("epub_refinement_config",
14391455
f"Refinement enabled: {enable_refinement} (prompt_options={prompt_options})")
@@ -1477,6 +1493,10 @@ async def translate_xhtml_simplified(
14771493

14781494
# 4.5. Refinement (optional - only if not interrupted)
14791495
if enable_refinement and translated_chunks:
1496+
# Switch stats to refinement phase
1497+
stats.refinement_phase = True
1498+
stats.refinement_chunks_completed = 0
1499+
14801500
if log_callback:
14811501
log_callback("epub_refinement_start",
14821502
f"✨ Starting EPUB refinement pass to polish translation quality... ({len(translated_chunks)} chunks)")
@@ -1490,7 +1510,9 @@ async def translate_xhtml_simplified(
14901510
context_manager=context_manager,
14911511
placeholder_format=placeholder_format,
14921512
log_callback=log_callback, # Pass through to parent's token tracker
1493-
prompt_options=prompt_options
1513+
prompt_options=prompt_options,
1514+
stats_callback=stats_callback, # Pass stats callback for progress updates
1515+
stats=stats # Pass stats object to update during refinement
14941516
)
14951517

14961518
if refined_result:

src/core/progress_tracker.py

Lines changed: 57 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -120,42 +120,81 @@ def start_refinement_phase(self):
120120

121121
def get_progress_percent(self) -> float:
122122
"""
123-
Calculate progress as percentage of total tokens completed.
123+
Calculate progress as percentage of total work completed.
124124
125-
For two-phase workflows:
126-
- Phase 1 (translation): returns 0-50%
127-
- Phase 2 (refinement): returns 50-100%
125+
For two-phase workflows (enable_refinement=True):
126+
- Total work = total_tokens * 2 (translation + refinement)
127+
- Phase 1 (translation): returns 0-50% based on tokens translated
128+
- Phase 2 (refinement): returns 50-100% based on tokens refined
129+
130+
For single-phase workflows:
131+
- Returns 0-100% based on tokens translated
128132
"""
129133
if self._total_tokens == 0:
130134
return 0.0
131135

132-
# Calculate raw progress (0-100%)
133-
raw_progress = (self._completed_tokens / self._total_tokens) * 100
134-
135136
if not self._enable_refinement:
136-
return raw_progress
137+
# Single-phase: direct calculation
138+
return (self._completed_tokens / self._total_tokens) * 100
139+
140+
# Two-phase workflow: total work is double (translate + refine)
141+
total_work_tokens = self._total_tokens * 2
137142

138-
# Apply phase weighting for two-phase workflows
139143
if self._current_phase == 1:
140144
# Translation phase: 0-50%
141-
return raw_progress * 0.5
145+
# completed_tokens out of total_work_tokens (which is double)
146+
return (self._completed_tokens / total_work_tokens) * 100
142147
else:
143148
# Refinement phase: 50-100%
144-
return 50.0 + (raw_progress * 0.5)
149+
# First phase already contributed 50%, now add refinement progress
150+
phase1_contribution = 50.0
151+
phase2_progress = (self._completed_tokens / self._total_tokens) * 50.0
152+
return phase1_contribution + phase2_progress
145153

146154
def get_estimated_remaining_seconds(self) -> float:
147-
"""Estimate remaining time based on token count and real performance."""
155+
"""
156+
Estimate remaining time based on token count and real performance.
157+
158+
For two-phase workflows, accounts for remaining work in both phases.
159+
"""
148160
if self._completed_chunks == 0:
149161
# Initial estimate before any real data
150-
return (self.FIXED_PROMPT_OVERHEAD * self._total_chunks) + \
151-
(self._total_tokens * self._token_rate)
162+
total_work_chunks = self._total_chunks * 2 if self._enable_refinement else self._total_chunks
163+
total_work_tokens = self._total_tokens * 2 if self._enable_refinement else self._total_tokens
164+
return (self.FIXED_PROMPT_OVERHEAD * total_work_chunks) + \
165+
(total_work_tokens * self._token_rate)
152166

153167
# Use calibrated rate
154-
remaining_tokens = self._total_tokens - self._completed_tokens
155-
remaining_chunks = self._total_chunks - self._completed_chunks
168+
if not self._enable_refinement:
169+
# Single-phase: simple calculation
170+
remaining_tokens = self._total_tokens - self._completed_tokens
171+
remaining_chunks = self._total_chunks - self._completed_chunks
172+
return (self.FIXED_PROMPT_OVERHEAD * remaining_chunks) + \
173+
(remaining_tokens * self._token_rate)
174+
175+
# Two-phase workflow
176+
if self._current_phase == 1:
177+
# Still in translation phase - need to account for:
178+
# 1. Remaining translation work
179+
# 2. All refinement work (entire second phase)
180+
remaining_translation_tokens = self._total_tokens - self._completed_tokens
181+
remaining_translation_chunks = self._total_chunks - self._completed_chunks
182+
183+
phase1_remaining = (self.FIXED_PROMPT_OVERHEAD * remaining_translation_chunks) + \
184+
(remaining_translation_tokens * self._token_rate)
185+
186+
# Phase 2 will process all chunks again
187+
phase2_total = (self.FIXED_PROMPT_OVERHEAD * self._total_chunks) + \
188+
(self._total_tokens * self._token_rate)
189+
190+
return phase1_remaining + phase2_total
191+
else:
192+
# In refinement phase - only refinement work remains
193+
remaining_refinement_tokens = self._total_tokens - self._completed_tokens
194+
remaining_refinement_chunks = self._total_chunks - self._completed_chunks
156195

157-
return (self.FIXED_PROMPT_OVERHEAD * remaining_chunks) + \
158-
(remaining_tokens * self._token_rate)
196+
return (self.FIXED_PROMPT_OVERHEAD * remaining_refinement_chunks) + \
197+
(remaining_refinement_tokens * self._token_rate)
159198

160199
def get_stats(self) -> ProgressStats:
161200
"""Get immutable snapshot of current progress statistics."""

0 commit comments

Comments
 (0)