Skip to content

Commit d46e1a2

Browse files
committed
Docx
1 parent 6aabdc2 commit d46e1a2

19 files changed

+1703
-597
lines changed

requirements.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ jinja2>=3.0
1515
langdetect>=1.0.9
1616
Pillow>=10.0.0
1717

18+
# DOCX support
19+
mammoth>=1.0.0
20+
python-docx>=1.1.0
21+
1822
# Chatterbox TTS (GPU-accelerated local TTS) - OPTIONAL
1923
# These dependencies have strict numpy version requirements that may conflict.
2024
# Install separately if needed:

src/api/services/path_validator.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
22
Path validation utilities for secure file operations
33
"""
4+
import os
45
from typing import Tuple
56

67

@@ -23,10 +24,24 @@ def validate_filename(filename: str) -> Tuple[bool, str]:
2324
if not filename:
2425
return False, "Filename cannot be empty"
2526

26-
# Prevent directory traversal
27-
if '..' in filename or filename.startswith('/') or filename.startswith('\\'):
27+
# Prevent directory traversal - check for path separators with '..'
28+
# This allows '...' or '....' but blocks '../' or '..\' patterns
29+
if filename.startswith('/') or filename.startswith('\\'):
30+
return False, "Invalid filename: absolute path not allowed"
31+
32+
# Check for directory traversal patterns
33+
# Block: ../ or ..\ (with separators)
34+
if '/../' in filename or '\\..\\' in filename or '/..' in filename or '\\..' in filename:
2835
return False, "Invalid filename: directory traversal not allowed"
2936

37+
# Also check if the normalized filename contains path separators
38+
# This catches cases like "foo/../bar" or "foo/bar"
39+
if os.path.sep in filename or ('/' in filename or '\\' in filename):
40+
# Exception: allow if it's just the filename itself (no actual traversal)
41+
# Use os.path.basename to check if it's a pure filename
42+
if os.path.basename(filename) != filename:
43+
return False, "Invalid filename: path separators not allowed"
44+
3045
# Check filename length
3146
if len(filename) > PathValidator.MAX_FILENAME_LENGTH:
3247
return False, f"Filename too long (max {PathValidator.MAX_FILENAME_LENGTH} characters)"

src/core/adapters/translate_file.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,40 @@ async def translate_file(
137137
)
138138
return True # Legacy function doesn't return success status
139139

140+
# DOCX translation using EPUB pipeline (Phase 1 implementation)
141+
# Similar to EPUB, DOCX requires HTML chunking, tag preservation, etc.
142+
# This reuses the EPUB pipeline for rapid deployment
143+
if ext == '.docx':
144+
from src.core.docx.translator import translate_docx_file
145+
from src.core.llm import create_llm_provider
146+
147+
# Create LLM client
148+
llm_client = create_llm_provider(
149+
provider_type=llm_provider,
150+
endpoint=llm_api_endpoint,
151+
model=model_name,
152+
gemini_api_key=gemini_api_key,
153+
openai_api_key=openai_api_key,
154+
openrouter_api_key=openrouter_api_key
155+
)
156+
157+
result = await translate_docx_file(
158+
input_filepath=input_filepath,
159+
output_filepath=output_filepath,
160+
source_language=source_language,
161+
target_language=target_language,
162+
model_name=model_name,
163+
llm_client=llm_client,
164+
max_tokens_per_chunk=context_window or 450,
165+
log_callback=log_callback,
166+
progress_callback=progress_callback,
167+
stats_callback=stats_callback,
168+
prompt_options=prompt_options,
169+
max_retries=1,
170+
context_manager=None
171+
)
172+
return result['success']
173+
140174
# Map file extensions to adapters
141175
adapter_map = {
142176
'.txt': TxtAdapter,
@@ -146,7 +180,7 @@ async def translate_file(
146180

147181
adapter_class = adapter_map.get(ext)
148182
if not adapter_class:
149-
supported = ', '.join(['.txt', '.srt', '.epub']) # Include .epub in supported formats
183+
supported = ', '.join(['.txt', '.srt', '.epub', '.docx']) # Include .epub and .docx in supported formats
150184
raise UnsupportedFormatError(
151185
f"Unsupported file format: {ext}. Supported formats: {supported}"
152186
)
@@ -214,6 +248,7 @@ def get_file_type_from_path(filepath: str) -> str:
214248
'.txt': 'txt',
215249
'.srt': 'srt',
216250
'.epub': 'epub',
251+
'.docx': 'docx',
217252
}
218253

219254
return type_map.get(ext, 'unknown')
@@ -266,6 +301,7 @@ async def build_translated_output(
266301
'txt': TxtAdapter,
267302
'srt': SrtAdapter,
268303
'epub': EpubAdapter,
304+
# Note: docx doesn't support checkpoint reconstruction yet
269305
}
270306

271307
adapter_class = adapter_map.get(file_type)

src/core/common/__init__.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
"""
2+
Common translation components.
3+
4+
This module contains reusable components for translation:
5+
- GenericTranslationOrchestrator: Unified pipeline for all formats
6+
- TranslationAdapter: Interface for format-specific adapters
7+
- TranslationMetrics: Shared metrics tracking
8+
"""
9+
10+
from .translation_orchestrator import (
11+
GenericTranslationOrchestrator,
12+
TranslationAdapter
13+
)
14+
15+
__all__ = [
16+
'GenericTranslationOrchestrator',
17+
'TranslationAdapter',
18+
]

0 commit comments

Comments
 (0)