Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ dependencies = [
"isort==5.13.2",
"tomli==2.2.1",
"claude-agent-sdk>=0.1.0",
"tiktoken==0.12.0",
"genai-prices==0.0.51",
]
classifiers = [
"Development Status :: 5 - Production/Stable",
Expand Down
128 changes: 86 additions & 42 deletions python_gpt_po/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,17 @@
import traceback
from argparse import Namespace
from dataclasses import dataclass
from typing import Dict, List, Optional
from typing import Any, Dict, List, Optional, Tuple

from .models.config import TranslationConfig, TranslationFlags
from .models.enums import ModelProvider
from .models.provider_clients import ProviderClients
from .services.language_detector import LanguageDetector
from .services.model_manager import ModelManager
from .services.translation_service import TranslationService
from .utils.cli import (auto_select_provider, create_language_mapping, get_provider_from_args, parse_args,
show_help_and_exit, validate_provider_key)
from .utils.config_loader import ConfigLoader
from .utils.cost_estimator import CostEstimator


def setup_logging(verbose: int = 0, quiet: bool = False):
Expand Down Expand Up @@ -53,20 +53,11 @@ def setup_logging(verbose: int = 0, quiet: bool = False):
logging.getLogger().setLevel(level)


def initialize_provider(args: Namespace) -> tuple[ProviderClients, ModelProvider, str]:
def get_offline_provider_info(args: Namespace) -> Tuple[Any, Any, str]:
"""
Initialize the provider client and determine the appropriate model.

Args:
args: Command line arguments from argparse

Returns:
tuple: (provider_clients, provider, model)

Raises:
SystemExit: If no valid provider can be found or initialized
Get provider and model information without making network calls.
"""
# Initialize provider clients
# Initialize provider clients (reads environment variables and args)
provider_clients = ProviderClients()
api_keys = provider_clients.initialize_clients(args)

Expand All @@ -82,40 +73,43 @@ def initialize_provider(args: Namespace) -> tuple[ProviderClients, ModelProvider
if not validate_provider_key(provider, api_keys):
sys.exit(1)

# Determine model - use CLI arg or default
model = args.model
if not model:
model = ModelManager.get_default_model(provider)

return provider_clients, provider, model


def initialize_provider(args: Namespace, provider_clients: Any, provider: Any, model: str) -> Tuple[Any, Any, str]:
"""
Finalize provider initialization with network validation if needed.
"""
# Create model manager for model operations
model_manager = ModelManager()

# List models if requested and exit
# List models if requested and exit (this makes network calls)
if args.list_models:
models = model_manager.get_available_models(provider_clients, provider)
print(f"Available models for {provider.value}:")
for model in models:
print(f" - {model}")
for m in models:
print(f" - {m}")
sys.exit(0)

# Determine appropriate model
model = get_appropriate_model(provider, provider_clients, model_manager, args.model)
# Validate model (this makes network calls)
final_model = get_appropriate_model(provider, provider_clients, model_manager, model)

return provider_clients, provider, model
return provider_clients, provider, final_model


def get_appropriate_model(
provider: ModelProvider,
provider_clients: ProviderClients,
model_manager: ModelManager,
provider: Any,
provider_clients: Any,
model_manager: Any,
requested_model: Optional[str]
) -> str:
"""
Get the appropriate model for the provider.

Args:
provider (ModelProvider): The selected provider
provider_clients (ProviderClients): The initialized provider clients
model_manager (ModelManager): The model manager instance
requested_model (Optional[str]): Model requested by the user

Returns:
str: The appropriate model ID
"""
# If a specific model was requested, validate it
if requested_model:
Expand Down Expand Up @@ -143,7 +137,7 @@ def get_appropriate_model(
@dataclass
class TranslationTask:
"""Parameters for translation processing."""
config: TranslationConfig
config: Any
folder: str
languages: List[str]
detail_languages: Dict[str, str]
Expand All @@ -154,9 +148,6 @@ class TranslationTask:
def process_translations(task: TranslationTask):
"""
Process translations for the given task parameters.

Args:
task: TranslationTask containing all processing parameters
"""
# Initialize translation service
translation_service = TranslationService(task.config, task.batch_size)
Expand Down Expand Up @@ -192,12 +183,9 @@ def main():
setup_logging(verbose=args.verbose, quiet=args.quiet)

try:
# Initialize provider
provider_clients, provider, model = initialize_provider(args)

# Get languages - either from args or auto-detect from PO files
# 1. Get languages (Pure logic)
try:
respect_gitignore = not args.no_gitignore # Invert the flag
respect_gitignore = not args.no_gitignore
languages = LanguageDetector.validate_or_detect_languages(
folder=args.folder,
lang_arg=args.lang,
Expand All @@ -208,7 +196,63 @@ def main():
logging.error(str(e))
sys.exit(1)

# Create mapping between language codes and detailed names
# 2. Extract model name for offline estimation (Purely offline)
# Defaults to gpt-4o-mini if not specified. Avoids ModelManager to prevent early side-effects.
estimated_model = args.model or "gpt-4o-mini"

# 3. Estimate cost if requested (Strictly Offline Terminal Flow)
if args.estimate_cost:
estimation = CostEstimator.estimate_cost(
args.folder,
languages,
estimated_model,
fix_fuzzy=args.fix_fuzzy,
respect_gitignore=respect_gitignore
)

print(f"\n{'=' * 40}")
print(" OFFLINE TOKEN ESTIMATION REPORT")
print(f"{'=' * 40}")
print(f"Model: {estimation['model']}")
print(f"Rate: {estimation['rate_info']}")
print(f"Unique msgids: {estimation['unique_texts']:,}")
print(f"Total Tokens: {estimation['total_tokens']:,} (estimated expansion included)")

if estimation['estimated_cost'] is not None:
print(f"Estimated Cost: ${estimation['estimated_cost']:.4f}")

print("\nPer-language Breakdown:")
for lang, data in estimation['breakdown'].items():
cost_str = f"${data['cost']:.4f}" if data['cost'] is not None else "unavailable"
print(f" - {lang:5}: {data['tokens']:8,} tokens | {cost_str}")

print("\nNote: Cost estimates are approximate and may not reflect current provider pricing.")
print(f"{'=' * 40}\n")

if estimation['total_tokens'] == 0:
logging.info("No entries require translation.")
return

if not args.yes:
confirm = input("Run actual translation with these settings? (y/n): ").lower()
if confirm != 'y':
logging.info("Cancelled by user.")
return

# Issue #57: Hard exit after estimation to ensure zero side effects.
# Estimation is a terminal dry-run. This prevents "Registered provider" logs
# or connection attempts from leaking into the audit output.
print(
"\n[Audit Successful] To proceed with actual translation, "
"run the command again WITHOUT --estimate-cost."
)
return

# 4. Initialize providers (Online Execution Path Starts Here)
provider_clients, provider, final_model_id = get_offline_provider_info(args)
provider_clients, provider, model = initialize_provider(args, provider_clients, provider, final_model_id)

# 5. Create mapping between language codes and detailed names
try:
detail_languages = create_language_mapping(languages, args.detail_lang)
except ValueError as e:
Expand Down
13 changes: 9 additions & 4 deletions python_gpt_po/services/po_file_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,15 +125,20 @@ def get_file_language(po_file_path, po_file, languages, folder_language):

if folder_language:
for part in po_file_path.split(os.sep):
# Try variants of the folder part
variant_match = POFileHandler._try_language_variants(part, languages)
# Clean part (strip .po if it's the filename)
clean_part = part
if part.endswith('.po'):
clean_part = part[:-3]

# Try variants of the folder/file part
variant_match = POFileHandler._try_language_variants(clean_part, languages)
if variant_match:
logging.info("Inferred language for .po file: %s as %s", po_file_path, variant_match)
return variant_match

# Try base language fallback
if not POFileHandler._should_skip_fallback(part):
norm_part = POFileHandler.normalize_language_code(part)
if not POFileHandler._should_skip_fallback(clean_part):
norm_part = POFileHandler.normalize_language_code(clean_part)
if norm_part and norm_part in languages:
logging.info("Inferred language for .po file: %s as %s (base of %s)",
po_file_path, norm_part, part)
Expand Down
66 changes: 66 additions & 0 deletions python_gpt_po/tests/unit/test_cost_estimator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import os
import shutil
import unittest

import polib

from python_gpt_po.utils.cost_estimator import CostEstimator


class TestCostEstimatorMinimal(unittest.TestCase):
def setUp(self):
self.test_dir = os.path.abspath("test_cost_est_minimal")
if os.path.exists(self.test_dir):
shutil.rmtree(self.test_dir)
os.makedirs(self.test_dir)

def tearDown(self):
if os.path.exists(self.test_dir):
shutil.rmtree(self.test_dir)

def test_minimal_token_math(self):
"""Verify tokenize once and multiply by languages."""
po_path = os.path.join(self.test_dir, "test.po")
po = polib.POFile()
# "Hello" is approx 1-2 tokens.
po.append(polib.POEntry(msgid="Hello", msgstr=""))
po.save(po_path)

# 1 language
est1 = CostEstimator.estimate_cost(self.test_dir, ["fr"], "gpt-4o-mini")
t1 = est1['total_tokens']

# 3 languages
est3 = CostEstimator.estimate_cost(self.test_dir, ["fr", "es", "de"], "gpt-4o-mini")
t3 = est3['total_tokens']

self.assertEqual(t3, t1 * 3)

def test_pricing_lookup(self):
"""Verify dynamic pricing lookup via genai-prices."""
po_path = os.path.join(self.test_dir, "test.po")
po = polib.POFile()
po.append(polib.POEntry(msgid="Test", msgstr=""))
po.save(po_path)

# Known model
est_known = CostEstimator.estimate_cost(self.test_dir, ["fr"], "gpt-4o-mini")
self.assertIsNotNone(est_known['estimated_cost'])

# Unknown model
est_unknown = CostEstimator.estimate_cost(self.test_dir, ["fr"], "unknown-model")
self.assertIsNone(est_unknown['estimated_cost'])

def test_zero_work(self):
"""Verify zero tokens when everything is translated."""
po_path = os.path.join(self.test_dir, "test.po")
po = polib.POFile()
po.append(polib.POEntry(msgid="Hello", msgstr="Bonjour"))
po.save(po_path)

est = CostEstimator.estimate_cost(self.test_dir, ["fr"], "gpt-4o-mini")
self.assertEqual(est['total_tokens'], 0)


if __name__ == '__main__':
unittest.main()
10 changes: 10 additions & 0 deletions python_gpt_po/utils/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,16 @@ def parse_args() -> Namespace:
metavar="SIZE",
help="Number of strings to translate in each batch (default: 50)"
)
advanced_group.add_argument(
"--estimate-cost",
action="store_true",
help="Estimate token usage and cost before translating"
)
advanced_group.add_argument(
"-y", "--yes",
action="store_true",
help="Skip confirmation prompt when using --estimate-cost"
)
fuzzy_group.add_argument(
"--fuzzy",
action="store_true",
Expand Down
Loading