Skip to content

Commit a1571c0

Browse files
author
bram
committed
Updated gitignore
1 parent 232bbb0 commit a1571c0

File tree

9 files changed

+297
-157
lines changed

9 files changed

+297
-157
lines changed

pyproject.toml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,17 @@ requires-python = ">=3.9"
2222
license = {text = "MIT"}
2323
dependencies = [
2424
"polib==1.2.0",
25-
"openai==1.58.1",
26-
"python-dotenv==1.0.0",
27-
"pytest==8.2.2",
25+
"openai==1.99.9",
26+
"python-dotenv==1.0.1",
27+
"pytest==8.3.4",
2828
"tenacity==9.0.0",
2929
"setuptools-scm==8.1.0",
3030
"pycountry==24.6.1",
31-
"anthropic==0.48.0",
31+
"anthropic==0.63.0",
3232
"requests==2.32.3",
33-
"responses==0.25.6",
34-
"isort==6.0.1",
35-
"tomli>=1.2.0; python_version<'3.11'",
33+
"responses==0.25.8",
34+
"isort==5.13.2",
35+
"tomli==2.2.1",
3636
]
3737
classifiers = [
3838
"Development Status :: 5 - Production/Stable",

python_gpt_po/main.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,9 +199,11 @@ def main():
199199
languages = parse_languages(args.lang)
200200
logging.info("Using specified languages: %s", ', '.join(languages))
201201
else:
202+
respect_gitignore = not args.no_gitignore # Invert the flag
202203
languages = LanguageDetector.detect_languages_from_folder(
203204
args.folder,
204-
use_folder_structure=args.folder_language
205+
use_folder_structure=args.folder_language,
206+
respect_gitignore=respect_gitignore
205207
)
206208
detection_method = "folder structure" if args.folder_language else "metadata"
207209
logging.info("Auto-detected languages from %s: %s", detection_method, ', '.join(languages))

python_gpt_po/models/config.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,14 @@
22
Configuration classes for the PO translator application.
33
"""
44

5+
from __future__ import annotations
56
from dataclasses import dataclass
7+
from typing import TYPE_CHECKING
68

79
from .enums import ModelProvider
8-
from .provider_clients import ProviderClients
10+
11+
if TYPE_CHECKING:
12+
from .provider_clients import ProviderClients
913

1014

1115
@dataclass

python_gpt_po/services/language_detector.py

Lines changed: 181 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,76 @@
44

55
import logging
66
import os
7-
import re
8-
from typing import List
7+
from typing import List, Optional, Set
98

109
import polib
10+
import pycountry
11+
12+
from ..utils.gitignore import create_gitignore_parser
1113

1214

1315
class LanguageDetector:
1416
"""Detects languages from PO files in a directory."""
1517

18+
# Cache for valid language codes
19+
_valid_codes_cache: Optional[Set[str]] = None
20+
21+
@classmethod
22+
def _get_valid_language_codes(cls) -> Set[str]:
23+
"""Get a set of all valid language codes using pycountry and known Django codes."""
24+
if cls._valid_codes_cache is not None:
25+
return cls._valid_codes_cache
26+
27+
valid_codes = set()
28+
29+
# Add ISO 639-1 codes (2-letter codes like 'en', 'fr')
30+
for lang in pycountry.languages:
31+
if hasattr(lang, 'alpha_2'):
32+
valid_codes.add(lang.alpha_2)
33+
if hasattr(lang, 'alpha_3'):
34+
# Also add 3-letter codes
35+
valid_codes.add(lang.alpha_3)
36+
37+
# Add locale codes with country (like 'en_US', 'pt_BR')
38+
for country in pycountry.countries:
39+
if hasattr(country, 'alpha_2'):
40+
country_code = country.alpha_2
41+
# Common combinations
42+
common_langs = [
43+
'en', 'es', 'fr', 'de', 'pt', 'zh', 'ar', 'ru', 'ja', 'ko',
44+
'it', 'nl', 'pl', 'tr', 'sv', 'da', 'no', 'fi', 'uk', 'cs',
45+
'hu', 'ro', 'bg', 'hr', 'sr', 'sl', 'sk', 'lt', 'lv', 'et',
46+
'hi', 'ur', 'bn', 'ta', 'te', 'mr', 'gu', 'kn', 'ml', 'pa', 'ne'
47+
]
48+
for lang_code in common_langs:
49+
valid_codes.add(f"{lang_code}_{country_code}")
50+
valid_codes.add(f"{lang_code}-{country_code}")
51+
52+
# Add Django special codes
53+
django_special = {
54+
'zh_Hans', 'zh_Hant', 'zh-hans', 'zh-hant', # Chinese variants
55+
'sr_Latn', 'sr-latn', 'sr@latin', # Serbian variants
56+
'be@tarask', # Belarusian variant
57+
'en-us', 'en-gb', 'en-au', 'en-ca', # English variants
58+
'es-es', 'es-mx', 'es-ar', # Spanish variants
59+
'pt-pt', 'pt-br', # Portuguese variants
60+
'no', 'nb', 'nn', # Norwegian variants
61+
}
62+
valid_codes.update(django_special)
63+
64+
cls._valid_codes_cache = valid_codes
65+
return valid_codes
66+
1667
@staticmethod
17-
def detect_languages_from_folder(folder: str, use_folder_structure: bool = False) -> List[str]:
68+
def detect_languages_from_folder(folder: str, use_folder_structure: bool = False,
69+
respect_gitignore: bool = True) -> List[str]:
1870
"""
1971
Scan all PO files in a folder and detect their languages.
2072
2173
Args:
2274
folder: Path to folder containing PO files
2375
use_folder_structure: If True, detect languages from folder names instead of metadata
76+
respect_gitignore: If True, respect .gitignore patterns when scanning
2477
2578
Returns:
2679
List of unique language codes found in PO files
@@ -29,46 +82,120 @@ def detect_languages_from_folder(folder: str, use_folder_structure: bool = False
2982
ValueError: If no languages could be detected
3083
"""
3184
if use_folder_structure:
32-
return LanguageDetector._detect_from_folder_structure(folder)
85+
return LanguageDetector._detect_from_folder_structure(folder, respect_gitignore)
3386

34-
return LanguageDetector._detect_from_metadata(folder)
87+
return LanguageDetector._detect_from_metadata(folder, respect_gitignore)
3588

3689
@staticmethod
37-
def _detect_from_folder_structure(folder: str) -> List[str]:
38-
"""Detect languages from folder structure (e.g., locale/it/LC_MESSAGES/)."""
90+
def _detect_from_folder_structure(folder: str, respect_gitignore: bool = True) -> List[str]:
91+
"""Detect languages from folder structure based on common framework patterns."""
3992
languages = set()
4093
po_files_found = 0
94+
valid_codes = LanguageDetector._get_valid_language_codes()
95+
gitignore_parser = create_gitignore_parser(folder, respect_gitignore)
4196

42-
# Scan all PO files
43-
for root, _, files in os.walk(folder):
44-
for file in files:
45-
if not file.endswith('.po'):
46-
continue
97+
for root, dirs, files in os.walk(folder):
98+
dirs[:], files = gitignore_parser.filter_walk_results(root, dirs, files)
99+
po_files_in_dir = [f for f in files if f.endswith('.po')]
47100

48-
po_files_found += 1
49-
po_path = os.path.join(root, file)
101+
if not po_files_in_dir:
102+
continue
50103

51-
# Extract language code from path
52-
# Common patterns: locale/it/LC_MESSAGES/, it/, locale/it/, etc.
53-
relative_path = os.path.relpath(po_path, folder)
104+
for file in po_files_in_dir:
105+
po_files_found += 1
106+
relative_path = os.path.relpath(os.path.join(root, file), folder)
54107
path_parts = relative_path.split(os.sep)
55108

56-
# Look for language codes in path components
57-
for part in path_parts:
58-
# Skip common directory names
59-
if part in ['locale', 'locales', 'LC_MESSAGES', 'po', 'i18n', 'translations']:
60-
continue
61-
if part.endswith('.po'):
62-
continue
109+
detected_lang = LanguageDetector._detect_language_from_path(
110+
path_parts, file, valid_codes
111+
)
112+
113+
if detected_lang:
114+
languages.add(detected_lang)
115+
logging.debug("Found language '%s' from path %s", detected_lang,
116+
os.path.join(root, file))
117+
118+
LanguageDetector._validate_detection_results(po_files_found, languages)
119+
detected = sorted(list(languages))
120+
logging.info("Auto-detected languages from folder structure in %d PO files: %s",
121+
po_files_found, ', '.join(detected))
122+
return detected
123+
124+
@staticmethod
125+
def _detect_language_from_path(path_parts: List[str], filename: str,
126+
valid_codes: set) -> Optional[str]:
127+
"""Detect language from a single file path using various patterns."""
128+
# Pattern 1: LC_MESSAGES structure
129+
detected_lang = LanguageDetector._detect_from_lc_messages(path_parts, valid_codes)
130+
if detected_lang:
131+
return detected_lang
132+
133+
# Pattern 2: WordPress filename pattern
134+
detected_lang = LanguageDetector._detect_from_filename(filename, valid_codes)
135+
if detected_lang:
136+
return detected_lang
137+
138+
# Pattern 3: Directory structure
139+
detected_lang = LanguageDetector._detect_from_directories(path_parts, valid_codes)
140+
if detected_lang:
141+
return detected_lang
142+
143+
# Pattern 4: Flat structure
144+
return LanguageDetector._detect_from_flat_structure(filename, valid_codes)
145+
146+
@staticmethod
147+
def _detect_from_lc_messages(path_parts: List[str], valid_codes: set) -> Optional[str]:
148+
"""Detect language from LC_MESSAGES pattern."""
149+
if 'LC_MESSAGES' in path_parts:
150+
lc_idx = path_parts.index('LC_MESSAGES')
151+
if lc_idx > 0:
152+
potential_lang = path_parts[lc_idx - 1]
153+
if potential_lang in valid_codes or potential_lang.lower() in valid_codes:
154+
return potential_lang
155+
return None
156+
157+
@staticmethod
158+
def _detect_from_filename(filename: str, valid_codes: set) -> Optional[str]:
159+
"""Detect language from WordPress-style filename pattern."""
160+
if '-' in filename:
161+
parts = filename.replace('.po', '').split('-')
162+
if len(parts) >= 2:
163+
potential_lang = parts[-1]
164+
if potential_lang in valid_codes or potential_lang.lower() in valid_codes:
165+
return potential_lang
166+
return None
167+
168+
@staticmethod
169+
def _detect_from_directories(path_parts: List[str], valid_codes: set) -> Optional[str]:
170+
"""Detect language from directory structure."""
171+
locale_dirs = ['locale', 'locales', 'i18n', 'translations', 'lang', 'languages', 'po']
172+
173+
for i, part in enumerate(path_parts[:-1]): # Exclude filename
174+
if part in locale_dirs:
175+
continue
176+
177+
if part in valid_codes or part.lower() in valid_codes:
178+
prev_is_locale = i > 0 and path_parts[i - 1] in locale_dirs
179+
is_po_parent = i == len(path_parts) - 2
63180

64-
# Check if this looks like a language code
65-
if LanguageDetector._is_language_code(part):
66-
languages.add(part)
67-
logging.debug("Found language '%s' from path %s", part, po_path)
68-
break
181+
if prev_is_locale or is_po_parent:
182+
return part
183+
return None
69184

185+
@staticmethod
186+
def _detect_from_flat_structure(filename: str, valid_codes: set) -> Optional[str]:
187+
"""Detect language from flat structure where filename is the language code."""
188+
if filename.endswith('.po'):
189+
lang_candidate = filename.replace('.po', '')
190+
if lang_candidate in valid_codes or lang_candidate.lower() in valid_codes:
191+
return lang_candidate
192+
return None
193+
194+
@staticmethod
195+
def _validate_detection_results(po_files_found: int, languages: set):
196+
"""Validate detection results and raise appropriate errors."""
70197
if not po_files_found:
71-
raise ValueError(f"No .po files found in folder: {folder}")
198+
raise ValueError("No .po files found in folder")
72199

73200
if not languages:
74201
raise ValueError(
@@ -77,23 +204,20 @@ def _detect_from_folder_structure(folder: str) -> List[str]:
77204
f"or use -l to specify languages."
78205
)
79206

80-
# Convert to sorted list for consistent ordering
81-
detected = sorted(list(languages))
82-
83-
logging.info("Auto-detected languages from folder structure in %d PO files: %s",
84-
po_files_found, ', '.join(detected))
85-
86-
return detected
87-
88207
@staticmethod
89-
def _detect_from_metadata(folder: str) -> List[str]:
208+
def _detect_from_metadata(folder: str, respect_gitignore: bool = True) -> List[str]:
90209
"""Detect languages from PO file metadata."""
91210
languages = set()
92211
po_files_found = 0
93212
files_with_language = 0
94213

214+
# Create gitignore parser for filtering
215+
gitignore_parser = create_gitignore_parser(folder, respect_gitignore)
216+
95217
# Scan all PO files
96-
for root, _, files in os.walk(folder):
218+
for root, dirs, files in os.walk(folder):
219+
# Filter directories and files using gitignore parser
220+
dirs[:], files = gitignore_parser.filter_walk_results(root, dirs, files)
97221
for file in files:
98222
if not file.endswith('.po'):
99223
continue
@@ -137,38 +261,36 @@ def _detect_from_metadata(folder: str) -> List[str]:
137261

138262
@staticmethod
139263
def _is_language_code(code: str) -> bool:
140-
"""Check if a string looks like a language code."""
141-
# Basic validation
142-
if not code or len(code) < 2 or len(code) > 10 or not code[0].isalpha():
264+
"""Check if a string is a valid language code."""
265+
if not code or len(code) < 2:
143266
return False
144267

145-
# Common language code patterns:
146-
# - 2-letter codes: en, fr, de, etc.
147-
# - 2+2 codes: en_US, fr_FR, etc.
148-
# - Special codes: zh_Hans, sr_Latn, be@tarask, etc.
268+
# Get valid codes
269+
valid_codes = LanguageDetector._get_valid_language_codes()
149270

150-
# Special cases used in Django
151-
special_codes = {
152-
'zh-hans', 'zh-hant', 'sr-latn', 'sr-cyrl', 'az-latn', 'az-cyrl',
153-
'uz-latn', 'uz-cyrl', 'kk-latn', 'kk-cyrl', 'ky-latn', 'ky-cyrl'
154-
}
271+
# Check if code is valid (case-insensitive)
272+
if code in valid_codes or code.lower() in valid_codes:
273+
return True
274+
275+
# Check with normalization
276+
normalized = code.replace('-', '_')
277+
if normalized in valid_codes or normalized.lower() in valid_codes:
278+
return True
155279

156-
# Check all valid patterns
157-
pattern_match = re.match(r'^[a-z]{2,3}(_[A-Z][a-z]+|_[A-Z]{2}|@[a-z]+)?$', code) is not None
158-
in_special = code.lower() in special_codes
159-
basic_match = re.match(r'^[a-z]{2,3}$', code) is not None
160-
return pattern_match or in_special or basic_match
280+
return False
161281

162282
@staticmethod
163283
def validate_or_detect_languages(folder: str, lang_arg: str = None,
164-
use_folder_structure: bool = False) -> List[str]:
284+
use_folder_structure: bool = False,
285+
respect_gitignore: bool = True) -> List[str]:
165286
"""
166287
Get languages from command line or auto-detect from PO files.
167288
168289
Args:
169290
folder: Path to folder containing PO files
170291
lang_arg: Language argument from command line (optional)
171292
use_folder_structure: If True, detect languages from folder names instead of metadata
293+
respect_gitignore: If True, respect .gitignore patterns when scanning
172294
173295
Returns:
174296
List of language codes to process
@@ -187,7 +309,7 @@ def validate_or_detect_languages(folder: str, lang_arg: str = None,
187309
logging.info("No languages specified with -l, auto-detecting from %s...", detection_method)
188310

189311
try:
190-
return LanguageDetector.detect_languages_from_folder(folder, use_folder_structure)
312+
return LanguageDetector.detect_languages_from_folder(folder, use_folder_structure, respect_gitignore)
191313
except ValueError as e:
192314
# Re-raise with more helpful message
193315
if use_folder_structure:

python_gpt_po/services/providers/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""
22
Provider implementations for model management.
33
"""
4-
from . import provider_init # noqa: F401 - Auto-registers providers
54
from .base import ModelProviderInterface
65
from .registry import ProviderRegistry
76

0 commit comments

Comments
 (0)