44
55import logging
66import os
7- import re
8- from typing import List
7+ from typing import List , Optional , Set
98
109import polib
10+ import pycountry
11+
12+ from ..utils .gitignore import create_gitignore_parser
1113
1214
1315class LanguageDetector :
1416 """Detects languages from PO files in a directory."""
1517
18+ # Cache for valid language codes
19+ _valid_codes_cache : Optional [Set [str ]] = None
20+
21+ @classmethod
22+ def _get_valid_language_codes (cls ) -> Set [str ]:
23+ """Get a set of all valid language codes using pycountry and known Django codes."""
24+ if cls ._valid_codes_cache is not None :
25+ return cls ._valid_codes_cache
26+
27+ valid_codes = set ()
28+
29+ # Add ISO 639-1 codes (2-letter codes like 'en', 'fr')
30+ for lang in pycountry .languages :
31+ if hasattr (lang , 'alpha_2' ):
32+ valid_codes .add (lang .alpha_2 )
33+ if hasattr (lang , 'alpha_3' ):
34+ # Also add 3-letter codes
35+ valid_codes .add (lang .alpha_3 )
36+
37+ # Add locale codes with country (like 'en_US', 'pt_BR')
38+ for country in pycountry .countries :
39+ if hasattr (country , 'alpha_2' ):
40+ country_code = country .alpha_2
41+ # Common combinations
42+ common_langs = [
43+ 'en' , 'es' , 'fr' , 'de' , 'pt' , 'zh' , 'ar' , 'ru' , 'ja' , 'ko' ,
44+ 'it' , 'nl' , 'pl' , 'tr' , 'sv' , 'da' , 'no' , 'fi' , 'uk' , 'cs' ,
45+ 'hu' , 'ro' , 'bg' , 'hr' , 'sr' , 'sl' , 'sk' , 'lt' , 'lv' , 'et' ,
46+ 'hi' , 'ur' , 'bn' , 'ta' , 'te' , 'mr' , 'gu' , 'kn' , 'ml' , 'pa' , 'ne'
47+ ]
48+ for lang_code in common_langs :
49+ valid_codes .add (f"{ lang_code } _{ country_code } " )
50+ valid_codes .add (f"{ lang_code } -{ country_code } " )
51+
52+ # Add Django special codes
53+ django_special = {
54+ 'zh_Hans' , 'zh_Hant' , 'zh-hans' , 'zh-hant' , # Chinese variants
55+ 'sr_Latn' , 'sr-latn' , 'sr@latin' , # Serbian variants
56+ 'be@tarask' , # Belarusian variant
57+ 'en-us' , 'en-gb' , 'en-au' , 'en-ca' , # English variants
58+ 'es-es' , 'es-mx' , 'es-ar' , # Spanish variants
59+ 'pt-pt' , 'pt-br' , # Portuguese variants
60+ 'no' , 'nb' , 'nn' , # Norwegian variants
61+ }
62+ valid_codes .update (django_special )
63+
64+ cls ._valid_codes_cache = valid_codes
65+ return valid_codes
66+
1667 @staticmethod
17- def detect_languages_from_folder (folder : str , use_folder_structure : bool = False ) -> List [str ]:
68+ def detect_languages_from_folder (folder : str , use_folder_structure : bool = False ,
69+ respect_gitignore : bool = True ) -> List [str ]:
1870 """
1971 Scan all PO files in a folder and detect their languages.
2072
2173 Args:
2274 folder: Path to folder containing PO files
2375 use_folder_structure: If True, detect languages from folder names instead of metadata
76+ respect_gitignore: If True, respect .gitignore patterns when scanning
2477
2578 Returns:
2679 List of unique language codes found in PO files
@@ -29,46 +82,120 @@ def detect_languages_from_folder(folder: str, use_folder_structure: bool = False
2982 ValueError: If no languages could be detected
3083 """
3184 if use_folder_structure :
32- return LanguageDetector ._detect_from_folder_structure (folder )
85+ return LanguageDetector ._detect_from_folder_structure (folder , respect_gitignore )
3386
34- return LanguageDetector ._detect_from_metadata (folder )
87+ return LanguageDetector ._detect_from_metadata (folder , respect_gitignore )
3588
3689 @staticmethod
37- def _detect_from_folder_structure (folder : str ) -> List [str ]:
38- """Detect languages from folder structure (e.g., locale/it/LC_MESSAGES/) ."""
90+ def _detect_from_folder_structure (folder : str , respect_gitignore : bool = True ) -> List [str ]:
91+ """Detect languages from folder structure based on common framework patterns ."""
3992 languages = set ()
4093 po_files_found = 0
94+ valid_codes = LanguageDetector ._get_valid_language_codes ()
95+ gitignore_parser = create_gitignore_parser (folder , respect_gitignore )
4196
42- # Scan all PO files
43- for root , _ , files in os .walk (folder ):
44- for file in files :
45- if not file .endswith ('.po' ):
46- continue
97+ for root , dirs , files in os .walk (folder ):
98+ dirs [:], files = gitignore_parser .filter_walk_results (root , dirs , files )
99+ po_files_in_dir = [f for f in files if f .endswith ('.po' )]
47100
48- po_files_found += 1
49- po_path = os . path . join ( root , file )
101+ if not po_files_in_dir :
102+ continue
50103
51- # Extract language code from path
52- # Common patterns: locale/it/LC_MESSAGES/, it/, locale/it/, etc.
53- relative_path = os .path .relpath (po_path , folder )
104+ for file in po_files_in_dir :
105+ po_files_found += 1
106+ relative_path = os .path .relpath (os . path . join ( root , file ) , folder )
54107 path_parts = relative_path .split (os .sep )
55108
56- # Look for language codes in path components
57- for part in path_parts :
58- # Skip common directory names
59- if part in ['locale' , 'locales' , 'LC_MESSAGES' , 'po' , 'i18n' , 'translations' ]:
60- continue
61- if part .endswith ('.po' ):
62- continue
109+ detected_lang = LanguageDetector ._detect_language_from_path (
110+ path_parts , file , valid_codes
111+ )
112+
113+ if detected_lang :
114+ languages .add (detected_lang )
115+ logging .debug ("Found language '%s' from path %s" , detected_lang ,
116+ os .path .join (root , file ))
117+
118+ LanguageDetector ._validate_detection_results (po_files_found , languages )
119+ detected = sorted (list (languages ))
120+ logging .info ("Auto-detected languages from folder structure in %d PO files: %s" ,
121+ po_files_found , ', ' .join (detected ))
122+ return detected
123+
124+ @staticmethod
125+ def _detect_language_from_path (path_parts : List [str ], filename : str ,
126+ valid_codes : set ) -> Optional [str ]:
127+ """Detect language from a single file path using various patterns."""
128+ # Pattern 1: LC_MESSAGES structure
129+ detected_lang = LanguageDetector ._detect_from_lc_messages (path_parts , valid_codes )
130+ if detected_lang :
131+ return detected_lang
132+
133+ # Pattern 2: WordPress filename pattern
134+ detected_lang = LanguageDetector ._detect_from_filename (filename , valid_codes )
135+ if detected_lang :
136+ return detected_lang
137+
138+ # Pattern 3: Directory structure
139+ detected_lang = LanguageDetector ._detect_from_directories (path_parts , valid_codes )
140+ if detected_lang :
141+ return detected_lang
142+
143+ # Pattern 4: Flat structure
144+ return LanguageDetector ._detect_from_flat_structure (filename , valid_codes )
145+
146+ @staticmethod
147+ def _detect_from_lc_messages (path_parts : List [str ], valid_codes : set ) -> Optional [str ]:
148+ """Detect language from LC_MESSAGES pattern."""
149+ if 'LC_MESSAGES' in path_parts :
150+ lc_idx = path_parts .index ('LC_MESSAGES' )
151+ if lc_idx > 0 :
152+ potential_lang = path_parts [lc_idx - 1 ]
153+ if potential_lang in valid_codes or potential_lang .lower () in valid_codes :
154+ return potential_lang
155+ return None
156+
157+ @staticmethod
158+ def _detect_from_filename (filename : str , valid_codes : set ) -> Optional [str ]:
159+ """Detect language from WordPress-style filename pattern."""
160+ if '-' in filename :
161+ parts = filename .replace ('.po' , '' ).split ('-' )
162+ if len (parts ) >= 2 :
163+ potential_lang = parts [- 1 ]
164+ if potential_lang in valid_codes or potential_lang .lower () in valid_codes :
165+ return potential_lang
166+ return None
167+
168+ @staticmethod
169+ def _detect_from_directories (path_parts : List [str ], valid_codes : set ) -> Optional [str ]:
170+ """Detect language from directory structure."""
171+ locale_dirs = ['locale' , 'locales' , 'i18n' , 'translations' , 'lang' , 'languages' , 'po' ]
172+
173+ for i , part in enumerate (path_parts [:- 1 ]): # Exclude filename
174+ if part in locale_dirs :
175+ continue
176+
177+ if part in valid_codes or part .lower () in valid_codes :
178+ prev_is_locale = i > 0 and path_parts [i - 1 ] in locale_dirs
179+ is_po_parent = i == len (path_parts ) - 2
63180
64- # Check if this looks like a language code
65- if LanguageDetector ._is_language_code (part ):
66- languages .add (part )
67- logging .debug ("Found language '%s' from path %s" , part , po_path )
68- break
181+ if prev_is_locale or is_po_parent :
182+ return part
183+ return None
69184
185+ @staticmethod
186+ def _detect_from_flat_structure (filename : str , valid_codes : set ) -> Optional [str ]:
187+ """Detect language from flat structure where filename is the language code."""
188+ if filename .endswith ('.po' ):
189+ lang_candidate = filename .replace ('.po' , '' )
190+ if lang_candidate in valid_codes or lang_candidate .lower () in valid_codes :
191+ return lang_candidate
192+ return None
193+
194+ @staticmethod
195+ def _validate_detection_results (po_files_found : int , languages : set ):
196+ """Validate detection results and raise appropriate errors."""
70197 if not po_files_found :
71- raise ValueError (f "No .po files found in folder: { folder } " )
198+ raise ValueError ("No .po files found in folder" )
72199
73200 if not languages :
74201 raise ValueError (
@@ -77,23 +204,20 @@ def _detect_from_folder_structure(folder: str) -> List[str]:
77204 f"or use -l to specify languages."
78205 )
79206
80- # Convert to sorted list for consistent ordering
81- detected = sorted (list (languages ))
82-
83- logging .info ("Auto-detected languages from folder structure in %d PO files: %s" ,
84- po_files_found , ', ' .join (detected ))
85-
86- return detected
87-
88207 @staticmethod
89- def _detect_from_metadata (folder : str ) -> List [str ]:
208+ def _detect_from_metadata (folder : str , respect_gitignore : bool = True ) -> List [str ]:
90209 """Detect languages from PO file metadata."""
91210 languages = set ()
92211 po_files_found = 0
93212 files_with_language = 0
94213
214+ # Create gitignore parser for filtering
215+ gitignore_parser = create_gitignore_parser (folder , respect_gitignore )
216+
95217 # Scan all PO files
96- for root , _ , files in os .walk (folder ):
218+ for root , dirs , files in os .walk (folder ):
219+ # Filter directories and files using gitignore parser
220+ dirs [:], files = gitignore_parser .filter_walk_results (root , dirs , files )
97221 for file in files :
98222 if not file .endswith ('.po' ):
99223 continue
@@ -137,38 +261,36 @@ def _detect_from_metadata(folder: str) -> List[str]:
137261
138262 @staticmethod
139263 def _is_language_code (code : str ) -> bool :
140- """Check if a string looks like a language code."""
141- # Basic validation
142- if not code or len (code ) < 2 or len (code ) > 10 or not code [0 ].isalpha ():
264+ """Check if a string is a valid language code."""
265+ if not code or len (code ) < 2 :
143266 return False
144267
145- # Common language code patterns:
146- # - 2-letter codes: en, fr, de, etc.
147- # - 2+2 codes: en_US, fr_FR, etc.
148- # - Special codes: zh_Hans, sr_Latn, be@tarask, etc.
268+ # Get valid codes
269+ valid_codes = LanguageDetector ._get_valid_language_codes ()
149270
150- # Special cases used in Django
151- special_codes = {
152- 'zh-hans' , 'zh-hant' , 'sr-latn' , 'sr-cyrl' , 'az-latn' , 'az-cyrl' ,
153- 'uz-latn' , 'uz-cyrl' , 'kk-latn' , 'kk-cyrl' , 'ky-latn' , 'ky-cyrl'
154- }
271+ # Check if code is valid (case-insensitive)
272+ if code in valid_codes or code .lower () in valid_codes :
273+ return True
274+
275+ # Check with normalization
276+ normalized = code .replace ('-' , '_' )
277+ if normalized in valid_codes or normalized .lower () in valid_codes :
278+ return True
155279
156- # Check all valid patterns
157- pattern_match = re .match (r'^[a-z]{2,3}(_[A-Z][a-z]+|_[A-Z]{2}|@[a-z]+)?$' , code ) is not None
158- in_special = code .lower () in special_codes
159- basic_match = re .match (r'^[a-z]{2,3}$' , code ) is not None
160- return pattern_match or in_special or basic_match
280+ return False
161281
162282 @staticmethod
163283 def validate_or_detect_languages (folder : str , lang_arg : str = None ,
164- use_folder_structure : bool = False ) -> List [str ]:
284+ use_folder_structure : bool = False ,
285+ respect_gitignore : bool = True ) -> List [str ]:
165286 """
166287 Get languages from command line or auto-detect from PO files.
167288
168289 Args:
169290 folder: Path to folder containing PO files
170291 lang_arg: Language argument from command line (optional)
171292 use_folder_structure: If True, detect languages from folder names instead of metadata
293+ respect_gitignore: If True, respect .gitignore patterns when scanning
172294
173295 Returns:
174296 List of language codes to process
@@ -187,7 +309,7 @@ def validate_or_detect_languages(folder: str, lang_arg: str = None,
187309 logging .info ("No languages specified with -l, auto-detecting from %s..." , detection_method )
188310
189311 try :
190- return LanguageDetector .detect_languages_from_folder (folder , use_folder_structure )
312+ return LanguageDetector .detect_languages_from_folder (folder , use_folder_structure , respect_gitignore )
191313 except ValueError as e :
192314 # Re-raise with more helpful message
193315 if use_folder_structure :
0 commit comments