diff --git a/python_twine/twine/formatters/__init__.py b/python_twine/twine/formatters/__init__.py index 7a11e25..4f3e2b4 100644 --- a/python_twine/twine/formatters/__init__.py +++ b/python_twine/twine/formatters/__init__.py @@ -12,6 +12,14 @@ from twine.twine_file import TwineFile, TwineDefinition, TwineSection from twine.output_processor import OutputProcessor + +LANGUAGE_CODE_WITH_OPTIONAL_REGION_CODE = r"[a-z]{2}(?:-[A-Za-z]{2})?" + +ONLY_LANGUAGE_AND_REGION_REGEX = re.compile( + rf"^{LANGUAGE_CODE_WITH_OPTIONAL_REGION_CODE}$", re.IGNORECASE +) + + def flatten(input: Optional[List[List[str]]]) -> List[str]: if input is None: return [] @@ -34,7 +42,6 @@ class AbstractFormatter(ABC): """Base class for all format formatters.""" SUPPORTS_PLURAL = False - LANGUAGE_CODE_WITH_OPTIONAL_REGION_CODE = r"[a-z]{2}(?:-[A-Za-z]{2})?" def __init__(self): self.twine_file = TwineFile() @@ -204,15 +211,12 @@ def set_comment_for_key(self, key: str, comment: str): def determine_language_given_path(self, path: str) -> Optional[str]: """Determine the language code from a file path.""" - only_language_and_region = re.compile( - rf"^{self.LANGUAGE_CODE_WITH_OPTIONAL_REGION_CODE}$", re.IGNORECASE - ) path_obj = Path(path) basename = path_obj.stem # Check if basename is a language code - if only_language_and_region.match(basename): + if ONLY_LANGUAGE_AND_REGION_REGEX.match(basename): return basename # Check if basename is in known language codes @@ -222,7 +226,7 @@ def determine_language_given_path(self, path: str) -> Optional[str]: # Check path segments in reverse order parts = path_obj.parts for segment in reversed(parts): - if only_language_and_region.match(segment): + if ONLY_LANGUAGE_AND_REGION_REGEX.match(segment): return segment return None diff --git a/python_twine/twine/formatters/android.py b/python_twine/twine/formatters/android.py index 9039a27..c542443 100644 --- a/python_twine/twine/formatters/android.py +++ b/python_twine/twine/formatters/android.py @@ -16,6 +16,11 @@ number_of_twine_placeholders, ) +REGEX_CDATA_BRACKET = re.compile(r"<(?!(\/?(\!\[CDATA)))") +REGEX_TAG_BRACKET = re.compile( + r"<(?!(\/?(b|em|i|cite|dfn|big|small|font|tt|s|strike|del|u|super|sub|ul|li|br|div|span|p|a|\!\[CDATA))\b)") +REGEX_RESORCE_IDENTIFIER = re.compile(r"@(?!([a-z\.]+:)?[a-z+]+\/[a-zA-Z_]+)") # @[:]/ + def inner_xml(node:Element) -> str: # Get inner XML (text + nested elements) @@ -277,12 +282,10 @@ def inside_opening_tag(text: str, pos: int) -> bool: if has_placeholders or self.options.get("escape_all_tags"): # Escape all < except :]/ - result = resource_identifier_regex.sub(r"\\@", result) + result = REGEX_RESORCE_IDENTIFIER.sub(r"\\@", result) return result diff --git a/python_twine/twine/formatters/django.py b/python_twine/twine/formatters/django.py index 0d0ee33..30611a5 100644 --- a/python_twine/twine/formatters/django.py +++ b/python_twine/twine/formatters/django.py @@ -7,6 +7,10 @@ from twine.formatters import AbstractFormatter +COMMENT_REGEX = re.compile(r'^\s*#\. *"?(.*)"?$') +SECTION_REGEX = re.compile(r'^\s*# -{9} (.+) -{9} #$') +KEY_REGEX = re.compile(r'^msgid *"(.*)"$') +VALUE_REGEX = re.compile(r'^msgstr *"(.*)"$', re.MULTILINE) class DjangoFormatter(AbstractFormatter): """Formatter for Django .po files.""" @@ -26,10 +30,6 @@ def default_file_name(self) -> str: def read(self, io: TextIO, lang: str): """Read Django .po file.""" - comment_regex = re.compile(r'^\s*#\. *"?(.*)"?$') - section_regex = re.compile(r'^\s*# -{9} (.+) -{9} #$') - key_regex = re.compile(r'^msgid *"(.*)"$') - value_regex = re.compile(r'^msgstr *"(.*)"$', re.MULTILINE) key = None value = None @@ -38,24 +38,24 @@ def read(self, io: TextIO, lang: str): for line in io: # Extract comment - comment_match = comment_regex.match(line) + comment_match = COMMENT_REGEX.match(line) if comment_match: comment = comment_match.group(1) continue - section_match = section_regex.match(line) + section_match = SECTION_REGEX.match(line) if section_match: current_section = section_match.group(1) comment = None continue # Extract key (msgid) - key_match = key_regex.match(line) + key_match = KEY_REGEX.match(line) if key_match: key = key_match.group(1).replace('\\"', '"') # Extract value (msgstr) - value_match = value_regex.match(line) + value_match = VALUE_REGEX.match(line) if value_match: # Handle multiline strings value = value_match.group(1) diff --git a/python_twine/twine/formatters/gettext.py b/python_twine/twine/formatters/gettext.py index ec3f382..6c5437d 100644 --- a/python_twine/twine/formatters/gettext.py +++ b/python_twine/twine/formatters/gettext.py @@ -8,6 +8,11 @@ from twine.formatters import AbstractFormatter from twine import __version__ +COMMENT_REGEX = re.compile(r'#\.\s*"(.*)"$', re.MULTILINE) +SECTION_REGEX = re.compile(r'# SECTION: (.+)$', re.MULTILINE) +KEY_REGEX = re.compile(r'msgctxt\s+"(.*)"$', re.MULTILINE) +VALUE_REGEX = re.compile(r'msgid\s+"(.*)"$', re.MULTILINE) + class GettextFormatter(AbstractFormatter): """Formatter for Gettext .po files.""" @@ -27,10 +32,6 @@ def default_file_name(self) -> str: def read(self, io: TextIO, lang: str): """Read Gettext .po file.""" - comment_regex = re.compile(r'#\.\s*"(.*)"$', re.MULTILINE) - section_regex = re.compile(r'# SECTION: (.+)$', re.MULTILINE) - key_regex = re.compile(r'msgctxt\s+"(.*)"$', re.MULTILINE) - value_regex = re.compile(r'msgid\s+"(.*)"$', re.MULTILINE) # Read file in chunks separated by double newlines content = io.read() @@ -46,22 +47,22 @@ def read(self, io: TextIO, lang: str): comment = None # Extract comment - comment_match = comment_regex.search(item) + comment_match = COMMENT_REGEX.search(item) if comment_match: comment = comment_match.group(1) # Extract section - section_match = section_regex.search(item) + section_match = SECTION_REGEX.search(item) if section_match: current_sections = section_match.group(1) # Extract key (msgctxt) - key_match = key_regex.search(item) + key_match = KEY_REGEX.search(item) if key_match: key = key_match.group(1).replace('\\"', '"') # Extract value (msgid) - value_match = value_regex.search(item) + value_match = VALUE_REGEX.search(item) if value_match: # Handle multiline strings: "string"\n"continuation" value = value_match.group(1) diff --git a/python_twine/twine/formatters/jquery.py b/python_twine/twine/formatters/jquery.py index 3642375..c66c1fd 100644 --- a/python_twine/twine/formatters/jquery.py +++ b/python_twine/twine/formatters/jquery.py @@ -21,6 +21,10 @@ def extension(self) -> str: def default_file_name(self) -> str: return "localize.json" + def output_path_for_language(self, lang: str) -> str: + """Return the output path component for a language.""" + return f"{lang}.json" + def determine_language_given_path(self, path: str) -> Optional[str]: """Extract language from filename like strings-en-US.json.""" from pathlib import Path diff --git a/python_twine/twine/output_processor.py b/python_twine/twine/output_processor.py index 7dce07c..1e34112 100644 --- a/python_twine/twine/output_processor.py +++ b/python_twine/twine/output_processor.py @@ -3,8 +3,8 @@ """ import re -import copy from typing import Optional, List, Dict + from twine.twine_file import TwineFile, TwineSection @@ -107,7 +107,7 @@ def process(self, language: str) -> TwineFile: continue # Create new definition with the translation - new_definition = copy.deepcopy(definition) + new_definition = definition.copy_lang(language) new_definition.translations[language] = value # Handle plural translations diff --git a/python_twine/twine/placeholders.py b/python_twine/twine/placeholders.py index bc92566..79fc8c9 100644 --- a/python_twine/twine/placeholders.py +++ b/python_twine/twine/placeholders.py @@ -17,6 +17,28 @@ r"%" + PLACEHOLDER_PARAMETER_FLAGS_WIDTH_PRECISION_LENGTH + PLACEHOLDER_TYPES ) +TWINE_PLACEHOLDER_REGEX = re.compile( + r"(%" + PLACEHOLDER_PARAMETER_FLAGS_WIDTH_PRECISION_LENGTH + r")@" +) + +PLACEHOLDER_SYNTAX = ( + PLACEHOLDER_PARAMETER_FLAGS_WIDTH_PRECISION_LENGTH + PLACEHOLDER_TYPES +) +SINGLE_PERCENT_REGEX = re.compile(r"([^%])(%)(?!(%|" + PLACEHOLDER_SYNTAX + r"))") + +NON_NUMBERED_PLACEHOLDER_REGEX = re.compile( + "%(" + PLACEHOLDER_FLAGS_WIDTH_PRECISION_LENGTH + PLACEHOLDER_TYPES + ")" +) + +ANDROID_PLACEHOLDER_REGEX = re.compile( + "(%" + PLACEHOLDER_PARAMETER_FLAGS_WIDTH_PRECISION_LENGTH + ")s" +) +PYTHON_PLACEHOLDER_REGEX = re.compile( + r"%\([a-zA-Z0-9_-]+\)" + + PLACEHOLDER_PARAMETER_FLAGS_WIDTH_PRECISION_LENGTH + + PLACEHOLDER_TYPES +) + def number_of_twine_placeholders(input_str: str) -> int: """Count the number of printf-style placeholders in a string.""" @@ -25,10 +47,7 @@ def number_of_twine_placeholders(input_str: str) -> int: def convert_twine_string_placeholder(input_str: str) -> str: """Convert Twine string placeholder from %@ to %s.""" - pattern = re.compile( - r"(%" + PLACEHOLDER_PARAMETER_FLAGS_WIDTH_PRECISION_LENGTH + r")@" - ) - return pattern.sub(r"\1s", input_str) + return TWINE_PLACEHOLDER_REGEX.sub(r"\1s", input_str) def convert_placeholders_from_twine_to_android(input_str: str) -> str: @@ -53,21 +72,13 @@ def convert_placeholders_from_twine_to_android(input_str: str) -> str: # Got placeholders -> need to double single percent signs # % -> %% (but %% -> %%, %d -> %d) - placeholder_syntax = ( - PLACEHOLDER_PARAMETER_FLAGS_WIDTH_PRECISION_LENGTH + PLACEHOLDER_TYPES - ) - single_percent_regex = re.compile(r"([^%])(%)(?!(%|" + placeholder_syntax + r"))") - value = single_percent_regex.sub(r"\1%%", value) + value = SINGLE_PERCENT_REGEX.sub(r"\1%%", value) if num_placeholders < 2: return value # Number placeholders if there are multiple - non_numbered_placeholder_regex = re.compile( - r"%(" + PLACEHOLDER_FLAGS_WIDTH_PRECISION_LENGTH + PLACEHOLDER_TYPES + r")" - ) - - non_numbered_matches = non_numbered_placeholder_regex.findall(value) + non_numbered_matches = NON_NUMBERED_PLACEHOLDER_REGEX.findall(value) num_non_numbered = len(non_numbered_matches) if num_non_numbered == 0: @@ -86,17 +97,14 @@ def number_placeholder(match): index += 1 return f"%{index}${match.group(1)}" - value = non_numbered_placeholder_regex.sub(number_placeholder, value) + value = NON_NUMBERED_PLACEHOLDER_REGEX.sub(number_placeholder, value) return value def convert_placeholders_from_android_to_twine(input_str: str) -> str: """Convert Android string placeholders (%s) to Twine format (%@).""" - placeholder_regex = re.compile( - r"(%" + PLACEHOLDER_PARAMETER_FLAGS_WIDTH_PRECISION_LENGTH + r")s" - ) - return placeholder_regex.sub(r"\1@", input_str) + return ANDROID_PLACEHOLDER_REGEX.sub(r"\1@", input_str) def convert_placeholders_from_twine_to_flash(input_str: str) -> str: @@ -132,9 +140,4 @@ def contains_python_specific_placeholder(input_str: str) -> bool: Python supports placeholders like %(amount)03d See https://docs.python.org/3/library/stdtypes.html#printf-style-string-formatting """ - pattern = re.compile( - r"%\([a-zA-Z0-9_-]+\)" - + PLACEHOLDER_PARAMETER_FLAGS_WIDTH_PRECISION_LENGTH - + PLACEHOLDER_TYPES - ) - return pattern.search(input_str) is not None + return PYTHON_PLACEHOLDER_REGEX.search(input_str) is not None diff --git a/python_twine/twine/twine_file.py b/python_twine/twine/twine_file.py index b0698e9..33910ae 100644 --- a/python_twine/twine/twine_file.py +++ b/python_twine/twine/twine_file.py @@ -4,6 +4,7 @@ import re from typing import Dict, List, Optional, Any +import copy FALLBACK_LANGS_MAPPING = { "zh-CN": "zh-Hans", # Chinese Simplified @@ -158,6 +159,18 @@ def is_plural(self) -> bool: """Check if this definition has plural translations.""" return bool(self.plural_translations) + def copy_lang(self, lang: str) -> "TwineDefinition": + """ Copy translation for one language into new definition. """ + new_def = TwineDefinition(self.key) + new_def._comment = self._comment + new_def.tags = copy.deepcopy(self.tags) + if lang in self.translations: + new_def.translations[lang] = self.translations[lang] + + if lang in self.plural_translations: + new_def.plural_translations[lang] = self.plural_translations[lang] + return new_def + class TwineSection: """Represents a section grouping multiple definitions."""