Skip to content

Add a script to produce translation information #229

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
272 changes: 272 additions & 0 deletions translators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
#!/usr/bin/env python3
"""Python script used to provide a list of translation contributors and
translation completion information for each locale (language).
"""
# Version 1.0
# Copyright (C) 2025 Bob Swift

import os
import re
import subprocess

from conf import locale_dirs


# Set to True to print the lines from the gitlog output and the list
# of authors for each file checked. This is a very lengthy output
# and is best redirected to a log file for review.
DEBUG = False

# Aliases to match to avoid duplicate credits under different names.
ALIASES = {
'bob': 'Bob Swift',
'phw': 'Philipp Wolfer',
'rdswift': 'Bob Swift',
}

# Contributor names to ignore (lower case)
IGNORE = {
'anonymous',
'hosted weblate',
'languages add-on',
'weblate',
}

# Used to restrict the applicable languages for selected authors. This is
# necessary because some authors appear on commits for all translation files,
# even if they did not contribute a translation for a locale (language).
OK_LANGUAGES = {
'Bob Swift': {'en', 'fr'},
'Philipp Wolfer': {'de'},
}

# Domains to omit from the translator credits.
BAD_DOMAINS = set(['hostux.ninja'])

# Regular expressions used
RE_LANGUAGE = re.compile(r'^.*/(?P<language>[^/]+)/LC_MESSAGES')
RE_GITLOG = re.compile(r'^(?P<email>[^¤]*)¤(?P<name>.*)$')
RE_TEAM = re.compile(r'^"Language-Team: (?P<team>[^<\\]*)')


######################################################################################

def get_domain(email: str) -> str:
"""Extract the domain portion of an email address.
Args:
email (str): Email address to process.
Returns:
str: Domain portion of the address if it exists, otherwise an empty string.
"""
return email.split('@', maxsplit=1)[1].strip() if '@' in email else ''


######################################################################################

def extract_authors_from_gitlog(path: str, debug: bool = False) -> set:
"""Read the git commit log and extract the list of authors for a file.
Args:
path (str): Path of the file to check.
debug (bool, optional): Print debug information while processing. Defaults to False.
Returns:
set: Set containing the authors for the file.
"""

authors = set()
cmd = ['git', 'log', r'--pretty=format:%aE¤%aN', r'--', path]
result = subprocess.run(cmd, stdout=subprocess.PIPE, timeout=30, check=False)
if result.returncode == 0:
for line in result.stdout.decode('utf-8').split("\n"):
if debug:
print(f"Checking: {line}")
matched = RE_GITLOG.search(line)
if matched:
author = matched.group('name')
email = matched.group('email')
# Get standard name for the author if there is an alias.
for c in (f"{author} <{email}>", email, author):
if c in ALIASES:
author = ALIASES[c]
break
# Only add author if name and email domain are not blocked.
if author.lower() not in IGNORE and get_domain(email) not in BAD_DOMAINS:
authors.add(author)
if debug:
print(f"Authors: {', '.join(sorted(authors))}")
return authors


######################################################################################

def get_translation_counts(file: str, translation_counts: tuple, language: str, language_titles: dict) -> tuple:
"""Update the counts of total strings to translate and translated strings
with counts from the specified file. Also updates the language title dictionary.
Args:
file (str): Translation file to process.
translation_counts (tuple): Starting value of (total translation stings, number of translated strings) for the language.
language (str): Localization (language) code for the file.
language_titles (dict): Language titles dictionary to update.
Returns:
tuple: Updated tuple of (total translation stings, number of translated strings) for the language.
"""
total, translated = translation_counts
processing = False
# Read the translation file into an array to allow accessing a line multiple times.
with open(file, 'r', encoding='utf8') as f:
lines = f.readlines()

line_count = len(lines)
line_num = 0
while line_num < line_count:
line = str(lines[line_num]).strip()
line_num += 1

# Ignore blank lines
if not line:
continue

# Get the locale (language) title if it isn't already set.
if line.startswith('"Language-Team:') and not language_titles[language]:
matches = RE_TEAM.match(line)
text = matches.group('team').strip()
if text and text.lower() != 'none':
language_titles[language] = text

# Get the translation key.
if line.startswith('msgid "'):
processing = False
text = line.strip()[7:-1]
# Append to text from continuation lines.
while line_num < line_count and str(lines[line_num]).strip() and str(lines[line_num]).startswith('"'):
text += str(lines[line_num]).strip()[1:-1]
line_num += 1
# Only update count if the translation key is not empty.
if text.strip():
total += 1
processing = True

# Get the translation value.
if processing and line.startswith('msgstr "'):
text = line.strip()[8:-1]
# Append to text from continuation lines.
while line_num < line_count and str(lines[line_num]).strip() and str(lines[line_num]).startswith('"'):
text += str(lines[line_num]).strip()[1:-1]
line_num += 1
# Only update count if the translation value is not empty.
if text.strip():
translated += 1
processing = False

return total, translated


######################################################################################

def get_po_files():
"""Gets the translation files to process.
Yields:
Iterable: Iterable of tuples: (locale, file path).
"""
for base_path in locale_dirs:
for _path, _dirs, _files in os.walk(base_path):
matches = RE_LANGUAGE.search(_path)
if not matches:
continue
language = matches.group('language')
for _file in _files:
# Only include translation *.po files.
if not _file.endswith('.po'):
continue
filepath = os.path.join(_path, _file)

yield language, filepath


######################################################################################

def main() -> None:
"""Main processing method. Reviews the git history for all translation files and
prints:
1) A list of all translators and the locales they translated; and
2) A list of the locales that have translations, and the translators contributing
to that locale.
"""
translators = {}
languages = {}
completion = {}
language_titles = {}

for item in get_po_files():
language, filepath = item

text = f"Processing: {filepath}{' ' * 79}"[:79]
print(f"{text}", end='\r', flush=True)

if language not in language_titles:
language_titles[language] = ''

# Tuple of (total translation stings, number of translated strings)
completion[language] = get_translation_counts(
filepath, completion[language] if language in completion else (0, 0),
language, language_titles)

# Example of only displaying debug output for a single locale.
# authors = extract_authors_from_gitlog(filepath, debug=language == 'pt_BR')
authors = extract_authors_from_gitlog(filepath, debug=DEBUG)
for author in authors:
# Don't add authors that have language restrictions
if author in OK_LANGUAGES and not any(language.startswith(i) for i in OK_LANGUAGES[author]):
continue

if author not in translators:
translators[author] = set()
translators[author].add(language)

if language not in languages:
languages[language] = set()
languages[language].add(author)

print(' ' * 79, end='\r', flush=True)

def _name_sorter(key: str) -> str:
"""Parses the name to return a sort key based on last name.
Args:
key (str): Name to parse.
Returns:
str: Lower case name in the format "last name, first name".
"""
parts = str(key).lower().split()
return f"{parts[-1]}{(', ' + ' '.join(parts[:-1])) if len(parts) > 1 else ''}"

print("\nTranslators:")
for author in sorted(translators.keys(), key=_name_sorter):
print(f" - {author} ({', '.join(sorted(translators[author]))})")

print(f"\n{'-' * 79}")

for language in sorted(languages.keys()):
total, translated = completion[language]
completed = round(100 * translated / total, 1)
print(
f"\nLanguage: [{language}] "
f"{language_titles[language] if language in language_titles and language_titles[language] else 'Unknown Language'}\n"
f"Progress: Translated {translated:,} of {total:,} strings ({completed}%)"
)
for author in sorted(languages[language], key=_name_sorter):
print(f" - {author}")


######################################################################################

if __name__ == '__main__':
main()