diff --git a/CHANGELOG.md b/CHANGELOG.md index 8934b5d..d044f37 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # Changelog +## 0.5.5 + +* Ensure that POT content is now sorted by path when merging POTs from multiple sources (i.e., templates and content). +* `xgettext` is used to merge POT files instead of `msgcat`, providing a better header and merging of same strings from different sources. +* The initially generated PO files will now have a header compatible with GNOME's Translation Editor, since they will have a non-placeholder `Project-Id-Version`. Existing users hitting this problem will need to fill in the `Project-Id-Version` header manually. +* Translations in templates now provide `pgettext` and `npgettext` methods. +* The bug where deletion of strings from the English PO file with non-English content is resolved. +* When updating translated PO files, the content-language PO file strings are automatically filled with the message IDs. Side effects with plurals are documented. +* A bug with discriminating between Markdown headings and Lektor block seperations has been fixed; the older heuristic with checking for colons in the previous line is replaced with simply checking for 3 dashes after stripped. + ## 0.5.4 * POT content is now sorted by path. diff --git a/README.md b/README.md index 507020b..755ab2e 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,20 @@ A `babel.cfg` must be created in your project root with the following content: [jinja2: **/templates/**.html] encoding = utf-8 +If you plan to extract from your templates, and the templates use functionality provided in Jinja2 extensions, specify +something like the following on an additional line in the config file. For example, if you use ``do`` statements, the +configuration file shall be: +``` +[jinja2: **/templates/**.html] +encoding = utf-8 +extensions = jinja2.ext.do +``` + +#### Whitespace Trimming during Extraction + +If you're using `{% trans %}` blocks in your template files, the `trimmed` policy is enabled for Jinja's i18n plugin, so all whitespaces would be trimmed at the beginning and end of those +blocks. However, in order for PyBabel's extraction to also work properly this way, one shall add `trimmed = True` to the jinja2 section of the `babel.cfg` configuration file. + ### Translatable fields In order for a field to be marked as translatable, an option has to be set in the field definition. Both blocks and flowblocks fields are translatable. @@ -96,12 +110,6 @@ For example: As with the previous example, `body` and `title` field content will be translated. However, in this example, `image` and `image_position` will not. -### Non-english content - -Due to a limitation of `msginit`, it is difficult to translate a site when the primary language is set to anything but English. - -If your default content language is not English, you will have to edit the first `contents-en.po` file and remove the translations. - ## Installation ### Prerequisites @@ -166,6 +174,12 @@ All translation files (`contents-*.po`) are then compiled and merged with the or You must run `lektor build` once to generate the list of `contents-xx.po` files. After that, once a translation change is applied to a `contents-xx.po` file, the site must be built again for the changes to be applied to the associated `contents-xx.lr` file. This results in the changes being rendered on the site. +### Plural Forms + +If you're using `{% pluralize %}` or `ngettext` or the like in your Jinja templates, make sure you fill in the plural forms in the PO headers manually, then make sure you have the correct +number of `msgstr[x]`s. The plugin automatically fills `msgstr`s into the PO file of your source lanaguage (which msginit only does for English), but since it doesn't parse plural forms, +any non-English PO file will not have its plural message strings filled in. Those must be done manually in the source-language PO file if simply singular and plural strings does not suffice . + ### Project file You must modify the `.lektorproject` file to include the expected languages. diff --git a/lektor_i18n.py b/lektor_i18n.py index d865521..8fb0ab9 100644 --- a/lektor_i18n.py +++ b/lektor_i18n.py @@ -5,10 +5,11 @@ import re import tempfile import time -from os.path import exists, join, relpath +from os.path import exists, join, relpath, basename from pprint import PrettyPrinter from textwrap import dedent from urllib.parse import urljoin +import polib from lektor.context import get_ctx from lektor.db import Page @@ -55,6 +56,14 @@ def ngettext(self, *x): self.init_translator() return self.translator.ngettext(*x) + def pgettext(self, *x): + self.init_translator() + return self.translator.pgettext(*x) + + def npgettext(self, *x): + self.init_translator() + return self.translator.npgettext(*x) + class Translations: """Memory of translations""" @@ -136,15 +145,29 @@ def write_pot(self, pot_filename, language): f.write(self.as_pot(language, header)) @staticmethod - def merge_pot(from_filenames, to_filename): - msgcat = locate_executable("msgcat") - if msgcat is None: - msgcat = "/usr/bin/msgcat" - cmdline = [msgcat, "--use-first"] + def merge_pot(from_filenames, to_filename, projectname): + # Get the POT Creation Date of the first file and inject it later. + pattern = r'("POT-Creation-Date:\s*)(\d{4}-\d{2}-\d{2}.*)(\\n")' + with open(from_filenames[0], 'r', encoding='utf-8') as f: + original_file1 = f.read() + date1 = re.search(pattern, original_file1).group(2) + + xgettext = locate_executable("xgettext") + if xgettext is None: + xgettext = "/usr/bin/xgettext" + cmdline = [xgettext, "--sort-by-file", "--package-name=" + projectname, "--package-version=1.0"] cmdline.extend(from_filenames) cmdline.extend(("-o", to_filename)) - reporter.report_debug_info("msgcat cmd line", cmdline) + reporter.report_debug_info("xgettext cmd line", cmdline) portable_popen(cmdline).wait() + + # Inject the creation date back into the produced file + with open(to_filename, 'r', encoding='utf-8') as f: + finishedfile_orig = f.read() + replacement = r'\g<1>' + date1 + r'\g<3>' + finishedcontent = re.sub(pattern, replacement, finishedfile_orig, count=1) + with open(to_filename, 'w', encoding='utf-8') as f: + f.write(finishedcontent) @staticmethod def parse_templates(to_filename): @@ -158,6 +181,51 @@ def parse_templates(to_filename): translations = Translations() # let's have a singleton +def clear_entry(entry): + entry.msgstr = '' + if entry.msgstr_plural: + for idx in entry.msgstr_plural: + entry.msgstr_plural[idx] = '' + if 'fuzzy' in entry.flags: + entry.flags.remove('fuzzy') + +def clear_translations(po_filepath, save_path=None): + po = polib.pofile(po_filepath) + for entry in po: + clear_entry(entry) + + po.save(save_path or po_filepath) + +def fill_translations(po_filepath, save_path=None): + po = polib.pofile(po_filepath) + + for entry in po: + # If we fuzzy-matched, we'd need to properly re-fill + # the entries so we clear. Particularly important is + # that when you add the plural form of a string... + # msgmerge seem to fill the plural field with the + # singular one, and mark it fuzzy... incorrect within + # source language. + if entry.fuzzy: + clear_entry(entry) + + # Actually fill in the entries with msgid within the + # source language. + if not entry.msgstr: + entry.msgstr = entry.msgid + + need_plural_fill = False + if entry.msgstr_plural: + for idx in entry.msgstr_plural: + if not entry.msgstr_plural[idx]: + need_plural_fill = True + if need_plural_fill and '+en.po' in basename(po_filepath): + for idx in entry.msgstr_plural: + if not entry.msgstr_plural[idx]: + entry.msgstr_plural[idx] = entry.msgid if int(idx) == 0 else entry.msgid_plural + + po.save(save_path or po_filepath) + class POFile: FILENAME_PATTERN = "contents+{}.po" @@ -186,6 +254,8 @@ def _msg_init(self): ] reporter.report_debug_info("msginit cmd line", cmdline) portable_popen(cmdline, cwd=self.i18npath).wait() + clear_translations(os.path.join(self.i18npath, self.FILENAME_PATTERN.format(self.language))) + self.reformat() def _msg_merge(self): """Merges an existing .po file with .pot file""" @@ -201,6 +271,11 @@ def _msg_merge(self): ] reporter.report_debug_info("msgmerge cmd line", cmdline) portable_popen(cmdline, cwd=self.i18npath).wait() + + def reformat(self): + msgcat = locate_executable("msgcat") + cmdline = [msgcat, self.FILENAME_PATTERN.format(self.language), "-o", self.FILENAME_PATTERN.format(self.language)] + portable_popen(cmdline, cwd=self.i18npath).wait() def _prepare_locale_dir(self): """Prepares the i18n//LC_MESSAGES/ to store the .mo file; @@ -238,20 +313,6 @@ def compile(self): self._msg_fmt(locale_dirname) -def line_starts_new_block(line, prev_line): - """ - Detect a new block in a Lektor document. Blocks are delimited by a line - containing 3 or more dashes. This actually matches the definition of a - markdown level 2 heading, so this function returns False if no colon was - found in the line before, e.g. it isn't a new block with a key: value pair - before. - """ - if not prev_line or ":" not in prev_line: - return False # could be a Markdown heading - line = line.strip() - return line == "-" * len(line) and len(line) >= 3 - - def split_paragraphs(document): if isinstance(document, (list, tuple)): document = "".join(document) # list of lines @@ -394,19 +455,30 @@ def __parse_source_structure(lines): blocks = [] count_lines_block = 0 # counting the number of lines of the current block is_content = False - prev_line = None + flow_level = 3 for line in lines: stripped_line = line.strip() if not stripped_line: # empty line blocks.append(("raw", "\n")) continue - # line like "---*" or a new block tag - if line_starts_new_block(stripped_line, prev_line) or block2re.search( - stripped_line - ): + # New block tag. + # The following two ifs will determine the start of a new "block" of content that we can further + # parse. Special care is needed, as the amount of allowed -s dictate whether it's a Markdown heading + # or a flow / field seperation. + if block2re.search(stripped_line): + count_lines_block = 0 + is_content = False + blocks.append(("raw", line)) + # Count the amount of preceding #s, as that determines the amount of -s allowed + # before it gets counted as a Markdown heading. + flow_level = len(stripped_line) - len(stripped_line.lstrip('#')) + # You're allowed to have between 3 and your maximum allowed number of -s. + elif stripped_line == '-' * len(stripped_line) and 3 <= len(stripped_line) <= flow_level: count_lines_block = 0 is_content = False blocks.append(("raw", line)) + # If there's less -s than the flow level, back down on the amount of allowed -s. + flow_level = len(stripped_line) else: count_lines_block += 1 match = command_re.search(stripped_line) @@ -423,7 +495,6 @@ def __parse_source_structure(lines): is_content = True if is_content: blocks.append(("translatable", line)) - prev_line = line # join neighbour blocks of same type newblocks = [] for type, data in blocks: @@ -558,7 +629,7 @@ def on_after_build_all(self, builder, **extra): reporter.report_generic(f"{relpath(pots[0], builder.env.root_path)} generated") pots = [p for p in pots if os.path.exists(p)] # only keep existing ones if len(pots) > 1: - translations.merge_pot(pots, contents_pot_filename) + translations.merge_pot(pots, contents_pot_filename, self.env.project.name) reporter.report_generic( f"Merged POT files " f"{', '.join(relpath(p, builder.env.root_path) for p in pots)}" @@ -567,3 +638,6 @@ def on_after_build_all(self, builder, **extra): for language in self.translations_languages: po_file = POFile(language, self.i18npath) po_file.generate() + if language == self.content_language: + fill_translations(os.path.join(po_file.i18npath, po_file.FILENAME_PATTERN.format(po_file.language))) + po_file.reformat() diff --git a/pyproject.toml b/pyproject.toml index 9182383..1a45731 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,9 @@ authors = [ maintainers = [ {name="BeeWare Team", email="team@beeware.org"}, ] +dependencies = [ + "polib", +] [project.optional-dependencies] # Extras used by developers *of* briefcase are pinned to specific versions to