diff --git a/Cargo.lock b/Cargo.lock index 9720eea..dbf6442 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -389,14 +389,14 @@ dependencies = [ [[package]] name = "env_logger" -version = "0.11.3" +version = "0.11.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38b35839ba51819680ba087cd351788c9a3c476841207e0b8cee0b04722343b9" +checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f" dependencies = [ "anstream", "anstyle", "env_filter", - "humantime", + "jiff", "log", ] @@ -452,6 +452,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "getopts" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" +dependencies = [ + "unicode-width", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -536,10 +545,21 @@ dependencies = [ ] [[package]] -name = "humantime" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" +name = "i18n-book-to-po" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "env_logger", + "lcs-diff", + "log", + "mdbook", + "mdbook-i18n-helpers", + "polib", + "pulldown-cmark 0.13.0", + "seal", + "slice-diff-patch", +] [[package]] name = "i18n-report" @@ -602,6 +622,30 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +[[package]] +name = "jiff" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be1f93b8b1eb69c77f24bbb0afdf66f54b632ee39af40ca21c4365a1d7347e49" +dependencies = [ + "jiff-static", + "log", + "portable-atomic", + "portable-atomic-util", + "serde", +] + +[[package]] +name = "jiff-static" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "jobserver" version = "0.1.31" @@ -626,6 +670,12 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +[[package]] +name = "lcs-diff" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c172ea7099cef89eb5a1a6e1f55d79a753823a0201d362f6ec5508028efcf4ed" + [[package]] name = "libc" version = "0.2.171" @@ -671,9 +721,9 @@ checksum = "fe7db12097d22ec582439daf8618b8fdd1a7bef6270e9af3b1ebcd30893cf413" [[package]] name = "log" -version = "0.4.21" +version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" [[package]] name = "mdbook" @@ -752,6 +802,15 @@ version = "2.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" +[[package]] +name = "memmap2" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" +dependencies = [ + "libc", +] + [[package]] name = "miniz_oxide" version = "0.7.3" @@ -918,6 +977,21 @@ dependencies = [ "linereader", ] +[[package]] +name = "portable-atomic" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" + +[[package]] +name = "portable-atomic-util" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +dependencies = [ + "portable-atomic", +] + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -936,9 +1010,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.83" +version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b33eb56c327dec362a9e55b3ad14f9d2f0904fb5a5b03b513ab5465399e9f43" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" dependencies = [ "unicode-ident", ] @@ -962,6 +1036,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e8bbe1a966bd2f362681a44f6edce3c2310ac21e4d5067a6e7ec396297a6ea0" dependencies = [ "bitflags 2.5.0", + "getopts", "memchr", "pulldown-cmark-escape 0.11.0", "unicase", @@ -990,9 +1065,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.36" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" dependencies = [ "proc-macro2", ] @@ -1097,6 +1172,18 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "seal" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5a64fede6ff71647707a5f6ec4472eb96477e42eaf13f5a6b9328e291e1c61e" +dependencies = [ + "bitflags 1.3.2", + "memmap2", + "tempfile", + "uuid", +] + [[package]] name = "semver" version = "1.0.26" @@ -1158,6 +1245,17 @@ version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" +[[package]] +name = "slice-diff-patch" +version = "1.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68668e75729c5a3c1dc0d76ad4c95a72299f92a4657e1bcb1661c02c0e8a4978" +dependencies = [ + "diff", + "lcs-diff", + "wu-diff", +] + [[package]] name = "slug" version = "0.1.5" @@ -1176,9 +1274,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.86" +version = "2.0.104" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e89275301d38033efb81a6e60e3497e734dfcc62571f2854bf4b16690398824c" +checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40" dependencies = [ "proc-macro2", "quote", @@ -1368,12 +1466,27 @@ version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +[[package]] +name = "unicode-width" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" + [[package]] name = "utf8parse" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" +[[package]] +name = "uuid" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9" +dependencies = [ + "getrandom 0.3.1", +] + [[package]] name = "version_check" version = "0.9.4" @@ -1574,6 +1687,12 @@ dependencies = [ "bitflags 2.5.0", ] +[[package]] +name = "wu-diff" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e3e6735fcde06432870db8dc9d7e3ab1b93727c14eaef329969426299f28893" + [[package]] name = "yansi" version = "1.0.1" diff --git a/Cargo.toml b/Cargo.toml index 31988c4..4ebd20f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [workspace] -members = ["i18n-helpers", "i18n-report", "mdbook-tera-backend", "fuzz"] -default-members = ["i18n-helpers", "i18n-report", "mdbook-tera-backend"] +members = ["i18n-book-to-po", "i18n-helpers", "i18n-report", "mdbook-tera-backend", "fuzz"] +default-members = ["i18n-book-to-po", "i18n-helpers", "i18n-report", "mdbook-tera-backend"] resolver = "2" [workspace.lints.clippy] diff --git a/i18n-book-to-po/Cargo.toml b/i18n-book-to-po/Cargo.toml new file mode 100644 index 0000000..4c924f0 --- /dev/null +++ b/i18n-book-to-po/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "i18n-book-to-po" +version = "0.1.0" +edition = "2024" + +[dependencies] +anyhow.workspace = true +clap = { version = "4.5.38", features = ["derive"] } +env_logger = "0.11.8" +lcs-diff = "0.1.1" +log = "0.4.27" +mdbook.workspace = true +mdbook-i18n-helpers = { version = "0.3.6", path = "../i18n-helpers" } +polib.workspace = true +pulldown-cmark = "0.13" +seal = "0.1.6" +slice-diff-patch = "1.2.4" + +[lints] +workspace = true diff --git a/i18n-book-to-po/de.po b/i18n-book-to-po/de.po new file mode 100644 index 0000000..6c22ee3 --- /dev/null +++ b/i18n-book-to-po/de.po @@ -0,0 +1,88 @@ + +msgid "" +msgstr "" +"Project-Id-Version: \n" +"POT-Creation-Date: \n" +"PO-Revision-Date: \n" +"Last-Translator: \n" +"Language-Team: \n" +"MIME-Version: \n" +"Content-Type: \n" +"Content-Transfer-Encoding: \n" +"Language: \n" +"Plural-Forms: nplurals=1; plural=0;\n" + +#: introduction.md:1 +msgid "Chapter 1: Introduction to Markdown" +msgstr "Kapitel 1: Einführung in Markdown" + +#: introduction.md:3 +msgid "" +"This is the first chapter of our example book. It shows what this tool can " +"do." +msgstr "" +"Dies ist das erste Kapitel unseres Beispielbuchs. Es zeigt, was dieses Tool " +"kann." + +#: introduction.md:5 +msgid "Subchapter" +msgstr "Unterkapitel" + +#: introduction.md:7 +msgid "" +"All pages of this book are written in Markdown and can contain different " +"Markdown elements:" +msgstr "" +"Alle Seiten dieses Buches sind in Markdown geschrieben und kann verschiedene " +"Markdown Elemente enthalten:" + +#: introduction.md:9 +msgid "Item 1" +msgstr "Punkt 1" + +#: introduction.md:10 +msgid "Item 2" +msgstr "Punkt 2" + +#: introduction.md:11 +msgid "Sub-item 2.1" +msgstr "Unterpunkt 2.1" + +#: introduction.md:12 +msgid "Item 3" +msgstr "Punkt 3" + +#: introduction.md:14 +msgid "" +"You can also include links like \\[appendix\\]\\[appendix\\] to other " +"resources easily" +msgstr "" +"Sie können auch problemlos Links wie \\[Anhang\\]\\[appendix\\] zu anderen " +"Ressourcen einfügen." + +#: introduction.md:16 +msgid "Code Blocks" +msgstr "Codeblöcke" + +#: introduction.md:18 +msgid "Displaying code is straightforward. Here's an example in Rust:" +msgstr "Das Anzeigen von Code ist unkompliziert. Hier ist ein Beispiel in Rust:" + +#: introduction.md:22 +msgid "\"Hello, {}!\"" +msgstr "\"Hallo, {}!\"" + +#: introduction.md:26 +msgid "\"World\"" +msgstr "\"Welt\"" + +#: introduction.md:30 +msgid "Further Examples" +msgstr "Weitere Beispiele" + +#: introduction.md:32 +msgid "You can use **bold text**, _italic text_, or even `inline code blocks`." +msgstr "" +"Sie können **fettgedruckten Text**, _kursiven Text_ oder sogar `inline Code " +"Blöcke` verwenden." + diff --git a/i18n-book-to-po/src/catalog.rs b/i18n-book-to-po/src/catalog.rs new file mode 100644 index 0000000..5089ed0 --- /dev/null +++ b/i18n-book-to-po/src/catalog.rs @@ -0,0 +1,42 @@ +use std::path::Path; + +use polib::catalog::Catalog; +use polib::message::Message; +use polib::metadata::CatalogMetadata; +use polib::po_file; + +// Create a catalog from the translation pairs given. +pub fn update_catalog( + source_file: &Path, + mut catalog: Catalog, + translations: Vec<(usize, (String, String))>, +) -> Catalog { + for (idx, (msgid, msgstr)) in translations.into_iter() { + let message = Message::build_singular() + .with_source(format!( + "{}:{idx}", + source_file.file_name().unwrap().to_str().unwrap() + )) + .with_msgid(msgid) + .with_msgstr(msgstr) + .done(); + catalog.append_or_update(message); + } + catalog +} + +/// Write the catalog to the provided path +pub fn update_po_file( + output: &Path, + source_file: &Path, + translations: Vec<(usize, (String, String))>, +) -> anyhow::Result<()> { + let catalog = if output.exists() { + po_file::parse(output)? + } else { + Catalog::new(CatalogMetadata::new()) + }; + + let catalog = update_catalog(source_file, catalog, translations); + Ok(po_file::write(&catalog, output)?) +} diff --git a/i18n-book-to-po/src/file_map.rs b/i18n-book-to-po/src/file_map.rs new file mode 100644 index 0000000..b5a6937 --- /dev/null +++ b/i18n-book-to-po/src/file_map.rs @@ -0,0 +1,49 @@ +use std::{ + fs, + path::{Path, PathBuf}, +}; + +use log::warn; + +/// Try to map markdown files from the two given folders. Missing files will be ignored +pub fn auto_folders_match( + source_base: &Path, + translation_base: &Path, +) -> anyhow::Result> { + // discover all relevant files + let mut source_filenames = discover_markdown_files(source_base)?; + let translation_filenames = discover_markdown_files(translation_base)?; + + source_filenames.sort(); + + // match the files according to their (identical) filenames + let mut map = Vec::new(); + for source_file in source_filenames { + // keep relative path inside the book as this should match + let source_relative_path = source_file.strip_prefix(source_base)?; + // try to find the same file in the translation files + let translation_target_file = translation_base.join(source_relative_path); + if translation_filenames.contains(&translation_target_file) { + map.push((source_file, translation_target_file)); + } else { + warn!( + "no matching translation file found for '{}'", + source_file.display() + ); + } + } + Ok(map) +} + +/// discover all markdown files in a given path +fn discover_markdown_files(path: &Path) -> anyhow::Result> { + let mut files = Vec::new(); + for entry in fs::read_dir(path)? { + let entry = entry?; + let path = entry.path(); + if path.is_file() && path.extension() == Some("md".as_ref()) { + files.push(path); + } + } + Ok(files) +} diff --git a/i18n-book-to-po/src/lib.rs b/i18n-book-to-po/src/lib.rs new file mode 100644 index 0000000..78dbf8b --- /dev/null +++ b/i18n-book-to-po/src/lib.rs @@ -0,0 +1,3 @@ +pub mod catalog; +pub mod file_map; +pub mod structure; diff --git a/i18n-book-to-po/src/main.rs b/i18n-book-to-po/src/main.rs new file mode 100644 index 0000000..7c2d7da --- /dev/null +++ b/i18n-book-to-po/src/main.rs @@ -0,0 +1,80 @@ +use std::{ + fs, + path::{Path, PathBuf}, +}; + +use anyhow::Ok; +use clap::{Parser as _, arg}; +use i18n_book_to_po::{ + catalog, file_map, + structure::{align::align_markdown_docs, types::DiffAlgorithm}, +}; +use log::{info, warn}; +use mdbook_i18n_helpers::extract_messages; + +#[derive(clap::Parser)] +struct Cli { + #[arg[short, long, value_name = "source/src"]] + source: PathBuf, + #[arg[short, long, value_name = "translation/src"]] + translation: PathBuf, + #[arg[short, long, value_name = "translation.po"]] + output: PathBuf, + #[arg[short, long, value_name = "diff_algorithm", value_enum, default_value_t = DiffAlgorithm::default()]] + diff_algorithm: DiffAlgorithm, +} + +/// +/// create a translation file for a given source and translation file +/// +/// This function takes the paths to a source markdown file, a translated +/// markdown file, and an output PO file. It aligns the source and translated +/// documents, extracts messages from the aligned documents, pairs them up, +/// and updates or creates a PO file with these translation pairs. +fn create_translation_for( + source: &Path, + translation: &Path, + output: &Path, + diff_algorithm: &DiffAlgorithm, +) -> anyhow::Result<()> { + let source_content = fs::read_to_string(source)?; + let translation_content = fs::read_to_string(translation)?; + + let (source_doc, translation_doc) = + align_markdown_docs(&source_content, &translation_content, true, diff_algorithm)?; + + let source_messages = extract_messages(&source_doc); + let translation_messages = extract_messages(&translation_doc); + let translated_message_pairs = source_messages + .unwrap() + .into_iter() + .zip(translation_messages.unwrap()) + .map(|((src_msg_id, src_msg), (_tr_msg_id, tr_msg))| { + (src_msg_id, (src_msg.message, tr_msg.message)) + }) + .collect::>(); + catalog::update_po_file(output, source, translated_message_pairs)?; + Ok(()) +} + +fn main() -> anyhow::Result<()> { + env_logger::init_from_env(env_logger::Env::default().filter_or("RUST_LOG", "info")); + let cli = Cli::parse(); + info!("Reconstruct po file from translation of a book"); + let source = Path::new(&cli.source); + let translation = Path::new(&cli.translation); + let output = Path::new(&cli.output); + let diff_algorithm = cli.diff_algorithm; + + let file_map = file_map::auto_folders_match(source, translation)?; + + for (source, translation) in &file_map { + info!("Processing {}", source.display()); + if source.file_name() != translation.file_name() { + warn!("filenames don't match") + } else { + create_translation_for(source, translation, output, &diff_algorithm)?; + } + } + Ok(()) +} diff --git a/i18n-book-to-po/src/structure/align.rs b/i18n-book-to-po/src/structure/align.rs new file mode 100644 index 0000000..ba093c8 --- /dev/null +++ b/i18n-book-to-po/src/structure/align.rs @@ -0,0 +1,658 @@ +use std::vec; + +use log::error; +use mdbook_i18n_helpers::reconstruct_markdown; +use pulldown_cmark::Event; + +use crate::structure::{ + diff::diff_structure, + types::{AlignAction, CmarkEvent, DiffAlgorithm}, +}; + +/// generate a sentence structure based on the amount of sentences (separated by dots). +/// This is splitting by "." and replacing these elements by a Vector of Sentence Elements +fn generate_sentence_structure(text: &str) -> Vec { + text.split(".") + .filter_map(|sentence| { + if sentence.is_empty() { + None + } else { + Some(CmarkEvent::SentenceElement) + } + }) + .collect() +} + +/// Parse the structure of the provided cmark events and return a Markdown structure +/// It leverages additional sentence elements as structural elements if feature_sentence is set +fn parse_structure( + markdown: &[pulldown_cmark::Event<'_>], + feature_sentence: bool, +) -> Vec { + let structure: Vec<_> = markdown + .iter() + .flat_map(|event| { + if feature_sentence { + // if sentences should be used to align the documents, split the + // Text Elements into sentences. + if let pulldown_cmark::Event::Text(text) = event { + // prepend the sentence elements with the Text variant + let mut result = vec![(event).into()]; + result.extend(generate_sentence_structure(text)); + return result; + } + }; + vec![(event).into()] + }) + .collect(); + + structure +} + +/// normalize the event stream +/// This is done to avoid issues with Softbreak and similar events that might be in different +/// positions in the translation. +/// Currently this removes SoftBreaks and merges Text blocks after SoftBreaks to get a normalized structure +fn normalize_events(events: Vec) -> Vec { + let mut normalized_events = Vec::new(); + let mut removed_softbreak = false; + for event in events { + match event { + pulldown_cmark::Event::SoftBreak => removed_softbreak = true, + pulldown_cmark::Event::Text(text) => { + // if a softbreak was just removed and we have a text event, merge it with + // a potential text element in front of the softbreak + if removed_softbreak { + if let Some(pulldown_cmark::Event::Text(prev_text)) = + normalized_events.last_mut() + { + // merge text events (with space as this is a soft break) + *prev_text = format!("{prev_text} {text}").into(); + } else { + // add the text event unmodified + normalized_events.push(pulldown_cmark::Event::Text(text)); + } + } else { + // add the text event unmodified + normalized_events.push(pulldown_cmark::Event::Text(text)); + } + } + _ => { + removed_softbreak = false; + normalized_events.push(event); + } + } + } + normalized_events +} + +/// helper function to read the markdown events. This can be replaced by +/// mdbook::utils::new_cmark_parser() once the version of pulldown-cmark is up-to-date +/// Then replace this call with +/// `mdbook::utils::new_cmark_parser(&content, false).collect();` +fn read_structure(content: &str) -> anyhow::Result>> { + // This is a using pulldown-cmark 0.10... + // let parser = mdbook::utils::new_cmark_parser(&content, false); + let mut opts = pulldown_cmark::Options::empty(); + opts.insert(pulldown_cmark::Options::ENABLE_TABLES); + opts.insert(pulldown_cmark::Options::ENABLE_FOOTNOTES); + opts.insert(pulldown_cmark::Options::ENABLE_STRIKETHROUGH); + opts.insert(pulldown_cmark::Options::ENABLE_TASKLISTS); + opts.insert(pulldown_cmark::Options::ENABLE_HEADING_ATTRIBUTES); + + Ok(pulldown_cmark::Parser::new_ext(content, opts).collect()) +} + +/// apply the diff to align the markdown events. +/// if an element is not available in the other document, this will output None in its place +fn align_markdown_events<'a>( + diff: Vec, + source: Vec>, + translated: Vec>, +) -> (Vec>>, Vec>>) { + // small hack to make the data structure better accessible as pop is easy on vec + let mut reversed_source = source.into_iter().rev().collect::>(); + let mut reversed_translated = translated.into_iter().rev().collect::>(); + + // These will store the aligned source and translation events wrapped in Some + // if something is missing in one stream, a None will be placed + let mut aligned_source = vec![]; + let mut aligned_translated = vec![]; + + for action in diff { + match action { + AlignAction::Source(_data_) => { + aligned_source.push(reversed_source.pop()); + aligned_translated.push(None); + } + AlignAction::Translation(_data) => { + aligned_source.push(None); + aligned_translated.push(reversed_translated.pop()); + } + AlignAction::Both(_data) => { + aligned_source.push(reversed_source.pop()); + aligned_translated.push(reversed_translated.pop()); + } + AlignAction::Different(_source, _translation) => { + // discard these elements + reversed_source.pop(); + reversed_translated.pop(); + // and show this with None + aligned_source.push(None); + aligned_translated.push(None); + } + } + } + // both streams need to be empty, otherwise this would indicate a bug + assert!(reversed_source.is_empty()); + assert!(reversed_translated.is_empty()); + // both aligned streams should be equal in length + assert_eq!(aligned_source.len(), aligned_translated.len()); + (aligned_source.clone(), aligned_translated.clone()) +} + +/// filter the source and translation files to only return elements that are available in both +fn minimize_aligned_events<'a>( + source: Vec>>, + translated: Vec>>, +) -> (Vec>, Vec>) { + source + .into_iter() + .zip(translated) + .filter_map(|(s, t)| s.zip(t)) + .unzip() +} + +/// this is a debug variant of minimize_aligned_events() that returns all events on both sides +/// that don't have a pendant in the other document. This is mostly useful for debugging if the +/// markdown cannot be properly reconstructed. +fn debug_get_unaligned_events<'a>( + source: &'a [Option>], + translated: &'a [Option>], +) -> (Vec>>, Vec>>) { + source + .iter() + .zip(translated) + .filter_map(|(s, t)| { + if s.is_none() || t.is_none() { + Some((s.as_ref(), t.as_ref())) + } else { + None + } + }) + .unzip() +} + +/// This is the main worker function. +/// It aligns two markdown documents based on their structure. +/// +/// This function has the steps: +/// - read markdown structure from both documents (read_structure) +/// - prepare both event streams by removing content from structure elements (parse_structure) +/// - diff the structural elements (without content) +/// - apply the diff to both event streams that still contain content (align_markdown_events) +/// - minimize the aligned aligned markdown event streams by removing everything that is not in both (minimize_aligned_events) +/// - reconstruct the markdown from the minimized streams and return both documents (reconstruct_markdown) +pub fn align_markdown_docs( + source: &str, + translation: &str, + normalize: bool, + diff_algorithm: &DiffAlgorithm, +) -> anyhow::Result<(String, String)> { + let source_events = read_structure(source)?; + let translated_events = read_structure(translation)?; + // remove some events if normalization is used that are not needed for alignment, e.g. soft breaks + let source_events = if normalize { + normalize_events(source_events) + } else { + source_events + }; + let translated_events = if normalize { + normalize_events(translated_events) + } else { + translated_events + }; + + let source_structure = parse_structure(&source_events, false); + let translated_structure = parse_structure(&translated_events, false); + + let diff = diff_structure(&source_structure, &translated_structure, diff_algorithm); + + let (aligned_source, aligned_translated) = + align_markdown_events(diff, source_events, translated_events); + + let (minimized_source, minimized_translated) = + minimize_aligned_events(aligned_source.clone(), aligned_translated.clone()); + + let reconstructed_source = reconstruct_markdown( + &minimized_source + .iter() + .map(|event| (0_usize, (*event).clone())) + .collect::>(), + None, + ); + if let Err(e) = reconstructed_source { + error!( + "Reconstructing source markdown: {e:?}\n{:?}\n{:?}\n{:?}", + &aligned_source, + &aligned_translated, + debug_get_unaligned_events(&aligned_source, &aligned_translated) + ); + return Err(e.into()); + } + let reconstructed_source = reconstructed_source.unwrap(); + + let reconstructed_translated = reconstruct_markdown( + &minimized_translated + .iter() + .map(|event| (0_usize, (*event).clone())) + .collect::>(), + None, + ); + if let Err(e) = reconstructed_translated { + error!( + "Reconstructing translated markdown: {e:?}\n{:?}\n{:?}\n{:?}", + &aligned_source, + &aligned_translated, + debug_get_unaligned_events(&aligned_source, &aligned_translated) + ); + return Err(e.into()); + } + let reconstructed_translated = reconstructed_translated.unwrap(); + + Ok((reconstructed_source.0, reconstructed_translated.0)) +} + +#[cfg(test)] +mod tests { + use std::borrow::Cow; + + use crate::structure::diff::diff_structure; + use crate::structure::types::{ + AlignAction, CmarkEvent, CmarkTagEnd, CmarkTagStart, DiffAlgorithm, + }; + use pulldown_cmark::{Event, HeadingLevel, Tag, TagEnd}; + + use crate::structure::align::{ + align_markdown_docs, align_markdown_events, minimize_aligned_events, parse_structure, + read_structure, + }; + + /// test reading text into a pulldown_cmark::Event vector + #[test] + fn test_read_structure() { + let markdown_doc = "# Title 1 +First paragraph. Second sentence. + +Second paragraph. 2nd sentence. 3rd sentence. + "; + let got_markdown_events: Vec> = + read_structure(markdown_doc).unwrap(); + let want_markdown_events = [ + Event::Start(Tag::Heading { + level: HeadingLevel::H1, + id: None, + classes: vec![], + attrs: vec![], + }), + Event::Text(Cow::Borrowed("Title 1").into()), + Event::End(TagEnd::Heading(HeadingLevel::H1)), + Event::Start(Tag::Paragraph), + Event::Text(Cow::Borrowed("First paragraph. Second sentence.").into()), + Event::End(TagEnd::Paragraph), + Event::Start(Tag::Paragraph), + Event::Text(Cow::Borrowed("Second paragraph. 2nd sentence. 3rd sentence.").into()), + Event::End(TagEnd::Paragraph), + ]; + assert_eq!(got_markdown_events, want_markdown_events) + } + + /// test parsing the structure from text (without content and without the sentence feature). + /// reading the structure is assumed to be correct in this test. + #[test] + fn test_parse_structure_without_sentence() { + let markdown_doc = "# Title 1 +First paragraph. Second sentence. + +Second paragraph. 2nd sentence. 3rd sentence. + "; + let events = read_structure(markdown_doc).unwrap(); + let got_markdown_events = parse_structure(&events, false); + let want_markdown_events = [ + CmarkEvent::Start(CmarkTagStart::Heading { + level: HeadingLevel::H1, + }), + CmarkEvent::Text, + CmarkEvent::End(CmarkTagEnd::Heading(HeadingLevel::H1)), + CmarkEvent::Start(CmarkTagStart::Paragraph), + CmarkEvent::Text, + CmarkEvent::End(CmarkTagEnd::Paragraph), + CmarkEvent::Start(CmarkTagStart::Paragraph), + CmarkEvent::Text, + CmarkEvent::End(CmarkTagEnd::Paragraph), + ]; + assert_eq!(got_markdown_events, want_markdown_events); + } + + /// test parsing the structure from text (without content but with the sentence feature) + /// reading the structure is assumed to be correct in this test. + #[test] + fn test_parse_structure_with_sentence() { + let markdown_doc = "# Title 1 +First paragraph. Second sentence. + +Second paragraph. 2nd sentence. 3rd sentence. + "; + let events = read_structure(markdown_doc).unwrap(); + let got_markdown_events = parse_structure(&events, true); + let want_markdown_events = [ + CmarkEvent::Start(CmarkTagStart::Heading { + level: HeadingLevel::H1, + }), + CmarkEvent::Text, + CmarkEvent::SentenceElement, + CmarkEvent::End(CmarkTagEnd::Heading(HeadingLevel::H1)), + CmarkEvent::Start(CmarkTagStart::Paragraph), + CmarkEvent::Text, + CmarkEvent::SentenceElement, + CmarkEvent::SentenceElement, + CmarkEvent::End(CmarkTagEnd::Paragraph), + CmarkEvent::Start(CmarkTagStart::Paragraph), + CmarkEvent::Text, + CmarkEvent::SentenceElement, + CmarkEvent::SentenceElement, + CmarkEvent::SentenceElement, + CmarkEvent::End(CmarkTagEnd::Paragraph), + ]; + assert_eq!(got_markdown_events, want_markdown_events); + } + + /// test if two documents with the same structure but different content are considered equal + #[test] + fn test_equal_structure() { + let original_doc = "# Title 1 +First paragraph. Second sentence. + +Second paragraph. 2nd sentence. 3rd sentence. + "; + let translated_doc = "# Foobar et 1 +Bla Baz. Foobar bar 42. + +Baz Bla. Lorem. Ipsum. + "; + let original_structure = parse_structure(&read_structure(original_doc).unwrap(), true); + let translated_structure = parse_structure(&read_structure(translated_doc).unwrap(), true); + + assert_eq!(original_structure, translated_structure); + } + + /// test if the diff between two markdown source texts generates correct AlignActions. + /// Some text in the source is not in the translation and vice versa. This should show + /// up in the AlignActions + #[test] + fn test_diff_structure() { + let original_doc = "# Title 1 +translated sentence. + +untranslated sentence + +# Title 2 + "; + let translated_doc = "# Title 1 +Bla Baz. Foobar bar 42. + +# Title 2 + +new sentence"; + let original_structure = parse_structure(&read_structure(original_doc).unwrap(), false); + let translated_structure = parse_structure(&read_structure(translated_doc).unwrap(), false); + + let got_diff = diff_structure( + &original_structure, + &translated_structure, + &DiffAlgorithm::default(), + ); + let want_diff = [ + AlignAction::Both(CmarkEvent::Start(CmarkTagStart::Heading { + level: HeadingLevel::H1, + })), + AlignAction::Both(CmarkEvent::Text), + AlignAction::Both(CmarkEvent::End(CmarkTagEnd::Heading(HeadingLevel::H1))), + AlignAction::Both(CmarkEvent::Start(CmarkTagStart::Paragraph)), + AlignAction::Both(CmarkEvent::Text), + AlignAction::Both(CmarkEvent::End(CmarkTagEnd::Paragraph)), + AlignAction::Source(CmarkEvent::Start(CmarkTagStart::Paragraph)), + AlignAction::Source(CmarkEvent::Text), + AlignAction::Source(CmarkEvent::End(CmarkTagEnd::Paragraph)), + AlignAction::Both(CmarkEvent::Start(CmarkTagStart::Heading { + level: HeadingLevel::H1, + })), + AlignAction::Both(CmarkEvent::Text), + AlignAction::Both(CmarkEvent::End(CmarkTagEnd::Heading(HeadingLevel::H1))), + AlignAction::Translation(CmarkEvent::Start(CmarkTagStart::Paragraph)), + AlignAction::Translation(CmarkEvent::Text), + AlignAction::Translation(CmarkEvent::End(CmarkTagEnd::Paragraph)), + ]; + assert_eq!(got_diff, want_diff); + } + + /// test if two streams of pulldown_cmark::Events are diffed correctly + /// and aligned properly. The structure will be a vector of of Option + /// with None being inserted in a stream if an event in the other stream is not + /// available in it + #[test] + fn test_align_markdown_events() { + let translated_a = vec![ + Event::Start(Tag::Heading { + level: HeadingLevel::H1, + id: None, + classes: vec![], + attrs: vec![], + }), + Event::Text(Cow::Borrowed("Title 1").into()), + Event::End(TagEnd::Heading(HeadingLevel::H1)), + Event::Start(Tag::Paragraph), + Event::Text(Cow::Borrowed("to translate sentence").into()), + Event::End(TagEnd::Paragraph), + ]; + let untranslated_paragraph = vec![ + Event::Start(Tag::Paragraph), + Event::Text(Cow::Borrowed("untranslated sentence").into()), + Event::End(TagEnd::Paragraph), + ]; + let translated_b = vec![ + Event::Start(Tag::Heading { + level: HeadingLevel::H1, + id: None, + classes: vec![], + attrs: vec![], + }), + Event::Text(Cow::Borrowed("Title 2").into()), + Event::End(TagEnd::Heading(HeadingLevel::H1)), + ]; + let new_paragraph_in_translation = vec![ + Event::Start(Tag::Paragraph), + Event::Text(Cow::Borrowed("new sentence").into()), + Event::End(TagEnd::Paragraph), + ]; + + // this assumes that there is a new untranslated pragraph in between + let original_events = [&translated_a, &untranslated_paragraph, &translated_b] + .into_iter() + .flatten() + .cloned() + .collect::>(); + + // the untranslated paragraph is missing but a new sentence was added by the translator + let translated_events = [&translated_a, &translated_b, &new_paragraph_in_translation] + .into_iter() + .flatten() + .cloned() + .collect::>(); + let original_structure = parse_structure(&original_events, false); + let translated_structure = parse_structure(&translated_events, false); + let diff = diff_structure( + &original_structure, + &translated_structure, + &DiffAlgorithm::default(), + ); + + let (got_aligned_source, got_aligned_translated) = + align_markdown_events(diff, original_events, translated_events); + + let want_aligned_source: Vec<_> = translated_a + .iter() + .map(|e| Some(e.clone())) + .chain(untranslated_paragraph.iter().map(|e| Some(e.clone()))) + .chain(translated_b.iter().map(|e| Some(e.clone()))) + .chain(new_paragraph_in_translation.iter().map(|_| None)) + .collect(); + + let want_aligned_translated: Vec<_> = translated_a + .iter() + .map(|e| Some(e.clone())) + .chain(untranslated_paragraph.iter().map(|_| None)) + .chain(translated_b.iter().map(|e| Some(e.clone()))) + .chain(new_paragraph_in_translation.iter().map(|e| Some(e.clone()))) + .collect(); + + assert_eq!(got_aligned_source, want_aligned_source); + assert_eq!(got_aligned_translated, want_aligned_translated); + } + + /// E2E test for + /// - reading the pulldown_cmark::Events from text + /// - converting into content-less CmarkEvents to get the raw structure + /// - diff the structure and generate a stream of AlignActions + /// - align the pulldown_cmark::Events with the created AlignActions + /// - minimize these aligned events (keep only Events that occur in both docs) + /// - compare against a generated Event stream from a known good document + #[test] + fn test_align_markdown_events_full() { + let original_doc = "# Title 1 +translated sentence. + +untranslated sentence + +# Title 2 + "; + let translated_doc = "# Title 1 +Bla Baz. Foobar bar 42. + +# Title 2 + +new sentence"; + let original_events = read_structure(original_doc).unwrap(); + let translated_events = read_structure(translated_doc).unwrap(); + let original_structure = parse_structure(&original_events, false); + let translated_structure = parse_structure(&translated_events, false); + let diff = diff_structure( + &original_structure, + &translated_structure, + &DiffAlgorithm::default(), + ); + + let (got_aligned_source, got_aligned_translated) = + align_markdown_events(diff, original_events, translated_events); + + let (got_aligned_source, got_aligned_translated) = + minimize_aligned_events(got_aligned_source, got_aligned_translated); + + let want_aligned_source = read_structure( + "# Title 1 +translated sentence. + +# Title 2", + ) + .unwrap() + .into_iter() + .collect::>(); + let want_aligned_translated = read_structure( + "# Title 1 +Bla Baz. Foobar bar 42. + +# Title 2", + ) + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!(got_aligned_source, want_aligned_source); + assert_eq!(got_aligned_translated, want_aligned_translated); + } + + /// test minimizing the aligned event streams. + /// This should emit only Events that occur in both streams + #[test] + fn test_minimize_aligned_events() { + let aligned_source = vec![ + Some(Event::Text(Cow::Borrowed("translated sentence").into())), + Some(Event::Text(Cow::Borrowed("untranslated sentence").into())), + None, + ]; + let aligned_translated = vec![ + Some(Event::Text(Cow::Borrowed("translated sentence").into())), + None, + Some(Event::Text(Cow::Borrowed("new sentence").into())), + ]; + let (got_aligned_source, got_aligned_translated) = + minimize_aligned_events(aligned_source, aligned_translated); + + let want = [Event::Text(Cow::Borrowed("translated sentence").into())]; + + assert_eq!(got_aligned_source, want); + assert_eq!(got_aligned_translated, want); + } + + /// full E2E test that is creating fully aligned markdown docs + /// containing only content that is available in both documents. + #[test] + fn test_align_markdown_docs() { + // original has one sentence more than translation in section 1 + // but translation has an added sentence in section 2 + let original_doc = "# source title 1 +translated source sentence. + +untranslated source sentence + +# source title 2 + +translated source sentence"; + let translated_doc = "# target title 1 + +translated target sentence + +# target title 2 + +translated target sentence + +new target sentence"; + + let (got_source, got_translated) = align_markdown_docs( + original_doc, + translated_doc, + true, + &DiffAlgorithm::default(), + ) + .unwrap(); + + // they should both have only the translated sentences + let want_source = "# source title 1 + +translated source sentence. + +# source title 2 + +translated source sentence"; + let want_translated = "# target title 1 + +translated target sentence + +# target title 2 + +translated target sentence"; + + assert_eq!(got_source, want_source); + assert_eq!(got_translated, want_translated); + } +} diff --git a/i18n-book-to-po/src/structure/diff.rs b/i18n-book-to-po/src/structure/diff.rs new file mode 100644 index 0000000..19cf65e --- /dev/null +++ b/i18n-book-to-po/src/structure/diff.rs @@ -0,0 +1,74 @@ +use lcs_diff::DiffResult; + +use crate::structure::types::{AlignAction, CmarkEvent, DiffAlgorithm}; + +/// this diffs the structure in how the original needs to be modified in order to create the translation. +/// lcs_diff:diff() already does the job, but we transform this to a better understandable datastructure +fn diff_structure_lcs(source: &[CmarkEvent], translated: &[CmarkEvent]) -> Vec { + lcs_diff::diff(translated, source) + .into_iter() + .map(|change| { + match change { + // this element does not exist in the original + DiffResult::Removed(diff_element) => AlignAction::Translation(diff_element.data), + // both sides are equal + DiffResult::Common(diff_element) => AlignAction::Both(diff_element.data), + // this element does not exist in the translation + DiffResult::Added(diff_element) => AlignAction::Source(diff_element.data), + } + }) + .collect() +} + +/// this diffs the structure in how the original needs to be modified in order to create the translation. +/// We use the global alignment algorithm NeedlemanWunsch and transform the result into a understandable datastructure +fn diff_structure_seal(source: &[CmarkEvent], translation: &[CmarkEvent]) -> Vec { + // equal is good, align operation is not good + let strategy = seal::pair::NeedlemanWunsch::new(1, -1, -1, 0); + let set: seal::pair::AlignmentSet = + seal::pair::AlignmentSet::new(translation.len(), source.len(), strategy, |x, y| { + translation[x] == source[y] + }) + .unwrap(); + let global_alignment = set.global_alignment(); + global_alignment + .steps() + .map(|step| { + // x is valid in source and y is valid in target + match step { + // this element only exists in the source (was deleted) and not in translation + seal::pair::Step::Delete { x } => { + let translation_element = translation.get(x).unwrap().clone(); + AlignAction::Translation(translation_element) + } + // both sides are equal, pick from the source + seal::pair::Step::Align { x, y } => { + let translation_element = translation.get(x).unwrap().clone(); + let source_element = source.get(y).unwrap().clone(); + if translation_element == source_element { + AlignAction::Both(translation_element) + } else { + AlignAction::Different(translation_element, source_element) + } + } + // this element only exists in the translation (was inserted) and not in source + seal::pair::Step::Insert { y } => { + let source_element = source.get(y).unwrap().clone(); + AlignAction::Source(source_element) + } + } + }) + .collect() +} + +/// diff the structure of to content-less CmarkEvent streams with the specified algorithm +pub fn diff_structure( + source: &[CmarkEvent], + translated: &[CmarkEvent], + algorithm: &DiffAlgorithm, +) -> Vec { + match algorithm { + DiffAlgorithm::Lcs => diff_structure_lcs(source, translated), + DiffAlgorithm::NeedlemanWunsch => diff_structure_seal(source, translated), + } +} diff --git a/i18n-book-to-po/src/structure/mod.rs b/i18n-book-to-po/src/structure/mod.rs new file mode 100644 index 0000000..f97b71e --- /dev/null +++ b/i18n-book-to-po/src/structure/mod.rs @@ -0,0 +1,3 @@ +pub mod align; +mod diff; +pub mod types; diff --git a/i18n-book-to-po/src/structure/types.rs b/i18n-book-to-po/src/structure/types.rs new file mode 100644 index 0000000..c02388f --- /dev/null +++ b/i18n-book-to-po/src/structure/types.rs @@ -0,0 +1,186 @@ +use clap::ValueEnum; +use pulldown_cmark::{BlockQuoteKind, HeadingLevel, LinkType, MetadataBlockKind}; + +/// copy of pulldown_cmark::Tag without any data +#[derive(Debug, PartialEq, Clone)] +pub enum CmarkTagStart { + Paragraph, + Heading { level: HeadingLevel }, + BlockQuote(Option), + CodeBlock, + HtmlBlock, + List(Option), + Item, + FootnoteDefinition, + Table, + TableHead, + TableRow, + TableCell, + Emphasis, + Strong, + Strikethrough, + // link type might be worthwhile but this could also be different + // skipping for now + // Link { link_type: LinkType }, + Link, + Image { link_type: LinkType }, + MetadataBlock(MetadataBlockKind), + DefinitionList, + DefinitionListTitle, + DefinitionListDefinition, + Superscript, + Subscript, +} + +impl From<&pulldown_cmark::Tag<'_>> for CmarkTagStart { + fn from(value: &pulldown_cmark::Tag) -> Self { + match value { + pulldown_cmark::Tag::Paragraph => Self::Paragraph, + pulldown_cmark::Tag::Heading { level, .. } => Self::Heading { level: *level }, + pulldown_cmark::Tag::BlockQuote(kind) => Self::BlockQuote(*kind), + pulldown_cmark::Tag::CodeBlock(..) => Self::CodeBlock, + pulldown_cmark::Tag::HtmlBlock => Self::HtmlBlock, + pulldown_cmark::Tag::List(number) => Self::List(*number), + pulldown_cmark::Tag::Item => Self::Item, + pulldown_cmark::Tag::FootnoteDefinition(..) => Self::FootnoteDefinition, + pulldown_cmark::Tag::Table(..) => Self::Table, + pulldown_cmark::Tag::TableHead => Self::TableHead, + pulldown_cmark::Tag::TableRow => Self::TableRow, + pulldown_cmark::Tag::TableCell => Self::TableCell, + pulldown_cmark::Tag::Emphasis => Self::Emphasis, + pulldown_cmark::Tag::Strong => Self::Strong, + pulldown_cmark::Tag::Strikethrough => Self::Strikethrough, + pulldown_cmark::Tag::Link { .. } => Self::Link, + pulldown_cmark::Tag::Image { link_type, .. } => Self::Image { + link_type: *link_type, + }, + pulldown_cmark::Tag::MetadataBlock(kind) => Self::MetadataBlock(*kind), + pulldown_cmark::Tag::DefinitionList => Self::DefinitionList, + pulldown_cmark::Tag::DefinitionListTitle => Self::DefinitionListTitle, + pulldown_cmark::Tag::DefinitionListDefinition => Self::DefinitionListDefinition, + pulldown_cmark::Tag::Superscript => Self::Superscript, + pulldown_cmark::Tag::Subscript => Self::Subscript, + } + } +} + +/// copy of pulldown_cmark::TagEnd without any data +#[derive(Debug, PartialEq, Clone)] +pub enum CmarkTagEnd { + Paragraph, + Heading(HeadingLevel), + BlockQuote(Option), + CodeBlock, + HtmlBlock, + List, + Item, + FootnoteDefinition, + Table, + TableHead, + TableRow, + TableCell, + Emphasis, + Strong, + Strikethrough, + Link, + Image, + MetadataBlock(MetadataBlockKind), + DefinitionList, + DefinitionListTitle, + DefinitionListDefinition, + Superscript, + Subscript, +} + +impl From<&pulldown_cmark::TagEnd> for CmarkTagEnd { + fn from(value: &pulldown_cmark::TagEnd) -> Self { + match value { + pulldown_cmark::TagEnd::Paragraph => Self::Paragraph, + pulldown_cmark::TagEnd::Heading(heading_level) => Self::Heading(*heading_level), + pulldown_cmark::TagEnd::BlockQuote(kind, ..) => Self::BlockQuote(*kind), + pulldown_cmark::TagEnd::CodeBlock => Self::CodeBlock, + pulldown_cmark::TagEnd::HtmlBlock => Self::HtmlBlock, + pulldown_cmark::TagEnd::List(_) => Self::List, + pulldown_cmark::TagEnd::Item => Self::Item, + pulldown_cmark::TagEnd::FootnoteDefinition => Self::FootnoteDefinition, + pulldown_cmark::TagEnd::Table => Self::Table, + pulldown_cmark::TagEnd::TableHead => Self::TableHead, + pulldown_cmark::TagEnd::TableRow => Self::TableRow, + pulldown_cmark::TagEnd::TableCell => Self::TableCell, + pulldown_cmark::TagEnd::Emphasis => Self::Emphasis, + pulldown_cmark::TagEnd::Strong => Self::Strong, + pulldown_cmark::TagEnd::Strikethrough => Self::Strikethrough, + pulldown_cmark::TagEnd::Link => Self::Link, + pulldown_cmark::TagEnd::Image => Self::Image, + pulldown_cmark::TagEnd::MetadataBlock(kind) => Self::MetadataBlock(*kind), + pulldown_cmark::TagEnd::DefinitionList => Self::DefinitionList, + pulldown_cmark::TagEnd::DefinitionListTitle => Self::DefinitionListTitle, + pulldown_cmark::TagEnd::DefinitionListDefinition => Self::DefinitionListDefinition, + pulldown_cmark::TagEnd::Superscript => Self::Superscript, + pulldown_cmark::TagEnd::Subscript => Self::Subscript, + } + } +} + +/// copy of pulldown_cmark::Event without data +#[derive(Debug, PartialEq, Clone)] +pub enum CmarkEvent { + Start(CmarkTagStart), + End(CmarkTagEnd), + Text, + Code, + Html, + InlineHtml, + FootnoteReference, + SoftBreak, + HardBreak, + Rule, + TaskListMarker, + InlineMath, + DisplayMath, + /// custom variant to support more (paragraph) internal structure + SentenceElement, +} + +impl From<&pulldown_cmark::Event<'_>> for CmarkEvent { + fn from(value: &pulldown_cmark::Event) -> Self { + match value { + pulldown_cmark::Event::Start(start_tag) => Self::Start(start_tag.into()), + pulldown_cmark::Event::End(end_tag) => Self::End(end_tag.into()), + pulldown_cmark::Event::Text(_) => Self::Text, + pulldown_cmark::Event::Code(_) => Self::Code, + pulldown_cmark::Event::Html(_) => Self::Html, + pulldown_cmark::Event::InlineHtml(_) => Self::InlineHtml, + pulldown_cmark::Event::FootnoteReference(_) => Self::FootnoteReference, + pulldown_cmark::Event::SoftBreak => Self::SoftBreak, + pulldown_cmark::Event::HardBreak => Self::HardBreak, + pulldown_cmark::Event::Rule => Self::Rule, + pulldown_cmark::Event::TaskListMarker(_) => Self::TaskListMarker, + pulldown_cmark::Event::InlineMath(_) => Self::InlineMath, + pulldown_cmark::Event::DisplayMath(_) => Self::DisplayMath, + } + } +} + +/// describes an action to modify the original documents. +/// Source(Event) indicates that Source has a Event element that is missing in the translation +/// Both(Event) indicates that both sides have the given element +#[derive(Debug, PartialEq)] +pub enum AlignAction { + /// only available in the source + Source(CmarkEvent), + /// only available in the translation + Translation(CmarkEvent), + /// available in source and translation + Both(CmarkEvent), + /// this element seems to have changed in the translation + Different(CmarkEvent, CmarkEvent), +} + +/// Supported Diff algorithms +#[derive(Default, Clone, ValueEnum)] +pub enum DiffAlgorithm { + Lcs, + #[default] + NeedlemanWunsch, +} diff --git a/i18n-book-to-po/testcases/original/introduction.md b/i18n-book-to-po/testcases/original/introduction.md new file mode 100644 index 0000000..3da11ad --- /dev/null +++ b/i18n-book-to-po/testcases/original/introduction.md @@ -0,0 +1,40 @@ +# Chapter 1: Introduction to Markdown + +This is the first chapter of our example book. It shows what this tool can do. + +## Subchapter + +All pages of this book are written in Markdown and can contain different +Markdown elements: + +- Item 1 +- Item 2 + - Sub-item 2.1 +- Item 3 + +You can also include links like [appendix][appendix] to other resources easily + +## Code Blocks + +Displaying code is straightforward. Here's an example in Rust: + +```rust +fn greet(name: &str) { + println!("Hello, {}!", name); +} + +fn main() { + greet("World"); +} +``` + +## Untranslated sub chapter + +This entire subchapter is not translated but the tool can deal with it and align +everything properly. + +### Further Examples + +You can use **bold text**, _italic text_, or even `inline code blocks`. + +[appendix]: some-appendix.html diff --git a/i18n-book-to-po/testcases/translation/introduction.md b/i18n-book-to-po/testcases/translation/introduction.md new file mode 100644 index 0000000..ec4bf13 --- /dev/null +++ b/i18n-book-to-po/testcases/translation/introduction.md @@ -0,0 +1,40 @@ +# Kapitel 1: Einführung in Markdown + +Dies ist das erste Kapitel unseres Beispielbuchs. Es zeigt, was dieses Tool +kann. + +## Unterkapitel + +Alle Seiten dieses Buches sind in Markdown geschrieben und kann verschiedene +Markdown Elemente enthalten: + +> Dieser Einschub der nur in der Übersetzung zu finden ist, wird ignoriert + +- Punkt 1 +- Punkt 2 + - Unterpunkt 2.1 +- Punkt 3 + +Sie können auch problemlos Links wie [Anhang][appendix] zu anderen Ressourcen +einfügen. + +## Codeblöcke + +Das Anzeigen von Code ist unkompliziert. Hier ist ein Beispiel in Rust: + +```rust +fn gruessen(name: &str) { + println!("Hallo, {}!", name); +} + +fn main() { + gruessen("Welt"); +} +``` + +### Weitere Beispiele + +Sie können **fettgedruckten Text**, _kursiven Text_ oder sogar +`inline Code Blöcke` verwenden. + +[appendix]: some-appendix.html diff --git a/i18n-helpers/src/xgettext.rs b/i18n-helpers/src/xgettext.rs index d2213c5..4b66798 100644 --- a/i18n-helpers/src/xgettext.rs +++ b/i18n-helpers/src/xgettext.rs @@ -41,7 +41,7 @@ fn strip_link(text: &str) -> String { }) .collect::>(); let (without_link, _) = reconstruct_markdown(&events, None) - .unwrap_or_else(|_| panic!("Couldn't strip link \"{}\"", text)); + .unwrap_or_else(|_| panic!("Couldn't strip link \"{text}\"")); without_link }