Skip to content

Commit 492e509

Browse files
committed
Fix: support re-translation of document-only text in the Client.
1 parent eb482e1 commit 492e509

File tree

3 files changed

+61
-44
lines changed

3 files changed

+61
-44
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ Changelog
2222
[Github master](https://github.com/bjones1/CodeChat_Editor)
2323
--------------------------------------------------------------------------------
2424

25-
* No changes.
25+
* Correctly re-translate Markdown documents.
2626

2727
Version 0.1.47 -- 2025-Dec-19
2828
--------------------------------------------------------------------------------

server/src/processing.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1063,7 +1063,7 @@ fn html_to_tree(html: &str) -> io::Result<Rc<Node>> {
10631063

10641064
// A framework to transform HTML by parsing it to a DOM tree, walking the tree,
10651065
// then serializing the tree back to an HTML string.
1066-
fn transform_html<T: FnOnce(Rc<Node>)>(html: &str, transform: T) -> io::Result<String> {
1066+
pub fn transform_html<T: FnOnce(Rc<Node>)>(html: &str, transform: T) -> io::Result<String> {
10671067
let tree = html_to_tree(html)?;
10681068
transform(tree.clone());
10691069

server/src/translation.rs

Lines changed: 59 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ use crate::{
226226
CodeChatForWeb, CodeMirror, CodeMirrorDiff, CodeMirrorDiffable, CodeMirrorDocBlock,
227227
CodeMirrorDocBlockVec, SourceFileMetadata, TranslationResultsString,
228228
codechat_for_web_to_source, diff_code_mirror_doc_blocks, diff_str,
229-
source_to_codechat_for_web_string,
229+
source_to_codechat_for_web_string, transform_html,
230230
},
231231
queue_send, queue_send_func,
232232
webserver::{
@@ -1052,14 +1052,12 @@ impl TranslationTask {
10521052
// Translate back to the Client to see if there are
10531053
// any changes after this conversion. Only check
10541054
// CodeChat documents, not Markdown docs.
1055-
if cfw.metadata.mode != MARKDOWN_MODE
1056-
&& let Ok(ccfws) = source_to_codechat_for_web_string(
1057-
&new_source_code,
1058-
&clean_file_path,
1059-
cfw.version,
1060-
false,
1061-
)
1062-
&& let TranslationResultsString::CodeChat(ccfw) = ccfws.0
1055+
if let Ok(ccfws) = source_to_codechat_for_web_string(
1056+
&new_source_code,
1057+
&clean_file_path,
1058+
cfw.version,
1059+
false,
1060+
) && let TranslationResultsString::CodeChat(ccfw) = ccfws.0
10631061
&& let CodeMirrorDiffable::Plain(code_mirror_translated) =
10641062
ccfw.source
10651063
&& self.sent_full
@@ -1068,27 +1066,31 @@ impl TranslationTask {
10681066
// changes (such as line wrapping in doc blocks
10691067
// which changes line numbering, creation of a
10701068
// new doc block from previous code block text,
1071-
// or updates from future document intelligence
1072-
// such as renamed headings, etc.) For doc
1073-
// blocks that haven't been edited by TinyMCE,
1074-
// this is easy; equality is sufficient. Doc
1075-
// blocks that have been edited are a different
1076-
// case: TinyMCE removes newlines, causing a lot
1077-
// of "changes" to re-insert these. Therefore,
1078-
// use the following approach:
1069+
// insertion of math, or updates from future
1070+
// document intelligence such as renamed
1071+
// headings, etc.). Note that strict HTML
1072+
// comparison fails, since TinyMCE modifies
1073+
// whitespace and some characters; for example, `👨‍👦` becomes `👨&zwj;👦` after processing by TinyMCE. Therefore, HTML processed by TinyMCE needs to be normalized before it can be compared.
1074+
//
1075+
// 1. Document-only: HTML is stored in `.doc`;
1076+
// ignore `.doc_blocks`.
1077+
// 2. Normal mode: HTML is stored in
1078+
// `.doc_blocks`; perform a binary compare of
1079+
// `.doc`, then an HTML comparison of `.doc_blocks`.
10791080
//
1080-
// 1. Compare the `doc` values. If they differ,
1081-
// then the the Client needs an update.
1082-
// 2. Compare each code block using simple
1083-
// equality. If this fails, compare the doc
1084-
// block text excluding newlines. If still
1085-
// different, then the Client needs an
1086-
// update.
1087-
if code_mirror_translated.doc != self.code_mirror_doc
1088-
|| !doc_block_compare(
1089-
&code_mirror_translated.doc_blocks,
1090-
self.code_mirror_doc_blocks.as_ref().unwrap(),
1091-
)
1081+
// It looks like comparing HTML is risky, since TinyMCE (or something) store emojis differently. Perhaps we need to compare Markdown instead of HTML? Or HTML after processing? The second seems easier.
1082+
let is_markdown_mode = cfw.metadata.mode == MARKDOWN_MODE;
1083+
if (is_markdown_mode
1084+
&& !compare_html(
1085+
&code_mirror_translated.doc,
1086+
&self.code_mirror_doc,
1087+
))
1088+
|| (!is_markdown_mode
1089+
&& (code_mirror_translated.doc != self.code_mirror_doc
1090+
|| !doc_block_compare(
1091+
&code_mirror_translated.doc_blocks,
1092+
self.code_mirror_doc_blocks.as_ref().unwrap(),
1093+
)))
10921094
{
10931095
// Use a whole number to avoid encoding
10941096
// differences with fractional values.
@@ -1192,6 +1194,31 @@ fn eol_convert(s: String, eol_type: &EolType) -> String {
11921194
}
11931195
}
11941196

1197+
// TinyMCE replaces newlines inside paragraphs with a space and (I think) avoids surrogate pairs by breaking them into a series of UTF-16 characters. Therefore, to compare HTML, normalize HTML touched by TinyMCE first.
1198+
fn compare_html(
1199+
// A string containing HTML that's been normalized by html5ever.
1200+
normalized_html: &str,
1201+
// A string containing HTML that's not normalized; for example, data after processing by TinyMCE.
1202+
raw_html: &str,
1203+
) -> bool {
1204+
// The normalized HTML is word-wrapped, while the raw HTML is not. Use this to ignore the differences between newlines and spaces, in order to ignore these differences.
1205+
fn map_newlines_to_spaces<'a>(
1206+
s: &'a str,
1207+
) -> std::iter::Map<std::str::Chars<'a>, impl FnMut(char) -> char> {
1208+
s.trim()
1209+
.chars()
1210+
.map(|c: char| if c == '\n' { ' ' } else { c })
1211+
}
1212+
1213+
// Transforming the HTML with an empty transform normalizes it but leave it otherwise unchanged.
1214+
if let Ok(normalized_raw_html) = transform_html(raw_html, |_node| {}) {
1215+
// Ignore word wrapping and leading/trailing whitespace in the comparison.
1216+
map_newlines_to_spaces(normalized_html).eq(map_newlines_to_spaces(&normalized_raw_html))
1217+
} else {
1218+
false
1219+
}
1220+
}
1221+
11951222
// Given a vector of two doc blocks, compare them, ignoring newlines.
11961223
fn doc_block_compare(a: &CodeMirrorDocBlockVec, b: &CodeMirrorDocBlockVec) -> bool {
11971224
if a.len() != b.len() {
@@ -1205,22 +1232,12 @@ fn doc_block_compare(a: &CodeMirrorDocBlockVec, b: &CodeMirrorDocBlockVec) -> bo
12051232
&& a.to == b.to
12061233
&& a.indent == b.indent
12071234
&& a.delimiter == b.delimiter
1208-
&& (a.contents == b.contents
1209-
// TinyMCE replaces newlines inside paragraphs with a space; for
1210-
// a crude comparison, translate all newlines back to spaces,
1211-
// then ignore leading/trailing newlines.
1212-
|| map_newlines_to_spaces(&a.contents).eq(map_newlines_to_spaces(&b.contents)))
1235+
// Most doc blocks haven't been touched by TinyMCE. Try a fast binary compare first.
1236+
&& (a.contents == b.contents ||
1237+
compare_html(&a.contents, &b.contents))
12131238
})
12141239
}
12151240

1216-
fn map_newlines_to_spaces<'a>(
1217-
s: &'a str,
1218-
) -> std::iter::Map<std::str::Chars<'a>, impl FnMut(char) -> char> {
1219-
s.trim()
1220-
.chars()
1221-
.map(|c: char| if c == '\n' { ' ' } else { c })
1222-
}
1223-
12241241
// Provide a simple debug function that prints only the first
12251242
// `MAX_MESSAGE_LENGTH` characters of the provided value.
12261243
fn debug_shorten<T: Debug>(val: T) -> String {

0 commit comments

Comments
 (0)