Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 161 additions & 0 deletions docs/src/html.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ impl Html {
description = document.metadata.description.clone();
}

// Apply minimal CJK-friendly preprocessing so emphasis works when
// CJK characters are adjacent to `*` or `_` without spaces.
let preprocessed = cjk_friendly_preprocess(text);
let text = preprocessed.as_str();

let options = md::Options::ENABLE_TABLES
| md::Options::ENABLE_FOOTNOTES
| md::Options::ENABLE_STRIKETHROUGH
Expand Down Expand Up @@ -97,6 +102,7 @@ impl Html {

let mut raw = String::new();
md::html::push_html(&mut raw, iter);
remove_zwsp_from_html(&mut raw);
raw.truncate(raw.trim_end().len());

Html {
Expand Down Expand Up @@ -156,6 +162,138 @@ struct Metadata {
description: Option<EcoString>,
}

fn is_cjk(ch: char) -> bool {
let cp = ch as u32;
matches!(cp,
0x4E00..=0x9FFF | // CJK Unified Ideographs
0x3400..=0x4DBF | // CJK Unified Ideographs Extension A
0xF900..=0xFAFF | // CJK Compatibility Ideographs
0x3040..=0x309F | // Hiragana
0x30A0..=0x30FF | // Katakana
0xAC00..=0xD7AF | // Hangul Syllables
0xFF00..=0xFFEF // Fullwidth forms
)
}

fn cjk_friendly_preprocess(input: &str) -> String {
let mut out = String::with_capacity(input.len());
let mut in_fence = false;
let mut fence_ticks = 0usize;

for line in input.lines() {
// detect fenced code block boundaries (```...)
if !in_fence && line.starts_with("```") {
in_fence = true;
fence_ticks = line.chars().take_while(|&c| c == '`').count();
out.push_str(line);
out.push('\n');
continue;
} else if in_fence && line.starts_with(&"`".repeat(fence_ticks)) {
Comment on lines +182 to +191
Copy link

Copilot AI Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The \"".repeat(fence_ticks)` creates a new String allocation on each line check inside fenced blocks. Consider pre-computing this string or using a more efficient comparison method.

Suggested change
for line in input.lines() {
// detect fenced code block boundaries (```...)
if !in_fence && line.starts_with("```") {
in_fence = true;
fence_ticks = line.chars().take_while(|&c| c == '`').count();
out.push_str(line);
out.push('\n');
continue;
} else if in_fence && line.starts_with(&"`".repeat(fence_ticks)) {
let mut fence_str = String::new();
for line in input.lines() {
// detect fenced code block boundaries (```...)
if !in_fence && line.starts_with("```") {
in_fence = true;
fence_ticks = line.chars().take_while(|&c| c == '`').count();
fence_str = "`".repeat(fence_ticks);
out.push_str(line);
out.push('\n');
continue;
} else if in_fence && line.starts_with(&fence_str) {

Copilot uses AI. Check for mistakes.

in_fence = false;
out.push_str(line);
out.push('\n');
continue;
}

if in_fence {
out.push_str(line);
out.push('\n');
continue;
}

let chars: Vec<char> = line.chars().collect();
Copy link

Copilot AI Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Collecting all characters into a Vec for each line could be memory-intensive for large documents. Consider using char_indices() iterator to avoid the allocation while still supporting indexing.

Copilot uses AI. Check for mistakes.

let mut i = 0usize;
let mut inline_backtick_len: Option<usize> = None;

while i < chars.len() {
let c = chars[i];

// handle inline code spans using backtick lengths
if c == '`' {
let mut j = i;
while j < chars.len() && chars[j] == '`' {
j += 1;
}
let n = j - i;
// toggle inline span when same length backticks close it
if inline_backtick_len.is_none() {
inline_backtick_len = Some(n);
} else if inline_backtick_len == Some(n) {
inline_backtick_len = None;
}
for _ in 0..n {
out.push('`');
}
i = j;
continue;
}

// if inside inline code, copy verbatim
if inline_backtick_len.is_some() {
out.push(c);
i += 1;
continue;
}

// emphasis markers
if c == '*' || c == '_' {
// count run length
let marker = c;
let mut j = i;
while j < chars.len() && chars[j] == marker {
j += 1;
}
let next_char = chars.get(j).copied();

// find previous non-newline char from out
let prev_char = out.chars().rev().find(|ch| *ch != '\n' && *ch != '\r');

let prev_is_cjk = prev_char.map(is_cjk).unwrap_or(false);
let next_is_cjk = next_char.map(is_cjk).unwrap_or(false);

if prev_is_cjk && prev_char.map(|ch| !ch.is_whitespace()).unwrap_or(false)
{
// Insert a numeric HTML entity for ZWSP so it survives
// Markdown parsing and becomes an invisible character in
// the final HTML.
out.push_str("&#8203;");
}

for _ in 0..(j - i) {
out.push(marker);
}

if next_is_cjk && next_char.map(|ch| !ch.is_whitespace()).unwrap_or(false)
{
out.push_str("&#8203;");
}

i = j;
continue;
}

out.push(c);
i += 1;
}

out.push('\n');
}

out
}

fn remove_zwsp_from_html(s: &mut String) {
if s.contains("&amp;#8203;") {
*s = s.replace("&amp;#8203;", "");
}
if s.contains("&#8203;") {
*s = s.replace("&#8203;", "");
}
if s.contains('\u{200B}') {
*s = s.replace('\u{200B}', "");
}
Comment on lines +286 to +294
Copy link

Copilot AI Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Multiple string replacements create new String instances unnecessarily. Consider using replace_range or a single pass approach to avoid multiple allocations.

Suggested change
if s.contains("&amp;#8203;") {
*s = s.replace("&amp;#8203;", "");
}
if s.contains("&#8203;") {
*s = s.replace("&#8203;", "");
}
if s.contains('\u{200B}') {
*s = s.replace('\u{200B}', "");
}
// Patterns to remove
const AMP_ZWSP: &str = "&amp;#8203;";
const ZWSP: &str = "&#8203;";
const ZWSP_CHAR: char = '\u{200B}';
let mut out = String::with_capacity(s.len());
let mut i = 0;
while i < s.len() {
if s[i..].starts_with(AMP_ZWSP) {
i += AMP_ZWSP.len();
} else if s[i..].starts_with(ZWSP) {
i += ZWSP.len();
} else if s[i..].starts_with(ZWSP_CHAR) {
i += ZWSP_CHAR.len_utf8();
} else {
// Get the next char and push it
let ch = s[i..].chars().next().unwrap();
out.push(ch);
i += ch.len_utf8();
}
}
*s = out;

Copilot uses AI. Check for mistakes.

}

struct Handler<'a> {
text: &'a str,
resolver: &'a dyn Resolver,
Expand Down Expand Up @@ -515,3 +653,26 @@ impl World for DocWorld {
Some(Datetime::from_ymd(1970, 1, 1).unwrap())
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn cjk_preprocess_inserts_entity() {
let src = "これは**強調**です。";
let out = cjk_friendly_preprocess(src);
assert!(
out.contains("&#8203;"),
"preprocessed output did not contain &#8203;: {}",
out
);
}

#[test]
fn remove_zwsp_from_html_cleans() {
let mut html = String::from("<p>foo&#8203;bar&amp;#8203;baz\u{200B}</p>");
remove_zwsp_from_html(&mut html);
assert_eq!(html, "<p>foobarbaz</p>");
}
}