Skip to content

Commit f3f7e08

Browse files
committed
feat: add escapeNumberedList option (fixes #165)
1 parent 96ad1ab commit f3f7e08

File tree

6 files changed

+106
-10
lines changed

6 files changed

+106
-10
lines changed

include/html2md.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,12 +124,22 @@ struct Options {
124124
*/
125125
bool compressWhitespace = false;
126126

127+
/*!
128+
* \brief Whether to escape numbered lists (e.g. "4." -> "4\.") to prevent them
129+
* from being interpreted as lists in Markdown.
130+
*
131+
* Whether to escape numbered lists.
132+
* Default is true.
133+
*/
134+
bool escapeNumberedList = true;
135+
127136
inline bool operator==(html2md::Options o) const {
128137
return splitLines == o.splitLines && unorderedList == o.unorderedList &&
129138
orderedList == o.orderedList && includeTitle == o.includeTitle &&
130139
softBreak == o.softBreak && hardBreak == o.hardBreak &&
131140
formatTable == o.formatTable && forceLeftTrim == o.forceLeftTrim &&
132-
compressWhitespace == o.compressWhitespace;
141+
compressWhitespace == o.compressWhitespace &&
142+
escapeNumberedList == o.escapeNumberedList;
133143
};
134144
};
135145

python/bindings.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ PYBIND11_MODULE(pyhtml2md, m) {
2929
.def_readwrite("compressWhitespace", &html2md::Options::compressWhitespace,
3030
"Whether to compress whitespace (tabs, multiple spaces) "
3131
"into a single space")
32+
.def_readwrite("escapeNumberedList", &html2md::Options::escapeNumberedList,
33+
"Whether to escape numbered lists (e.g. '4.' -> '4\\.')")
3234
.def("__eq__", &html2md::Options::operator==);
3335

3436
py::class_<html2md::Converter>(m, "Converter")

reproduce_issue.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,14 @@
11
import pyhtml2md
22

33
html = """
4-
<p>
5-
<strong>1. Title</strong></p>
6-
<p>
7-
paragraph with tabs and spaces</p>
4+
4.<br />
5+
Please implement as requested.
86
"""
97

108
print("--- Original Output ---")
119
converter = pyhtml2md.Converter(html)
12-
# Enable forceLeftTrim as user mentioned it was part of their partial solution
13-
options = pyhtml2md.Options()
14-
options.compressWhitespace = True
15-
converter = pyhtml2md.Converter(html, options)
1610
print(converter.convert())
1711

18-
expected = "**1. Title**\n\nparagraph with tabs and spaces\n"
12+
expected = "4\\. \nPlease implement as requested.\n"
1913
print("\n--- Expected Output ---")
2014
print(expected)

src/html2md.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,35 @@ bool Converter::ParseCharInTagContent(char ch) {
583583
case '\\':
584584
appendToMd("\\\\");
585585
break;
586+
case '.': {
587+
bool is_ordered_list_start = false;
588+
if (chars_in_curr_line_ > 0) {
589+
size_t start_idx = md_.length() - chars_in_curr_line_;
590+
size_t idx = start_idx;
591+
// Skip spaces
592+
while (idx < md_.length() && isspace(md_[idx])) {
593+
idx++;
594+
}
595+
// Check digits
596+
bool has_digits = false;
597+
while (idx < md_.length() && isdigit(md_[idx])) {
598+
has_digits = true;
599+
idx++;
600+
}
601+
// If we reached the end and had digits, it's a match
602+
if (has_digits && idx == md_.length()) {
603+
is_ordered_list_start = true;
604+
}
605+
}
606+
607+
if (is_ordered_list_start && option.escapeNumberedList) {
608+
appendToMd("\\.");
609+
} else {
610+
md_ += ch;
611+
++chars_in_curr_line_;
612+
}
613+
break;
614+
}
586615
default:
587616
md_ += ch;
588617
++chars_in_curr_line_;

tests/main.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,26 @@ bool testInvalidTags() {
439439
return true;
440440
}
441441

442+
bool testEscapingNumberedList() {
443+
testOption("escapingNumberedList");
444+
445+
string html = "4.<br />\nPlease implement as requested.";
446+
447+
html2md::Converter c(html);
448+
auto md = c.convert();
449+
450+
string expected = "4\\. \nPlease implement as requested.\n";
451+
452+
if (md != expected) {
453+
cout << "Failed to escape numbered list:\n"
454+
<< "Input: " << html << "\n"
455+
<< "Expected: " << expected << "\n"
456+
<< "Got: " << md << "\n";
457+
return false;
458+
}
459+
return true;
460+
}
461+
442462
int main(int argc, const char **argv) {
443463
// List to store all markdown files in this dir
444464
vector<string> files;
@@ -500,6 +520,7 @@ int main(int argc, const char **argv) {
500520
&testSelfClosingTags,
501521
&testZeroWidthSpaceWithBlockquote,
502522
&testInvalidTags,
523+
&testEscapingNumberedList,
503524
};
504525

505526
for (const auto &test : tests)

tests/python/test_option_escape.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import pytest
2+
import pyhtml2md
3+
4+
def test_escape_numbered_list_enabled_by_default():
5+
html = "4.<br />\nPlease implement as requested."
6+
expected = "4\\. \nPlease implement as requested.\n"
7+
assert pyhtml2md.convert(html) == expected
8+
9+
def test_escape_numbered_list_disabled():
10+
html = "4.<br />\nPlease implement as requested."
11+
options = pyhtml2md.Options()
12+
options.escapeNumberedList = False
13+
14+
# When disabled, it should be interpreted as a list item (or at least not escaped)
15+
# The original issue was that "4." became "1." because it was seen as a list.
16+
# So we expect "1. \nPlease..." or similar if it's interpreted as a list,
17+
# OR just "4. \n..." if it's not escaped but also not renumbered (depending on md4c/parser behavior).
18+
# However, based on the issue description: "The issue is, that this text is renumbered by Markdown and rendered as '1.\nPlease...'."
19+
# So if we disable escaping, we expect the output to contain "4. \n" which might then be rendered as "1." by a viewer,
20+
# BUT html2md output itself is what we check.
21+
# If we don't escape, html2md outputs "4. \n".
22+
23+
expected = "4. \nPlease implement as requested.\n"
24+
converter = pyhtml2md.Converter(html, options)
25+
assert converter.convert() == expected
26+
27+
def test_escape_numbered_list_with_other_content():
28+
html = "<p>1. Item</p>"
29+
# In a paragraph, it might be different.
30+
# But our fix is in ParseCharInTagContent which handles text content.
31+
# If it's "1. Item", it should be escaped to "1\. Item" if enabled.
32+
33+
expected = "1\\. Item\n"
34+
assert pyhtml2md.convert(html) == expected
35+
36+
options = pyhtml2md.Options()
37+
options.escapeNumberedList = False
38+
expected_disabled = "1. Item\n"
39+
converter = pyhtml2md.Converter(html, options)
40+
assert converter.convert() == expected_disabled

0 commit comments

Comments
 (0)