feat: add escapeNumberedList option (fixes #165)

tim-gromeyer · tim-gromeyer · commit f3f7e08ffcb0 · 2025-11-25T08:42:42.000+01:00
diff --git a/include/html2md.h b/include/html2md.h
@@ -124,12 +124,22 @@ struct Options {
    */
   bool compressWhitespace = false;
 
+  /*!
+   * \brief Whether to escape numbered lists (e.g. "4." -> "4\.") to prevent them
+   * from being interpreted as lists in Markdown.
+   *
+   * Whether to escape numbered lists.
+   * Default is true.
+   */
+  bool escapeNumberedList = true;
+
   inline bool operator==(html2md::Options o) const {
     return splitLines == o.splitLines && unorderedList == o.unorderedList &&
            orderedList == o.orderedList && includeTitle == o.includeTitle &&
            softBreak == o.softBreak && hardBreak == o.hardBreak &&
            formatTable == o.formatTable && forceLeftTrim == o.forceLeftTrim &&
-           compressWhitespace == o.compressWhitespace;
+           compressWhitespace == o.compressWhitespace &&
+           escapeNumberedList == o.escapeNumberedList;
   };
 };
 
diff --git a/python/bindings.cpp b/python/bindings.cpp
@@ -29,6 +29,8 @@ PYBIND11_MODULE(pyhtml2md, m) {
       .def_readwrite("compressWhitespace", &html2md::Options::compressWhitespace,
                      "Whether to compress whitespace (tabs, multiple spaces) "
                      "into a single space")
+      .def_readwrite("escapeNumberedList", &html2md::Options::escapeNumberedList,
+                     "Whether to escape numbered lists (e.g. '4.' -> '4\\.')")
       .def("__eq__", &html2md::Options::operator==);
 
   py::class_<html2md::Converter>(m, "Converter")
diff --git a/reproduce_issue.py b/reproduce_issue.py
@@ -1,20 +1,14 @@
 import pyhtml2md
 
 html = """
-<p>
-	<strong>1. Title</strong></p>
-<p>
-	paragraph   with tabs	and    spaces</p>
+4.<br />
+Please implement as requested.
 """
 
 print("--- Original Output ---")
 converter = pyhtml2md.Converter(html)
-# Enable forceLeftTrim as user mentioned it was part of their partial solution
-options = pyhtml2md.Options()
-options.compressWhitespace = True
-converter = pyhtml2md.Converter(html, options)
 print(converter.convert())
 
-expected = "**1. Title**\n\nparagraph with tabs and spaces\n"
+expected = "4\\.  \nPlease implement as requested.\n"
 print("\n--- Expected Output ---")
 print(expected)
diff --git a/src/html2md.cpp b/src/html2md.cpp
@@ -583,6 +583,35 @@ bool Converter::ParseCharInTagContent(char ch) {
   case '\\':
     appendToMd("\\\\");
     break;
+  case '.': {
+    bool is_ordered_list_start = false;
+    if (chars_in_curr_line_ > 0) {
+      size_t start_idx = md_.length() - chars_in_curr_line_;
+      size_t idx = start_idx;
+      // Skip spaces
+      while (idx < md_.length() && isspace(md_[idx])) {
+        idx++;
+      }
+      // Check digits
+      bool has_digits = false;
+      while (idx < md_.length() && isdigit(md_[idx])) {
+        has_digits = true;
+        idx++;
+      }
+      // If we reached the end and had digits, it's a match
+      if (has_digits && idx == md_.length()) {
+        is_ordered_list_start = true;
+      }
+    }
+
+    if (is_ordered_list_start && option.escapeNumberedList) {
+      appendToMd("\\.");
+    } else {
+      md_ += ch;
+      ++chars_in_curr_line_;
+    }
+    break;
+  }
   default:
     md_ += ch;
     ++chars_in_curr_line_;
diff --git a/tests/main.cpp b/tests/main.cpp
@@ -439,6 +439,26 @@ bool testInvalidTags() {
   return true;
 }
 
+bool testEscapingNumberedList() {
+  testOption("escapingNumberedList");
+
+  string html = "4.<br />\nPlease implement as requested.";
+
+  html2md::Converter c(html);
+  auto md = c.convert();
+
+  string expected = "4\\.  \nPlease implement as requested.\n";
+
+  if (md != expected) {
+    cout << "Failed to escape numbered list:\n"
+         << "Input: " << html << "\n"
+         << "Expected: " << expected << "\n"
+         << "Got: " << md << "\n";
+    return false;
+  }
+  return true;
+}
+
 int main(int argc, const char **argv) {
   // List to store all markdown files in this dir
   vector<string> files;
@@ -500,6 +520,7 @@ int main(int argc, const char **argv) {
                 &testSelfClosingTags,
                 &testZeroWidthSpaceWithBlockquote,
                 &testInvalidTags,
+                &testEscapingNumberedList,
               };
 
   for (const auto &test : tests)
diff --git a/tests/python/test_option_escape.py b/tests/python/test_option_escape.py
@@ -0,0 +1,40 @@
+import pytest
+import pyhtml2md
+
+def test_escape_numbered_list_enabled_by_default():
+    html = "4.<br />\nPlease implement as requested."
+    expected = "4\\.  \nPlease implement as requested.\n"
+    assert pyhtml2md.convert(html) == expected
+
+def test_escape_numbered_list_disabled():
+    html = "4.<br />\nPlease implement as requested."
+    options = pyhtml2md.Options()
+    options.escapeNumberedList = False
+    
+    # When disabled, it should be interpreted as a list item (or at least not escaped)
+    # The original issue was that "4." became "1." because it was seen as a list.
+    # So we expect "1. \nPlease..." or similar if it's interpreted as a list,
+    # OR just "4. \n..." if it's not escaped but also not renumbered (depending on md4c/parser behavior).
+    # However, based on the issue description: "The issue is, that this text is renumbered by Markdown and rendered as '1.\nPlease...'."
+    # So if we disable escaping, we expect the output to contain "4.  \n" which might then be rendered as "1." by a viewer,
+    # BUT html2md output itself is what we check.
+    # If we don't escape, html2md outputs "4.  \n".
+    
+    expected = "4.  \nPlease implement as requested.\n"
+    converter = pyhtml2md.Converter(html, options)
+    assert converter.convert() == expected
+
+def test_escape_numbered_list_with_other_content():
+    html = "<p>1. Item</p>"
+    # In a paragraph, it might be different.
+    # But our fix is in ParseCharInTagContent which handles text content.
+    # If it's "1. Item", it should be escaped to "1\. Item" if enabled.
+    
+    expected = "1\\. Item\n"
+    assert pyhtml2md.convert(html) == expected
+
+    options = pyhtml2md.Options()
+    options.escapeNumberedList = False
+    expected_disabled = "1. Item\n"
+    converter = pyhtml2md.Converter(html, options)
+    assert converter.convert() == expected_disabled