Skip to content

Commit b07e348

Browse files
committed
feat: add keepHtmlEntities option (closes #166)
1 parent 377b81c commit b07e348

File tree

5 files changed

+90
-20
lines changed

5 files changed

+90
-20
lines changed

cli/main.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,16 +60,20 @@ constexpr const char *const DESCRIPTION =
6060
" -r, --replace\tOverwrite the output file (if it already exists) without "
6161
"asking.\n";
6262

63+
constexpr const char *const EXTRA_OPTIONS =
64+
" -E, --preserve-entities\tKeep HTML entities (e.g.  ) in output.\n";
65+
6366
struct Options {
6467
bool print = false;
6568
bool replace = false;
69+
bool preserveEntities = false;
6670
string inputFile;
6771
string outputFile;
6872
string inputText;
6973
};
7074

7175
void printHelp(const string &programName) {
72-
cout << programName << DESCRIPTION;
76+
cout << programName << DESCRIPTION << EXTRA_OPTIONS;
7377
}
7478

7579
void printVersion() { cout << "Version " << VERSION << endl; }
@@ -115,6 +119,8 @@ Options parseCommandLine(int argc, char **argv) {
115119
options.print = true;
116120
} else if (arg == "-r" || arg == "--replace") {
117121
options.replace = true;
122+
} else if (arg == "-E" || arg == "--preserve-entities") {
123+
options.preserveEntities = true;
118124
} else if (arg == "-o" || arg == "--output") {
119125
if (i + 1 < argc) {
120126
options.outputFile = argv[i + 1];
@@ -153,7 +159,10 @@ int main(int argc, char **argv) {
153159
return EXIT_FAILURE;
154160
}
155161

156-
html2md::Converter converter(input);
162+
// Pass CLI-driven option to the converter
163+
html2md::Options copt;
164+
copt.keepHtmlEntities = options.preserveEntities;
165+
html2md::Converter converter(input, &copt);
157166
string md = converter.convert();
158167

159168
if (options.print) {

include/html2md.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,13 +133,22 @@ struct Options {
133133
*/
134134
bool escapeNumberedList = true;
135135

136+
/*!
137+
* \brief Whether to keep HTML entities (e.g. `&nbsp;`) in the output
138+
*
139+
* If true, the converter will not replace HTML entities configured in the
140+
* internal conversion map. Default is false (current behaviour).
141+
*/
142+
bool keepHtmlEntities = false;
143+
136144
inline bool operator==(html2md::Options o) const {
137145
return splitLines == o.splitLines && unorderedList == o.unorderedList &&
138146
orderedList == o.orderedList && includeTitle == o.includeTitle &&
139147
softBreak == o.softBreak && hardBreak == o.hardBreak &&
140148
formatTable == o.formatTable && forceLeftTrim == o.forceLeftTrim &&
141149
compressWhitespace == o.compressWhitespace &&
142-
escapeNumberedList == o.escapeNumberedList;
150+
escapeNumberedList == o.escapeNumberedList &&
151+
keepHtmlEntities == o.keepHtmlEntities;
143152
};
144153
};
145154

python/bindings.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ PYBIND11_MODULE(pyhtml2md, m) {
3131
"into a single space")
3232
.def_readwrite("escapeNumberedList", &html2md::Options::escapeNumberedList,
3333
"Whether to escape numbered lists (e.g. '4.' -> '4\\.')")
34+
.def_readwrite("keepHtmlEntities", &html2md::Options::keepHtmlEntities,
35+
"Whether to keep HTML entities (e.g. '&nbsp;') in the output")
3436
.def("__eq__", &html2md::Options::operator==);
3537

3638
py::class_<html2md::Converter>(m, "Converter")

src/html2md.cpp

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -164,26 +164,32 @@ void Converter::CleanUpMarkdown() {
164164
std::string buffer;
165165
buffer.reserve(md_.size());
166166

167-
// Replace HTML symbols during the initial pass
168-
for (size_t i = 0; i < md_.size();) {
169-
bool replaced = false;
170-
171-
// C++11 compatible iteration over htmlSymbolConversions_
172-
for (const auto &symbol_replacement : htmlSymbolConversions_) {
173-
const std::string &symbol = symbol_replacement.first;
174-
const std::string &replacement = symbol_replacement.second;
175-
176-
if (md_.compare(i, symbol.size(), symbol) == 0) {
177-
buffer.append(replacement);
178-
i += symbol.size();
179-
replaced = true;
180-
break;
167+
// Replace HTML symbols during the initial pass unless the user requested
168+
// to keep HTML entities intact (e.g. keep `&nbsp;`)
169+
if (!option.keepHtmlEntities) {
170+
for (size_t i = 0; i < md_.size();) {
171+
bool replaced = false;
172+
173+
// C++11 compatible iteration over htmlSymbolConversions_
174+
for (const auto &symbol_replacement : htmlSymbolConversions_) {
175+
const std::string &symbol = symbol_replacement.first;
176+
const std::string &replacement = symbol_replacement.second;
177+
178+
if (md_.compare(i, symbol.size(), symbol) == 0) {
179+
buffer.append(replacement);
180+
i += symbol.size();
181+
replaced = true;
182+
break;
183+
}
181184
}
182-
}
183185

184-
if (!replaced) {
185-
buffer.push_back(md_[i++]);
186+
if (!replaced) {
187+
buffer.push_back(md_[i++]);
188+
}
186189
}
190+
} else {
191+
// Keep entities as-is: copy through without transforming
192+
buffer.append(md_);
187193
}
188194

189195
// Use swap instead of move assignment for better pre-C++11 compatibility

tests/main.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,49 @@ bool testTableFormatting() {
533533
return true;
534534
}
535535

536+
bool testPreserveNbsp() {
537+
testOption("preserveNbsp");
538+
539+
string html = R"(<p class="" style="text-autospace:none"><span style="font-size:11.0pt;mso-fareast-language:EN-US">redacted1</span></p>
540+
<p class=""><span style="font-size:11.0pt;mso-fareast-language:EN-US">&nbsp;</span></p>
541+
<p class=""><span style="font-size:11.0pt;mso-fareast-language:EN-US">&nbsp;</span></p>
542+
<div>
543+
<p class=""><span style="font-size:11.0pt;mso-ligatures:standardcontextual;mso-fareast-language:EN-US">&nbsp;</span></p>
544+
<p class="" style="text-autospace:none"><b><span style="font-size:10.0pt;font-family:&quot;Arial&quot;,sans-serif;color:#244061"><img width="239" height="30" style="width: 2.493in; height: 0.3125in; max-width: 100%;" id="" src=""></span></b><b><span style="font-size:10.0pt;font-family:&quot;Arial&quot;,sans-serif;color:#244061"></span></b></p>
545+
<p class="" style="text-autospace:none"><b><span style="font-size:10.0pt;font-family:&quot;Arial&quot;,sans-serif;color:#244061;mso-ligatures:standardcontextual">redacted2</span></b></p>)";
546+
547+
html2md::Options o;
548+
o.splitLines = false;
549+
o.keepHtmlEntities = true;
550+
551+
html2md::Converter c(html, &o);
552+
auto md = c.convert();
553+
554+
// We expect the output to contain HTML non-breaking spaces preserved
555+
size_t count = 0;
556+
size_t pos = 0;
557+
while ((pos = md.find("&nbsp;", pos)) != string::npos) {
558+
++count;
559+
pos += 6;
560+
}
561+
562+
if (count < 3) {
563+
cout << "Failed preserve &nbsp;: only " << count << " occurrences found\n"
564+
<< "Generated Markdown:\n" << md << "\n";
565+
return false;
566+
}
567+
568+
// Also ensure that redacted1 and redacted2 are present in correct order
569+
auto p1 = md.find("redacted1");
570+
auto p2 = md.find("redacted2");
571+
if (p1 == string::npos || p2 == string::npos || p1 >= p2) {
572+
cout << "redacted texts missing or in wrong order\n" << md << "\n";
573+
return false;
574+
}
575+
576+
return true;
577+
}
578+
536579
int main(int argc, const char **argv) {
537580
// List to store all markdown files in this dir
538581
vector<string> files;
@@ -596,6 +639,7 @@ int main(int argc, const char **argv) {
596639
&testInvalidTags,
597640
&testEscapingNumberedList,
598641
&testTableFormatting,
642+
&testPreserveNbsp,
599643
};
600644

601645
for (const auto &test : tests)

0 commit comments

Comments
 (0)