Skip to content

Commit e028400

Browse files
committed
qownnotes/web-companion#35 htmltomarkdown: start reimplementation
Signed-off-by: Patrizio Bekerle <[email protected]>
1 parent 5cd13e8 commit e028400

File tree

3 files changed

+305
-21
lines changed

3 files changed

+305
-21
lines changed

src/utils/misc.cpp

Lines changed: 234 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -659,6 +659,144 @@ QString Utils::Misc::htmlToMarkdown(QString text) {
659659
QRegularExpression::CaseInsensitiveOption |
660660
QRegularExpression::DotMatchesEverythingOption));
661661

662+
// Handle tables - must be done before other replacements
663+
// Convert HTML tables to Markdown tables
664+
QRegularExpression tableRe(QStringLiteral("<table[^>]*?>(.*?)<\\/table>"),
665+
QRegularExpression::CaseInsensitiveOption |
666+
QRegularExpression::DotMatchesEverythingOption);
667+
QRegularExpressionMatchIterator tableIt = tableRe.globalMatch(text);
668+
669+
while (tableIt.hasNext()) {
670+
QRegularExpressionMatch tableMatch = tableIt.next();
671+
QString tableHtml = tableMatch.captured(1);
672+
QString markdownTable;
673+
674+
// Extract rows (handle both thead/tbody and plain tr)
675+
QRegularExpression rowRe(QStringLiteral("<tr[^>]*?>(.*?)<\\/tr>"),
676+
QRegularExpression::CaseInsensitiveOption |
677+
QRegularExpression::DotMatchesEverythingOption);
678+
QRegularExpressionMatchIterator rowIt = rowRe.globalMatch(tableHtml);
679+
680+
QStringList rows;
681+
bool isFirstRow = true;
682+
int columnCount = 0;
683+
684+
while (rowIt.hasNext()) {
685+
QRegularExpressionMatch rowMatch = rowIt.next();
686+
QString rowHtml = rowMatch.captured(1);
687+
688+
// Extract cells (th or td)
689+
QRegularExpression cellRe(QStringLiteral("<(th|td)[^>]*?>(.*?)<\\/\\1>"),
690+
QRegularExpression::CaseInsensitiveOption |
691+
QRegularExpression::DotMatchesEverythingOption);
692+
QRegularExpressionMatchIterator cellIt = cellRe.globalMatch(rowHtml);
693+
694+
QStringList cells;
695+
while (cellIt.hasNext()) {
696+
QRegularExpressionMatch cellMatch = cellIt.next();
697+
QString cellContent = cellMatch.captured(2);
698+
699+
// Remove inner HTML tags from cell content
700+
cellContent.remove(QRegularExpression(QStringLiteral("<[^>]+>"),
701+
QRegularExpression::CaseInsensitiveOption));
702+
cellContent = cellContent.trimmed();
703+
704+
// Escape pipe characters in cell content
705+
cellContent.replace(QStringLiteral("|"), QStringLiteral("\\|"));
706+
707+
cells.append(cellContent);
708+
}
709+
710+
if (!cells.isEmpty()) {
711+
columnCount = qMax(columnCount, cells.count());
712+
rows.append(QStringLiteral("| ") + cells.join(QStringLiteral(" | ")) + QStringLiteral(" |"));
713+
714+
// Add separator row after first row (header)
715+
if (isFirstRow) {
716+
QStringList separators;
717+
for (int i = 0; i < cells.count(); ++i) {
718+
separators.append(QStringLiteral("---"));
719+
}
720+
rows.append(QStringLiteral("| ") + separators.join(QStringLiteral(" | ")) + QStringLiteral(" |"));
721+
isFirstRow = false;
722+
}
723+
}
724+
}
725+
726+
if (!rows.isEmpty()) {
727+
markdownTable = QStringLiteral("\n\n") + rows.join(QStringLiteral("\n")) + QStringLiteral("\n\n");
728+
text.replace(tableMatch.captured(0), markdownTable);
729+
}
730+
}
731+
732+
// Handle strikethrough
733+
text.replace(QRegularExpression(QStringLiteral("<(s|strike|del).*?>(.+?)<\\/\\1>"),
734+
QRegularExpression::CaseInsensitiveOption |
735+
QRegularExpression::DotMatchesEverythingOption),
736+
QStringLiteral("~~\\2~~"));
737+
738+
// Handle underline (convert to emphasis since Markdown doesn't have native underline)
739+
text.replace(QRegularExpression(QStringLiteral("<u.*?>(.+?)<\\/u>"),
740+
QRegularExpression::CaseInsensitiveOption |
741+
QRegularExpression::DotMatchesEverythingOption),
742+
QStringLiteral("_\\1_"));
743+
744+
// Handle mark/highlight
745+
text.replace(QRegularExpression(QStringLiteral("<mark.*?>(.+?)<\\/mark>"),
746+
QRegularExpression::CaseInsensitiveOption |
747+
QRegularExpression::DotMatchesEverythingOption),
748+
QStringLiteral("==\\1=="));
749+
750+
// Handle subscript and superscript
751+
text.replace(QRegularExpression(QStringLiteral("<sub.*?>(.+?)<\\/sub>"),
752+
QRegularExpression::CaseInsensitiveOption |
753+
QRegularExpression::DotMatchesEverythingOption),
754+
QStringLiteral("~\\1~"));
755+
text.replace(QRegularExpression(QStringLiteral("<sup.*?>(.+?)<\\/sup>"),
756+
QRegularExpression::CaseInsensitiveOption |
757+
QRegularExpression::DotMatchesEverythingOption),
758+
QStringLiteral("^\\1^"));
759+
760+
// Handle blockquote
761+
text.replace(QRegularExpression(QStringLiteral("<blockquote.*?>(.+?)<\\/blockquote>"),
762+
QRegularExpression::CaseInsensitiveOption |
763+
QRegularExpression::DotMatchesEverythingOption),
764+
QStringLiteral("\n\n> \\1\n\n"));
765+
766+
// Handle horizontal rule
767+
text.replace(QRegularExpression(QStringLiteral("<hr.*?/?>"),
768+
QRegularExpression::CaseInsensitiveOption),
769+
QStringLiteral("\n\n---\n\n"));
770+
771+
// Handle images - must be done before links
772+
text.replace(QRegularExpression(QStringLiteral("<img[^>]+src=\"([^\"]+)\"[^>]*alt=\"([^\"]+)\"[^>]*>"),
773+
QRegularExpression::CaseInsensitiveOption),
774+
QStringLiteral("![\\2](\\1)"));
775+
text.replace(QRegularExpression(QStringLiteral("<img[^>]+alt=\"([^\"]+)\"[^>]*src=\"([^\"]+)\"[^>]*>"),
776+
QRegularExpression::CaseInsensitiveOption),
777+
QStringLiteral("![\\1](\\2)"));
778+
text.replace(QRegularExpression(QStringLiteral("<img[^>]+src=\"([^\"]+)\"[^>]*>"),
779+
QRegularExpression::CaseInsensitiveOption),
780+
QStringLiteral("![](\\1)"));
781+
782+
// Handle code blocks with language first
783+
text.replace(QRegularExpression(QStringLiteral("<pre[^>]*><code[^>]+class=\"[^\"]*language-([^\"\\s]+)[^\"]*\"[^>]*>(.+?)<\\/code><\\/pre>"),
784+
QRegularExpression::CaseInsensitiveOption |
785+
QRegularExpression::DotMatchesEverythingOption),
786+
QStringLiteral("\n\n```\\1\n\\2\n```\n\n"));
787+
788+
// Handle pre blocks
789+
text.replace(QRegularExpression(QStringLiteral("<pre.*?>(.+?)<\\/pre>"),
790+
QRegularExpression::CaseInsensitiveOption |
791+
QRegularExpression::DotMatchesEverythingOption),
792+
QStringLiteral("\n\n```\n\\1\n```\n\n"));
793+
794+
// Handle standalone code blocks
795+
text.replace(QRegularExpression(QStringLiteral("<code.*?>(.+?)<\\/code>"),
796+
QRegularExpression::CaseInsensitiveOption |
797+
QRegularExpression::DotMatchesEverythingOption),
798+
QStringLiteral("`\\1`"));
799+
662800
// replace some html tags with Markdown
663801
text.replace(QRegularExpression(QStringLiteral("<strong.*?>(.+?)<\\/strong>"),
664802
QRegularExpression::CaseInsensitiveOption |
@@ -676,56 +814,137 @@ QString Utils::Misc::htmlToMarkdown(QString text) {
676814
QRegularExpression::CaseInsensitiveOption |
677815
QRegularExpression::DotMatchesEverythingOption),
678816
QStringLiteral("*\\1*"));
679-
text.replace(QRegularExpression(QStringLiteral("<pre.*?>(.+?)<\\/pre>"),
680-
QRegularExpression::CaseInsensitiveOption |
681-
QRegularExpression::DotMatchesEverythingOption),
682-
QStringLiteral("\n```\n\\1\n```\n"));
683-
text.replace(QRegularExpression(QStringLiteral("<code.*?>(.+?)<\\/code>"),
684-
QRegularExpression::CaseInsensitiveOption |
685-
QRegularExpression::DotMatchesEverythingOption),
686-
QStringLiteral("\n```\n\\1\n```\n"));
687817
text.replace(QRegularExpression(QStringLiteral("<h1.*?>(.+?)<\\/h1>"),
688818
QRegularExpression::CaseInsensitiveOption |
689819
QRegularExpression::DotMatchesEverythingOption),
690-
QStringLiteral("\n# \\1\n"));
820+
QStringLiteral("\n\n# \\1\n\n"));
691821
text.replace(QRegularExpression(QStringLiteral("<h2.*?>(.+?)<\\/h2>"),
692822
QRegularExpression::CaseInsensitiveOption |
693823
QRegularExpression::DotMatchesEverythingOption),
694-
QStringLiteral("\n## \\1\n"));
824+
QStringLiteral("\n\n## \\1\n\n"));
695825
text.replace(QRegularExpression(QStringLiteral("<h3.*?>(.+?)<\\/h3>"),
696826
QRegularExpression::CaseInsensitiveOption |
697827
QRegularExpression::DotMatchesEverythingOption),
698-
QStringLiteral("\n### \\1\n"));
828+
QStringLiteral("\n\n### \\1\n\n"));
699829
text.replace(QRegularExpression(QStringLiteral("<h4.*?>(.+?)<\\/h4>"),
700830
QRegularExpression::CaseInsensitiveOption |
701831
QRegularExpression::DotMatchesEverythingOption),
702-
QStringLiteral("\n#### \\1\n"));
832+
QStringLiteral("\n\n#### \\1\n\n"));
703833
text.replace(QRegularExpression(QStringLiteral("<h5.*?>(.+?)<\\/h5>"),
704834
QRegularExpression::CaseInsensitiveOption |
705835
QRegularExpression::DotMatchesEverythingOption),
706-
QStringLiteral("\n##### \\1\n"));
836+
QStringLiteral("\n\n##### \\1\n\n"));
707837
text.replace(QRegularExpression(QStringLiteral("<h6.*?>(.+?)<\\/h6>"),
708838
QRegularExpression::CaseInsensitiveOption |
709839
QRegularExpression::DotMatchesEverythingOption),
710-
QStringLiteral("\n###### \\1\n"));
840+
QStringLiteral("\n\n###### \\1\n\n"));
841+
842+
// Handle ordered lists
843+
text.replace(QRegularExpression(QStringLiteral("<ol[^>]*?>(.*?)<\\/ol>"),
844+
QRegularExpression::CaseInsensitiveOption |
845+
QRegularExpression::DotMatchesEverythingOption),
846+
QStringLiteral("\\1"));
847+
848+
// Handle unordered lists
849+
text.replace(QRegularExpression(QStringLiteral("<ul[^>]*?>(.*?)<\\/ul>"),
850+
QRegularExpression::CaseInsensitiveOption |
851+
QRegularExpression::DotMatchesEverythingOption),
852+
QStringLiteral("\\1"));
853+
711854
text.replace(QRegularExpression(QStringLiteral("<li.*?>(.+?)<\\/li>"),
712855
QRegularExpression::CaseInsensitiveOption |
713856
QRegularExpression::DotMatchesEverythingOption),
714857
QStringLiteral("- \\1\n"));
858+
715859
text.replace(
716860
QRegularExpression(QStringLiteral("<br.*?>"), QRegularExpression::CaseInsensitiveOption),
717861
QStringLiteral("\n"));
862+
863+
// Handle links (must be after images)
718864
text.replace(QRegularExpression(QStringLiteral("<a[^>]+href=\"(.+?)\".*?>(.+?)<\\/a>"),
719865
QRegularExpression::CaseInsensitiveOption |
720866
QRegularExpression::DotMatchesEverythingOption),
721867
QStringLiteral("[\\2](\\1)"));
868+
869+
// Handle div and span (just extract content)
870+
text.replace(QRegularExpression(QStringLiteral("<div.*?>"), QRegularExpression::CaseInsensitiveOption),
871+
QStringLiteral("\n"));
872+
text.replace(QRegularExpression(QStringLiteral("<\\/div>"), QRegularExpression::CaseInsensitiveOption),
873+
QStringLiteral("\n"));
874+
text.replace(QRegularExpression(QStringLiteral("<span.*?>"), QRegularExpression::CaseInsensitiveOption),
875+
QStringLiteral(""));
876+
text.replace(QRegularExpression(QStringLiteral("<\\/span>"), QRegularExpression::CaseInsensitiveOption),
877+
QStringLiteral(""));
878+
879+
// Handle paragraphs
722880
text.replace(QRegularExpression(QStringLiteral("<p.*?>(.+?)</p>"),
723881
QRegularExpression::CaseInsensitiveOption |
724882
QRegularExpression::DotMatchesEverythingOption),
725883
QStringLiteral("\n\n\\1\n\n"));
726884

885+
// Remove any remaining HTML tags
886+
text.remove(QRegularExpression(QStringLiteral("<[^>]+>"),
887+
QRegularExpression::CaseInsensitiveOption));
888+
889+
// Decode HTML entities manually to preserve linebreaks
890+
// Common HTML entities
891+
text.replace(QStringLiteral("&amp;"), QStringLiteral("&"));
892+
text.replace(QStringLiteral("&lt;"), QStringLiteral("<"));
893+
text.replace(QStringLiteral("&gt;"), QStringLiteral(">"));
894+
text.replace(QStringLiteral("&quot;"), QStringLiteral("\""));
895+
text.replace(QStringLiteral("&apos;"), QStringLiteral("'"));
896+
text.replace(QStringLiteral("&#39;"), QStringLiteral("'"));
897+
text.replace(QStringLiteral("&nbsp;"), QStringLiteral(" "));
898+
899+
// Decode numeric HTML entities (&#xxx; and &#xHH;)
900+
QRegularExpression numericEntityRe(QStringLiteral("&#(\\d+);"));
901+
QRegularExpressionMatchIterator it = numericEntityRe.globalMatch(text);
902+
QList<QPair<QString, QString>> replacements;
903+
904+
while (it.hasNext()) {
905+
QRegularExpressionMatch match = it.next();
906+
bool ok;
907+
int charCode = match.captured(1).toInt(&ok);
908+
if (ok && charCode > 0) {
909+
QString entity = match.captured(0);
910+
QString replacement = QString(QChar(charCode));
911+
replacements.append(qMakePair(entity, replacement));
912+
}
913+
}
914+
915+
for (const auto &pair : replacements) {
916+
text.replace(pair.first, pair.second);
917+
}
918+
919+
// Decode hex numeric HTML entities (&#xHH;)
920+
QRegularExpression hexEntityRe(QStringLiteral("&#x([0-9A-Fa-f]+);"));
921+
it = hexEntityRe.globalMatch(text);
922+
replacements.clear();
923+
924+
while (it.hasNext()) {
925+
QRegularExpressionMatch match = it.next();
926+
bool ok;
927+
int charCode = match.captured(1).toInt(&ok, 16);
928+
if (ok && charCode > 0) {
929+
QString entity = match.captured(0);
930+
QString replacement = QString(QChar(charCode));
931+
replacements.append(qMakePair(entity, replacement));
932+
}
933+
}
934+
935+
for (const auto &pair : replacements) {
936+
text.replace(pair.first, pair.second);
937+
}
938+
939+
// Clean up excessive whitespace
940+
text.replace(QRegularExpression(QStringLiteral("[ \\t]+\n")), QStringLiteral("\n"));
941+
text.replace(QRegularExpression(QStringLiteral("\n[ \\t]+")), QStringLiteral("\n"));
942+
727943
// replace multiple line breaks
728-
text.replace(QRegularExpression(QStringLiteral("\n\n+")), QStringLiteral("\n\n"));
944+
text.replace(QRegularExpression(QStringLiteral("\n\n\n+")), QStringLiteral("\n\n"));
945+
946+
// Trim leading/trailing whitespace
947+
text = text.trimmed();
729948

730949
return text;
731950
}

tests/unit_tests/testcases/app/test_utilsmisc.cpp

Lines changed: 69 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -103,20 +103,83 @@ void TestUtilsMisc::testHtmlToMarkdown() {
103103
html += "<h1>Heading <em>italic</em></h1>";
104104
html += "<h2>Heading <strong>italic</strong></h2>";
105105
html += "<h3>Heading <em>italic</em></h3>";
106-
html += "<h4>Heading <em>italic</em></h4";
106+
html += "<h4>Heading <em>italic</em></h4>";
107107
html += "<h5>Heading <em>italic</em></h5>";
108108
html += "<h6>Heading <em>italic</em></h6>";
109109
html += "<code>hello</code>";
110110
html += "<i>hello</i>";
111111

112112
QString result = htmlToMarkdown(html);
113-
QString expected =
114-
"\n# Heading *italic*\n\n## Heading **italic**\n\n### Heading "
115-
"*italic*\n<h4>Heading *italic*</h4\n##### Heading *italic*\n\n###### "
116-
"Heading *italic*\n\n```\nhello\n```\n*hello*";
117-
QVERIFY(result == expected);
113+
114+
// Check that all expected elements are present
115+
QVERIFY(result.contains("# Heading *italic*"));
116+
QVERIFY(result.contains("## Heading **italic**"));
117+
QVERIFY(result.contains("### Heading *italic*"));
118+
QVERIFY(result.contains("#### Heading *italic*"));
119+
QVERIFY(result.contains("##### Heading *italic*"));
120+
QVERIFY(result.contains("###### Heading *italic*"));
121+
QVERIFY(result.contains("hello"));
122+
QVERIFY(!result.contains("<script>"));
123+
QVERIFY(!result.contains("<style>"));
124+
QVERIFY(!result.contains("<head>"));
125+
}
126+
127+
void TestUtilsMisc::testHtmlToMarkdownTables() {
128+
QString html = R"(
129+
<table>
130+
<tr>
131+
<th>Name</th>
132+
<th>Age</th>
133+
<th>City</th>
134+
</tr>
135+
<tr>
136+
<td>John</td>
137+
<td>30</td>
138+
<td>New York</td>
139+
</tr>
140+
<tr>
141+
<td>Jane</td>
142+
<td>25</td>
143+
<td>London</td>
144+
</tr>
145+
</table>
146+
)";
147+
148+
QString result = htmlToMarkdown(html);
149+
150+
// Check for table structure
151+
QVERIFY(result.contains("| Name | Age | City |"));
152+
QVERIFY(result.contains("| --- | --- | --- |"));
153+
QVERIFY(result.contains("| John | 30 | New York |"));
154+
QVERIFY(result.contains("| Jane | 25 | London |"));
155+
}
156+
157+
void TestUtilsMisc::testHtmlToMarkdownTableSpecialChars() {
158+
QString html = R"(
159+
<table>
160+
<tr>
161+
<th>Operator</th>
162+
<th>Description</th>
163+
</tr>
164+
<tr>
165+
<td>|</td>
166+
<td>Pipe operator</td>
167+
</tr>
168+
<tr>
169+
<td>||</td>
170+
<td>Logical OR</td>
171+
</tr>
172+
</table>
173+
)";
174+
175+
QString result = htmlToMarkdown(html);
176+
177+
// Pipes should be escaped in table cells
178+
QVERIFY(result.contains("| Operator | Description |"));
179+
QVERIFY(result.contains("\\|"));
118180
}
119181

182+
120183
void TestUtilsMisc::testParseTaskList() {
121184
const auto listTag = QStringLiteral("<li style=\"list-style-type:square\">");
122185
const QString &t1 = "<li> [ ] task 1</li>";

tests/unit_tests/testcases/app/test_utilsmisc.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ class TestUtilsMisc : public QObject {
1515
void testShorten();
1616
void testCycleTextCase();
1717
void testHtmlToMarkdown();
18+
void testHtmlToMarkdownTables();
19+
void testHtmlToMarkdownTableSpecialChars();
1820
void testParseTaskList();
1921
void testUnescapeHtml();
2022
void testHtmlSpecialChars();

0 commit comments

Comments
 (0)