@@ -659,6 +659,144 @@ QString Utils::Misc::htmlToMarkdown(QString text) {
659659 QRegularExpression::CaseInsensitiveOption |
660660 QRegularExpression::DotMatchesEverythingOption));
661661
662+ // Handle tables - must be done before other replacements
663+ // Convert HTML tables to Markdown tables
664+ QRegularExpression tableRe (QStringLiteral (" <table[^>]*?>(.*?)<\\ /table>" ),
665+ QRegularExpression::CaseInsensitiveOption |
666+ QRegularExpression::DotMatchesEverythingOption);
667+ QRegularExpressionMatchIterator tableIt = tableRe.globalMatch (text);
668+
669+ while (tableIt.hasNext ()) {
670+ QRegularExpressionMatch tableMatch = tableIt.next ();
671+ QString tableHtml = tableMatch.captured (1 );
672+ QString markdownTable;
673+
674+ // Extract rows (handle both thead/tbody and plain tr)
675+ QRegularExpression rowRe (QStringLiteral (" <tr[^>]*?>(.*?)<\\ /tr>" ),
676+ QRegularExpression::CaseInsensitiveOption |
677+ QRegularExpression::DotMatchesEverythingOption);
678+ QRegularExpressionMatchIterator rowIt = rowRe.globalMatch (tableHtml);
679+
680+ QStringList rows;
681+ bool isFirstRow = true ;
682+ int columnCount = 0 ;
683+
684+ while (rowIt.hasNext ()) {
685+ QRegularExpressionMatch rowMatch = rowIt.next ();
686+ QString rowHtml = rowMatch.captured (1 );
687+
688+ // Extract cells (th or td)
689+ QRegularExpression cellRe (QStringLiteral (" <(th|td)[^>]*?>(.*?)<\\ /\\ 1>" ),
690+ QRegularExpression::CaseInsensitiveOption |
691+ QRegularExpression::DotMatchesEverythingOption);
692+ QRegularExpressionMatchIterator cellIt = cellRe.globalMatch (rowHtml);
693+
694+ QStringList cells;
695+ while (cellIt.hasNext ()) {
696+ QRegularExpressionMatch cellMatch = cellIt.next ();
697+ QString cellContent = cellMatch.captured (2 );
698+
699+ // Remove inner HTML tags from cell content
700+ cellContent.remove (QRegularExpression (QStringLiteral (" <[^>]+>" ),
701+ QRegularExpression::CaseInsensitiveOption));
702+ cellContent = cellContent.trimmed ();
703+
704+ // Escape pipe characters in cell content
705+ cellContent.replace (QStringLiteral (" |" ), QStringLiteral (" \\ |" ));
706+
707+ cells.append (cellContent);
708+ }
709+
710+ if (!cells.isEmpty ()) {
711+ columnCount = qMax (columnCount, cells.count ());
712+ rows.append (QStringLiteral (" | " ) + cells.join (QStringLiteral (" | " )) + QStringLiteral (" |" ));
713+
714+ // Add separator row after first row (header)
715+ if (isFirstRow) {
716+ QStringList separators;
717+ for (int i = 0 ; i < cells.count (); ++i) {
718+ separators.append (QStringLiteral (" ---" ));
719+ }
720+ rows.append (QStringLiteral (" | " ) + separators.join (QStringLiteral (" | " )) + QStringLiteral (" |" ));
721+ isFirstRow = false ;
722+ }
723+ }
724+ }
725+
726+ if (!rows.isEmpty ()) {
727+ markdownTable = QStringLiteral (" \n\n " ) + rows.join (QStringLiteral (" \n " )) + QStringLiteral (" \n\n " );
728+ text.replace (tableMatch.captured (0 ), markdownTable);
729+ }
730+ }
731+
732+ // Handle strikethrough
733+ text.replace (QRegularExpression (QStringLiteral (" <(s|strike|del).*?>(.+?)<\\ /\\ 1>" ),
734+ QRegularExpression::CaseInsensitiveOption |
735+ QRegularExpression::DotMatchesEverythingOption),
736+ QStringLiteral (" ~~\\ 2~~" ));
737+
738+ // Handle underline (convert to emphasis since Markdown doesn't have native underline)
739+ text.replace (QRegularExpression (QStringLiteral (" <u.*?>(.+?)<\\ /u>" ),
740+ QRegularExpression::CaseInsensitiveOption |
741+ QRegularExpression::DotMatchesEverythingOption),
742+ QStringLiteral (" _\\ 1_" ));
743+
744+ // Handle mark/highlight
745+ text.replace (QRegularExpression (QStringLiteral (" <mark.*?>(.+?)<\\ /mark>" ),
746+ QRegularExpression::CaseInsensitiveOption |
747+ QRegularExpression::DotMatchesEverythingOption),
748+ QStringLiteral (" ==\\ 1==" ));
749+
750+ // Handle subscript and superscript
751+ text.replace (QRegularExpression (QStringLiteral (" <sub.*?>(.+?)<\\ /sub>" ),
752+ QRegularExpression::CaseInsensitiveOption |
753+ QRegularExpression::DotMatchesEverythingOption),
754+ QStringLiteral (" ~\\ 1~" ));
755+ text.replace (QRegularExpression (QStringLiteral (" <sup.*?>(.+?)<\\ /sup>" ),
756+ QRegularExpression::CaseInsensitiveOption |
757+ QRegularExpression::DotMatchesEverythingOption),
758+ QStringLiteral (" ^\\ 1^" ));
759+
760+ // Handle blockquote
761+ text.replace (QRegularExpression (QStringLiteral (" <blockquote.*?>(.+?)<\\ /blockquote>" ),
762+ QRegularExpression::CaseInsensitiveOption |
763+ QRegularExpression::DotMatchesEverythingOption),
764+ QStringLiteral (" \n\n > \\ 1\n\n " ));
765+
766+ // Handle horizontal rule
767+ text.replace (QRegularExpression (QStringLiteral (" <hr.*?/?>" ),
768+ QRegularExpression::CaseInsensitiveOption),
769+ QStringLiteral (" \n\n ---\n\n " ));
770+
771+ // Handle images - must be done before links
772+ text.replace (QRegularExpression (QStringLiteral (" <img[^>]+src=\" ([^\" ]+)\" [^>]*alt=\" ([^\" ]+)\" [^>]*>" ),
773+ QRegularExpression::CaseInsensitiveOption),
774+ QStringLiteral (" " ));
775+ text.replace (QRegularExpression (QStringLiteral (" <img[^>]+alt=\" ([^\" ]+)\" [^>]*src=\" ([^\" ]+)\" [^>]*>" ),
776+ QRegularExpression::CaseInsensitiveOption),
777+ QStringLiteral (" " ));
778+ text.replace (QRegularExpression (QStringLiteral (" <img[^>]+src=\" ([^\" ]+)\" [^>]*>" ),
779+ QRegularExpression::CaseInsensitiveOption),
780+ QStringLiteral (" " ));
781+
782+ // Handle code blocks with language first
783+ text.replace (QRegularExpression (QStringLiteral (" <pre[^>]*><code[^>]+class=\" [^\" ]*language-([^\"\\ s]+)[^\" ]*\" [^>]*>(.+?)<\\ /code><\\ /pre>" ),
784+ QRegularExpression::CaseInsensitiveOption |
785+ QRegularExpression::DotMatchesEverythingOption),
786+ QStringLiteral (" \n\n ```\\ 1\n\\ 2\n ```\n\n " ));
787+
788+ // Handle pre blocks
789+ text.replace (QRegularExpression (QStringLiteral (" <pre.*?>(.+?)<\\ /pre>" ),
790+ QRegularExpression::CaseInsensitiveOption |
791+ QRegularExpression::DotMatchesEverythingOption),
792+ QStringLiteral (" \n\n ```\n\\ 1\n ```\n\n " ));
793+
794+ // Handle standalone code blocks
795+ text.replace (QRegularExpression (QStringLiteral (" <code.*?>(.+?)<\\ /code>" ),
796+ QRegularExpression::CaseInsensitiveOption |
797+ QRegularExpression::DotMatchesEverythingOption),
798+ QStringLiteral (" `\\ 1`" ));
799+
662800 // replace some html tags with Markdown
663801 text.replace (QRegularExpression (QStringLiteral (" <strong.*?>(.+?)<\\ /strong>" ),
664802 QRegularExpression::CaseInsensitiveOption |
@@ -676,56 +814,137 @@ QString Utils::Misc::htmlToMarkdown(QString text) {
676814 QRegularExpression::CaseInsensitiveOption |
677815 QRegularExpression::DotMatchesEverythingOption),
678816 QStringLiteral (" *\\ 1*" ));
679- text.replace (QRegularExpression (QStringLiteral (" <pre.*?>(.+?)<\\ /pre>" ),
680- QRegularExpression::CaseInsensitiveOption |
681- QRegularExpression::DotMatchesEverythingOption),
682- QStringLiteral (" \n ```\n\\ 1\n ```\n " ));
683- text.replace (QRegularExpression (QStringLiteral (" <code.*?>(.+?)<\\ /code>" ),
684- QRegularExpression::CaseInsensitiveOption |
685- QRegularExpression::DotMatchesEverythingOption),
686- QStringLiteral (" \n ```\n\\ 1\n ```\n " ));
687817 text.replace (QRegularExpression (QStringLiteral (" <h1.*?>(.+?)<\\ /h1>" ),
688818 QRegularExpression::CaseInsensitiveOption |
689819 QRegularExpression::DotMatchesEverythingOption),
690- QStringLiteral (" \n # \\ 1\n " ));
820+ QStringLiteral (" \n\n # \\ 1\n \n" ));
691821 text.replace (QRegularExpression (QStringLiteral (" <h2.*?>(.+?)<\\ /h2>" ),
692822 QRegularExpression::CaseInsensitiveOption |
693823 QRegularExpression::DotMatchesEverythingOption),
694- QStringLiteral (" \n ## \\ 1\n " ));
824+ QStringLiteral (" \n\n ## \\ 1\n \n" ));
695825 text.replace (QRegularExpression (QStringLiteral (" <h3.*?>(.+?)<\\ /h3>" ),
696826 QRegularExpression::CaseInsensitiveOption |
697827 QRegularExpression::DotMatchesEverythingOption),
698- QStringLiteral (" \n ### \\ 1\n " ));
828+ QStringLiteral (" \n\n ### \\ 1\n \n" ));
699829 text.replace (QRegularExpression (QStringLiteral (" <h4.*?>(.+?)<\\ /h4>" ),
700830 QRegularExpression::CaseInsensitiveOption |
701831 QRegularExpression::DotMatchesEverythingOption),
702- QStringLiteral (" \n #### \\ 1\n " ));
832+ QStringLiteral (" \n\n #### \\ 1\n \n" ));
703833 text.replace (QRegularExpression (QStringLiteral (" <h5.*?>(.+?)<\\ /h5>" ),
704834 QRegularExpression::CaseInsensitiveOption |
705835 QRegularExpression::DotMatchesEverythingOption),
706- QStringLiteral (" \n ##### \\ 1\n " ));
836+ QStringLiteral (" \n\n ##### \\ 1\n \n" ));
707837 text.replace (QRegularExpression (QStringLiteral (" <h6.*?>(.+?)<\\ /h6>" ),
708838 QRegularExpression::CaseInsensitiveOption |
709839 QRegularExpression::DotMatchesEverythingOption),
710- QStringLiteral (" \n ###### \\ 1\n " ));
840+ QStringLiteral (" \n\n ###### \\ 1\n\n " ));
841+
842+ // Handle ordered lists
843+ text.replace (QRegularExpression (QStringLiteral (" <ol[^>]*?>(.*?)<\\ /ol>" ),
844+ QRegularExpression::CaseInsensitiveOption |
845+ QRegularExpression::DotMatchesEverythingOption),
846+ QStringLiteral (" \\ 1" ));
847+
848+ // Handle unordered lists
849+ text.replace (QRegularExpression (QStringLiteral (" <ul[^>]*?>(.*?)<\\ /ul>" ),
850+ QRegularExpression::CaseInsensitiveOption |
851+ QRegularExpression::DotMatchesEverythingOption),
852+ QStringLiteral (" \\ 1" ));
853+
711854 text.replace (QRegularExpression (QStringLiteral (" <li.*?>(.+?)<\\ /li>" ),
712855 QRegularExpression::CaseInsensitiveOption |
713856 QRegularExpression::DotMatchesEverythingOption),
714857 QStringLiteral (" - \\ 1\n " ));
858+
715859 text.replace (
716860 QRegularExpression (QStringLiteral (" <br.*?>" ), QRegularExpression::CaseInsensitiveOption),
717861 QStringLiteral (" \n " ));
862+
863+ // Handle links (must be after images)
718864 text.replace (QRegularExpression (QStringLiteral (" <a[^>]+href=\" (.+?)\" .*?>(.+?)<\\ /a>" ),
719865 QRegularExpression::CaseInsensitiveOption |
720866 QRegularExpression::DotMatchesEverythingOption),
721867 QStringLiteral (" [\\ 2](\\ 1)" ));
868+
869+ // Handle div and span (just extract content)
870+ text.replace (QRegularExpression (QStringLiteral (" <div.*?>" ), QRegularExpression::CaseInsensitiveOption),
871+ QStringLiteral (" \n " ));
872+ text.replace (QRegularExpression (QStringLiteral (" <\\ /div>" ), QRegularExpression::CaseInsensitiveOption),
873+ QStringLiteral (" \n " ));
874+ text.replace (QRegularExpression (QStringLiteral (" <span.*?>" ), QRegularExpression::CaseInsensitiveOption),
875+ QStringLiteral (" " ));
876+ text.replace (QRegularExpression (QStringLiteral (" <\\ /span>" ), QRegularExpression::CaseInsensitiveOption),
877+ QStringLiteral (" " ));
878+
879+ // Handle paragraphs
722880 text.replace (QRegularExpression (QStringLiteral (" <p.*?>(.+?)</p>" ),
723881 QRegularExpression::CaseInsensitiveOption |
724882 QRegularExpression::DotMatchesEverythingOption),
725883 QStringLiteral (" \n\n\\ 1\n\n " ));
726884
885+ // Remove any remaining HTML tags
886+ text.remove (QRegularExpression (QStringLiteral (" <[^>]+>" ),
887+ QRegularExpression::CaseInsensitiveOption));
888+
889+ // Decode HTML entities manually to preserve linebreaks
890+ // Common HTML entities
891+ text.replace (QStringLiteral (" &" ), QStringLiteral (" &" ));
892+ text.replace (QStringLiteral (" <" ), QStringLiteral (" <" ));
893+ text.replace (QStringLiteral (" >" ), QStringLiteral (" >" ));
894+ text.replace (QStringLiteral (" "" ), QStringLiteral (" \" " ));
895+ text.replace (QStringLiteral (" '" ), QStringLiteral (" '" ));
896+ text.replace (QStringLiteral (" '" ), QStringLiteral (" '" ));
897+ text.replace (QStringLiteral (" " ), QStringLiteral (" " ));
898+
899+ // Decode numeric HTML entities (&#xxx; and &#xHH;)
900+ QRegularExpression numericEntityRe (QStringLiteral (" &#(\\ d+);" ));
901+ QRegularExpressionMatchIterator it = numericEntityRe.globalMatch (text);
902+ QList<QPair<QString, QString>> replacements;
903+
904+ while (it.hasNext ()) {
905+ QRegularExpressionMatch match = it.next ();
906+ bool ok;
907+ int charCode = match.captured (1 ).toInt (&ok);
908+ if (ok && charCode > 0 ) {
909+ QString entity = match.captured (0 );
910+ QString replacement = QString (QChar (charCode));
911+ replacements.append (qMakePair (entity, replacement));
912+ }
913+ }
914+
915+ for (const auto &pair : replacements) {
916+ text.replace (pair.first , pair.second );
917+ }
918+
919+ // Decode hex numeric HTML entities (&#xHH;)
920+ QRegularExpression hexEntityRe (QStringLiteral (" &#x([0-9A-Fa-f]+);" ));
921+ it = hexEntityRe.globalMatch (text);
922+ replacements.clear ();
923+
924+ while (it.hasNext ()) {
925+ QRegularExpressionMatch match = it.next ();
926+ bool ok;
927+ int charCode = match.captured (1 ).toInt (&ok, 16 );
928+ if (ok && charCode > 0 ) {
929+ QString entity = match.captured (0 );
930+ QString replacement = QString (QChar (charCode));
931+ replacements.append (qMakePair (entity, replacement));
932+ }
933+ }
934+
935+ for (const auto &pair : replacements) {
936+ text.replace (pair.first , pair.second );
937+ }
938+
939+ // Clean up excessive whitespace
940+ text.replace (QRegularExpression (QStringLiteral (" [ \\ t]+\n " )), QStringLiteral (" \n " ));
941+ text.replace (QRegularExpression (QStringLiteral (" \n [ \\ t]+" )), QStringLiteral (" \n " ));
942+
727943 // replace multiple line breaks
728- text.replace (QRegularExpression (QStringLiteral (" \n\n +" )), QStringLiteral (" \n\n " ));
944+ text.replace (QRegularExpression (QStringLiteral (" \n\n\n +" )), QStringLiteral (" \n\n " ));
945+
946+ // Trim leading/trailing whitespace
947+ text = text.trimmed ();
729948
730949 return text;
731950}
0 commit comments