add markdown preprocessing to handle code blocks and code spans correctly

tcottin · tcottin · commit 078294bf08f6 · 2025-09-15T11:42:02.000Z
diff --git a/clang-tools-extra/clangd/SymbolDocumentation.cpp b/clang-tools-extra/clangd/SymbolDocumentation.cpp
@@ -13,6 +13,7 @@
 #include "clang/AST/CommentCommandTraits.h"
 #include "clang/AST/CommentVisitor.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 
 namespace clang {
@@ -249,7 +250,53 @@ class BlockCommentToMarkupDocument
     }
   }
 
+  void visitCodeCommand(const comments::VerbatimBlockComment *VB) {
+    std::string CodeLang = "";
+    auto *FirstLine = VB->child_begin();
+    // The \\code command has an optional language argument.
+    // This argument is currently not parsed by the clang doxygen parser.
+    // Therefore we try to extract it from the first line of the verbatim
+    // block.
+    if (VB->getNumLines() > 0) {
+      if (const auto *Line =
+              cast<comments::VerbatimBlockLineComment>(*FirstLine)) {
+        llvm::StringRef Text = Line->getText();
+        // Language is a single word enclosed in {}.
+        if (llvm::none_of(Text, llvm::isSpace) && Text.consume_front("{") &&
+            Text.consume_back("}")) {
+          // drop a potential . since this is not supported in Markdown
+          // fenced code blocks.
+          Text.consume_front(".");
+          // Language is alphanumeric or '+'.
+          CodeLang = Text.take_while([](char C) {
+                           return llvm::isAlnum(C) || C == '+';
+                         })
+                         .str();
+          // Skip the first line for the verbatim text.
+          ++FirstLine;
+        }
+      }
+    }
+
+    std::string CodeBlockText;
+
+    for (const auto *LI = FirstLine; LI != VB->child_end(); ++LI) {
+      if (const auto *Line = cast<comments::VerbatimBlockLineComment>(*LI)) {
+        CodeBlockText += Line->getText().str() + "\n";
+      }
+    }
+
+    Out.addCodeBlock(CodeBlockText, CodeLang);
+  }
+
   void visitVerbatimBlockComment(const comments::VerbatimBlockComment *VB) {
+    // The \\code command is a special verbatim block command which we handle
+    // separately.
+    if (VB->getCommandID() == comments::CommandTraits::KCI_code) {
+      visitCodeCommand(VB);
+      return;
+    }
+
     commandToMarkup(Out.addParagraph(), VB->getCommandName(Traits),
                     VB->getCommandMarker(), "");
 
@@ -292,6 +339,110 @@ class BlockCommentToMarkupDocument
   }
 };
 
+void SymbolDocCommentVisitor::preprocessDocumentation(StringRef Doc) {
+  enum State {
+    Normal,
+    FencedCodeblock,
+  } State = Normal;
+  std::string CodeFence;
+
+  llvm::raw_string_ostream OS(CommentWithMarkers);
+
+  // The documentation string is processed line by line.
+  // The raw documentation string does not contain the comment markers
+  // (e.g. /// or /** */).
+  // But the comment lexer expects doxygen markers, so add them back.
+  // We need to use the /// style doxygen markers because the comment could
+  // contain the closing tag "*/" of a C Style "/** */" comment
+  // which would break the parsing if we would just enclose the comment text
+  // with "/** */".
+
+  // Escape doxygen commands inside markdown inline code spans.
+  // This is required to not let the doxygen parser interpret them as
+  // commands.
+  // Note: This is a heuristic which may fail in some cases.
+  bool InCodeSpan = false;
+
+  llvm::StringRef Line, Rest;
+  for (std::tie(Line, Rest) = Doc.split('\n'); !(Line.empty() && Rest.empty());
+       std::tie(Line, Rest) = Rest.split('\n')) {
+
+    // Detect code fence (``` or ~~~)
+    if (State == Normal) {
+      llvm::StringRef Trimmed = Line.ltrim();
+      if (Trimmed.starts_with("```") || Trimmed.starts_with("~~~")) {
+        // https://www.doxygen.nl/manual/markdown.html#md_fenced
+        CodeFence =
+            Trimmed.take_while([](char C) { return C == '`' || C == '~'; })
+                .str();
+        // Try to detect language: first word after fence. Could also be
+        // enclosed in {}
+        llvm::StringRef AfterFence =
+            Trimmed.drop_front(CodeFence.size()).ltrim();
+        // ignore '{' at the beginning of the language name to not duplicate it
+        // for the doxygen command
+        AfterFence.consume_front("{");
+        // The name is alphanumeric or '.' or '+'
+        StringRef CodeLang = AfterFence.take_while(
+            [](char C) { return llvm::isAlnum(C) || C == '.' || C == '+'; });
+
+        OS << "///@code";
+
+        if (!CodeLang.empty())
+          OS << "{" << CodeLang.str() << "}";
+
+        OS << "\n";
+
+        State = FencedCodeblock;
+        continue;
+      }
+
+      // FIXME: handle indented code blocks too?
+      // In doxygen, the indentation which triggers a code block depends on the
+      // indentation of the previous paragraph.
+      // https://www.doxygen.nl/manual/markdown.html#mddox_code_blocks
+    } else if (State == FencedCodeblock) {
+      // End of code fence
+      if (Line.ltrim().starts_with(CodeFence)) {
+        OS << "///@endcode\n";
+        State = Normal;
+        continue;
+      }
+      OS << "///" << Line << "\n";
+      continue;
+    }
+
+    // Normal line preprocessing (add doxygen markers, handle escaping)
+    OS << "///";
+
+    if (Line.empty() || Line.trim().empty()) {
+      OS << "\n";
+      // Empty lines reset the InCodeSpan state.
+      InCodeSpan = false;
+      continue;
+    }
+
+    if (Line.starts_with("<"))
+      // A comment line starting with '///<' is treated as a doxygen
+      // command. To avoid this, we add a space before the '<'.
+      OS << ' ';
+
+    for (char C : Line) {
+      if (C == '`')
+        InCodeSpan = !InCodeSpan;
+      else if (InCodeSpan && (C == '@' || C == '\\'))
+        OS << '\\';
+      OS << C;
+    }
+
+    OS << "\n";
+  }
+
+  // Close any unclosed code block
+  if (State == FencedCodeblock)
+    OS << "///@endcode\n";
+}
+
 void SymbolDocCommentVisitor::visitBlockCommandComment(
     const comments::BlockCommandComment *B) {
   switch (B->getCommandID()) {
diff --git a/clang-tools-extra/clangd/SymbolDocumentation.h b/clang-tools-extra/clangd/SymbolDocumentation.h
@@ -21,6 +21,7 @@
 #include "clang/AST/CommentSema.h"
 #include "clang/AST/CommentVisitor.h"
 #include "clang/Basic/SourceManager.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/raw_ostream.h"
 #include <string>
 
@@ -51,31 +52,8 @@ class SymbolDocCommentVisitor
     CommentWithMarkers.reserve(Documentation.size() +
                                Documentation.count('\n') * 3);
 
-    // The comment lexer expects doxygen markers, so add them back.
-    // We need to use the /// style doxygen markers because the comment could
-    // contain the closing the closing tag "*/" of a C Style "/** */" comment
-    // which would break the parsing if we would just enclose the comment text
-    // with "/** */".
-    CommentWithMarkers = "///";
-    bool NewLine = true;
-    for (char C : Documentation) {
-      if (C == '\n') {
-        CommentWithMarkers += "\n///";
-        NewLine = true;
-      } else {
-        if (NewLine && (C == '<')) {
-          // A comment line starting with '///<' is treated as a doxygen
-          // comment. Therefore add a space to separate the '<' from the comment
-          // marker. This allows to parse html tags at the beginning of a line
-          // and the escape marker prevents adding the artificial space in the
-          // markup documentation. The extra space will not be rendered, since
-          // we render it as markdown.
-          CommentWithMarkers += ' ';
-        }
-        CommentWithMarkers += C;
-        NewLine = false;
-      }
-    }
+    preprocessDocumentation(Documentation);
+
     SourceManagerForFile SourceMgrForFile("mock_file.cpp", CommentWithMarkers);
 
     SourceManager &SourceMgr = SourceMgrForFile.get();
@@ -149,6 +127,27 @@ class SymbolDocCommentVisitor
     TemplateParameters[TP->getParamNameAsWritten()] = std::move(TP);
   }
 
+  /// \brief Preprocesses the raw documentation string to prepare it for doxygen
+  /// parsing.
+  ///
+  /// This is a workaround to provide better support for markdown in
+  /// doxygen. Clang's doxygen parser e.g. does not handle markdown code blocks.
+  ///
+  /// The documentation string is preprocessed to replace some markdown
+  /// constructs with parsable doxygen commands. E.g. markdown code blocks are
+  /// replaced with doxygen \\code{.lang} ...
+  /// \\endcode blocks.
+  ///
+  /// Additionally, potential doxygen commands inside markdown
+  /// inline code spans are escaped to avoid that doxygen tries to interpret
+  /// them as commands.
+  ///
+  /// \note Although this is a workaround, it is very similar to what
+  /// doxygen itself does for markdown. In doxygen, the first parsing step is
+  /// also a markdown preprocessing step.
+  /// See https://www.doxygen.nl/manual/markdown.html
+  void preprocessDocumentation(StringRef Doc);
+
 private:
   comments::CommandTraits Traits;
   llvm::BumpPtrAllocator Allocator;
diff --git a/clang-tools-extra/clangd/support/Markup.cpp b/clang-tools-extra/clangd/support/Markup.cpp
@@ -199,10 +199,16 @@ bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After,
   return needsLeadingEscapeMarkdown(C, After);
 }
 
-/// Escape a markdown text block.
+/// \brief Render text for markdown output.
+///
 /// If \p EscapeMarkdown is true it ensures the punctuation will not introduce
 /// any of the markdown constructs.
+///
 /// Else, markdown syntax is not escaped, only HTML tags and entities.
+/// HTML is escaped because usually clients do not support HTML rendering by
+/// default. Passing unescaped HTML will therefore often result in not showing
+/// the HTML at all.
+/// \note In markdown code spans, we do not escape anything.
 std::string renderText(llvm::StringRef Input, bool StartsLine,
                        bool EscapeMarkdown) {
   std::string R;
@@ -213,6 +219,10 @@ std::string renderText(llvm::StringRef Input, bool StartsLine,
 
   bool IsFirstLine = true;
 
+  // Inside markdown code spans, we do not escape anything when EscapeMarkdown
+  // is false.
+  bool InCodeSpan = false;
+
   for (std::tie(Line, Rest) = Input.split('\n');
        !(Line.empty() && Rest.empty());
        std::tie(Line, Rest) = Rest.split('\n')) {
@@ -226,12 +236,27 @@ std::string renderText(llvm::StringRef Input, bool StartsLine,
       R.append(LeadingSpaces);
     }
 
+    // Handle the case where the user escaped a character themselves.
+    // This is relevant for markdown code spans if EscapeMarkdown is false,
+    // because if the user escaped a backtick, we must treat the enclosed text
+    // as normal markdown text.
+    bool UserEscape = false;
     for (unsigned I = LeadingSpaces.size(); I < Line.size(); ++I) {
-      if (needsLeadingEscape(Line[I], Line.substr(LeadingSpaces.size(), I),
+
+      if (!EscapeMarkdown && !UserEscape && Line[I] == '`')
+        InCodeSpan = !InCodeSpan;
+
+      if (!InCodeSpan &&
+          needsLeadingEscape(Line[I], Line.substr(LeadingSpaces.size(), I),
                              Line.substr(I + 1), StartsLineIntern,
                              EscapeMarkdown))
         R.push_back('\\');
       R.push_back(Line[I]);
+
+      if (Line[I] == '\\')
+        UserEscape = !UserEscape;
+      else
+        UserEscape = false;
     }
 
     IsFirstLine = false;
diff --git a/clang-tools-extra/clangd/unittests/SymbolDocumentationTests.cpp b/clang-tools-extra/clangd/unittests/SymbolDocumentationTests.cpp