Skip to content
Merged
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 113 additions & 11 deletions src/Elastic.Markdown/Myst/MarkdownParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
// See the LICENSE file in the project root for more information

using System.IO.Abstractions;
using System.Text.RegularExpressions;

using Cysharp.IO;

using Elastic.Documentation.Diagnostics;
using Elastic.Markdown.Myst.CodeBlocks;
using Elastic.Markdown.Myst.Comments;
using Elastic.Markdown.Myst.Directives;
Expand All @@ -13,6 +17,7 @@
using Elastic.Markdown.Myst.Renderers;
using Elastic.Markdown.Myst.Roles;
using Elastic.Markdown.Myst.Roles.AppliesTo;

using Markdig;
using Markdig.Extensions.EmphasisExtras;
using Markdig.Parsers;
Expand All @@ -25,6 +30,94 @@ public class MarkdownParser(BuildContext build, IParserResolvers resolvers)
private BuildContext Build { get; } = build;
private IParserResolvers Resolvers { get; } = resolvers;

// Collection of irregular whitespace characters that may impair Markdown rendering
private static readonly char[] IrregularWhitespaceChars =
[
'\u000B', // Line Tabulation (\v) - <VT>
'\u000C', // Form Feed (\f) - <FF>
'\u00A0', // No-Break Space - <NBSP>
'\u0085', // Next Line
'\u1680', // Ogham Space Mark
'\u180E', // Mongolian Vowel Separator - <MVS>
'\ufeff', // Zero Width No-Break Space - <BOM>
'\u2000', // En Quad
'\u2001', // Em Quad
'\u2002', // En Space - <ENSP>
'\u2003', // Em Space - <EMSP>
'\u2004', // Tree-Per-Em
'\u2005', // Four-Per-Em
'\u2006', // Six-Per-Em
'\u2007', // Figure Space
'\u2008', // Punctuation Space - <PUNCSP>
'\u2009', // Thin Space
'\u200A', // Hair Space
'\u200B', // Zero Width Space - <ZWSP>
'\u2028', // Line Separator
'\u2029', // Paragraph Separator
'\u202F', // Narrow No-Break Space
'\u205F', // Medium Mathematical Space
'\u3000' // Ideographic Space
];

// Detects irregular whitespace in the markdown content and reports diagnostics
private void DetectIrregularWhitespace(string content, string filePath)
{
var lines = content.Split(["\r\n", "\n", "\r"], StringSplitOptions.None);

for (var lineIndex = 0; lineIndex < lines.Length; lineIndex++)
{
var line = lines[lineIndex];
for (var columnIndex = 0; columnIndex < line.Length; columnIndex++)
{
var c = line[columnIndex];
if (Array.IndexOf(IrregularWhitespaceChars, c) >= 0)
{
var charName = GetCharacterName(c);
Build.Collector.Write(new Diagnostic
{
Severity = Severity.Warning,
File = filePath,
Line = lineIndex + 1, // 1-based line number
Column = columnIndex + 1, // 1-based column number
Length = 1,
Message = $"Irregular whitespace character detected: U+{(int)c:X4} ({charName}). This may impair Markdown rendering."
});
}
}
}
}

// Helper to get a friendly name for the whitespace character
private static string GetCharacterName(char c) => c switch
{
'\u000B' => "Line Tabulation (VT)",
'\u000C' => "Form Feed (FF)",
'\u00A0' => "No-Break Space (NBSP)",
'\u0085' => "Next Line",
'\u1680' => "Ogham Space Mark",
'\u180E' => "Mongolian Vowel Separator (MVS)",
'\ufeff' => "Zero Width No-Break Space (BOM)",
'\u2000' => "En Quad",
'\u2001' => "Em Quad",
'\u2002' => "En Space (ENSP)",
'\u2003' => "Em Space (EMSP)",
'\u2004' => "Tree-Per-Em",
'\u2005' => "Four-Per-Em",
'\u2006' => "Six-Per-Em",
'\u2007' => "Figure Space",
'\u2008' => "Punctuation Space (PUNCSP)",
'\u2009' => "Thin Space",
'\u200A' => "Hair Space",
'\u200B' => "Zero Width Space (ZWSP)",
'\u2028' => "Line Separator",
'\u2029' => "Paragraph Separator",
'\u202F' => "Narrow No-Break Space",
'\u205F' => "Medium Mathematical Space",
'\u3000' => "Ideographic Space",
_ => "Unknown"
};


public Task<MarkdownDocument> MinimalParseAsync(IFileInfo path, Cancel ctx)
{
var state = new ParserState(Build)
Expand Down Expand Up @@ -66,11 +159,17 @@ public Task<MarkdownDocument> ParseSnippetAsync(IFileInfo path, IFileInfo parent
return ParseAsync(path, context, Pipeline, ctx);
}

public MarkdownDocument ParseStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter) =>
ParseMarkdownStringAsync(markdown, path, matter, Pipeline);
public MarkdownDocument ParseStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter)
{
DetectIrregularWhitespace(markdown, path.FullName);
return ParseMarkdownStringAsync(markdown, path, matter, Pipeline);
}

public MarkdownDocument MinimalParseStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter) =>
ParseMarkdownStringAsync(markdown, path, matter, MinimalPipeline);
public MarkdownDocument MinimalParseStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter)
{
DetectIrregularWhitespace(markdown, path.FullName);
return ParseMarkdownStringAsync(markdown, path, matter, MinimalPipeline);
}

private MarkdownDocument ParseMarkdownStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter, MarkdownPipeline pipeline)
{
Expand All @@ -86,26 +185,29 @@ private MarkdownDocument ParseMarkdownStringAsync(string markdown, IFileInfo pat
return markdownDocument;
}

private static async Task<MarkdownDocument> ParseAsync(
private async Task<MarkdownDocument> ParseAsync(
IFileInfo path,
MarkdownParserContext context,
MarkdownPipeline pipeline,
Cancel ctx)
{
string inputMarkdown;
if (path.FileSystem is FileSystem)
{
//real IO optimize through UTF8 stream reader.
await using var streamReader = new Utf8StreamReader(path.FullName, fileOpenMode: FileOpenMode.Throughput);
var inputMarkdown = await streamReader.AsTextReader().ReadToEndAsync(ctx);
var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context);
return markdownDocument;
inputMarkdown = await streamReader.AsTextReader().ReadToEndAsync(ctx);
}
else
{
var inputMarkdown = await path.FileSystem.File.ReadAllTextAsync(path.FullName, ctx);
var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context);
return markdownDocument;
inputMarkdown = await path.FileSystem.File.ReadAllTextAsync(path.FullName, ctx);
}

// Check for irregular whitespace characters
DetectIrregularWhitespace(inputMarkdown, path.FullName);

var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context);
return markdownDocument;
}

// ReSharper disable once InconsistentNaming
Expand Down
Loading