Skip to content

Commit 3050c5c

Browse files
committed
Add Irregular Whitespace detection
1 parent 0c267f9 commit 3050c5c

File tree

1 file changed

+108
-11
lines changed

1 file changed

+108
-11
lines changed

src/Elastic.Markdown/Myst/MarkdownParser.cs

Lines changed: 108 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
using System.IO.Abstractions;
66
using Cysharp.IO;
7+
using Elastic.Documentation.Diagnostics;
8+
using System.Text.RegularExpressions;
79
using Elastic.Markdown.Myst.CodeBlocks;
810
using Elastic.Markdown.Myst.Comments;
911
using Elastic.Markdown.Myst.Directives;
@@ -25,6 +27,92 @@ public class MarkdownParser(BuildContext build, IParserResolvers resolvers)
2527
private BuildContext Build { get; } = build;
2628
private IParserResolvers Resolvers { get; } = resolvers;
2729

30+
// Collection of irregular whitespace characters that may impair Markdown rendering
31+
private static readonly char[] IrregularWhitespaceChars = {
32+
'\u000B', // Line Tabulation (\v) - <VT>
33+
'\u000C', // Form Feed (\f) - <FF>
34+
'\u00A0', // No-Break Space - <NBSP>
35+
'\u0085', // Next Line
36+
'\u1680', // Ogham Space Mark
37+
'\u180E', // Mongolian Vowel Separator - <MVS>
38+
'\ufeff', // Zero Width No-Break Space - <BOM>
39+
'\u2000', // En Quad
40+
'\u2001', // Em Quad
41+
'\u2002', // En Space - <ENSP>
42+
'\u2003', // Em Space - <EMSP>
43+
'\u2004', // Tree-Per-Em
44+
'\u2005', // Four-Per-Em
45+
'\u2006', // Six-Per-Em
46+
'\u2007', // Figure Space
47+
'\u2008', // Punctuation Space - <PUNCSP>
48+
'\u2009', // Thin Space
49+
'\u200A', // Hair Space
50+
'\u200B', // Zero Width Space - <ZWSP>
51+
'\u2028', // Line Separator
52+
'\u2029', // Paragraph Separator
53+
'\u202F', // Narrow No-Break Space
54+
'\u205F', // Medium Mathematical Space
55+
'\u3000' // Ideographic Space
56+
};
57+
58+
// Detects irregular whitespace in the markdown content and reports diagnostics
59+
private void DetectIrregularWhitespace(string content, string filePath)
60+
{
61+
var lines = content.Split(new[] { "\r\n", "\n", "\r" }, StringSplitOptions.None);
62+
63+
for (var lineIndex = 0; lineIndex < lines.Length; lineIndex++)
64+
{
65+
var line = lines[lineIndex];
66+
for (var columnIndex = 0; columnIndex < line.Length; columnIndex++)
67+
{
68+
var c = line[columnIndex];
69+
if (Array.IndexOf(IrregularWhitespaceChars, c) >= 0)
70+
{
71+
var charName = GetCharacterName(c);
72+
Build.Collector.Write(new Diagnostic
73+
{
74+
Severity = Severity.Warning,
75+
File = filePath,
76+
Line = lineIndex + 1, // 1-based line number
77+
Column = columnIndex + 1, // 1-based column number
78+
Length = 1,
79+
Message = $"Irregular whitespace character detected: U+{(int)c:X4} ({charName}). This may impair Markdown rendering."
80+
});
81+
}
82+
}
83+
}
84+
}
85+
86+
// Helper to get a friendly name for the whitespace character
87+
private static string GetCharacterName(char c) => c switch
88+
{
89+
'\u000B' => "Line Tabulation (VT)",
90+
'\u000C' => "Form Feed (FF)",
91+
'\u00A0' => "No-Break Space (NBSP)",
92+
'\u0085' => "Next Line",
93+
'\u1680' => "Ogham Space Mark",
94+
'\u180E' => "Mongolian Vowel Separator (MVS)",
95+
'\ufeff' => "Zero Width No-Break Space (BOM)",
96+
'\u2000' => "En Quad",
97+
'\u2001' => "Em Quad",
98+
'\u2002' => "En Space (ENSP)",
99+
'\u2003' => "Em Space (EMSP)",
100+
'\u2004' => "Tree-Per-Em",
101+
'\u2005' => "Four-Per-Em",
102+
'\u2006' => "Six-Per-Em",
103+
'\u2007' => "Figure Space",
104+
'\u2008' => "Punctuation Space (PUNCSP)",
105+
'\u2009' => "Thin Space",
106+
'\u200A' => "Hair Space",
107+
'\u200B' => "Zero Width Space (ZWSP)",
108+
'\u2028' => "Line Separator",
109+
'\u2029' => "Paragraph Separator",
110+
'\u202F' => "Narrow No-Break Space",
111+
'\u205F' => "Medium Mathematical Space",
112+
'\u3000' => "Ideographic Space",
113+
_ => "Unknown"
114+
};
115+
28116
public Task<MarkdownDocument> MinimalParseAsync(IFileInfo path, Cancel ctx)
29117
{
30118
var state = new ParserState(Build)
@@ -66,11 +154,17 @@ public Task<MarkdownDocument> ParseSnippetAsync(IFileInfo path, IFileInfo parent
66154
return ParseAsync(path, context, Pipeline, ctx);
67155
}
68156

69-
public MarkdownDocument ParseStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter) =>
70-
ParseMarkdownStringAsync(markdown, path, matter, Pipeline);
157+
public MarkdownDocument ParseStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter)
158+
{
159+
DetectIrregularWhitespace(markdown, path.FullName);
160+
return ParseMarkdownStringAsync(markdown, path, matter, Pipeline);
161+
}
71162

72-
public MarkdownDocument MinimalParseStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter) =>
73-
ParseMarkdownStringAsync(markdown, path, matter, MinimalPipeline);
163+
public MarkdownDocument MinimalParseStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter)
164+
{
165+
DetectIrregularWhitespace(markdown, path.FullName);
166+
return ParseMarkdownStringAsync(markdown, path, matter, MinimalPipeline);
167+
}
74168

75169
private MarkdownDocument ParseMarkdownStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter, MarkdownPipeline pipeline)
76170
{
@@ -86,26 +180,29 @@ private MarkdownDocument ParseMarkdownStringAsync(string markdown, IFileInfo pat
86180
return markdownDocument;
87181
}
88182

89-
private static async Task<MarkdownDocument> ParseAsync(
183+
private async Task<MarkdownDocument> ParseAsync(
90184
IFileInfo path,
91185
MarkdownParserContext context,
92186
MarkdownPipeline pipeline,
93187
Cancel ctx)
94188
{
189+
string inputMarkdown;
95190
if (path.FileSystem is FileSystem)
96191
{
97192
//real IO optimize through UTF8 stream reader.
98193
await using var streamReader = new Utf8StreamReader(path.FullName, fileOpenMode: FileOpenMode.Throughput);
99-
var inputMarkdown = await streamReader.AsTextReader().ReadToEndAsync(ctx);
100-
var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context);
101-
return markdownDocument;
194+
inputMarkdown = await streamReader.AsTextReader().ReadToEndAsync(ctx);
102195
}
103196
else
104197
{
105-
var inputMarkdown = await path.FileSystem.File.ReadAllTextAsync(path.FullName, ctx);
106-
var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context);
107-
return markdownDocument;
198+
inputMarkdown = await path.FileSystem.File.ReadAllTextAsync(path.FullName, ctx);
108199
}
200+
201+
// Check for irregular whitespace characters
202+
DetectIrregularWhitespace(inputMarkdown, path.FullName);
203+
204+
var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context);
205+
return markdownDocument;
109206
}
110207

111208
// ReSharper disable once InconsistentNaming

0 commit comments

Comments
 (0)