44
55using System . IO . Abstractions ;
66using Cysharp . IO ;
7+ using Elastic . Documentation . Diagnostics ;
8+ using System . Text . RegularExpressions ;
79using Elastic . Markdown . Myst . CodeBlocks ;
810using Elastic . Markdown . Myst . Comments ;
911using Elastic . Markdown . Myst . Directives ;
@@ -25,6 +27,92 @@ public class MarkdownParser(BuildContext build, IParserResolvers resolvers)
2527 private BuildContext Build { get ; } = build ;
2628 private IParserResolvers Resolvers { get ; } = resolvers ;
2729
30+ // Collection of irregular whitespace characters that may impair Markdown rendering
31+ private static readonly char [ ] IrregularWhitespaceChars = {
32+ '\u000B ' , // Line Tabulation (\v) - <VT>
33+ '\u000C ' , // Form Feed (\f) - <FF>
34+ '\u00A0 ' , // No-Break Space - <NBSP>
35+ '\u0085 ' , // Next Line
36+ '\u1680 ' , // Ogham Space Mark
37+ '\u180E ' , // Mongolian Vowel Separator - <MVS>
38+ '\ufeff ' , // Zero Width No-Break Space - <BOM>
39+ '\u2000 ' , // En Quad
40+ '\u2001 ' , // Em Quad
41+ '\u2002 ' , // En Space - <ENSP>
42+ '\u2003 ' , // Em Space - <EMSP>
43+ '\u2004 ' , // Tree-Per-Em
44+ '\u2005 ' , // Four-Per-Em
45+ '\u2006 ' , // Six-Per-Em
46+ '\u2007 ' , // Figure Space
47+ '\u2008 ' , // Punctuation Space - <PUNCSP>
48+ '\u2009 ' , // Thin Space
49+ '\u200A ' , // Hair Space
50+ '\u200B ' , // Zero Width Space - <ZWSP>
51+ '\u2028 ' , // Line Separator
52+ '\u2029 ' , // Paragraph Separator
53+ '\u202F ' , // Narrow No-Break Space
54+ '\u205F ' , // Medium Mathematical Space
55+ '\u3000 ' // Ideographic Space
56+ } ;
57+
58+ // Detects irregular whitespace in the markdown content and reports diagnostics
59+ private void DetectIrregularWhitespace ( string content , string filePath )
60+ {
61+ var lines = content . Split ( new [ ] { "\r \n " , "\n " , "\r " } , StringSplitOptions . None ) ;
62+
63+ for ( var lineIndex = 0 ; lineIndex < lines . Length ; lineIndex ++ )
64+ {
65+ var line = lines [ lineIndex ] ;
66+ for ( var columnIndex = 0 ; columnIndex < line . Length ; columnIndex ++ )
67+ {
68+ var c = line [ columnIndex ] ;
69+ if ( Array . IndexOf ( IrregularWhitespaceChars , c ) >= 0 )
70+ {
71+ var charName = GetCharacterName ( c ) ;
72+ Build . Collector . Write ( new Diagnostic
73+ {
74+ Severity = Severity . Warning ,
75+ File = filePath ,
76+ Line = lineIndex + 1 , // 1-based line number
77+ Column = columnIndex + 1 , // 1-based column number
78+ Length = 1 ,
79+ Message = $ "Irregular whitespace character detected: U+{ ( int ) c : X4} ({ charName } ). This may impair Markdown rendering."
80+ } ) ;
81+ }
82+ }
83+ }
84+ }
85+
86+ // Helper to get a friendly name for the whitespace character
87+ private static string GetCharacterName ( char c ) => c switch
88+ {
89+ '\u000B ' => "Line Tabulation (VT)" ,
90+ '\u000C ' => "Form Feed (FF)" ,
91+ '\u00A0 ' => "No-Break Space (NBSP)" ,
92+ '\u0085 ' => "Next Line" ,
93+ '\u1680 ' => "Ogham Space Mark" ,
94+ '\u180E ' => "Mongolian Vowel Separator (MVS)" ,
95+ '\ufeff ' => "Zero Width No-Break Space (BOM)" ,
96+ '\u2000 ' => "En Quad" ,
97+ '\u2001 ' => "Em Quad" ,
98+ '\u2002 ' => "En Space (ENSP)" ,
99+ '\u2003 ' => "Em Space (EMSP)" ,
100+ '\u2004 ' => "Tree-Per-Em" ,
101+ '\u2005 ' => "Four-Per-Em" ,
102+ '\u2006 ' => "Six-Per-Em" ,
103+ '\u2007 ' => "Figure Space" ,
104+ '\u2008 ' => "Punctuation Space (PUNCSP)" ,
105+ '\u2009 ' => "Thin Space" ,
106+ '\u200A ' => "Hair Space" ,
107+ '\u200B ' => "Zero Width Space (ZWSP)" ,
108+ '\u2028 ' => "Line Separator" ,
109+ '\u2029 ' => "Paragraph Separator" ,
110+ '\u202F ' => "Narrow No-Break Space" ,
111+ '\u205F ' => "Medium Mathematical Space" ,
112+ '\u3000 ' => "Ideographic Space" ,
113+ _ => "Unknown"
114+ } ;
115+
28116 public Task < MarkdownDocument > MinimalParseAsync ( IFileInfo path , Cancel ctx )
29117 {
30118 var state = new ParserState ( Build )
@@ -66,11 +154,17 @@ public Task<MarkdownDocument> ParseSnippetAsync(IFileInfo path, IFileInfo parent
66154 return ParseAsync ( path , context , Pipeline , ctx ) ;
67155 }
68156
69- public MarkdownDocument ParseStringAsync ( string markdown , IFileInfo path , YamlFrontMatter ? matter ) =>
70- ParseMarkdownStringAsync ( markdown , path , matter , Pipeline ) ;
157+ public MarkdownDocument ParseStringAsync ( string markdown , IFileInfo path , YamlFrontMatter ? matter )
158+ {
159+ DetectIrregularWhitespace ( markdown , path . FullName ) ;
160+ return ParseMarkdownStringAsync ( markdown , path , matter , Pipeline ) ;
161+ }
71162
72- public MarkdownDocument MinimalParseStringAsync ( string markdown , IFileInfo path , YamlFrontMatter ? matter ) =>
73- ParseMarkdownStringAsync ( markdown , path , matter , MinimalPipeline ) ;
163+ public MarkdownDocument MinimalParseStringAsync ( string markdown , IFileInfo path , YamlFrontMatter ? matter )
164+ {
165+ DetectIrregularWhitespace ( markdown , path . FullName ) ;
166+ return ParseMarkdownStringAsync ( markdown , path , matter , MinimalPipeline ) ;
167+ }
74168
75169 private MarkdownDocument ParseMarkdownStringAsync ( string markdown , IFileInfo path , YamlFrontMatter ? matter , MarkdownPipeline pipeline )
76170 {
@@ -86,26 +180,29 @@ private MarkdownDocument ParseMarkdownStringAsync(string markdown, IFileInfo pat
86180 return markdownDocument ;
87181 }
88182
89- private static async Task < MarkdownDocument > ParseAsync (
183+ private async Task < MarkdownDocument > ParseAsync (
90184 IFileInfo path ,
91185 MarkdownParserContext context ,
92186 MarkdownPipeline pipeline ,
93187 Cancel ctx )
94188 {
189+ string inputMarkdown ;
95190 if ( path . FileSystem is FileSystem )
96191 {
97192 //real IO optimize through UTF8 stream reader.
98193 await using var streamReader = new Utf8StreamReader ( path . FullName , fileOpenMode : FileOpenMode . Throughput ) ;
99- var inputMarkdown = await streamReader . AsTextReader ( ) . ReadToEndAsync ( ctx ) ;
100- var markdownDocument = Markdig . Markdown . Parse ( inputMarkdown , pipeline , context ) ;
101- return markdownDocument ;
194+ inputMarkdown = await streamReader . AsTextReader ( ) . ReadToEndAsync ( ctx ) ;
102195 }
103196 else
104197 {
105- var inputMarkdown = await path . FileSystem . File . ReadAllTextAsync ( path . FullName , ctx ) ;
106- var markdownDocument = Markdig . Markdown . Parse ( inputMarkdown , pipeline , context ) ;
107- return markdownDocument ;
198+ inputMarkdown = await path . FileSystem . File . ReadAllTextAsync ( path . FullName , ctx ) ;
108199 }
200+
201+ // Check for irregular whitespace characters
202+ DetectIrregularWhitespace ( inputMarkdown , path . FullName ) ;
203+
204+ var markdownDocument = Markdig . Markdown . Parse ( inputMarkdown , pipeline , context ) ;
205+ return markdownDocument ;
109206 }
110207
111208 // ReSharper disable once InconsistentNaming
0 commit comments