Skip to content

Commit 806a147

Browse files
committed
Move detecting irregular whitespace to a markdown parser and add tests
1 parent 9b5793d commit 806a147

File tree

7 files changed

+177
-106
lines changed

7 files changed

+177
-106
lines changed
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
// Licensed to Elasticsearch B.V under one or more agreements.
2+
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
3+
// See the LICENSE file in the project root for more information
4+
5+
using System.Buffers;
6+
using Elastic.Markdown.Diagnostics;
7+
using Markdig;
8+
using Markdig.Helpers;
9+
using Markdig.Parsers;
10+
using Markdig.Parsers.Inlines;
11+
using Markdig.Renderers;
12+
using Markdig.Renderers.Html;
13+
using Markdig.Renderers.Html.Inlines;
14+
using Markdig.Syntax.Inlines;
15+
16+
namespace Elastic.Markdown.Myst.Linters;
17+
18+
public static class WhiteSpaceNormalizerBuilderExtensions
19+
{
20+
public static MarkdownPipelineBuilder UseWhiteSpaceNormalizer(this MarkdownPipelineBuilder pipeline)
21+
{
22+
pipeline.Extensions.AddIfNotAlready<WhiteSpaceNormalizerBuilderExtension>();
23+
return pipeline;
24+
}
25+
}
26+
27+
public class WhiteSpaceNormalizerBuilderExtension : IMarkdownExtension
28+
{
29+
public void Setup(MarkdownPipelineBuilder pipeline) =>
30+
pipeline.InlineParsers.InsertBefore<EmphasisInlineParser>(new WhiteSpaceNormalizerParser());
31+
32+
public void Setup(MarkdownPipeline pipeline, IMarkdownRenderer renderer) =>
33+
renderer.ObjectRenderers.InsertAfter<EmphasisInlineRenderer>(new WhiteSpaceNormalizerRenderer());
34+
}
35+
36+
public class WhiteSpaceNormalizerParser : InlineParser
37+
{
38+
// Collection of irregular whitespace characters that may impair Markdown rendering
39+
private static readonly char[] IrregularWhitespaceChars =
40+
[
41+
'\u000B', // Line Tabulation (\v) - <VT>
42+
'\u000C', // Form Feed (\f) - <FF>
43+
'\u00A0', // No-Break Space - <NBSP>
44+
'\u0085', // Next Line
45+
'\u1680', // Ogham Space Mark
46+
'\u180E', // Mongolian Vowel Separator - <MVS>
47+
'\ufeff', // Zero Width No-Break Space - <BOM>
48+
'\u2000', // En Quad
49+
'\u2001', // Em Quad
50+
'\u2002', // En Space - <ENSP>
51+
'\u2003', // Em Space - <EMSP>
52+
'\u2004', // Tree-Per-Em
53+
'\u2005', // Four-Per-Em
54+
'\u2006', // Six-Per-Em
55+
'\u2007', // Figure Space
56+
'\u2008', // Punctuation Space - <PUNCSP>
57+
'\u2009', // Thin Space
58+
'\u200A', // Hair Space
59+
'\u200B', // Zero Width Space - <ZWSP>
60+
'\u2028', // Line Separator
61+
'\u2029', // Paragraph Separator
62+
'\u202F', // Narrow No-Break Space
63+
'\u205F', // Medium Mathematical Space
64+
'\u3000' // Ideographic Space
65+
];
66+
private static readonly SearchValues<char> WhiteSpaceSearchValues = SearchValues.Create(IrregularWhitespaceChars);
67+
68+
public WhiteSpaceNormalizerParser() => OpeningCharacters = IrregularWhitespaceChars;
69+
70+
public override bool Match(InlineProcessor processor, ref StringSlice slice)
71+
{
72+
var span = slice.AsSpan().Slice(0, 1);
73+
if (span.IndexOfAny(WhiteSpaceSearchValues) == -1)
74+
return false;
75+
76+
processor.Inline = IrregularWhiteSpace.Instance;
77+
78+
var c = span[0];
79+
var charName = GetCharacterName(c);
80+
81+
processor.EmitHint(processor.Inline, 1, $"Irregular whitespace character detected: U+{(int)c:X4} ({charName}). This may impair Markdown rendering.");
82+
83+
slice.SkipChar();
84+
return true;
85+
}
86+
87+
// Helper to get a friendly name for the whitespace character
88+
private static string GetCharacterName(char c) => c switch
89+
{
90+
'\u000B' => "Line Tabulation (VT)",
91+
'\u000C' => "Form Feed (FF)",
92+
'\u00A0' => "No-Break Space (NBSP)",
93+
'\u0085' => "Next Line",
94+
'\u1680' => "Ogham Space Mark",
95+
'\u180E' => "Mongolian Vowel Separator (MVS)",
96+
'\ufeff' => "Zero Width No-Break Space (BOM)",
97+
'\u2000' => "En Quad",
98+
'\u2001' => "Em Quad",
99+
'\u2002' => "En Space (ENSP)",
100+
'\u2003' => "Em Space (EMSP)",
101+
'\u2004' => "Tree-Per-Em",
102+
'\u2005' => "Four-Per-Em",
103+
'\u2006' => "Six-Per-Em",
104+
'\u2007' => "Figure Space",
105+
'\u2008' => "Punctuation Space (PUNCSP)",
106+
'\u2009' => "Thin Space",
107+
'\u200A' => "Hair Space",
108+
'\u200B' => "Zero Width Space (ZWSP)",
109+
'\u2028' => "Line Separator",
110+
'\u2029' => "Paragraph Separator",
111+
'\u202F' => "Narrow No-Break Space",
112+
'\u205F' => "Medium Mathematical Space",
113+
'\u3000' => "Ideographic Space",
114+
_ => "Unknown"
115+
};
116+
}
117+
118+
public class IrregularWhiteSpace : LeafInline
119+
{
120+
public static readonly IrregularWhiteSpace Instance = new();
121+
};
122+
123+
public class WhiteSpaceNormalizerRenderer : HtmlObjectRenderer<IrregularWhiteSpace>
124+
{
125+
protected override void Write(HtmlRenderer renderer, IrregularWhiteSpace obj) =>
126+
renderer.Write(' ');
127+
}

src/Elastic.Markdown/Myst/MarkdownParser.cs

Lines changed: 7 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
using Elastic.Markdown.Myst.FrontMatter;
1515
using Elastic.Markdown.Myst.InlineParsers;
1616
using Elastic.Markdown.Myst.InlineParsers.Substitution;
17+
using Elastic.Markdown.Myst.Linters;
1718
using Elastic.Markdown.Myst.Renderers;
1819
using Elastic.Markdown.Myst.Roles;
1920
using Elastic.Markdown.Myst.Roles.AppliesTo;
@@ -30,94 +31,6 @@ public class MarkdownParser(BuildContext build, IParserResolvers resolvers)
3031
private BuildContext Build { get; } = build;
3132
private IParserResolvers Resolvers { get; } = resolvers;
3233

33-
// Collection of irregular whitespace characters that may impair Markdown rendering
34-
private static readonly char[] IrregularWhitespaceChars =
35-
[
36-
'\u000B', // Line Tabulation (\v) - <VT>
37-
'\u000C', // Form Feed (\f) - <FF>
38-
'\u00A0', // No-Break Space - <NBSP>
39-
'\u0085', // Next Line
40-
'\u1680', // Ogham Space Mark
41-
'\u180E', // Mongolian Vowel Separator - <MVS>
42-
'\ufeff', // Zero Width No-Break Space - <BOM>
43-
'\u2000', // En Quad
44-
'\u2001', // Em Quad
45-
'\u2002', // En Space - <ENSP>
46-
'\u2003', // Em Space - <EMSP>
47-
'\u2004', // Tree-Per-Em
48-
'\u2005', // Four-Per-Em
49-
'\u2006', // Six-Per-Em
50-
'\u2007', // Figure Space
51-
'\u2008', // Punctuation Space - <PUNCSP>
52-
'\u2009', // Thin Space
53-
'\u200A', // Hair Space
54-
'\u200B', // Zero Width Space - <ZWSP>
55-
'\u2028', // Line Separator
56-
'\u2029', // Paragraph Separator
57-
'\u202F', // Narrow No-Break Space
58-
'\u205F', // Medium Mathematical Space
59-
'\u3000' // Ideographic Space
60-
];
61-
62-
// Detects irregular whitespace in the markdown content and reports diagnostics
63-
private void DetectIrregularWhitespace(string content, string filePath)
64-
{
65-
var lines = content.Split(["\r\n", "\n", "\r"], StringSplitOptions.None);
66-
67-
for (var lineIndex = 0; lineIndex < lines.Length; lineIndex++)
68-
{
69-
var line = lines[lineIndex];
70-
for (var columnIndex = 0; columnIndex < line.Length; columnIndex++)
71-
{
72-
var c = line[columnIndex];
73-
if (Array.IndexOf(IrregularWhitespaceChars, c) >= 0)
74-
{
75-
var charName = GetCharacterName(c);
76-
Build.Collector.Write(new Diagnostic
77-
{
78-
Severity = Severity.Warning,
79-
File = filePath,
80-
Line = lineIndex + 1, // 1-based line number
81-
Column = columnIndex + 1, // 1-based column number
82-
Length = 1,
83-
Message = $"Irregular whitespace character detected: U+{(int)c:X4} ({charName}). This may impair Markdown rendering."
84-
});
85-
}
86-
}
87-
}
88-
}
89-
90-
// Helper to get a friendly name for the whitespace character
91-
private static string GetCharacterName(char c) => c switch
92-
{
93-
'\u000B' => "Line Tabulation (VT)",
94-
'\u000C' => "Form Feed (FF)",
95-
'\u00A0' => "No-Break Space (NBSP)",
96-
'\u0085' => "Next Line",
97-
'\u1680' => "Ogham Space Mark",
98-
'\u180E' => "Mongolian Vowel Separator (MVS)",
99-
'\ufeff' => "Zero Width No-Break Space (BOM)",
100-
'\u2000' => "En Quad",
101-
'\u2001' => "Em Quad",
102-
'\u2002' => "En Space (ENSP)",
103-
'\u2003' => "Em Space (EMSP)",
104-
'\u2004' => "Tree-Per-Em",
105-
'\u2005' => "Four-Per-Em",
106-
'\u2006' => "Six-Per-Em",
107-
'\u2007' => "Figure Space",
108-
'\u2008' => "Punctuation Space (PUNCSP)",
109-
'\u2009' => "Thin Space",
110-
'\u200A' => "Hair Space",
111-
'\u200B' => "Zero Width Space (ZWSP)",
112-
'\u2028' => "Line Separator",
113-
'\u2029' => "Paragraph Separator",
114-
'\u202F' => "Narrow No-Break Space",
115-
'\u205F' => "Medium Mathematical Space",
116-
'\u3000' => "Ideographic Space",
117-
_ => "Unknown"
118-
};
119-
120-
12134
public Task<MarkdownDocument> MinimalParseAsync(IFileInfo path, Cancel ctx)
12235
{
12336
var state = new ParserState(Build)
@@ -159,17 +72,11 @@ public Task<MarkdownDocument> ParseSnippetAsync(IFileInfo path, IFileInfo parent
15972
return ParseAsync(path, context, Pipeline, ctx);
16073
}
16174

162-
public MarkdownDocument ParseStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter)
163-
{
164-
DetectIrregularWhitespace(markdown, path.FullName);
165-
return ParseMarkdownStringAsync(markdown, path, matter, Pipeline);
166-
}
75+
public MarkdownDocument ParseStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter) =>
76+
ParseMarkdownStringAsync(markdown, path, matter, Pipeline);
16777

168-
public MarkdownDocument MinimalParseStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter)
169-
{
170-
DetectIrregularWhitespace(markdown, path.FullName);
171-
return ParseMarkdownStringAsync(markdown, path, matter, MinimalPipeline);
172-
}
78+
public MarkdownDocument MinimalParseStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter) =>
79+
ParseMarkdownStringAsync(markdown, path, matter, MinimalPipeline);
17380

17481
private MarkdownDocument ParseMarkdownStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter, MarkdownPipeline pipeline)
17582
{
@@ -185,7 +92,7 @@ private MarkdownDocument ParseMarkdownStringAsync(string markdown, IFileInfo pat
18592
return markdownDocument;
18693
}
18794

188-
private async Task<MarkdownDocument> ParseAsync(
95+
private static async Task<MarkdownDocument> ParseAsync(
18996
IFileInfo path,
19097
MarkdownParserContext context,
19198
MarkdownPipeline pipeline,
@@ -199,12 +106,7 @@ private async Task<MarkdownDocument> ParseAsync(
199106
inputMarkdown = await streamReader.AsTextReader().ReadToEndAsync(ctx);
200107
}
201108
else
202-
{
203109
inputMarkdown = await path.FileSystem.File.ReadAllTextAsync(path.FullName, ctx);
204-
}
205-
206-
// Check for irregular whitespace characters
207-
DetectIrregularWhitespace(inputMarkdown, path.FullName);
208110

209111
var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context);
210112
return markdownDocument;
@@ -258,6 +160,7 @@ public MarkdownPipeline Pipeline
258160
.UseEnhancedCodeBlocks()
259161
.UseHtmxLinkInlineRenderer()
260162
.DisableHtml()
163+
.UseWhiteSpaceNormalizer()
261164
.UseHardBreaks();
262165
_ = builder.BlockParsers.TryRemove<IndentedCodeBlockParser>();
263166
_pipelineCached = builder.Build();

tests/authoring/Framework/ErrorCollectorAssertions.fs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,4 +54,19 @@ module DiagnosticsCollectorAssertions =
5454
| Some e ->
5555
let message = e.Message
5656
test <@ message.Contains(expected) @>
57-
| None -> failwithf "Expected errors but no errors were logged"
57+
| None -> failwithf "Expected warnings but no warnings were logged"
58+
59+
[<DebuggerStepThrough>]
60+
let hasHint (expected: string) (actual: Lazy<GeneratorResults>) =
61+
let actual = actual.Value
62+
actual.Context.Collector.Hints |> shouldBeGreaterThan 0
63+
let errorDiagnostics = actual.Context.Collector.Diagnostics
64+
.Where(fun d -> d.Severity = Severity.Hint)
65+
.ToArray()
66+
|> List.ofArray
67+
|> List.tryHead
68+
match errorDiagnostics with
69+
| Some e ->
70+
let message = e.Message
71+
test <@ message.Contains(expected) @>
72+
| None -> failwithf "Expected hints but no hints were logged"

tests/authoring/Framework/TestValues.fs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,10 @@ type TestDiagnosticsOutput() =
2626
match diagnostic.Severity with
2727
| Severity.Error ->
2828
output.WriteLine($"Error: {diagnostic.Message} ({diagnostic.File}:{line})")
29-
| _ ->
29+
| Severity.Warning ->
3030
output.WriteLine($"Warn : {diagnostic.Message} ({diagnostic.File}:{line})")
31+
| _ ->
32+
output.WriteLine($"Hint : {diagnostic.Message} ({diagnostic.File}:{line})")
3133
| _ -> ()
3234

3335

tests/authoring/Inline/Comments.fs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@ not a comment
1717
[<Fact>]
1818
let ``validate HTML: commented line should not be emitted`` () =
1919
markdown |> convertsToHtml """<p>not a comment</p>"""
20+
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
module ``linters``.``white space normalizers``
2+
3+
open Xunit
4+
open authoring
5+
6+
7+
type ``white space detection`` () =
8+
9+
static let markdown = Setup.Markdown $"""
10+
not a{'\u000B'}space
11+
"""
12+
13+
[<Fact>]
14+
let ``validate HTML: should not contain bad space character`` () =
15+
markdown |> convertsToHtml """<p>not a space</p>"""
16+
17+
[<Fact>]
18+
let ``emits a hint when a bad space is used`` () =
19+
markdown |> hasHint "Irregular whitespace character detected: U+000B (Line Tabulation (VT)). This may impair Markdown rendering."

tests/authoring/authoring.fsproj

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,4 +58,8 @@
5858
<Compile Include="Directives\IncludeBlocks.fs" />
5959
</ItemGroup>
6060

61+
<ItemGroup>
62+
<Compile Include="Linters\WhiteSpaceNormalizers.fs" />
63+
</ItemGroup>
64+
6165
</Project>

0 commit comments

Comments
 (0)