Skip to content

Commit 9c1e5be

Browse files
theletterfMpdreamz
andauthored
Test for irregular whitespaces (#1262)
* Add Irregular Whitespace detection * Format * Add test * Formatting * Remove unused imports * Remove test * Move detecting irregular whitespace to a markdown parser and add tests * add license headers --------- Co-authored-by: Martijn Laarman <[email protected]>
1 parent e32bb23 commit 9c1e5be

File tree

7 files changed

+187
-10
lines changed

7 files changed

+187
-10
lines changed
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
// Licensed to Elasticsearch B.V under one or more agreements.
2+
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
3+
// See the LICENSE file in the project root for more information
4+
5+
using System.Buffers;
6+
using Elastic.Markdown.Diagnostics;
7+
using Markdig;
8+
using Markdig.Helpers;
9+
using Markdig.Parsers;
10+
using Markdig.Parsers.Inlines;
11+
using Markdig.Renderers;
12+
using Markdig.Renderers.Html;
13+
using Markdig.Renderers.Html.Inlines;
14+
using Markdig.Syntax.Inlines;
15+
16+
namespace Elastic.Markdown.Myst.Linters;
17+
18+
public static class WhiteSpaceNormalizerBuilderExtensions
19+
{
20+
public static MarkdownPipelineBuilder UseWhiteSpaceNormalizer(this MarkdownPipelineBuilder pipeline)
21+
{
22+
pipeline.Extensions.AddIfNotAlready<WhiteSpaceNormalizerBuilderExtension>();
23+
return pipeline;
24+
}
25+
}
26+
27+
public class WhiteSpaceNormalizerBuilderExtension : IMarkdownExtension
28+
{
29+
public void Setup(MarkdownPipelineBuilder pipeline) =>
30+
pipeline.InlineParsers.InsertBefore<EmphasisInlineParser>(new WhiteSpaceNormalizerParser());
31+
32+
public void Setup(MarkdownPipeline pipeline, IMarkdownRenderer renderer) =>
33+
renderer.ObjectRenderers.InsertAfter<EmphasisInlineRenderer>(new WhiteSpaceNormalizerRenderer());
34+
}
35+
36+
public class WhiteSpaceNormalizerParser : InlineParser
37+
{
38+
// Collection of irregular whitespace characters that may impair Markdown rendering
39+
private static readonly char[] IrregularWhitespaceChars =
40+
[
41+
'\u000B', // Line Tabulation (\v) - <VT>
42+
'\u000C', // Form Feed (\f) - <FF>
43+
'\u00A0', // No-Break Space - <NBSP>
44+
'\u0085', // Next Line
45+
'\u1680', // Ogham Space Mark
46+
'\u180E', // Mongolian Vowel Separator - <MVS>
47+
'\ufeff', // Zero Width No-Break Space - <BOM>
48+
'\u2000', // En Quad
49+
'\u2001', // Em Quad
50+
'\u2002', // En Space - <ENSP>
51+
'\u2003', // Em Space - <EMSP>
52+
'\u2004', // Tree-Per-Em
53+
'\u2005', // Four-Per-Em
54+
'\u2006', // Six-Per-Em
55+
'\u2007', // Figure Space
56+
'\u2008', // Punctuation Space - <PUNCSP>
57+
'\u2009', // Thin Space
58+
'\u200A', // Hair Space
59+
'\u200B', // Zero Width Space - <ZWSP>
60+
'\u2028', // Line Separator
61+
'\u2029', // Paragraph Separator
62+
'\u202F', // Narrow No-Break Space
63+
'\u205F', // Medium Mathematical Space
64+
'\u3000' // Ideographic Space
65+
];
66+
private static readonly SearchValues<char> WhiteSpaceSearchValues = SearchValues.Create(IrregularWhitespaceChars);
67+
68+
public WhiteSpaceNormalizerParser() => OpeningCharacters = IrregularWhitespaceChars;
69+
70+
public override bool Match(InlineProcessor processor, ref StringSlice slice)
71+
{
72+
var span = slice.AsSpan().Slice(0, 1);
73+
if (span.IndexOfAny(WhiteSpaceSearchValues) == -1)
74+
return false;
75+
76+
processor.Inline = IrregularWhiteSpace.Instance;
77+
78+
var c = span[0];
79+
var charName = GetCharacterName(c);
80+
81+
processor.EmitHint(processor.Inline, 1, $"Irregular whitespace character detected: U+{(int)c:X4} ({charName}). This may impair Markdown rendering.");
82+
83+
slice.SkipChar();
84+
return true;
85+
}
86+
87+
// Helper to get a friendly name for the whitespace character
88+
private static string GetCharacterName(char c) => c switch
89+
{
90+
'\u000B' => "Line Tabulation (VT)",
91+
'\u000C' => "Form Feed (FF)",
92+
'\u00A0' => "No-Break Space (NBSP)",
93+
'\u0085' => "Next Line",
94+
'\u1680' => "Ogham Space Mark",
95+
'\u180E' => "Mongolian Vowel Separator (MVS)",
96+
'\ufeff' => "Zero Width No-Break Space (BOM)",
97+
'\u2000' => "En Quad",
98+
'\u2001' => "Em Quad",
99+
'\u2002' => "En Space (ENSP)",
100+
'\u2003' => "Em Space (EMSP)",
101+
'\u2004' => "Tree-Per-Em",
102+
'\u2005' => "Four-Per-Em",
103+
'\u2006' => "Six-Per-Em",
104+
'\u2007' => "Figure Space",
105+
'\u2008' => "Punctuation Space (PUNCSP)",
106+
'\u2009' => "Thin Space",
107+
'\u200A' => "Hair Space",
108+
'\u200B' => "Zero Width Space (ZWSP)",
109+
'\u2028' => "Line Separator",
110+
'\u2029' => "Paragraph Separator",
111+
'\u202F' => "Narrow No-Break Space",
112+
'\u205F' => "Medium Mathematical Space",
113+
'\u3000' => "Ideographic Space",
114+
_ => "Unknown"
115+
};
116+
}
117+
118+
public class IrregularWhiteSpace : LeafInline
119+
{
120+
public static readonly IrregularWhiteSpace Instance = new();
121+
};
122+
123+
public class WhiteSpaceNormalizerRenderer : HtmlObjectRenderer<IrregularWhiteSpace>
124+
{
125+
protected override void Write(HtmlRenderer renderer, IrregularWhiteSpace obj) =>
126+
renderer.Write(' ');
127+
}

src/Elastic.Markdown/Myst/MarkdownParser.cs

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,22 @@
33
// See the LICENSE file in the project root for more information
44

55
using System.IO.Abstractions;
6+
using System.Text.RegularExpressions;
7+
68
using Cysharp.IO;
9+
10+
using Elastic.Documentation.Diagnostics;
711
using Elastic.Markdown.Myst.CodeBlocks;
812
using Elastic.Markdown.Myst.Comments;
913
using Elastic.Markdown.Myst.Directives;
1014
using Elastic.Markdown.Myst.FrontMatter;
1115
using Elastic.Markdown.Myst.InlineParsers;
1216
using Elastic.Markdown.Myst.InlineParsers.Substitution;
17+
using Elastic.Markdown.Myst.Linters;
1318
using Elastic.Markdown.Myst.Renderers;
1419
using Elastic.Markdown.Myst.Roles;
1520
using Elastic.Markdown.Myst.Roles.AppliesTo;
21+
1622
using Markdig;
1723
using Markdig.Extensions.EmphasisExtras;
1824
using Markdig.Parsers;
@@ -92,20 +98,18 @@ private static async Task<MarkdownDocument> ParseAsync(
9298
MarkdownPipeline pipeline,
9399
Cancel ctx)
94100
{
101+
string inputMarkdown;
95102
if (path.FileSystem is FileSystem)
96103
{
97104
//real IO optimize through UTF8 stream reader.
98105
await using var streamReader = new Utf8StreamReader(path.FullName, fileOpenMode: FileOpenMode.Throughput);
99-
var inputMarkdown = await streamReader.AsTextReader().ReadToEndAsync(ctx);
100-
var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context);
101-
return markdownDocument;
106+
inputMarkdown = await streamReader.AsTextReader().ReadToEndAsync(ctx);
102107
}
103108
else
104-
{
105-
var inputMarkdown = await path.FileSystem.File.ReadAllTextAsync(path.FullName, ctx);
106-
var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context);
107-
return markdownDocument;
108-
}
109+
inputMarkdown = await path.FileSystem.File.ReadAllTextAsync(path.FullName, ctx);
110+
111+
var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context);
112+
return markdownDocument;
109113
}
110114

111115
// ReSharper disable once InconsistentNaming
@@ -156,6 +160,7 @@ public MarkdownPipeline Pipeline
156160
.UseEnhancedCodeBlocks()
157161
.UseHtmxLinkInlineRenderer()
158162
.DisableHtml()
163+
.UseWhiteSpaceNormalizer()
159164
.UseHardBreaks();
160165
_ = builder.BlockParsers.TryRemove<IndentedCodeBlockParser>();
161166
_pipelineCached = builder.Build();

tests/authoring/Framework/ErrorCollectorAssertions.fs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,4 +54,19 @@ module DiagnosticsCollectorAssertions =
5454
| Some e ->
5555
let message = e.Message
5656
test <@ message.Contains(expected) @>
57-
| None -> failwithf "Expected errors but no errors were logged"
57+
| None -> failwithf "Expected warnings but no warnings were logged"
58+
59+
[<DebuggerStepThrough>]
60+
let hasHint (expected: string) (actual: Lazy<GeneratorResults>) =
61+
let actual = actual.Value
62+
actual.Context.Collector.Hints |> shouldBeGreaterThan 0
63+
let errorDiagnostics = actual.Context.Collector.Diagnostics
64+
.Where(fun d -> d.Severity = Severity.Hint)
65+
.ToArray()
66+
|> List.ofArray
67+
|> List.tryHead
68+
match errorDiagnostics with
69+
| Some e ->
70+
let message = e.Message
71+
test <@ message.Contains(expected) @>
72+
| None -> failwithf "Expected hints but no hints were logged"

tests/authoring/Framework/TestValues.fs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,10 @@ type TestDiagnosticsOutput() =
2626
match diagnostic.Severity with
2727
| Severity.Error ->
2828
output.WriteLine($"Error: {diagnostic.Message} ({diagnostic.File}:{line})")
29-
| _ ->
29+
| Severity.Warning ->
3030
output.WriteLine($"Warn : {diagnostic.Message} ({diagnostic.File}:{line})")
31+
| _ ->
32+
output.WriteLine($"Hint : {diagnostic.Message} ({diagnostic.File}:{line})")
3133
| _ -> ()
3234

3335

tests/authoring/Inline/Comments.fs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@ not a comment
1717
[<Fact>]
1818
let ``validate HTML: commented line should not be emitted`` () =
1919
markdown |> convertsToHtml """<p>not a comment</p>"""
20+
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
// Licensed to Elasticsearch B.V under one or more agreements.
2+
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
3+
// See the LICENSE file in the project root for more information
4+
5+
module ``linters``.``white space normalizers``
6+
7+
open Xunit
8+
open authoring
9+
10+
11+
type ``white space detection`` () =
12+
13+
static let markdown = Setup.Markdown $"""
14+
not a{'\u000B'}space
15+
"""
16+
17+
[<Fact>]
18+
let ``validate HTML: should not contain bad space character`` () =
19+
markdown |> convertsToHtml """<p>not a space</p>"""
20+
21+
[<Fact>]
22+
let ``emits a hint when a bad space is used`` () =
23+
markdown |> hasHint "Irregular whitespace character detected: U+000B (Line Tabulation (VT)). This may impair Markdown rendering."

tests/authoring/authoring.fsproj

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,4 +58,8 @@
5858
<Compile Include="Directives\IncludeBlocks.fs" />
5959
</ItemGroup>
6060

61+
<ItemGroup>
62+
<Compile Include="Linters\WhiteSpaceNormalizers.fs" />
63+
</ItemGroup>
64+
6165
</Project>

0 commit comments

Comments
 (0)