|
| 1 | +// Licensed to Elasticsearch B.V under one or more agreements. |
| 2 | +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. |
| 3 | +// See the LICENSE file in the project root for more information |
| 4 | + |
| 5 | +using System.Buffers; |
| 6 | +using Elastic.Markdown.Diagnostics; |
| 7 | +using Markdig; |
| 8 | +using Markdig.Helpers; |
| 9 | +using Markdig.Parsers; |
| 10 | +using Markdig.Parsers.Inlines; |
| 11 | +using Markdig.Renderers; |
| 12 | +using Markdig.Renderers.Html; |
| 13 | +using Markdig.Renderers.Html.Inlines; |
| 14 | +using Markdig.Syntax.Inlines; |
| 15 | + |
| 16 | +namespace Elastic.Markdown.Myst.Linters; |
| 17 | + |
| 18 | +public static class WhiteSpaceNormalizerBuilderExtensions |
| 19 | +{ |
| 20 | + public static MarkdownPipelineBuilder UseWhiteSpaceNormalizer(this MarkdownPipelineBuilder pipeline) |
| 21 | + { |
| 22 | + pipeline.Extensions.AddIfNotAlready<WhiteSpaceNormalizerBuilderExtension>(); |
| 23 | + return pipeline; |
| 24 | + } |
| 25 | +} |
| 26 | + |
| 27 | +public class WhiteSpaceNormalizerBuilderExtension : IMarkdownExtension |
| 28 | +{ |
| 29 | + public void Setup(MarkdownPipelineBuilder pipeline) => |
| 30 | + pipeline.InlineParsers.InsertBefore<EmphasisInlineParser>(new WhiteSpaceNormalizerParser()); |
| 31 | + |
| 32 | + public void Setup(MarkdownPipeline pipeline, IMarkdownRenderer renderer) => |
| 33 | + renderer.ObjectRenderers.InsertAfter<EmphasisInlineRenderer>(new WhiteSpaceNormalizerRenderer()); |
| 34 | +} |
| 35 | + |
| 36 | +public class WhiteSpaceNormalizerParser : InlineParser |
| 37 | +{ |
| 38 | + // Collection of irregular whitespace characters that may impair Markdown rendering |
| 39 | + private static readonly char[] IrregularWhitespaceChars = |
| 40 | + [ |
| 41 | + '\u000B', // Line Tabulation (\v) - <VT> |
| 42 | + '\u000C', // Form Feed (\f) - <FF> |
| 43 | + '\u00A0', // No-Break Space - <NBSP> |
| 44 | + '\u0085', // Next Line |
| 45 | + '\u1680', // Ogham Space Mark |
| 46 | + '\u180E', // Mongolian Vowel Separator - <MVS> |
| 47 | + '\ufeff', // Zero Width No-Break Space - <BOM> |
| 48 | + '\u2000', // En Quad |
| 49 | + '\u2001', // Em Quad |
| 50 | + '\u2002', // En Space - <ENSP> |
| 51 | + '\u2003', // Em Space - <EMSP> |
| 52 | + '\u2004', // Tree-Per-Em |
| 53 | + '\u2005', // Four-Per-Em |
| 54 | + '\u2006', // Six-Per-Em |
| 55 | + '\u2007', // Figure Space |
| 56 | + '\u2008', // Punctuation Space - <PUNCSP> |
| 57 | + '\u2009', // Thin Space |
| 58 | + '\u200A', // Hair Space |
| 59 | + '\u200B', // Zero Width Space - <ZWSP> |
| 60 | + '\u2028', // Line Separator |
| 61 | + '\u2029', // Paragraph Separator |
| 62 | + '\u202F', // Narrow No-Break Space |
| 63 | + '\u205F', // Medium Mathematical Space |
| 64 | + '\u3000' // Ideographic Space |
| 65 | + ]; |
| 66 | + private static readonly SearchValues<char> WhiteSpaceSearchValues = SearchValues.Create(IrregularWhitespaceChars); |
| 67 | + |
| 68 | + public WhiteSpaceNormalizerParser() => OpeningCharacters = IrregularWhitespaceChars; |
| 69 | + |
| 70 | + public override bool Match(InlineProcessor processor, ref StringSlice slice) |
| 71 | + { |
| 72 | + var span = slice.AsSpan().Slice(0, 1); |
| 73 | + if (span.IndexOfAny(WhiteSpaceSearchValues) == -1) |
| 74 | + return false; |
| 75 | + |
| 76 | + processor.Inline = IrregularWhiteSpace.Instance; |
| 77 | + |
| 78 | + var c = span[0]; |
| 79 | + var charName = GetCharacterName(c); |
| 80 | + |
| 81 | + processor.EmitHint(processor.Inline, 1, $"Irregular whitespace character detected: U+{(int)c:X4} ({charName}). This may impair Markdown rendering."); |
| 82 | + |
| 83 | + slice.SkipChar(); |
| 84 | + return true; |
| 85 | + } |
| 86 | + |
| 87 | + // Helper to get a friendly name for the whitespace character |
| 88 | + private static string GetCharacterName(char c) => c switch |
| 89 | + { |
| 90 | + '\u000B' => "Line Tabulation (VT)", |
| 91 | + '\u000C' => "Form Feed (FF)", |
| 92 | + '\u00A0' => "No-Break Space (NBSP)", |
| 93 | + '\u0085' => "Next Line", |
| 94 | + '\u1680' => "Ogham Space Mark", |
| 95 | + '\u180E' => "Mongolian Vowel Separator (MVS)", |
| 96 | + '\ufeff' => "Zero Width No-Break Space (BOM)", |
| 97 | + '\u2000' => "En Quad", |
| 98 | + '\u2001' => "Em Quad", |
| 99 | + '\u2002' => "En Space (ENSP)", |
| 100 | + '\u2003' => "Em Space (EMSP)", |
| 101 | + '\u2004' => "Tree-Per-Em", |
| 102 | + '\u2005' => "Four-Per-Em", |
| 103 | + '\u2006' => "Six-Per-Em", |
| 104 | + '\u2007' => "Figure Space", |
| 105 | + '\u2008' => "Punctuation Space (PUNCSP)", |
| 106 | + '\u2009' => "Thin Space", |
| 107 | + '\u200A' => "Hair Space", |
| 108 | + '\u200B' => "Zero Width Space (ZWSP)", |
| 109 | + '\u2028' => "Line Separator", |
| 110 | + '\u2029' => "Paragraph Separator", |
| 111 | + '\u202F' => "Narrow No-Break Space", |
| 112 | + '\u205F' => "Medium Mathematical Space", |
| 113 | + '\u3000' => "Ideographic Space", |
| 114 | + _ => "Unknown" |
| 115 | + }; |
| 116 | +} |
| 117 | + |
| 118 | +public class IrregularWhiteSpace : LeafInline |
| 119 | +{ |
| 120 | + public static readonly IrregularWhiteSpace Instance = new(); |
| 121 | +}; |
| 122 | + |
| 123 | +public class WhiteSpaceNormalizerRenderer : HtmlObjectRenderer<IrregularWhiteSpace> |
| 124 | +{ |
| 125 | + protected override void Write(HtmlRenderer renderer, IrregularWhiteSpace obj) => |
| 126 | + renderer.Write(' '); |
| 127 | +} |
0 commit comments