Skip to content

Commit 3ab5230

Browse files
authored
Refine irregular spaces linting (#2099)
1 parent 45facb6 commit 3ab5230

File tree

3 files changed

+97
-31
lines changed

3 files changed

+97
-31
lines changed

docs/cli/docset/format.md

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,46 @@ Currently, it handles irregular space characters that may impair Markdown render
3232

3333
### Irregular Space Detection
3434

35-
The format command detects and replaces 24 types of irregular space characters with regular spaces, including:
35+
The format command intelligently handles irregular space characters by categorizing them into three groups:
3636

37-
- No-Break Space (U+00A0)
38-
- En Space (U+2002)
39-
- Em Space (U+2003)
37+
#### Characters removed entirely
38+
39+
These characters are removed completely as they serve no visual purpose and can cause rendering issues:
40+
41+
- Line Tabulation (U+000B)
42+
- Form Feed (U+000C)
43+
- Next Line (U+0085)
44+
- Ogham Space Mark (U+1680)
45+
- Mongolian Vowel Separator (U+180E)
46+
- Zero Width No-Break Space/BOM (U+FEFF)
4047
- Zero Width Space (U+200B)
4148
- Line Separator (U+2028)
4249
- Paragraph Separator (U+2029)
43-
- And 18 other irregular space variants
50+
51+
#### Characters preserved
52+
53+
These characters are preserved as they serve important typographic or functional purposes:
54+
55+
- No-Break Space (U+00A0) - Prevents line breaks
56+
- Figure Space (U+2007) - Aligns numbers in tables
57+
- Narrow No-Break Space (U+202F) - French typography
58+
- Medium Mathematical Space (U+205F) - Mathematical expressions
59+
60+
#### Characters replaced with regular spaces
61+
62+
These characters are replaced with standard spaces (U+0020) as they can cause inconsistent rendering:
63+
64+
- En Quad (U+2000)
65+
- Em Quad (U+2001)
66+
- En Space (U+2002)
67+
- Em Space (U+2003)
68+
- Tree-Per-Em (U+2004)
69+
- Four-Per-Em (U+2005)
70+
- Six-Per-Em (U+2006)
71+
- Punctuation Space (U+2008)
72+
- Thin Space (U+2009)
73+
- Hair Space (U+200A)
74+
- Ideographic Space (U+3000)
4475

4576
These characters can cause unexpected rendering issues in Markdown and are often introduced accidentally through copy-paste operations from other applications.
4677

src/Elastic.Markdown/Myst/Linters/SpaceNormalizer.cs

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// See the LICENSE file in the project root for more information
44

55
using System.Buffers;
6+
using System.Linq;
67
using Elastic.Markdown.Diagnostics;
78
using Markdig;
89
using Markdig.Helpers;
@@ -35,40 +36,44 @@ public void Setup(MarkdownPipeline pipeline, IMarkdownRenderer renderer) =>
3536

3637
public class SpaceNormalizerParser : InlineParser
3738
{
38-
// Collection of irregular space characters that may impair Markdown rendering
39-
private static readonly char[] IrregularSpaceChars =
39+
// Characters that should be removed entirely (invisible/problematic)
40+
private static readonly char[] CharactersToRemove =
4041
[
4142
'\u000B', // Line Tabulation (\v) - <VT>
4243
'\u000C', // Form Feed (\f) - <FF>
43-
'\u00A0', // No-Break Space - <NBSP>
4444
'\u0085', // Next Line
4545
'\u1680', // Ogham Space Mark
4646
'\u180E', // Mongolian Vowel Separator - <MVS>
4747
'\ufeff', // Zero Width No-Break Space - <BOM>
48+
'\u200B', // Zero Width Space - <ZWSP>
49+
'\u2028', // Line Separator
50+
'\u2029' // Paragraph Separator
51+
];
52+
53+
// Characters to replace with regular spaces (visible but problematic)
54+
private static readonly char[] CharactersToReplace =
55+
[
4856
'\u2000', // En Quad
4957
'\u2001', // Em Quad
5058
'\u2002', // En Space - <ENSP>
5159
'\u2003', // Em Space - <EMSP>
5260
'\u2004', // Tree-Per-Em
5361
'\u2005', // Four-Per-Em
5462
'\u2006', // Six-Per-Em
55-
'\u2007', // Figure Space
5663
'\u2008', // Punctuation Space - <PUNCSP>
5764
'\u2009', // Thin Space
5865
'\u200A', // Hair Space
59-
'\u200B', // Zero Width Space - <ZWSP>
60-
'\u2028', // Line Separator
61-
'\u2029', // Paragraph Separator
62-
'\u202F', // Narrow No-Break Space
63-
'\u205F', // Medium Mathematical Space
6466
'\u3000' // Ideographic Space
6567
];
66-
private static readonly SearchValues<char> SpaceSearchValues = SearchValues.Create(IrregularSpaceChars);
68+
69+
// Combined list of characters that need fixing (removed or replaced)
70+
private static readonly char[] CharactersToFix = CharactersToRemove.Concat(CharactersToReplace).ToArray();
71+
private static readonly SearchValues<char> SpaceSearchValues = SearchValues.Create(CharactersToFix);
6772

6873
// Track which files have already had the hint emitted to avoid duplicates
6974
private static readonly HashSet<string> FilesWithHintEmitted = [];
7075

71-
public SpaceNormalizerParser() => OpeningCharacters = IrregularSpaceChars;
76+
public SpaceNormalizerParser() => OpeningCharacters = CharactersToFix;
7277

7378
public override bool Match(InlineProcessor processor, ref StringSlice slice)
7479
{

src/authoring/Elastic.Documentation.Refactor/Formatters/IrregularSpaceFormatter.cs

Lines changed: 45 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,62 +8,92 @@
88
namespace Elastic.Documentation.Refactor.Formatters;
99

1010
/// <summary>
11-
/// Formatter that replaces irregular space characters with regular spaces
11+
/// Formatter that handles irregular space characters appropriately:
12+
/// - Removes invisible characters entirely
13+
/// - Preserves semantically meaningful spaces
14+
/// - Replaces problematic spaces with regular spaces
1215
/// </summary>
1316
public class IrregularSpaceFormatter : IFormatter
1417
{
1518
public string Name => "irregular space";
1619

17-
// Collection of irregular space characters that may impair Markdown rendering
18-
private static readonly char[] IrregularSpaceChars =
20+
// Characters to remove entirely (invisible/problematic)
21+
private static readonly char[] CharactersToRemove =
1922
[
2023
'\u000B', // Line Tabulation (\v) - <VT>
2124
'\u000C', // Form Feed (\f) - <FF>
22-
'\u00A0', // No-Break Space - <NBSP>
2325
'\u0085', // Next Line
2426
'\u1680', // Ogham Space Mark
2527
'\u180E', // Mongolian Vowel Separator - <MVS>
2628
'\ufeff', // Zero Width No-Break Space - <BOM>
29+
'\u200B', // Zero Width Space - <ZWSP>
30+
'\u2028', // Line Separator
31+
'\u2029' // Paragraph Separator
32+
];
33+
34+
// Characters to preserve (semantically meaningful)
35+
private static readonly char[] CharactersToPreserve =
36+
[
37+
'\u00A0', // No-Break Space - <NBSP>
38+
'\u2007', // Figure Space
39+
'\u202F', // Narrow No-Break Space
40+
'\u205F' // Medium Mathematical Space
41+
];
42+
43+
// Characters to replace with regular spaces (visible but problematic)
44+
private static readonly char[] CharactersToReplace =
45+
[
2746
'\u2000', // En Quad
2847
'\u2001', // Em Quad
2948
'\u2002', // En Space - <ENSP>
3049
'\u2003', // Em Space - <EMSP>
3150
'\u2004', // Tree-Per-Em
3251
'\u2005', // Four-Per-Em
3352
'\u2006', // Six-Per-Em
34-
'\u2007', // Figure Space
3553
'\u2008', // Punctuation Space - <PUNCSP>
3654
'\u2009', // Thin Space
3755
'\u200A', // Hair Space
38-
'\u200B', // Zero Width Space - <ZWSP>
39-
'\u2028', // Line Separator
40-
'\u2029', // Paragraph Separator
41-
'\u202F', // Narrow No-Break Space
42-
'\u205F', // Medium Mathematical Space
4356
'\u3000' // Ideographic Space
4457
];
4558

46-
private static readonly SearchValues<char> IrregularSpaceSearchValues = SearchValues.Create(IrregularSpaceChars);
59+
private static readonly SearchValues<char> CharactersToRemoveValues = SearchValues.Create(CharactersToRemove);
60+
private static readonly SearchValues<char> CharactersToPreserveValues = SearchValues.Create(CharactersToPreserve);
61+
private static readonly SearchValues<char> CharactersToReplaceValues = SearchValues.Create(CharactersToReplace);
4762

4863
public FormatResult Format(string content)
4964
{
50-
// Quick check - if no irregular space, return original
51-
if (content.AsSpan().IndexOfAny(IrregularSpaceSearchValues) == -1)
65+
// Quick check - if no irregular space characters, return original
66+
var span = content.AsSpan();
67+
if (span.IndexOfAny(CharactersToRemoveValues) == -1 &&
68+
span.IndexOfAny(CharactersToPreserveValues) == -1 &&
69+
span.IndexOfAny(CharactersToReplaceValues) == -1)
5270
return new FormatResult(content, 0);
5371

54-
// Replace irregular space with regular spaces
72+
// Process each character with appropriate handling
5573
var sb = new StringBuilder(content.Length);
5674
var replacements = 0;
5775

5876
foreach (var c in content)
5977
{
60-
if (IrregularSpaceSearchValues.Contains(c))
78+
if (CharactersToRemoveValues.Contains(c))
79+
{
80+
// Remove invisible/problematic characters entirely
81+
replacements++;
82+
}
83+
else if (CharactersToPreserveValues.Contains(c))
84+
{
85+
// Preserve semantically meaningful characters
86+
_ = sb.Append(c);
87+
}
88+
else if (CharactersToReplaceValues.Contains(c))
6189
{
90+
// Replace problematic visible characters with regular spaces
6291
_ = sb.Append(' ');
6392
replacements++;
6493
}
6594
else
6695
{
96+
// Keep regular characters as-is
6797
_ = sb.Append(c);
6898
}
6999
}

0 commit comments

Comments
 (0)