|
8 | 8 | namespace Elastic.Documentation.Refactor.Formatters; |
9 | 9 |
|
10 | 10 | /// <summary> |
11 | | -/// Formatter that replaces irregular space characters with regular spaces |
| 11 | +/// Formatter that handles irregular space characters appropriately: |
| 12 | +/// - Removes invisible characters entirely |
| 13 | +/// - Preserves semantically meaningful spaces |
| 14 | +/// - Replaces problematic spaces with regular spaces |
12 | 15 | /// </summary> |
13 | 16 | public class IrregularSpaceFormatter : IFormatter |
14 | 17 | { |
15 | 18 | public string Name => "irregular space"; |
16 | 19 |
|
17 | | - // Collection of irregular space characters that may impair Markdown rendering |
18 | | - private static readonly char[] IrregularSpaceChars = |
| 20 | + // Characters to remove entirely (invisible/problematic) |
| 21 | + private static readonly char[] CharactersToRemove = |
19 | 22 | [ |
20 | 23 | '\u000B', // Line Tabulation (\v) - <VT> |
21 | 24 | '\u000C', // Form Feed (\f) - <FF> |
22 | | - '\u00A0', // No-Break Space - <NBSP> |
23 | 25 | '\u0085', // Next Line |
24 | 26 | '\u1680', // Ogham Space Mark |
25 | 27 | '\u180E', // Mongolian Vowel Separator - <MVS> |
26 | 28 | '\ufeff', // Zero Width No-Break Space - <BOM> |
| 29 | + '\u200B', // Zero Width Space - <ZWSP> |
| 30 | + '\u2028', // Line Separator |
| 31 | + '\u2029' // Paragraph Separator |
| 32 | + ]; |
| 33 | + |
| 34 | + // Characters to preserve (semantically meaningful) |
| 35 | + private static readonly char[] CharactersToPreserve = |
| 36 | + [ |
| 37 | + '\u00A0', // No-Break Space - <NBSP> |
| 38 | + '\u2007', // Figure Space |
| 39 | + '\u202F', // Narrow No-Break Space |
| 40 | + '\u205F' // Medium Mathematical Space |
| 41 | + ]; |
| 42 | + |
| 43 | + // Characters to replace with regular spaces (visible but problematic) |
| 44 | + private static readonly char[] CharactersToReplace = |
| 45 | + [ |
27 | 46 | '\u2000', // En Quad |
28 | 47 | '\u2001', // Em Quad |
29 | 48 | '\u2002', // En Space - <ENSP> |
30 | 49 | '\u2003', // Em Space - <EMSP> |
31 | 50 | '\u2004', // Tree-Per-Em |
32 | 51 | '\u2005', // Four-Per-Em |
33 | 52 | '\u2006', // Six-Per-Em |
34 | | - '\u2007', // Figure Space |
35 | 53 | '\u2008', // Punctuation Space - <PUNCSP> |
36 | 54 | '\u2009', // Thin Space |
37 | 55 | '\u200A', // Hair Space |
38 | | - '\u200B', // Zero Width Space - <ZWSP> |
39 | | - '\u2028', // Line Separator |
40 | | - '\u2029', // Paragraph Separator |
41 | | - '\u202F', // Narrow No-Break Space |
42 | | - '\u205F', // Medium Mathematical Space |
43 | 56 | '\u3000' // Ideographic Space |
44 | 57 | ]; |
45 | 58 |
|
46 | | - private static readonly SearchValues<char> IrregularSpaceSearchValues = SearchValues.Create(IrregularSpaceChars); |
| 59 | + private static readonly SearchValues<char> CharactersToRemoveValues = SearchValues.Create(CharactersToRemove); |
| 60 | + private static readonly SearchValues<char> CharactersToPreserveValues = SearchValues.Create(CharactersToPreserve); |
| 61 | + private static readonly SearchValues<char> CharactersToReplaceValues = SearchValues.Create(CharactersToReplace); |
47 | 62 |
|
48 | 63 | public FormatResult Format(string content) |
49 | 64 | { |
50 | | - // Quick check - if no irregular space, return original |
51 | | - if (content.AsSpan().IndexOfAny(IrregularSpaceSearchValues) == -1) |
| 65 | + // Quick check - if no irregular space characters, return original |
| 66 | + var span = content.AsSpan(); |
| 67 | + if (span.IndexOfAny(CharactersToRemoveValues) == -1 && |
| 68 | + span.IndexOfAny(CharactersToPreserveValues) == -1 && |
| 69 | + span.IndexOfAny(CharactersToReplaceValues) == -1) |
52 | 70 | return new FormatResult(content, 0); |
53 | 71 |
|
54 | | - // Replace irregular space with regular spaces |
| 72 | + // Process each character with appropriate handling |
55 | 73 | var sb = new StringBuilder(content.Length); |
56 | 74 | var replacements = 0; |
57 | 75 |
|
58 | 76 | foreach (var c in content) |
59 | 77 | { |
60 | | - if (IrregularSpaceSearchValues.Contains(c)) |
| 78 | + if (CharactersToRemoveValues.Contains(c)) |
| 79 | + { |
| 80 | + // Remove invisible/problematic characters entirely |
| 81 | + replacements++; |
| 82 | + } |
| 83 | + else if (CharactersToPreserveValues.Contains(c)) |
| 84 | + { |
| 85 | + // Preserve semantically meaningful characters |
| 86 | + _ = sb.Append(c); |
| 87 | + } |
| 88 | + else if (CharactersToReplaceValues.Contains(c)) |
61 | 89 | { |
| 90 | + // Replace problematic visible characters with regular spaces |
62 | 91 | _ = sb.Append(' '); |
63 | 92 | replacements++; |
64 | 93 | } |
65 | 94 | else |
66 | 95 | { |
| 96 | + // Keep regular characters as-is |
67 | 97 | _ = sb.Append(c); |
68 | 98 | } |
69 | 99 | } |
|
0 commit comments