@@ -16,6 +16,33 @@ export interface NormalizeWhitespaceOptions {
1616 preserveNewlines ?: boolean ;
1717}
1818
19+ // Pre-compiled regex patterns for better performance
20+ // Unicode whitespace characters to normalize:
21+ // \u00A0 - Non-breaking space
22+ // \u1680 - Ogham space mark
23+ // \u2000-\u200B - Various spaces (en space, em space, thin space, etc.)
24+ // \u2028 - Line separator
25+ // \u2029 - Paragraph separator
26+ // \u202F - Narrow non-breaking space
27+ // \u205F - Medium mathematical space
28+ // \u3000 - Ideographic space
29+ // \uFEFF - Zero-width non-breaking space (BOM)
30+
31+ // Single-pass regex: collapse all whitespace including Unicode
32+ const COLLAPSE_ALL_WHITESPACE =
33+ / [ \s \u00A0 \u1680 \u2000 - \u200B \u2028 \u2029 \u202F \u205F \u3000 \uFEFF ] + / g;
34+
35+ // Just replace Unicode spaces (no collapse)
36+ const UNICODE_SPACES =
37+ / [ \s \u00A0 \u1680 \u2000 - \u200B \u2028 \u2029 \u202F \u205F \u3000 \uFEFF ] / g;
38+
39+ // Preserve newlines: collapse all non-newline whitespace
40+ const COLLAPSE_NON_NEWLINE = / [ ^ \S \n ] + / g;
41+
42+ // Replace Unicode spaces except newlines
43+ const UNICODE_SPACES_NO_NEWLINE =
44+ / [ \u00A0 \u1680 \u2000 - \u200B \u2028 \u2029 \u202F \u205F \u3000 \uFEFF \t \r \f \v ] / g;
45+
1946/**
2047 * Normalizes various Unicode whitespace characters to regular spaces
2148 * @param str - The string to normalize
@@ -38,41 +65,26 @@ export function normalizeWhitespace(
3865
3966 if ( ! str ) return str ;
4067
41- let result = str ;
42-
43- // Unicode whitespace characters to normalize:
44- // \u00A0 - Non-breaking space
45- // \u1680 - Ogham space mark
46- // \u2000-\u200A - Various spaces (en space, em space, thin space, etc.)
47- // \u2028 - Line separator
48- // \u2029 - Paragraph separator
49- // \u202F - Narrow non-breaking space
50- // \u205F - Medium mathematical space
51- // \u3000 - Ideographic space
52- // \uFEFF - Zero-width non-breaking space (BOM)
53- // \u200B - Zero-width space
68+ let result : string ;
5469
70+ // Optimize for common cases with single-pass regex
5571 if ( preserveNewlines ) {
56- // Replace all Unicode spaces except newlines with regular space
57- result = result . replace (
58- / [ \u00A0 \u1680 \u2000 - \u200B \u2028 \u2029 \u202F \u205F \u3000 \uFEFF \t \r \f \v ] / g,
59- " "
60- ) ;
61-
6272 if ( collapse ) {
63- // Collapse multiple spaces (but not newlines) into one
64- result = result . replace ( / [ ^ \S \n ] + / g, " " ) ;
73+ // Single pass: replace Unicode spaces AND collapse non-newline whitespace
74+ result = str
75+ . replace ( UNICODE_SPACES_NO_NEWLINE , " " )
76+ . replace ( COLLAPSE_NON_NEWLINE , " " ) ;
77+ } else {
78+ // Just replace Unicode spaces, preserve spacing
79+ result = str . replace ( UNICODE_SPACES_NO_NEWLINE , " " ) ;
6580 }
6681 } else {
67- // Replace all whitespace characters including newlines with regular space
68- result = result . replace (
69- / [ \s \u00A0 \u1680 \u2000 - \u200B \u2028 \u2029 \u202F \u205F \u3000 \uFEFF ] / g,
70- " "
71- ) ;
72-
7382 if ( collapse ) {
74- // Collapse multiple spaces into one
75- result = result . replace ( / \s + / g, " " ) ;
83+ // Most common case: single-pass regex to collapse all whitespace
84+ result = str . replace ( COLLAPSE_ALL_WHITESPACE , " " ) ;
85+ } else {
86+ // Replace Unicode spaces without collapsing
87+ result = str . replace ( UNICODE_SPACES , " " ) ;
7688 }
7789 }
7890
0 commit comments