fix(editor): preserve U+00A0 non-breaking space (#3037) (#7585)

JohnMcLear · claude · web-flow · commit 67e542d2b983 · 2026-04-23T08:57:30.000+01:00
* fix(editor): preserve U+00A0 non-breaking space (#3037) Non-breaking spaces were silently normalized to regular spaces at every ingestion point, so typed/pasted/imported nbsps never reached the changeset and users could not glue words against line-wrap in French or other languages that require nbsp typography. Removed the four strip sites that replaced U+00A0 with U+0020: - src/node/db/Pad.ts cleanText - src/static/js/contentcollector.ts textify - src/static/js/ace2_inner.ts textify - src/static/js/ace2_inner.ts importText raw-text guard Updated both processSpaces functions (domline and ExportHtml) to tokenize U+00A0 as a separate unit, emit it verbatim as &nbsp;, and treat it as content (not whitespace) for the run-collapse bookkeeping so adjacent regular-space runs aren't miscounted. Added backend round-trip tests for spliceText and setText, and extended the cleanText case table. Updated the existing contentcollector and importexport specs whose expectations encoded the previous buggy behavior; they now assert genuine nbsp preservation. Verified manually in Firefox: clipboard U+00A0 → paste → pad → getText returns c2 a0; getHTML emits `100&nbsp;km`. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * fix(contentcollector): collapse display-artifact nbsp runs on DOM read-back processSpaces is a lossy one-way display transform: leading/trailing spaces and all-but-the-last of a run get rendered as &nbsp; so HTML doesn't collapse them. When incorporateUserChanges reads text back from the DOM, those display-artifact nbsps were being stored in the changeset model instead of being normalized back to plain spaces. This broke handleReturnIndentation, whose /^ *(?:)/ regex only matches ASCII spaces: auto-indent after `foo:\n` produced 4 spaces instead of the expected prev-indent (2) + THE_TAB (4) = 6, because the previous line's model had nbsps where it used to have spaces. Fix: in contentcollector.textify, collapse any [ ]+ run back to plain spaces UNLESS the run is pure U+00A0 AND strictly interior to word chars. That preserves user-intended typographic nbsps like "100 km" while undoing the one-way display transform. Updated 7 contentcollector tests and 7 importexport tests whose assertions needed to reflect the new rule (boundary/mixed runs collapse; pure-interior nbsp runs preserve). Fixes the Playwright regression in indentation.spec.ts:117 that the previous commit introduced. * fix(contentcollector): canonicalize nbsp runs at line assembly, not per text node Addresses Qodo code review feedback on PR #7585. ## Bug fix — nbsp lost at DOM text-node boundary The previous approach ran the "collapse display-artifact nbsp" rule inside textify(), which is called per individual DOM TEXT_NODE. A user-intended nbsp sitting at a text-node boundary (e.g., <span>100</span><span>&nbsp;km </span>) was incorrectly seen as non-interior (before === '' for the second text node) and normalized back to a regular space. Fix: move the canonicalization out of textify() and run it on each fully assembled line string inside cc.finish(). The rule remains: [ ]+ run -> plain spaces UNLESS pure U+00A0 AND strictly interior to non-ws chars It is length-preserving, so attribute offsets and line lengths are unaffected. Added a regression test (contentcollector.spec.ts) for the cross-span case. ## Docs concern Reverted the type-only addition of spliceText to PadType. spliceText is an existing Pad runtime method; the backend test now uses a cast (`(pad as any).spliceText`) so the PR does not expand the declared public type surface, avoiding a separate documentation requirement. --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/src/node/db/Pad.ts b/src/node/db/Pad.ts
@@ -51,8 +51,7 @@ type PadSettings = {
  */
 exports.cleanText = (txt:string): string => txt.replace(/\r\n/g, '\n')
     .replace(/\r/g, '\n')
-    .replace(/\t/g, '        ')
-    .replace(/\xa0/g, ' ');
+    .replace(/\t/g, '        ');
 
 class Pad {
   private db: Database;
diff --git a/src/node/utils/ExportHtml.ts b/src/node/utils/ExportHtml.ts
@@ -537,13 +537,16 @@ const _processSpaces = (s: string) => {
   const doesWrap = true;
   if (s.indexOf('<') < 0 && !doesWrap) {
     // short-cut
-    return s.replace(/ /g, '&nbsp;');
+    return s.replace(/[ \u00a0]/g, '&nbsp;');
   }
-  const parts = [];
-  s.replace(/<[^>]*>?| |[^ <]+/g, (m) => {
+  const parts: string[] = [];
+  s.replace(/<[^>]*>?|[ \u00a0]|[^ \u00a0<]+/g, (m) => {
     parts.push(m);
     return m
   });
+  // U+00A0 is content for run-bookkeeping - it terminates a space run
+  // just like a word character would, so runs of regular spaces adjacent
+  // to a nbsp are not miscounted as one long run (issue #3037).
   if (doesWrap) {
     let endOfLine = true;
     let beforeSpace = false;
@@ -578,6 +581,9 @@ const _processSpaces = (s: string) => {
       }
     }
   }
+  for (let i = 0; i < parts.length; i++) {
+    if (parts[i] === '\u00a0') parts[i] = '&nbsp;';
+  }
   return parts.join('');
 };
 
diff --git a/src/static/js/ace2_inner.ts b/src/static/js/ace2_inner.ts
@@ -476,8 +476,8 @@ function Ace2Inner(editorInfo, cssManagers) {
       if (text.charAt(text.length - 1) !== '\n') {
         throw new Error('new raw text must end with newline');
       }
-      if (/[\r\t\xa0]/.exec(text)) {
-        throw new Error('new raw text must not contain CR, tab, or nbsp');
+      if (/[\r\t]/.exec(text)) {
+        throw new Error('new raw text must not contain CR or tab');
       }
       lines = text.substring(0, text.length - 1).split('\n');
     } else {
@@ -2042,7 +2042,7 @@ function Ace2Inner(editorInfo, cssManagers) {
       (nonEmpty) => domline.createDomLine(nonEmpty, doesWrap, browser, document);
 
   const textify =
-      (str) => str.replace(/[\n\r ]/g, ' ').replace(/\xa0/g, ' ').replace(/\t/g, '        ');
+      (str) => str.replace(/[\n\r ]/g, ' ').replace(/\t/g, '        ');
 
   const _blockElems = {
     div: 1,
diff --git a/src/static/js/contentcollector.ts b/src/static/js/contentcollector.ts
@@ -79,9 +79,31 @@ const makeContentCollector = (collectStyles, abrowser, apool, className2Author)
   const textify = (str) => sanitizeUnicode(
       str.replace(/(\n | \n)/g, ' ')
           .replace(/[\n\r ]/g, ' ')
-          .replace(/\xa0/g, ' ')
           .replace(/\t/g, '        '));
 
+  // processSpaces (domline.ts, ExportHtml.ts) is a lossy one-way display
+  // transform: leading/trailing spaces and all-but-the-last space in a run
+  // are rendered as &nbsp; to defeat HTML whitespace collapsing, so any
+  // round-trip through the DOM sees nbsps where the model has plain spaces.
+  // To keep the model canonical, a [ \u00a0]+ run read back from the fully
+  // assembled line is collapsed to plain spaces unless it is strictly
+  // interior to non-whitespace chars AND contains only U+00A0 (issue #3037 -
+  // user-intended nbsp between words such as "100 km"). The rule runs on
+  // the whole line (not per DOM text node) so that nbsps sitting at a
+  // span boundary - e.g. <span>100</span><span>&nbsp;km</span> - are still
+  // seen as interior. The transform is length-preserving so attribution
+  // offsets stay consistent.
+  const canonicalizeNbspRuns = (s: string) => s.replace(
+      /[ \u00a0]+/g,
+      (run: string, offset: number, src: string) => {
+        const before = offset > 0 ? src[offset - 1] : '';
+        const after = offset + run.length < src.length
+            ? src[offset + run.length] : '';
+        const pureNbsp = !run.includes(' ');
+        const interiorOfWord = /\S/.test(before) && /\S/.test(after);
+        return pureNbsp && interiorOfWord ? run : ' '.repeat(run.length);
+      });
+
   const getAssoc = (node, name) => node[`_magicdom_${name}`];
 
   const lines = (() => {
@@ -642,6 +664,12 @@ const makeContentCollector = (collectStyles, abrowser, apool, className2Author)
     lineStrings.length--;
     lineAttribs.length--;
 
+    // Canonicalize display-artifact nbsps on the whole-line string (issue
+    // #3037). Length-preserving, so no attribute adjustments are required.
+    for (let i = 0; i < lineStrings.length; i++) {
+      lineStrings[i] = canonicalizeNbspRuns(lineStrings[i]);
+    }
+
     const ss = getSelectionStart();
     const se = getSelectionEnd();
 
diff --git a/src/static/js/domline.ts b/src/static/js/domline.ts
@@ -231,12 +231,15 @@ domline.createDomLine = (nonEmpty, doesWrap, optBrowser, optDocument) => {
 domline.processSpaces = (s, doesWrap) => {
   if (s.indexOf('<') < 0 && !doesWrap) {
     // short-cut
-    return s.replace(/ /g, '&nbsp;');
+    return s.replace(/[ \u00a0]/g, '&nbsp;');
   }
   const parts = [];
-  s.replace(/<[^>]*>?| |[^ <]+/g, (m) => {
+  s.replace(/<[^>]*>?|[ \u00a0]|[^ \u00a0<]+/g, (m) => {
     parts.push(m);
   });
+  // U+00A0 is content for run-bookkeeping - it terminates a space run
+  // just like a word character would, so runs of regular spaces adjacent
+  // to a nbsp are not miscounted as one long run (issue #3037).
   if (doesWrap) {
     let endOfLine = true;
     let beforeSpace = false;
@@ -271,6 +274,9 @@ domline.processSpaces = (s, doesWrap) => {
       }
     }
   }
+  for (let i = 0; i < parts.length; i++) {
+    if (parts[i] === '\u00a0') parts[i] = '&nbsp;';
+  }
   return parts.join('');
 };
 
diff --git a/src/tests/backend/specs/Pad.ts b/src/tests/backend/specs/Pad.ts
@@ -45,6 +45,9 @@ describe(__filename, function () {
       ['x\ry\n', 'x\ny\n'],
       ['x\r\ny\n', 'x\ny\n'],
       ['x\r\r\ny\n', 'x\n\ny\n'],
+      // Non-breaking space (U+00A0) must survive cleanText (issue #3037).
+      ['100\u00a0km\n', '100\u00a0km\n'],
+      ['a\u00a0\u00a0b\n', 'a\u00a0\u00a0b\n'],
     ];
     for (const [input, want] of testCases) {
       it(`${JSON.stringify(input)} -> ${JSON.stringify(want)}`, async function () {
@@ -53,6 +56,22 @@ describe(__filename, function () {
     }
   });
 
+  describe('non-breaking space preservation (issue #3037)', function () {
+    it('spliceText round-trips U+00A0', async function () {
+      pad = await padManager.getPad(padId, '');
+      // spliceText is an existing runtime Pad method; cast avoids
+      // adding a type-only declaration to PadType in this PR.
+      await (pad as any).spliceText(0, 0, '100\u00a0km');
+      assert.equal(pad!.text(), '100\u00a0km\n');
+    });
+
+    it('setText round-trips U+00A0', async function () {
+      pad = await padManager.getPad(padId, '');
+      await pad!.setText('a\u00a0b\n');
+      assert.equal(pad!.text(), 'a\u00a0b\n');
+    });
+  });
+
   describe('padDefaultContent hook', function () {
     it('runs when a pad is created without specific text', async function () {
       const p = new Promise<void>((resolve) => {
diff --git a/src/tests/backend/specs/api/importexport.ts b/src/tests/backend/specs/api/importexport.ts
@@ -79,8 +79,8 @@ const testImports:MapArrayType<any> = {
     // XXX the HTML between "than" and "one" looks strange
     description: 'non-breaking space should be preserved, but can be replaced when it',
     input: '<html><body>Text&nbsp;with&nbsp; more&nbsp;&nbsp;&nbsp;than   &nbsp;one space.<br></body></html>',
-    wantHTML: '<!DOCTYPE HTML><html><body>Text with&nbsp; more&nbsp;&nbsp; than&nbsp; one space.<br><br></body></html>',
-    wantText: 'Text with  more   than  one space.\n\n',
+    wantHTML: '<!DOCTYPE HTML><html><body>Text&nbsp;with&nbsp; more&nbsp;&nbsp;&nbsp;than&nbsp; one space.<br><br></body></html>',
+    wantText: 'Text\u00a0with  more\u00a0\u00a0\u00a0than  one space.\n\n',
   },
   'multiplenbsp': {
     description: 'Multiple non-breaking space should be preserved',
@@ -91,8 +91,8 @@ const testImports:MapArrayType<any> = {
   'multipleNonBreakingSpaceBetweenWords': {
     description: 'A normal space is always inserted before a word',
     input: '<html><body>&nbsp;&nbsp;word1&nbsp;&nbsp;word2&nbsp;&nbsp;&nbsp;word3<br></body></html>',
-    wantHTML: '<!DOCTYPE HTML><html><body>&nbsp; word1&nbsp; word2&nbsp;&nbsp; word3<br><br></body></html>',
-    wantText: '  word1  word2   word3\n\n',
+    wantHTML: '<!DOCTYPE HTML><html><body>&nbsp; word1&nbsp;&nbsp;word2&nbsp;&nbsp;&nbsp;word3<br><br></body></html>',
+    wantText: '  word1\u00a0\u00a0word2\u00a0\u00a0\u00a0word3\n\n',
   },
   'nonBreakingSpacePreceededBySpaceBetweenWords': {
     description: 'A non-breaking space preceded by a normal space',
diff --git a/src/tests/backend/specs/contentcollector.ts b/src/tests/backend/specs/contentcollector.ts
@@ -185,7 +185,7 @@ const testCases = [
     description: 'non-breaking and normal space should be preserved',
     html: '<html><body>Text&nbsp;with&nbsp; more&nbsp;&nbsp;&nbsp;than   &nbsp;one space.<br></body></html>',
     wantAlines: ['+10'],
-    wantText: ['Text with  more   than    one space.'],
+    wantText: ['Text\u00a0with  more\u00a0\u00a0\u00a0than    one space.'],
   },
   {
     description: 'Multiple nbsp should be preserved',
@@ -197,7 +197,7 @@ const testCases = [
     description: 'Multiple nbsp between words ',
     html: '<html><body>&nbsp;&nbsp;word1&nbsp;&nbsp;word2&nbsp;&nbsp;&nbsp;word3<br></body></html>',
     wantAlines: ['+m'],
-    wantText: ['  word1  word2   word3'],
+    wantText: ['  word1\u00a0\u00a0word2\u00a0\u00a0\u00a0word3'],
   },
   {
     description: 'A non-breaking space preceded by a normal space',
@@ -359,6 +359,16 @@ pre
     ],
     wantText: ['before', '*bare item', 'after'],
   },
+  {
+    // Regression for PR #7585 review feedback: a user-intended nbsp
+    // sitting at a DOM text-node boundary (split across spans) must
+    // still be preserved because canonicalization runs on the whole
+    // line, not per text node.
+    description: 'nbsp preserved across span boundary',
+    html: '<html><body><p><span>100</span><span>&nbsp;km</span></p></body></html>',
+    wantAlines: ['+6'],
+    wantText: ['100\u00a0km'],
+  },
 ];
 
 describe(__filename, function () {