docs: add upstream differences section and replace tests

sciyoshi · claude · sciyoshi · commit 48891434fe02 · 2026-02-20T01:21:29.000-05:00
Document Python-specific adaptations from the TypeScript upstream
(naming conventions, DOM handling, UTF-16 semantics, etc.) and add
test suite for Node.replace() and replace error handling.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/README.md b/README.md
@@ -78,6 +78,76 @@ assert tr.doc.to_json() == {
 }
 ```
 
+## Differences from Upstream
+
+While the translation follows the original TypeScript implementation as closely
+as possible, some adaptations were necessary for Python. These are documented
+here for reference.
+
+### Naming Conventions
+
+Python's snake_case naming is used throughout:
+
+- `camelCase` methods/properties become `snake_case` (e.g. `nodeSize` ->
+  `node_size`, `isBlock` -> `is_block`, `textBetween` -> `text_between`)
+- `from` (a Python keyword) becomes `from_` in parameter names and the
+  `Fragment.from_()` static method
+
+### DOM Handling
+
+The upstream uses browser DOM APIs. The Python port uses
+[lxml](https://lxml.de/) for parsing and a lightweight custom `Element` /
+`DocumentFragment` model for serialization:
+
+- **`DOMParser`**: Uses `lxml.html` for HTML parsing. Text nodes are wrapped in
+  `<lxmltext>` pseudo-elements since lxml doesn't represent text nodes as
+  separate child elements. CSS selector matching uses `lxml.cssselect`.
+- **`DOMSerializer`**: Outputs HTML strings via custom `Element` and
+  `DocumentFragment` classes rather than creating real DOM nodes.
+- **XML namespaces**: Not supported (raises `NotImplementedError`). This only
+  affects SVG or MathML node serialization.
+
+### String Length and Slicing (UTF-16 Semantics)
+
+JavaScript strings use UTF-16 encoding, so `string.length` counts UTF-16 code
+units (surrogate pairs count as 2). The Python port preserves these semantics
+using a `text_length()` helper and UTF-16 encode/decode for slicing in:
+
+- `Node.node_size` / `TextNode.node_size`
+- `TextNode.cut()`
+- `TextNode.text_between()`
+- `Fragment.findIndex()` / `Fragment.cut()`
+- `diff.py` (character-by-character comparison)
+
+### Deep Comparison
+
+The upstream uses a custom `compareDeep` function for recursive comparison of
+arrays/objects. The Python port uses native `==`, which already performs deep
+comparison of dicts and lists.
+
+### Resolve Cache
+
+The upstream uses a `WeakMap<Node, ResolveCache>` for caching resolved
+positions. Python uses a `dict[int, _ResolveCache]` keyed by `id(doc)` with a
+`weakref.ref` callback to clean up entries when the document node is garbage
+collected.
+
+### Type System
+
+- TypeScript interfaces (`NodeSpec`, `MarkSpec`, `ParseOptions`, etc.) are
+  translated as `TypedDict` or frozen `dataclass` types.
+- Union types use `X | Y` syntax (Python 3.10+).
+
+### Additional Conveniences
+
+These are Python-specific additions not present in the upstream:
+
+- `Fragment.from_json()` accepts a JSON `str` and parses it automatically.
+- `from_dom.py` includes a `from_html()` helper to parse an HTML string
+  directly to a ProseMirror document.
+- `DOMSerializer` output type is named `HTMLOutputSpec` (instead of
+  `DOMOutputSpec`) to reflect that it produces HTML strings.
+
 ## AI Disclosure
 
 The initial version of this translation was written manually in 2019. AI is now
diff --git a/tests/prosemirror_model/tests/test_replace.py b/tests/prosemirror_model/tests/test_replace.py
@@ -0,0 +1,193 @@
+import re
+
+import pytest
+
+from prosemirror.model import Slice
+from prosemirror.test_builder import eq, out
+
+doc = out["doc"]
+blockquote = out["blockquote"]
+h1 = out["h1"]
+p = out["p"]
+ul = out["ul"]
+li = out["li"]
+
+
+def rpl(doc_node, insert, expected):
+    if insert is not None:
+        slice = insert.slice(insert.tag["a"], insert.tag["b"])
+    else:
+        slice = Slice.empty
+    result = doc_node.replace(doc_node.tag["a"], doc_node.tag["b"], slice)
+    assert eq(result, expected), f"Expected {expected}, got {result}"
+
+
+def bad(doc_node, insert, pattern):
+    if insert is not None:
+        slice = insert.slice(insert.tag["a"], insert.tag["b"])
+    else:
+        slice = Slice.empty
+    with pytest.raises(Exception, match=re.compile(pattern, re.IGNORECASE)):
+        doc_node.replace(doc_node.tag["a"], doc_node.tag["b"], slice)
+
+
+class TestReplace:
+    def test_joins_on_delete(self):
+        rpl(doc(p("on<a>e"), p("t<b>wo")), None, doc(p("onwo")))
+
+    def test_merges_matching_blocks(self):
+        rpl(
+            doc(p("on<a>e"), p("t<b>wo")),
+            doc(p("xx<a>xx"), p("yy<b>yy")),
+            doc(p("onxx"), p("yywo")),
+        )
+
+    def test_merges_when_adding_text(self):
+        rpl(
+            doc(p("on<a>e"), p("t<b>wo")),
+            doc(p("<a>H<b>")),
+            doc(p("onHwo")),
+        )
+
+    def test_can_insert_text(self):
+        rpl(
+            doc(p("before"), p("on<a><b>e"), p("after")),
+            doc(p("<a>H<b>")),
+            doc(p("before"), p("onHe"), p("after")),
+        )
+
+    def test_doesnt_merge_non_matching_blocks(self):
+        rpl(
+            doc(p("on<a>e"), p("t<b>wo")),
+            doc(h1("<a>H<b>")),
+            doc(p("onHwo")),
+        )
+
+    def test_can_merge_a_nested_node(self):
+        rpl(
+            doc(blockquote(blockquote(p("on<a>e"), p("t<b>wo")))),
+            doc(p("<a>H<b>")),
+            doc(blockquote(blockquote(p("onHwo")))),
+        )
+
+    def test_can_replace_within_a_block(self):
+        rpl(
+            doc(blockquote(p("a<a>bc<b>d"))),
+            doc(p("x<a>y<b>z")),
+            doc(blockquote(p("ayd"))),
+        )
+
+    def test_can_insert_a_lopsided_slice(self):
+        rpl(
+            doc(blockquote(blockquote(p("on<a>e"), p("two"), "<b>", p("three")))),
+            doc(blockquote(p("aa<a>aa"), p("bb"), p("cc"), "<b>", p("dd"))),
+            doc(blockquote(blockquote(p("onaa"), p("bb"), p("cc"), p("three")))),
+        )
+
+    def test_can_insert_a_deep_lopsided_slice(self):
+        rpl(
+            doc(
+                blockquote(blockquote(p("on<a>e"), p("two"), p("three")), "<b>", p("x"))
+            ),
+            doc(blockquote(p("aa<a>aa"), p("bb"), p("cc")), "<b>", p("dd")),
+            doc(blockquote(blockquote(p("onaa"), p("bb"), p("cc")), p("x"))),
+        )
+
+    def test_can_merge_multiple_levels(self):
+        rpl(
+            doc(
+                blockquote(blockquote(p("hell<a>o"))),
+                blockquote(blockquote(p("<b>a"))),
+            ),
+            None,
+            doc(blockquote(blockquote(p("hella")))),
+        )
+
+    def test_can_merge_multiple_levels_while_inserting(self):
+        rpl(
+            doc(
+                blockquote(blockquote(p("hell<a>o"))),
+                blockquote(blockquote(p("<b>a"))),
+            ),
+            doc(p("<a>i<b>")),
+            doc(blockquote(blockquote(p("hellia")))),
+        )
+
+    def test_can_insert_a_split(self):
+        rpl(
+            doc(p("foo<a><b>bar")),
+            doc(p("<a>x"), p("y<b>")),
+            doc(p("foox"), p("ybar")),
+        )
+
+    def test_can_insert_a_deep_split(self):
+        rpl(
+            doc(blockquote(p("foo<a>x<b>bar"))),
+            doc(blockquote(p("<a>x")), blockquote(p("y<b>"))),
+            doc(blockquote(p("foox")), blockquote(p("ybar"))),
+        )
+
+    def test_can_add_a_split_one_level_up(self):
+        rpl(
+            doc(blockquote(p("foo<a>u"), p("v<b>bar"))),
+            doc(blockquote(p("<a>x")), blockquote(p("y<b>"))),
+            doc(blockquote(p("foox")), blockquote(p("ybar"))),
+        )
+
+    def test_keeps_the_node_type_of_the_left_node(self):
+        rpl(
+            doc(h1("foo<a>bar"), "<b>"),
+            doc(p("foo<a>baz"), "<b>"),
+            doc(h1("foobaz")),
+        )
+
+    def test_keeps_the_node_type_even_when_empty(self):
+        rpl(
+            doc(h1("<a>bar"), "<b>"),
+            doc(p("foo<a>baz"), "<b>"),
+            doc(h1("baz")),
+        )
+
+
+class TestReplaceErrors:
+    def test_doesnt_allow_the_left_side_to_be_too_deep(self):
+        bad(
+            doc(p("<a><b>")),
+            doc(blockquote(p("<a>")), "<b>"),
+            "deeper",
+        )
+
+    def test_doesnt_allow_a_depth_mismatch(self):
+        bad(
+            doc(p("<a><b>")),
+            doc("<a>", p("<b>")),
+            "inconsistent",
+        )
+
+    def test_rejects_a_bad_fit(self):
+        bad(
+            doc("<a><b>"),
+            doc(p("<a>foo<b>")),
+            "invalid content",
+        )
+
+    def test_rejects_unjoinable_content(self):
+        bad(
+            doc(ul(li(p("a")), "<a>"), "<b>"),
+            doc(p("foo", "<a>"), "<b>"),
+            "cannot join",
+        )
+
+    def test_rejects_an_unjoinable_delete(self):
+        bad(
+            doc(blockquote(p("a"), "<a>"), ul("<b>", li(p("b")))),
+            None,
+            "cannot join",
+        )
+
+    def test_check_content_validity(self):
+        bad(
+            doc(blockquote("<a>", p("hi")), "<b>"),
+            doc(blockquote("hi", "<a>"), "<b>"),
+            "invalid content",
+        )