Skip to content

Commit 4889143

Browse files
sciyoshiclaude
andcommitted
docs: add upstream differences section and replace tests
Document Python-specific adaptations from the TypeScript upstream (naming conventions, DOM handling, UTF-16 semantics, etc.) and add test suite for Node.replace() and replace error handling. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 4195359 commit 4889143

File tree

2 files changed

+263
-0
lines changed

2 files changed

+263
-0
lines changed

README.md

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,76 @@ assert tr.doc.to_json() == {
7878
}
7979
```
8080

81+
## Differences from Upstream
82+
83+
While the translation follows the original TypeScript implementation as closely
84+
as possible, some adaptations were necessary for Python. These are documented
85+
here for reference.
86+
87+
### Naming Conventions
88+
89+
Python's snake_case naming is used throughout:
90+
91+
- `camelCase` methods/properties become `snake_case` (e.g. `nodeSize` ->
92+
`node_size`, `isBlock` -> `is_block`, `textBetween` -> `text_between`)
93+
- `from` (a Python keyword) becomes `from_` in parameter names and the
94+
`Fragment.from_()` static method
95+
96+
### DOM Handling
97+
98+
The upstream uses browser DOM APIs. The Python port uses
99+
[lxml](https://lxml.de/) for parsing and a lightweight custom `Element` /
100+
`DocumentFragment` model for serialization:
101+
102+
- **`DOMParser`**: Uses `lxml.html` for HTML parsing. Text nodes are wrapped in
103+
`<lxmltext>` pseudo-elements since lxml doesn't represent text nodes as
104+
separate child elements. CSS selector matching uses `lxml.cssselect`.
105+
- **`DOMSerializer`**: Outputs HTML strings via custom `Element` and
106+
`DocumentFragment` classes rather than creating real DOM nodes.
107+
- **XML namespaces**: Not supported (raises `NotImplementedError`). This only
108+
affects SVG or MathML node serialization.
109+
110+
### String Length and Slicing (UTF-16 Semantics)
111+
112+
JavaScript strings use UTF-16 encoding, so `string.length` counts UTF-16 code
113+
units (surrogate pairs count as 2). The Python port preserves these semantics
114+
using a `text_length()` helper and UTF-16 encode/decode for slicing in:
115+
116+
- `Node.node_size` / `TextNode.node_size`
117+
- `TextNode.cut()`
118+
- `TextNode.text_between()`
119+
- `Fragment.findIndex()` / `Fragment.cut()`
120+
- `diff.py` (character-by-character comparison)
121+
122+
### Deep Comparison
123+
124+
The upstream uses a custom `compareDeep` function for recursive comparison of
125+
arrays/objects. The Python port uses native `==`, which already performs deep
126+
comparison of dicts and lists.
127+
128+
### Resolve Cache
129+
130+
The upstream uses a `WeakMap<Node, ResolveCache>` for caching resolved
131+
positions. Python uses a `dict[int, _ResolveCache]` keyed by `id(doc)` with a
132+
`weakref.ref` callback to clean up entries when the document node is garbage
133+
collected.
134+
135+
### Type System
136+
137+
- TypeScript interfaces (`NodeSpec`, `MarkSpec`, `ParseOptions`, etc.) are
138+
translated as `TypedDict` or frozen `dataclass` types.
139+
- Union types use `X | Y` syntax (Python 3.10+).
140+
141+
### Additional Conveniences
142+
143+
These are Python-specific additions not present in the upstream:
144+
145+
- `Fragment.from_json()` accepts a JSON `str` and parses it automatically.
146+
- `from_dom.py` includes a `from_html()` helper to parse an HTML string
147+
directly to a ProseMirror document.
148+
- `DOMSerializer` output type is named `HTMLOutputSpec` (instead of
149+
`DOMOutputSpec`) to reflect that it produces HTML strings.
150+
81151
## AI Disclosure
82152

83153
The initial version of this translation was written manually in 2019. AI is now
Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
import re
2+
3+
import pytest
4+
5+
from prosemirror.model import Slice
6+
from prosemirror.test_builder import eq, out
7+
8+
doc = out["doc"]
9+
blockquote = out["blockquote"]
10+
h1 = out["h1"]
11+
p = out["p"]
12+
ul = out["ul"]
13+
li = out["li"]
14+
15+
16+
def rpl(doc_node, insert, expected):
17+
if insert is not None:
18+
slice = insert.slice(insert.tag["a"], insert.tag["b"])
19+
else:
20+
slice = Slice.empty
21+
result = doc_node.replace(doc_node.tag["a"], doc_node.tag["b"], slice)
22+
assert eq(result, expected), f"Expected {expected}, got {result}"
23+
24+
25+
def bad(doc_node, insert, pattern):
26+
if insert is not None:
27+
slice = insert.slice(insert.tag["a"], insert.tag["b"])
28+
else:
29+
slice = Slice.empty
30+
with pytest.raises(Exception, match=re.compile(pattern, re.IGNORECASE)):
31+
doc_node.replace(doc_node.tag["a"], doc_node.tag["b"], slice)
32+
33+
34+
class TestReplace:
35+
def test_joins_on_delete(self):
36+
rpl(doc(p("on<a>e"), p("t<b>wo")), None, doc(p("onwo")))
37+
38+
def test_merges_matching_blocks(self):
39+
rpl(
40+
doc(p("on<a>e"), p("t<b>wo")),
41+
doc(p("xx<a>xx"), p("yy<b>yy")),
42+
doc(p("onxx"), p("yywo")),
43+
)
44+
45+
def test_merges_when_adding_text(self):
46+
rpl(
47+
doc(p("on<a>e"), p("t<b>wo")),
48+
doc(p("<a>H<b>")),
49+
doc(p("onHwo")),
50+
)
51+
52+
def test_can_insert_text(self):
53+
rpl(
54+
doc(p("before"), p("on<a><b>e"), p("after")),
55+
doc(p("<a>H<b>")),
56+
doc(p("before"), p("onHe"), p("after")),
57+
)
58+
59+
def test_doesnt_merge_non_matching_blocks(self):
60+
rpl(
61+
doc(p("on<a>e"), p("t<b>wo")),
62+
doc(h1("<a>H<b>")),
63+
doc(p("onHwo")),
64+
)
65+
66+
def test_can_merge_a_nested_node(self):
67+
rpl(
68+
doc(blockquote(blockquote(p("on<a>e"), p("t<b>wo")))),
69+
doc(p("<a>H<b>")),
70+
doc(blockquote(blockquote(p("onHwo")))),
71+
)
72+
73+
def test_can_replace_within_a_block(self):
74+
rpl(
75+
doc(blockquote(p("a<a>bc<b>d"))),
76+
doc(p("x<a>y<b>z")),
77+
doc(blockquote(p("ayd"))),
78+
)
79+
80+
def test_can_insert_a_lopsided_slice(self):
81+
rpl(
82+
doc(blockquote(blockquote(p("on<a>e"), p("two"), "<b>", p("three")))),
83+
doc(blockquote(p("aa<a>aa"), p("bb"), p("cc"), "<b>", p("dd"))),
84+
doc(blockquote(blockquote(p("onaa"), p("bb"), p("cc"), p("three")))),
85+
)
86+
87+
def test_can_insert_a_deep_lopsided_slice(self):
88+
rpl(
89+
doc(
90+
blockquote(blockquote(p("on<a>e"), p("two"), p("three")), "<b>", p("x"))
91+
),
92+
doc(blockquote(p("aa<a>aa"), p("bb"), p("cc")), "<b>", p("dd")),
93+
doc(blockquote(blockquote(p("onaa"), p("bb"), p("cc")), p("x"))),
94+
)
95+
96+
def test_can_merge_multiple_levels(self):
97+
rpl(
98+
doc(
99+
blockquote(blockquote(p("hell<a>o"))),
100+
blockquote(blockquote(p("<b>a"))),
101+
),
102+
None,
103+
doc(blockquote(blockquote(p("hella")))),
104+
)
105+
106+
def test_can_merge_multiple_levels_while_inserting(self):
107+
rpl(
108+
doc(
109+
blockquote(blockquote(p("hell<a>o"))),
110+
blockquote(blockquote(p("<b>a"))),
111+
),
112+
doc(p("<a>i<b>")),
113+
doc(blockquote(blockquote(p("hellia")))),
114+
)
115+
116+
def test_can_insert_a_split(self):
117+
rpl(
118+
doc(p("foo<a><b>bar")),
119+
doc(p("<a>x"), p("y<b>")),
120+
doc(p("foox"), p("ybar")),
121+
)
122+
123+
def test_can_insert_a_deep_split(self):
124+
rpl(
125+
doc(blockquote(p("foo<a>x<b>bar"))),
126+
doc(blockquote(p("<a>x")), blockquote(p("y<b>"))),
127+
doc(blockquote(p("foox")), blockquote(p("ybar"))),
128+
)
129+
130+
def test_can_add_a_split_one_level_up(self):
131+
rpl(
132+
doc(blockquote(p("foo<a>u"), p("v<b>bar"))),
133+
doc(blockquote(p("<a>x")), blockquote(p("y<b>"))),
134+
doc(blockquote(p("foox")), blockquote(p("ybar"))),
135+
)
136+
137+
def test_keeps_the_node_type_of_the_left_node(self):
138+
rpl(
139+
doc(h1("foo<a>bar"), "<b>"),
140+
doc(p("foo<a>baz"), "<b>"),
141+
doc(h1("foobaz")),
142+
)
143+
144+
def test_keeps_the_node_type_even_when_empty(self):
145+
rpl(
146+
doc(h1("<a>bar"), "<b>"),
147+
doc(p("foo<a>baz"), "<b>"),
148+
doc(h1("baz")),
149+
)
150+
151+
152+
class TestReplaceErrors:
153+
def test_doesnt_allow_the_left_side_to_be_too_deep(self):
154+
bad(
155+
doc(p("<a><b>")),
156+
doc(blockquote(p("<a>")), "<b>"),
157+
"deeper",
158+
)
159+
160+
def test_doesnt_allow_a_depth_mismatch(self):
161+
bad(
162+
doc(p("<a><b>")),
163+
doc("<a>", p("<b>")),
164+
"inconsistent",
165+
)
166+
167+
def test_rejects_a_bad_fit(self):
168+
bad(
169+
doc("<a><b>"),
170+
doc(p("<a>foo<b>")),
171+
"invalid content",
172+
)
173+
174+
def test_rejects_unjoinable_content(self):
175+
bad(
176+
doc(ul(li(p("a")), "<a>"), "<b>"),
177+
doc(p("foo", "<a>"), "<b>"),
178+
"cannot join",
179+
)
180+
181+
def test_rejects_an_unjoinable_delete(self):
182+
bad(
183+
doc(blockquote(p("a"), "<a>"), ul("<b>", li(p("b")))),
184+
None,
185+
"cannot join",
186+
)
187+
188+
def test_check_content_validity(self):
189+
bad(
190+
doc(blockquote("<a>", p("hi")), "<b>"),
191+
doc(blockquote("hi", "<a>"), "<b>"),
192+
"invalid content",
193+
)

0 commit comments

Comments
 (0)