Skip to content

Commit 9b1fe20

Browse files
feat(xml): add XML module with streaming parser, DOM-style parser, and serialization
1 parent 6b93b78 commit 9b1fe20

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+18437
-2
lines changed

.github/workflows/title.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,4 +76,5 @@ jobs:
7676
ulid(/unstable)?
7777
uuid(/unstable)?
7878
webgpu(/unstable)?
79+
xml(/unstable)?
7980
yaml(/unstable)?

browser-compat.tsconfig.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
"./ulid",
4949
"./uuid",
5050
"./webgpu",
51+
"./xml",
5152
"./yaml"
5253
]
5354
}

deno.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@
3939
"_tools/node_test_runner",
4040
"http/testdata",
4141
"fs/testdata",
42-
"dotenv/testdata"
42+
"dotenv/testdata",
43+
"xml/testdata"
4344
],
4445
"lint": {
4546
"rules": {
@@ -94,6 +95,7 @@
9495
"./ulid",
9596
"./uuid",
9697
"./webgpu",
98+
"./xml",
9799
"./yaml"
98100
]
99101
}

import_map.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
"npm:/typescript": "npm:[email protected]",
66
"automation/": "https://raw.githubusercontent.com/denoland/automation/0.10.0/",
77
"graphviz": "npm:node-graphviz@^0.1.1",
8-
98
"@std/assert": "jsr:@std/assert@^1.0.16",
109
"@std/async": "jsr:@std/async@^1.0.16",
1110
"@std/bytes": "jsr:@std/bytes@^1.0.6",
@@ -46,6 +45,7 @@
4645
"@std/ulid": "jsr:@std/ulid@^1.0.0",
4746
"@std/uuid": "jsr:@std/uuid@^1.1.0",
4847
"@std/webgpu": "jsr:@std/webgpu@^0.224.9",
48+
"@std/xml": "jsr:@std/xml@^0.0.1",
4949
"@std/yaml": "jsr:@std/yaml@^1.0.10"
5050
}
5151
}

xml/_common.ts

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
// Copyright 2018-2026 the Deno authors. MIT license.
2+
// This module is browser compatible.
3+
4+
/**
5+
* Internal shared utilities for the XML module.
6+
*
7+
* @module
8+
*/
9+
10+
import type { XmlName } from "./types.ts";
11+
12+
/**
13+
* Line ending normalization pattern per XML 1.0 §2.11.
14+
* Converts \r\n and standalone \r to \n.
15+
*/
16+
export const LINE_ENDING_RE = /\r\n?/g;
17+
18+
/**
19+
* Whitespace-only test per XML 1.0 §2.3.
20+
* Uses explicit [ \t\r\n] instead of \s to match XML spec exactly:
21+
* S ::= (#x20 | #x9 | #xD | #xA)+
22+
*/
23+
export const WHITESPACE_ONLY_RE = /^[ \t\r\n]*$/;
24+
25+
/**
26+
* XML declaration version attribute pattern.
27+
* Matches both single and double quoted values.
28+
*/
29+
export const VERSION_RE = /version\s*=\s*(?:"([^"]+)"|'([^']+)')/;
30+
31+
/**
32+
* XML declaration encoding attribute pattern.
33+
* Matches both single and double quoted values.
34+
*/
35+
export const ENCODING_RE = /encoding\s*=\s*(?:"([^"]+)"|'([^']+)')/;
36+
37+
/**
38+
* XML declaration standalone attribute pattern.
39+
* Matches both single and double quoted values, restricted to "yes" or "no".
40+
*/
41+
export const STANDALONE_RE = /standalone\s*=\s*(?:"(yes|no)"|'(yes|no)')/;
42+
43+
/**
44+
* Parses a qualified XML name into its prefix and local parts.
45+
*
46+
* @example Usage
47+
* ```ts
48+
* import { parseName } from "./_common.ts";
49+
*
50+
* parseName("ns:element"); // { prefix: "ns", local: "element" }
51+
* parseName("element"); // { local: "element" }
52+
* ```
53+
*
54+
* @param name The raw name string (e.g., "ns:element" or "element")
55+
* @returns An XmlName object with local and optional prefix
56+
*/
57+
export function parseName(name: string): XmlName {
58+
const colonIndex = name.indexOf(":");
59+
if (colonIndex === -1) {
60+
return { local: name };
61+
}
62+
return {
63+
prefix: name.slice(0, colonIndex),
64+
local: name.slice(colonIndex + 1),
65+
};
66+
}

xml/_entities.ts

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
// Copyright 2018-2026 the Deno authors. MIT license.
2+
// This module is browser compatible.
3+
4+
/**
5+
* Internal module for XML entity encoding and decoding.
6+
*
7+
* @module
8+
*/
9+
10+
/**
11+
* The five predefined XML entities per XML 1.0 §4.6.
12+
* Using const assertion for precise typing.
13+
*/
14+
const NAMED_ENTITIES = {
15+
lt: "<",
16+
gt: ">",
17+
amp: "&",
18+
apos: "'",
19+
quot: '"',
20+
} as const;
21+
22+
/**
23+
* Reverse mapping for encoding special characters.
24+
*/
25+
const CHAR_TO_ENTITY = {
26+
"<": "&lt;",
27+
">": "&gt;",
28+
"&": "&amp;",
29+
"'": "&apos;",
30+
'"': "&quot;",
31+
} as const;
32+
33+
/**
34+
* Extended mapping for attribute value encoding (includes whitespace).
35+
*/
36+
const ATTR_CHAR_MAP: Record<string, string> = {
37+
"<": "&lt;",
38+
">": "&gt;",
39+
"&": "&amp;",
40+
"'": "&apos;",
41+
'"': "&quot;",
42+
"\t": "&#9;",
43+
"\n": "&#10;",
44+
"\r": "&#13;",
45+
};
46+
47+
// Hoisted regex patterns for performance
48+
const ENTITY_RE = /&([a-zA-Z]+|#[0-9]+|#x[0-9a-fA-F]+);/g;
49+
const SPECIAL_CHARS_RE = /[<>&'"]/g;
50+
const ATTR_ENCODE_RE = /[<>&'"\t\n\r]/g;
51+
52+
/**
53+
* Pattern to detect bare `&` not followed by a valid reference.
54+
* Valid references are: &name; or &#digits; or &#xhexdigits;
55+
*/
56+
const BARE_AMPERSAND_RE = /&(?![a-zA-Z][a-zA-Z0-9]*;|#[0-9]+;|#x[0-9a-fA-F]+;)/;
57+
58+
/**
59+
* Checks if a code point is a valid XML 1.0 Char per §2.2.
60+
*
61+
* Per the specification:
62+
* Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
63+
*
64+
* This excludes:
65+
* - NULL (#x0)
66+
* - Control characters #x1-#x8, #xB-#xC, #xE-#x1F
67+
* - Surrogate pairs #xD800-#xDFFF (handled separately)
68+
* - Non-characters #xFFFE-#xFFFF
69+
*
70+
* @see {@link https://www.w3.org/TR/xml/#charsets | XML 1.0 §2.2 Characters}
71+
*/
72+
function isValidXmlChar(codePoint: number): boolean {
73+
return (
74+
codePoint === 0x9 ||
75+
codePoint === 0xA ||
76+
codePoint === 0xD ||
77+
(codePoint >= 0x20 && codePoint <= 0xD7FF) ||
78+
(codePoint >= 0xE000 && codePoint <= 0xFFFD) ||
79+
(codePoint >= 0x10000 && codePoint <= 0x10FFFF)
80+
);
81+
}
82+
83+
/**
84+
* Options for entity decoding.
85+
*/
86+
export interface DecodeEntityOptions {
87+
/**
88+
* If true, throws an error on invalid bare `&` characters.
89+
* Per XML 1.0 §3.1, `&` must be escaped as `&amp;` unless it starts
90+
* a valid entity or character reference.
91+
*
92+
* @default false
93+
*/
94+
readonly strict?: boolean;
95+
}
96+
97+
/**
98+
* Decodes XML entities in a string.
99+
*
100+
* Handles the five predefined entities (§4.6) and numeric character
101+
* references (§4.1) per the XML 1.0 specification.
102+
*
103+
* @param text The text containing XML entities to decode.
104+
* @param options Decoding options.
105+
* @returns The text with entities decoded.
106+
*/
107+
export function decodeEntities(
108+
text: string,
109+
options?: DecodeEntityOptions,
110+
): string {
111+
// Fast path: no ampersand means no entities to decode
112+
if (!text.includes("&")) return text;
113+
114+
if (options?.strict) {
115+
const match = BARE_AMPERSAND_RE.exec(text);
116+
if (match) {
117+
throw new Error(
118+
`Invalid bare '&' at position ${match.index}: ` +
119+
`entity references must be &name; or &#num; or &#xHex;`,
120+
);
121+
}
122+
}
123+
124+
return text.replace(ENTITY_RE, (match, entity: string) => {
125+
if (entity.startsWith("#x")) {
126+
// Hexadecimal character reference
127+
const codePoint = parseInt(entity.slice(2), 16);
128+
// Invalid per XML 1.0 §4.1 WFC: Legal Character - must match Char production
129+
if (!isValidXmlChar(codePoint)) {
130+
return match;
131+
}
132+
return String.fromCodePoint(codePoint);
133+
}
134+
if (entity.startsWith("#")) {
135+
// Decimal character reference
136+
const codePoint = parseInt(entity.slice(1), 10);
137+
// Invalid per XML 1.0 §4.1 WFC: Legal Character - must match Char production
138+
if (!isValidXmlChar(codePoint)) {
139+
return match;
140+
}
141+
return String.fromCodePoint(codePoint);
142+
}
143+
// Named entity
144+
if (entity in NAMED_ENTITIES) {
145+
return NAMED_ENTITIES[entity as keyof typeof NAMED_ENTITIES];
146+
}
147+
// Unknown entity - return as-is
148+
return match;
149+
});
150+
}
151+
152+
/**
153+
* Encodes special characters as XML entities.
154+
*
155+
* @param text The text to encode.
156+
* @returns The text with special characters encoded as entities.
157+
*/
158+
export function encodeEntities(text: string): string {
159+
// Fast path: no special characters means nothing to encode
160+
if (!/[<>&'"]/.test(text)) return text;
161+
return text.replace(
162+
SPECIAL_CHARS_RE,
163+
(char) => CHAR_TO_ENTITY[char as keyof typeof CHAR_TO_ENTITY],
164+
);
165+
}
166+
167+
/**
168+
* Encodes special characters for use in XML attribute values.
169+
* Encodes whitespace characters that would be normalized per XML 1.0 §3.3.3.
170+
*
171+
* @param value The attribute value to encode.
172+
* @returns The encoded attribute value.
173+
*/
174+
export function encodeAttributeValue(value: string): string {
175+
// Fast path: no special characters means nothing to encode
176+
if (!/[<>&'"\t\n\r]/.test(value)) return value;
177+
return value.replace(ATTR_ENCODE_RE, (c) => ATTR_CHAR_MAP[c]!);
178+
}

0 commit comments

Comments
 (0)