Skip to content

Commit 8e85ade

Browse files
committed
initial XML parsing support in DOMParser
1 parent dcde19d commit 8e85ade

File tree

6 files changed

+175
-37
lines changed

6 files changed

+175
-37
lines changed

src/browser/parser/Parser.zig

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,29 @@ pub fn parse(self: *Parser, html: []const u8) void {
9898
);
9999
}
100100

101+
pub fn parseXML(self: *Parser, xml: []const u8) void {
102+
h5e.xml5ever_parse_document(
103+
xml.ptr,
104+
xml.len,
105+
&self.container,
106+
self,
107+
createElementCallback,
108+
getDataCallback,
109+
appendCallback,
110+
parseErrorCallback,
111+
popCallback,
112+
createCommentCallback,
113+
createProcessingInstruction,
114+
appendDoctypeToDocument,
115+
addAttrsIfMissingCallback,
116+
getTemplateContentsCallback,
117+
removeFromParentCallback,
118+
reparentChildrenCallback,
119+
appendBeforeSiblingCallback,
120+
appendBasedOnParentNodeCallback,
121+
);
122+
}
123+
101124
pub fn parseFragment(self: *Parser, html: []const u8) void {
102125
h5e.html5ever_parse_fragment(
103126
html.ptr,

src/browser/parser/html5ever.zig

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,3 +171,24 @@ pub const NodeOrText = extern struct {
171171
text: []const u8,
172172
};
173173
};
174+
175+
pub extern "c" fn xml5ever_parse_document(
176+
html: [*c]const u8,
177+
len: usize,
178+
doc: *anyopaque,
179+
ctx: *anyopaque,
180+
createElementCallback: *const fn (ctx: *anyopaque, data: *anyopaque, QualName, AttributeIterator) callconv(.c) ?*anyopaque,
181+
elemNameCallback: *const fn (node_ref: *anyopaque) callconv(.c) *anyopaque,
182+
appendCallback: *const fn (ctx: *anyopaque, parent_ref: *anyopaque, NodeOrText) callconv(.c) void,
183+
parseErrorCallback: *const fn (ctx: *anyopaque, StringSlice) callconv(.c) void,
184+
popCallback: *const fn (ctx: *anyopaque, node_ref: *anyopaque) callconv(.c) void,
185+
createCommentCallback: *const fn (ctx: *anyopaque, StringSlice) callconv(.c) ?*anyopaque,
186+
createProcessingInstruction: *const fn (ctx: *anyopaque, StringSlice, StringSlice) callconv(.c) ?*anyopaque,
187+
appendDoctypeToDocument: *const fn (ctx: *anyopaque, StringSlice, StringSlice, StringSlice) callconv(.c) void,
188+
addAttrsIfMissingCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque, AttributeIterator) callconv(.c) void,
189+
getTemplateContentsCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque) callconv(.c) ?*anyopaque,
190+
removeFromParentCallback: *const fn (ctx: *anyopaque, target_ref: *anyopaque) callconv(.c) void,
191+
reparentChildrenCallback: *const fn (ctx: *anyopaque, node_ref: *anyopaque, new_parent_ref: *anyopaque) callconv(.c) void,
192+
appendBeforeSiblingCallback: *const fn (ctx: *anyopaque, sibling_ref: *anyopaque, NodeOrText) callconv(.c) void,
193+
appendBasedOnParentNodeCallback: *const fn (ctx: *anyopaque, element_ref: *anyopaque, prev_element_ref: *anyopaque, NodeOrText) callconv(.c) void,
194+
) void;

src/browser/webapi/DOMParser.zig

Lines changed: 46 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -21,41 +21,67 @@ const std = @import("std");
2121
const js = @import("../js/js.zig");
2222
const Page = @import("../Page.zig");
2323
const HTMLDocument = @import("HTMLDocument.zig");
24+
const XMLDocument = @import("XMLDocument.zig");
2425

2526
const DOMParser = @This();
2627

2728
pub fn init() DOMParser {
2829
return .{};
2930
}
3031

31-
pub fn parseFromString(self: *const DOMParser, html: []const u8, mime_type: []const u8, page: *Page) !*HTMLDocument {
32-
_ = self;
32+
pub const HTMLDocumentOrXMLDocument = union(enum) {
33+
html_document: *HTMLDocument,
34+
xml_document: *XMLDocument,
35+
};
3336

34-
// For now, only support text/html
35-
if (!std.mem.eql(u8, mime_type, "text/html")) {
36-
return error.NotSupported;
37-
}
37+
pub fn parseFromString(
38+
_: *const DOMParser,
39+
html: []const u8,
40+
mime_type: []const u8,
41+
page: *Page,
42+
) !HTMLDocumentOrXMLDocument {
43+
if (std.mem.eql(u8, mime_type, "text/html")) {
44+
// Create a new HTMLDocument
45+
const doc = try page._factory.document(HTMLDocument{
46+
._proto = undefined,
47+
});
48+
49+
var normalized = std.mem.trim(u8, html, &std.ascii.whitespace);
50+
if (normalized.len == 0) {
51+
normalized = "<html></html>";
52+
}
3853

39-
// Create a new HTMLDocument
40-
const doc = try page._factory.document(HTMLDocument{
41-
._proto = undefined,
42-
});
54+
// Parse HTML into the document
55+
const Parser = @import("../parser/Parser.zig");
56+
var parser = Parser.init(page.arena, doc.asNode(), page);
57+
parser.parse(normalized);
4358

44-
var normalized = std.mem.trim(u8, html, &std.ascii.whitespace);
45-
if (normalized.len == 0) {
46-
normalized = "<html></html>";
59+
if (parser.err) |pe| {
60+
return pe.err;
61+
}
62+
63+
return .{ .html_document = doc };
4764
}
4865

49-
// Parse HTML into the document
50-
const Parser = @import("../parser/Parser.zig");
51-
var parser = Parser.init(page.arena, doc.asNode(), page);
52-
parser.parse(normalized);
66+
if (std.mem.eql(u8, mime_type, "text/xml")) {
67+
// Create a new XMLDocument.
68+
const doc = try page._factory.document(XMLDocument{
69+
._proto = undefined,
70+
});
71+
72+
// Parse XML into XMLDocument.
73+
const Parser = @import("../parser/Parser.zig");
74+
var parser = Parser.init(page.arena, doc.asNode(), page);
75+
parser.parseXML(html);
76+
77+
if (parser.err) |pe| {
78+
return pe.err;
79+
}
5380

54-
if (parser.err) |pe| {
55-
return pe.err;
81+
return .{ .xml_document = doc };
5682
}
5783

58-
return doc;
84+
return error.NotSupported;
5985
}
6086

6187
pub const JsApi = struct {

src/html5ever/Cargo.lock

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/html5ever/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ string_cache = "0.9.0"
1414
typed-arena = "2.0.2"
1515
tikv-jemallocator = {version = "0.6.0", features = ["stats"]}
1616
tikv-jemalloc-ctl = {version = "0.6.0", features = ["stats"]}
17+
xml5ever = "0.35.0"
1718

1819
[profile.release]
1920
lto = true

src/html5ever/lib.rs

Lines changed: 73 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,20 +16,20 @@
1616
// You should have received a copy of the GNU Affero General Public License
1717
// along with this program. If not, see <https://www.gnu.org/licenses/>.
1818

19-
mod types;
2019
mod sink;
20+
mod types;
2121

2222
#[cfg(debug_assertions)]
2323
#[global_allocator]
2424
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
2525

26-
use types::*;
2726
use std::cell::Cell;
2827
use std::os::raw::{c_uchar, c_void};
28+
use types::*;
2929

30-
use html5ever::{parse_document, parse_fragment, QualName, LocalName, ns, ParseOpts, Parser};
31-
use html5ever::tendril::{TendrilSink, StrTendril};
3230
use html5ever::interface::tree_builder::QuirksMode;
31+
use html5ever::tendril::{StrTendril, TendrilSink};
32+
use html5ever::{ns, parse_document, parse_fragment, LocalName, ParseOpts, Parser, QualName};
3333

3434
#[no_mangle]
3535
pub extern "C" fn html5ever_parse_document(
@@ -135,13 +135,14 @@ pub extern "C" fn html5ever_parse_fragment(
135135

136136
let bytes = unsafe { std::slice::from_raw_parts(html, len) };
137137
parse_fragment(
138-
sink, Default::default(),
138+
sink,
139+
Default::default(),
139140
QualName::new(None, ns!(html), LocalName::from("body")),
140-
vec![], // attributes
141-
false, // context_element_allows_scripting
141+
vec![], // attributes
142+
false, // context_element_allows_scripting
142143
)
143-
.from_utf8()
144-
.one(bytes);
144+
.from_utf8()
145+
.one(bytes);
145146
}
146147

147148
#[no_mangle]
@@ -182,15 +183,15 @@ pub struct Memory {
182183
#[cfg(debug_assertions)]
183184
#[no_mangle]
184185
pub extern "C" fn html5ever_get_memory_usage() -> Memory {
185-
use tikv_jemalloc_ctl::{stats, epoch};
186+
use tikv_jemalloc_ctl::{epoch, stats};
186187

187188
// many statistics are cached and only updated when the epoch is advanced.
188189
epoch::advance().unwrap();
189190

190-
return Memory{
191+
return Memory {
191192
resident: stats::resident::read().unwrap(),
192193
allocated: stats::allocated::read().unwrap(),
193-
}
194+
};
194195
}
195196

196197
// Streaming parser API
@@ -225,9 +226,8 @@ pub extern "C" fn html5ever_streaming_parser_create(
225226
// SAFETY: We're creating a self-referential structure here.
226227
// The arena is stored in the StreamingParser and lives as long as the parser.
227228
// The sink contains a reference to the arena that's valid for the parser's lifetime.
228-
let arena_ref: &'static typed_arena::Arena<sink::ElementData> = unsafe {
229-
std::mem::transmute(arena.as_ref())
230-
};
229+
let arena_ref: &'static typed_arena::Arena<sink::ElementData> =
230+
unsafe { std::mem::transmute(arena.as_ref()) };
231231

232232
let sink = sink::Sink {
233233
ctx: ctx,
@@ -281,7 +281,8 @@ pub extern "C" fn html5ever_streaming_parser_feed(
281281

282282
// Feed the chunk to the parser
283283
// The Parser implements TendrilSink, so we can call process() on it
284-
let parser = streaming_parser.parser
284+
let parser = streaming_parser
285+
.parser
285286
.downcast_mut::<Parser<sink::Sink>>()
286287
.expect("Invalid parser type");
287288

@@ -304,7 +305,8 @@ pub extern "C" fn html5ever_streaming_parser_finish(parser_ptr: *mut c_void) {
304305
let streaming_parser = unsafe { Box::from_raw(parser_ptr as *mut StreamingParser) };
305306

306307
// Extract and finish the parser
307-
let parser = streaming_parser.parser
308+
let parser = streaming_parser
309+
.parser
308310
.downcast::<Parser<sink::Sink>>()
309311
.expect("Invalid parser type");
310312

@@ -326,3 +328,57 @@ pub extern "C" fn html5ever_streaming_parser_destroy(parser_ptr: *mut c_void) {
326328
let _ = Box::from_raw(parser_ptr as *mut StreamingParser);
327329
}
328330
}
331+
332+
#[no_mangle]
333+
pub extern "C" fn xml5ever_parse_document(
334+
xml: *mut c_uchar,
335+
len: usize,
336+
document: Ref,
337+
ctx: Ref,
338+
create_element_callback: CreateElementCallback,
339+
get_data_callback: GetDataCallback,
340+
append_callback: AppendCallback,
341+
parse_error_callback: ParseErrorCallback,
342+
pop_callback: PopCallback,
343+
create_comment_callback: CreateCommentCallback,
344+
create_processing_instruction: CreateProcessingInstruction,
345+
append_doctype_to_document: AppendDoctypeToDocumentCallback,
346+
add_attrs_if_missing_callback: AddAttrsIfMissingCallback,
347+
get_template_contents_callback: GetTemplateContentsCallback,
348+
remove_from_parent_callback: RemoveFromParentCallback,
349+
reparent_children_callback: ReparentChildrenCallback,
350+
append_before_sibling_callback: AppendBeforeSiblingCallback,
351+
append_based_on_parent_node_callback: AppendBasedOnParentNodeCallback,
352+
) -> () {
353+
if xml.is_null() || len == 0 {
354+
return ();
355+
}
356+
357+
let arena = typed_arena::Arena::new();
358+
359+
let sink = sink::Sink {
360+
ctx: ctx,
361+
arena: &arena,
362+
document: document,
363+
quirks_mode: Cell::new(QuirksMode::NoQuirks),
364+
pop_callback: pop_callback,
365+
append_callback: append_callback,
366+
get_data_callback: get_data_callback,
367+
parse_error_callback: parse_error_callback,
368+
create_element_callback: create_element_callback,
369+
create_comment_callback: create_comment_callback,
370+
create_processing_instruction: create_processing_instruction,
371+
append_doctype_to_document: append_doctype_to_document,
372+
add_attrs_if_missing_callback: add_attrs_if_missing_callback,
373+
get_template_contents_callback: get_template_contents_callback,
374+
remove_from_parent_callback: remove_from_parent_callback,
375+
reparent_children_callback: reparent_children_callback,
376+
append_before_sibling_callback: append_before_sibling_callback,
377+
append_based_on_parent_node_callback: append_based_on_parent_node_callback,
378+
};
379+
380+
let bytes = unsafe { std::slice::from_raw_parts(xml, len) };
381+
xml5ever::driver::parse_document(sink, xml5ever::driver::XmlParseOpts::default())
382+
.from_utf8()
383+
.one(bytes);
384+
}

0 commit comments

Comments
 (0)