diff --git a/src/browser/css/libdom_test.zig b/src/browser/css/libdom_test.zig index 4cd267e03..67c1fe693 100644 --- a/src/browser/css/libdom_test.zig +++ b/src/browser/css/libdom_test.zig @@ -161,7 +161,7 @@ test "matchFirst" { for (testcases) |tc| { matcher.reset(); - const doc = try parser.documentHTMLParseFromStr(tc.html); + const doc = try parser.documentHTMLParseFromStr(alloc, tc.html); defer parser.documentHTMLClose(doc) catch {}; const s = css.parse(alloc, tc.q, .{}) catch |e| { diff --git a/src/browser/dump.zig b/src/browser/dump.zig index 57084decd..a230271d0 100644 --- a/src/browser/dump.zig +++ b/src/browser/dump.zig @@ -196,7 +196,10 @@ fn testWriteFullHTML(comptime expected: []const u8, src: []const u8) !void { var buf = std.ArrayListUnmanaged(u8){}; defer buf.deinit(testing.allocator); - const doc_html = try parser.documentHTMLParseFromStr(src); + var aa = std.heap.ArenaAllocator.init(testing.allocator); + defer aa.deinit(); + + const doc_html = try parser.documentHTMLParseFromStr(aa.allocator(), src); defer parser.documentHTMLClose(doc_html) catch {}; const doc = parser.documentHTMLToDocument(doc_html); diff --git a/src/browser/html/document.zig b/src/browser/html/document.zig index 89370f507..0dda3ff51 100644 --- a/src/browser/html/document.zig +++ b/src/browser/html/document.zig @@ -29,6 +29,19 @@ const collection = @import("../dom/html_collection.zig"); const Walker = @import("../dom/walker.zig").WalkerDepthFirst; const Cookie = @import("../storage/cookie.zig").Cookie; +pub fn normalizeWhitespace(arena: std.mem.Allocator, title: []const u8) ![]const u8 { + var normalized = try std.ArrayListUnmanaged(u8).initCapacity(arena, title.len); + var tokens = std.mem.tokenizeAny(u8, title, &std.ascii.whitespace); + + var prepend = false; + while (tokens.next()) |token| { + if (prepend) normalized.appendAssumeCapacity(' ') else prepend = true; + normalized.appendSliceAssumeCapacity(token); + } + + return normalized.items; +} + // WEB IDL https://html.spec.whatwg.org/#the-document-object pub const HTMLDocument = struct { pub const Self = parser.DocumentHTML; @@ -94,9 +107,10 @@ pub const HTMLDocument = struct { return try parser.documentHTMLGetTitle(self); } - pub fn set_title(self: *parser.DocumentHTML, v: []const u8) ![]const u8 { - try parser.documentHTMLSetTitle(self, v); - return v; + pub fn set_title(self: *parser.DocumentHTML, v: []const u8, state: *SessionState) ![]const u8 { + const normalized = try normalizeWhitespace(state.arena, v); + try parser.documentHTMLSetTitle(self, normalized); + return normalized; } pub fn _getElementsByName(self: *parser.DocumentHTML, name: []const u8, state: *SessionState) !NodeList { diff --git a/src/browser/netsurf.zig b/src/browser/netsurf.zig index 6e7dbeb62..6f318c542 100644 --- a/src/browser/netsurf.zig +++ b/src/browser/netsurf.zig @@ -29,6 +29,7 @@ const c = @cImport({ }); const mimalloc = @import("mimalloc.zig"); +const normalizeWhitespace = @import("html/document.zig").normalizeWhitespace; // init initializes netsurf lib. // init starts a mimalloc heap arena for the netsurf session. The caller must @@ -2152,12 +2153,12 @@ fn parserErr(err: HubbubErr) ParserError!void { // documentHTMLParseFromStr parses the given HTML string. // The caller is responsible for closing the document. -pub fn documentHTMLParseFromStr(str: []const u8) !*DocumentHTML { +pub fn documentHTMLParseFromStr(arena: std.mem.Allocator, str: []const u8) !*DocumentHTML { var fbs = std.io.fixedBufferStream(str); - return try documentHTMLParse(fbs.reader(), "UTF-8"); + return try documentHTMLParse(arena, fbs.reader(), "UTF-8"); } -pub fn documentHTMLParse(reader: anytype, enc: ?[:0]const u8) !*DocumentHTML { +pub fn documentHTMLParse(arena: std.mem.Allocator, reader: anytype, enc: ?[:0]const u8) !*DocumentHTML { var parser: ?*c.dom_hubbub_parser = undefined; var doc: ?*c.dom_document = undefined; var err: c.hubbub_error = undefined; @@ -2169,7 +2170,11 @@ pub fn documentHTMLParse(reader: anytype, enc: ?[:0]const u8) !*DocumentHTML { try parseData(parser.?, reader); - return @as(*DocumentHTML, @ptrCast(doc.?)); + const html_doc: *DocumentHTML = @ptrCast(doc.?); + const old_title = try documentHTMLGetTitle(html_doc); + const normalized = try normalizeWhitespace(arena, old_title); + try documentHTMLSetTitle(html_doc, normalized); + return html_doc; } pub fn documentParseFragmentFromStr(self: *Document, str: []const u8) !*DocumentFragment { diff --git a/src/browser/page.zig b/src/browser/page.zig index ffed1cd23..4192db073 100644 --- a/src/browser/page.zig +++ b/src/browser/page.zig @@ -248,7 +248,7 @@ pub const Page = struct { const ccharset = try arena.dupeZ(u8, charset); - const html_doc = try parser.documentHTMLParse(reader, ccharset); + const html_doc = try parser.documentHTMLParse(arena, reader, ccharset); const doc = parser.documentHTMLToDocument(html_doc); // save a document's pointer in the page. diff --git a/src/browser/xhr/xhr.zig b/src/browser/xhr/xhr.zig index 4ea517cdb..78bfda2c9 100644 --- a/src/browser/xhr/xhr.zig +++ b/src/browser/xhr/xhr.zig @@ -703,7 +703,7 @@ pub const XMLHttpRequest = struct { } var fbs = std.io.fixedBufferStream(self.response_bytes.items); - const doc = parser.documentHTMLParse(fbs.reader(), ccharset) catch { + const doc = parser.documentHTMLParse(self.arena, fbs.reader(), ccharset) catch { self.response_obj = .{ .Failure = {} }; return; }; diff --git a/src/testing.zig b/src/testing.zig index f11c0ab65..ef20e1792 100644 --- a/src/testing.zig +++ b/src/testing.zig @@ -214,11 +214,13 @@ pub const Document = struct { parser.deinit(); try parser.init(); + var arena = std.heap.ArenaAllocator.init(allocator); + var fbs = std.io.fixedBufferStream(html); - const html_doc = try parser.documentHTMLParse(fbs.reader(), "utf-8"); + const html_doc = try parser.documentHTMLParse(arena.allocator(), fbs.reader(), "utf-8"); return .{ - .arena = std.heap.ArenaAllocator.init(allocator), + .arena = arena, .doc = parser.documentHTMLToDocument(html_doc), }; } @@ -410,7 +412,7 @@ pub const JsRunner = struct { errdefer self.loop.deinit(); var html = std.io.fixedBufferStream(opts.html); - const document = try parser.documentHTMLParse(html.reader(), "UTF-8"); + const document = try parser.documentHTMLParse(arena, html.reader(), "UTF-8"); self.state = .{ .arena = arena,