Skip to content

Commit bef3985

Browse files
committed
normalize html title whitespace
1 parent 6f9dd8d commit bef3985

File tree

7 files changed

+38
-14
lines changed

7 files changed

+38
-14
lines changed

src/browser/browser.zig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,7 @@ pub const Page = struct {
415415

416416
const ccharset = try arena.dupeZ(u8, charset);
417417

418-
const html_doc = try parser.documentHTMLParse(reader, ccharset);
418+
const html_doc = try parser.documentHTMLParse(arena, reader, ccharset);
419419
const doc = parser.documentHTMLToDocument(html_doc);
420420

421421
// save a document's pointer in the page.

src/browser/css/libdom_test.zig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ test "matchFirst" {
161161
for (testcases) |tc| {
162162
matcher.reset();
163163

164-
const doc = try parser.documentHTMLParseFromStr(tc.html);
164+
const doc = try parser.documentHTMLParseFromStr(alloc, tc.html);
165165
defer parser.documentHTMLClose(doc) catch {};
166166

167167
const s = css.parse(alloc, tc.q, .{}) catch |e| {

src/browser/dump.zig

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,10 @@ fn testWriteFullHTML(comptime expected: []const u8, src: []const u8) !void {
196196
var buf = std.ArrayListUnmanaged(u8){};
197197
defer buf.deinit(testing.allocator);
198198

199-
const doc_html = try parser.documentHTMLParseFromStr(src);
199+
var aa = std.heap.ArenaAllocator.init(testing.allocator);
200+
defer aa.deinit();
201+
202+
const doc_html = try parser.documentHTMLParseFromStr(aa.allocator(), src);
200203
defer parser.documentHTMLClose(doc_html) catch {};
201204

202205
const doc = parser.documentHTMLToDocument(doc_html);

src/browser/html/document.zig

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,19 @@ const collection = @import("../dom/html_collection.zig");
2929
const Walker = @import("../dom/walker.zig").WalkerDepthFirst;
3030
const Cookie = @import("../storage/cookie.zig").Cookie;
3131

32+
pub fn normalizeWhitespace(arena: std.mem.Allocator, title: []const u8) ![]const u8 {
33+
var normalized = try std.ArrayListUnmanaged(u8).initCapacity(arena, title.len);
34+
var tokens = std.mem.tokenizeAny(u8, title, &std.ascii.whitespace);
35+
36+
var prepend = false;
37+
while (tokens.next()) |token| {
38+
if (prepend) normalized.appendAssumeCapacity(' ') else prepend = true;
39+
normalized.appendSliceAssumeCapacity(token);
40+
}
41+
42+
return normalized.items;
43+
}
44+
3245
// WEB IDL https://html.spec.whatwg.org/#the-document-object
3346
pub const HTMLDocument = struct {
3447
pub const Self = parser.DocumentHTML;
@@ -94,9 +107,10 @@ pub const HTMLDocument = struct {
94107
return try parser.documentHTMLGetTitle(self);
95108
}
96109

97-
pub fn set_title(self: *parser.DocumentHTML, v: []const u8) ![]const u8 {
98-
try parser.documentHTMLSetTitle(self, v);
99-
return v;
110+
pub fn set_title(self: *parser.DocumentHTML, v: []const u8, state: *SessionState) ![]const u8 {
111+
const normalized = try normalizeWhitespace(state.arena, v);
112+
try parser.documentHTMLSetTitle(self, normalized);
113+
return normalized;
100114
}
101115

102116
pub fn _getElementsByName(self: *parser.DocumentHTML, name: []const u8, state: *SessionState) !NodeList {

src/browser/netsurf.zig

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ const c = @cImport({
2929
});
3030

3131
const mimalloc = @import("mimalloc.zig");
32+
const normalizeWhitespace = @import("html/document.zig").normalizeWhitespace;
3233

3334
// init initializes netsurf lib.
3435
// init starts a mimalloc heap arena for the netsurf session. The caller must
@@ -2152,12 +2153,12 @@ fn parserErr(err: HubbubErr) ParserError!void {
21522153

21532154
// documentHTMLParseFromStr parses the given HTML string.
21542155
// The caller is responsible for closing the document.
2155-
pub fn documentHTMLParseFromStr(str: []const u8) !*DocumentHTML {
2156+
pub fn documentHTMLParseFromStr(arena: std.mem.Allocator, str: []const u8) !*DocumentHTML {
21562157
var fbs = std.io.fixedBufferStream(str);
2157-
return try documentHTMLParse(fbs.reader(), "UTF-8");
2158+
return try documentHTMLParse(arena, fbs.reader(), "UTF-8");
21582159
}
21592160

2160-
pub fn documentHTMLParse(reader: anytype, enc: ?[:0]const u8) !*DocumentHTML {
2161+
pub fn documentHTMLParse(arena: std.mem.Allocator, reader: anytype, enc: ?[:0]const u8) !*DocumentHTML {
21612162
var parser: ?*c.dom_hubbub_parser = undefined;
21622163
var doc: ?*c.dom_document = undefined;
21632164
var err: c.hubbub_error = undefined;
@@ -2169,7 +2170,11 @@ pub fn documentHTMLParse(reader: anytype, enc: ?[:0]const u8) !*DocumentHTML {
21692170

21702171
try parseData(parser.?, reader);
21712172

2172-
return @as(*DocumentHTML, @ptrCast(doc.?));
2173+
const html_doc: *DocumentHTML = @ptrCast(doc.?);
2174+
const old_title = try documentHTMLGetTitle(html_doc);
2175+
const normalized = try normalizeWhitespace(arena, old_title);
2176+
try documentHTMLSetTitle(html_doc, normalized);
2177+
return html_doc;
21732178
}
21742179

21752180
pub fn documentParseFragmentFromStr(self: *Document, str: []const u8) !*DocumentFragment {

src/browser/xhr/xhr.zig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -704,7 +704,7 @@ pub const XMLHttpRequest = struct {
704704
}
705705

706706
var fbs = std.io.fixedBufferStream(self.response_bytes.items);
707-
const doc = parser.documentHTMLParse(fbs.reader(), ccharset) catch {
707+
const doc = parser.documentHTMLParse(self.arena, fbs.reader(), ccharset) catch {
708708
self.response_obj = .{ .Failure = {} };
709709
return;
710710
};

src/testing.zig

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -214,11 +214,13 @@ pub const Document = struct {
214214
parser.deinit();
215215
try parser.init();
216216

217+
var arena = std.heap.ArenaAllocator.init(allocator);
218+
217219
var fbs = std.io.fixedBufferStream(html);
218-
const html_doc = try parser.documentHTMLParse(fbs.reader(), "utf-8");
220+
const html_doc = try parser.documentHTMLParse(arena.allocator(), fbs.reader(), "utf-8");
219221

220222
return .{
221-
.arena = std.heap.ArenaAllocator.init(allocator),
223+
.arena = arena,
222224
.doc = parser.documentHTMLToDocument(html_doc),
223225
};
224226
}
@@ -410,7 +412,7 @@ pub const JsRunner = struct {
410412
errdefer self.loop.deinit();
411413

412414
var html = std.io.fixedBufferStream(opts.html);
413-
const document = try parser.documentHTMLParse(html.reader(), "UTF-8");
415+
const document = try parser.documentHTMLParse(arena, html.reader(), "UTF-8");
414416

415417
self.state = .{
416418
.arena = arena,

0 commit comments

Comments
 (0)