Skip to content

Commit 57aa270

Browse files
authored
Merge pull request #1048 from lightpanda-io/nikneym/mime-changes
Mime: charset identification changes
2 parents 81ed4f3 + 90a96fd commit 57aa270

File tree

3 files changed

+66
-43
lines changed

3 files changed

+66
-43
lines changed

src/browser/mime.zig

Lines changed: 63 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,15 @@ const Allocator = std.mem.Allocator;
2222
pub const Mime = struct {
2323
content_type: ContentType,
2424
params: []const u8 = "",
25-
charset: ?[:0]const u8 = null,
25+
// IANA defines max. charset value length as 40.
26+
// We keep 41 for null-termination since HTML parser expects in this format.
27+
charset: [41]u8 = default_charset,
2628

27-
pub const unknown = Mime{
28-
.params = "",
29-
.charset = null,
30-
.content_type = .{ .unknown = {} },
31-
};
29+
/// String "UTF-8" continued by null characters.
30+
pub const default_charset = .{ 'U', 'T', 'F', '-', '8' } ++ .{0} ** 36;
31+
32+
/// Mime with unknown Content-Type, empty params and empty charset.
33+
pub const unknown = Mime{ .content_type = .{ .unknown = {} } };
3234

3335
pub const ContentTypeEnum = enum {
3436
text_xml,
@@ -52,6 +54,34 @@ pub const Mime = struct {
5254
other: struct { type: []const u8, sub_type: []const u8 },
5355
};
5456

57+
/// Returns the null-terminated charset value.
58+
pub inline fn charsetString(mime: *const Mime) [:0]const u8 {
59+
return @ptrCast(&mime.charset);
60+
}
61+
62+
/// Removes quotes of value if quotes are given.
63+
///
64+
/// Currently we don't validate the charset.
65+
/// See section 2.3 Naming Requirements:
66+
/// https://datatracker.ietf.org/doc/rfc2978/
67+
fn parseCharset(value: []const u8) error{ CharsetTooBig, Invalid }![]const u8 {
68+
// Cannot be larger than 40.
69+
// https://datatracker.ietf.org/doc/rfc2978/
70+
if (value.len > 40) return error.CharsetTooBig;
71+
72+
// If the first char is a quote, look for a pair.
73+
if (value[0] == '"') {
74+
if (value.len < 3 or value[value.len - 1] != '"') {
75+
return error.Invalid;
76+
}
77+
78+
return value[1 .. value.len - 1];
79+
}
80+
81+
// No quotes.
82+
return value;
83+
}
84+
5585
pub fn parse(input: []u8) !Mime {
5686
if (input.len > 255) {
5787
return error.TooBig;
@@ -69,7 +99,7 @@ pub const Mime = struct {
6999

70100
const params = trimLeft(normalized[type_len..]);
71101

72-
var charset: ?[:0]const u8 = null;
102+
var charset: [41]u8 = undefined;
73103

74104
var it = std.mem.splitScalar(u8, params, ';');
75105
while (it.next()) |attr| {
@@ -87,35 +117,14 @@ pub const Mime = struct {
87117

88118
switch (attribute_name) {
89119
.charset => {
90-
// We used to have a proper value parser, but we currently
91-
// only care about the charset attribute, plus only about
92-
// the UTF-8 value. It's a lot easier to do it this way,
93-
// and it doesn't require an allocation to (a) unescape the
94-
// value or (b) ensure the correct lifetime.
95120
if (value.len == 0) {
96121
break;
97122
}
98-
var attribute_value = value;
99-
if (value[0] == '"') {
100-
if (value.len < 3 or value[value.len - 1] != '"') {
101-
return error.Invalid;
102-
}
103-
attribute_value = value[1 .. value.len - 1];
104-
}
105123

106-
if (std.ascii.eqlIgnoreCase(attribute_value, "utf-8")) {
107-
charset = "UTF-8";
108-
} else if (std.ascii.eqlIgnoreCase(attribute_value, "iso-8859-1")) {
109-
charset = "ISO-8859-1";
110-
} else {
111-
// we only care about null (which we default to UTF-8)
112-
// or UTF-8. If this is actually set (i.e. not null)
113-
// and isn't UTF-8, we'll just put a dummy value. If
114-
// we want to capture the actual value, we'll need to
115-
// dupe/allocate it. Since, for now, we don't need that
116-
// we can avoid the allocation.
117-
charset = "lightpanda:UNSUPPORTED";
118-
}
124+
const attribute_value = try parseCharset(value);
125+
@memcpy(charset[0..attribute_value.len], attribute_value);
126+
// Null-terminate right after attribute value.
127+
charset[attribute_value.len] = 0;
119128
},
120129
}
121130
}
@@ -363,21 +372,33 @@ test "Mime: parse charset" {
363372

364373
try expect(.{
365374
.content_type = .{ .text_xml = {} },
366-
.charset = "UTF-8",
375+
.charset = "utf-8",
367376
.params = "charset=utf-8",
368377
}, "text/xml; charset=utf-8");
369378

370379
try expect(.{
371380
.content_type = .{ .text_xml = {} },
372-
.charset = "UTF-8",
381+
.charset = "utf-8",
373382
.params = "charset=\"utf-8\"",
374-
}, "text/xml;charset=\"utf-8\"");
383+
}, "text/xml;charset=\"UTF-8\"");
384+
385+
try expect(.{
386+
.content_type = .{ .text_html = {} },
387+
.charset = "iso-8859-1",
388+
.params = "charset=\"iso-8859-1\"",
389+
}, "text/html; charset=\"iso-8859-1\"");
390+
391+
try expect(.{
392+
.content_type = .{ .text_html = {} },
393+
.charset = "iso-8859-1",
394+
.params = "charset=\"iso-8859-1\"",
395+
}, "text/html; charset=\"ISO-8859-1\"");
375396

376397
try expect(.{
377398
.content_type = .{ .text_xml = {} },
378-
.charset = "lightpanda:UNSUPPORTED",
379-
.params = "charset=\"\\\\ \\\" \"",
380-
}, "text/xml;charset=\"\\\\ \\\" \" ");
399+
.charset = "custom-non-standard-charset-value",
400+
.params = "charset=\"custom-non-standard-charset-value\"",
401+
}, "text/xml;charset=\"custom-non-standard-charset-value\"");
381402
}
382403

383404
test "Mime: isHTML" {
@@ -490,8 +511,10 @@ fn expect(expected: Expectation, input: []const u8) !void {
490511
try testing.expectEqual(expected.params, actual.params);
491512

492513
if (expected.charset) |ec| {
493-
try testing.expectEqual(ec, actual.charset.?);
514+
// We remove the null characters for testing purposes here.
515+
try testing.expectEqual(ec, actual.charsetString()[0..ec.len]);
494516
} else {
495-
try testing.expectEqual(null, actual.charset);
517+
const m: Mime = .unknown;
518+
try testing.expectEqual(m.charsetString(), actual.charsetString());
496519
}
497520
}

src/browser/page.zig

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -672,14 +672,14 @@ pub const Page = struct {
672672
log.debug(.http, "navigate first chunk", .{ .content_type = mime.content_type, .len = data.len });
673673

674674
self.mode = switch (mime.content_type) {
675-
.text_html => .{ .html = try parser.Parser.init(mime.charset orelse "UTF-8") },
675+
.text_html => .{ .html = try parser.Parser.init(mime.charsetString()) },
676676

677677
.application_json,
678678
.text_javascript,
679679
.text_css,
680680
.text_plain,
681681
=> blk: {
682-
var p = try parser.Parser.init(mime.charset orelse "UTF-8");
682+
var p = try parser.Parser.init(mime.charsetString());
683683
try p.process("<html><head><meta charset=\"utf-8\"></head><body><pre>");
684684
break :blk .{ .text = p };
685685
},

src/browser/xhr/xhr.zig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -679,7 +679,7 @@ pub const XMLHttpRequest = struct {
679679
}
680680

681681
var fbs = std.io.fixedBufferStream(self.response_bytes.items);
682-
const doc = parser.documentHTMLParse(fbs.reader(), mime.charset orelse "UTF-8") catch {
682+
const doc = parser.documentHTMLParse(fbs.reader(), mime.charsetString()) catch {
683683
self.response_obj = .{ .Failure = {} };
684684
return;
685685
};

0 commit comments

Comments
 (0)