Merge pull request #1048 from lightpanda-io/nikneym/mime-changes

karlseguin · web-flow · commit 57aa270032b6 · 2025-09-16T16:13:46.000+08:00
Mime: charset identification changes
diff --git a/src/browser/mime.zig b/src/browser/mime.zig
@@ -22,13 +22,15 @@ const Allocator = std.mem.Allocator;
 pub const Mime = struct {
     content_type: ContentType,
     params: []const u8 = "",
-    charset: ?[:0]const u8 = null,
+    // IANA defines max. charset value length as 40.
+    // We keep 41 for null-termination since HTML parser expects in this format.
+    charset: [41]u8 = default_charset,
 
-    pub const unknown = Mime{
-        .params = "",
-        .charset = null,
-        .content_type = .{ .unknown = {} },
-    };
+    /// String "UTF-8" continued by null characters.
+    pub const default_charset = .{ 'U', 'T', 'F', '-', '8' } ++ .{0} ** 36;
+
+    /// Mime with unknown Content-Type, empty params and empty charset.
+    pub const unknown = Mime{ .content_type = .{ .unknown = {} } };
 
     pub const ContentTypeEnum = enum {
         text_xml,
@@ -52,6 +54,34 @@ pub const Mime = struct {
         other: struct { type: []const u8, sub_type: []const u8 },
     };
 
+    /// Returns the null-terminated charset value.
+    pub inline fn charsetString(mime: *const Mime) [:0]const u8 {
+        return @ptrCast(&mime.charset);
+    }
+
+    /// Removes quotes of value if quotes are given.
+    ///
+    /// Currently we don't validate the charset.
+    /// See section 2.3 Naming Requirements:
+    /// https://datatracker.ietf.org/doc/rfc2978/
+    fn parseCharset(value: []const u8) error{ CharsetTooBig, Invalid }![]const u8 {
+        // Cannot be larger than 40.
+        // https://datatracker.ietf.org/doc/rfc2978/
+        if (value.len > 40) return error.CharsetTooBig;
+
+        // If the first char is a quote, look for a pair.
+        if (value[0] == '"') {
+            if (value.len < 3 or value[value.len - 1] != '"') {
+                return error.Invalid;
+            }
+
+            return value[1 .. value.len - 1];
+        }
+
+        // No quotes.
+        return value;
+    }
+
     pub fn parse(input: []u8) !Mime {
         if (input.len > 255) {
             return error.TooBig;
@@ -69,7 +99,7 @@ pub const Mime = struct {
 
         const params = trimLeft(normalized[type_len..]);
 
-        var charset: ?[:0]const u8 = null;
+        var charset: [41]u8 = undefined;
 
         var it = std.mem.splitScalar(u8, params, ';');
         while (it.next()) |attr| {
@@ -87,35 +117,14 @@ pub const Mime = struct {
 
             switch (attribute_name) {
                 .charset => {
-                    // We used to have a proper value parser, but we currently
-                    // only care about the charset attribute, plus only about
-                    // the UTF-8 value. It's a lot easier to do it this way,
-                    // and it doesn't require an allocation to (a) unescape the
-                    // value or (b) ensure the correct lifetime.
                     if (value.len == 0) {
                         break;
                     }
-                    var attribute_value = value;
-                    if (value[0] == '"') {
-                        if (value.len < 3 or value[value.len - 1] != '"') {
-                            return error.Invalid;
-                        }
-                        attribute_value = value[1 .. value.len - 1];
-                    }
 
-                    if (std.ascii.eqlIgnoreCase(attribute_value, "utf-8")) {
-                        charset = "UTF-8";
-                    } else if (std.ascii.eqlIgnoreCase(attribute_value, "iso-8859-1")) {
-                        charset = "ISO-8859-1";
-                    } else {
-                        // we only care about null (which we default to UTF-8)
-                        // or UTF-8. If this is actually set (i.e. not null)
-                        // and isn't UTF-8, we'll just put a dummy value. If
-                        // we want to capture the actual value, we'll need to
-                        // dupe/allocate it. Since, for now, we don't need that
-                        // we can avoid the allocation.
-                        charset = "lightpanda:UNSUPPORTED";
-                    }
+                    const attribute_value = try parseCharset(value);
+                    @memcpy(charset[0..attribute_value.len], attribute_value);
+                    // Null-terminate right after attribute value.
+                    charset[attribute_value.len] = 0;
                 },
             }
         }
@@ -363,21 +372,33 @@ test "Mime: parse charset" {
 
     try expect(.{
         .content_type = .{ .text_xml = {} },
-        .charset = "UTF-8",
+        .charset = "utf-8",
         .params = "charset=utf-8",
     }, "text/xml; charset=utf-8");
 
     try expect(.{
         .content_type = .{ .text_xml = {} },
-        .charset = "UTF-8",
+        .charset = "utf-8",
         .params = "charset=\"utf-8\"",
-    }, "text/xml;charset=\"utf-8\"");
+    }, "text/xml;charset=\"UTF-8\"");
+
+    try expect(.{
+        .content_type = .{ .text_html = {} },
+        .charset = "iso-8859-1",
+        .params = "charset=\"iso-8859-1\"",
+    }, "text/html; charset=\"iso-8859-1\"");
+
+    try expect(.{
+        .content_type = .{ .text_html = {} },
+        .charset = "iso-8859-1",
+        .params = "charset=\"iso-8859-1\"",
+    }, "text/html; charset=\"ISO-8859-1\"");
 
     try expect(.{
         .content_type = .{ .text_xml = {} },
-        .charset = "lightpanda:UNSUPPORTED",
-        .params = "charset=\"\\\\ \\\" \"",
-    }, "text/xml;charset=\"\\\\ \\\" \"   ");
+        .charset = "custom-non-standard-charset-value",
+        .params = "charset=\"custom-non-standard-charset-value\"",
+    }, "text/xml;charset=\"custom-non-standard-charset-value\"");
 }
 
 test "Mime: isHTML" {
@@ -490,8 +511,10 @@ fn expect(expected: Expectation, input: []const u8) !void {
     try testing.expectEqual(expected.params, actual.params);
 
     if (expected.charset) |ec| {
-        try testing.expectEqual(ec, actual.charset.?);
+        // We remove the null characters for testing purposes here.
+        try testing.expectEqual(ec, actual.charsetString()[0..ec.len]);
     } else {
-        try testing.expectEqual(null, actual.charset);
+        const m: Mime = .unknown;
+        try testing.expectEqual(m.charsetString(), actual.charsetString());
     }
 }
diff --git a/src/browser/page.zig b/src/browser/page.zig
@@ -672,14 +672,14 @@ pub const Page = struct {
             log.debug(.http, "navigate first chunk", .{ .content_type = mime.content_type, .len = data.len });
 
             self.mode = switch (mime.content_type) {
-                .text_html => .{ .html = try parser.Parser.init(mime.charset orelse "UTF-8") },
+                .text_html => .{ .html = try parser.Parser.init(mime.charsetString()) },
 
                 .application_json,
                 .text_javascript,
                 .text_css,
                 .text_plain,
                 => blk: {
-                    var p = try parser.Parser.init(mime.charset orelse "UTF-8");
+                    var p = try parser.Parser.init(mime.charsetString());
                     try p.process("<html><head><meta charset=\"utf-8\"></head><body><pre>");
                     break :blk .{ .text = p };
                 },
diff --git a/src/browser/xhr/xhr.zig b/src/browser/xhr/xhr.zig
@@ -679,7 +679,7 @@ pub const XMLHttpRequest = struct {
         }
 
         var fbs = std.io.fixedBufferStream(self.response_bytes.items);
-        const doc = parser.documentHTMLParse(fbs.reader(), mime.charset orelse "UTF-8") catch {
+        const doc = parser.documentHTMLParse(fbs.reader(), mime.charsetString()) catch {
             self.response_obj = .{ .Failure = {} };
             return;
         };

Original file line number	Diff line number	Diff line change
`@@ -679,7 +679,7 @@ pub const XMLHttpRequest = struct {`
`679`	`679`	`}`
`680`	`680`
`681`	`681`	`var fbs = std.io.fixedBufferStream(self.response_bytes.items);`
`682`		`- const doc = parser.documentHTMLParse(fbs.reader(), mime.charset orelse "UTF-8") catch {`
	`682`	`+ const doc = parser.documentHTMLParse(fbs.reader(), mime.charsetString()) catch {`
`683`	`683`	`self.response_obj = .{ .Failure = {} };`
`684`	`684`	`return;`
`685`	`685`	`};`