@@ -22,13 +22,15 @@ const Allocator = std.mem.Allocator;
2222pub const Mime = struct {
2323 content_type : ContentType ,
2424 params : []const u8 = "" ,
25- charset : ? [:0 ]const u8 = null ,
25+ // IANA defines max. charset value length as 40.
26+ // We keep 41 for null-termination since HTML parser expects in this format.
27+ charset : [41 ]u8 = default_charset ,
2628
27- pub const unknown = Mime {
28- . params = "" ,
29- . charset = null ,
30- . content_type = .{ . unknown = {} },
31- };
29+ /// String "UTF-8" continued by null characters.
30+ pub const default_charset = .{ 'U' , 'T' , 'F' , '-' , '8' } ++ .{ 0 } ** 36 ;
31+
32+ /// Mime with unknown Content-Type, empty params and empty charset.
33+ pub const unknown = Mime { . content_type = .{ . unknown = {} } };
3234
3335 pub const ContentTypeEnum = enum {
3436 text_xml ,
@@ -52,6 +54,34 @@ pub const Mime = struct {
5254 other : struct { type : []const u8 , sub_type : []const u8 },
5355 };
5456
57+ /// Returns the null-terminated charset value.
58+ pub inline fn charsetString (mime : * const Mime ) [:0 ]const u8 {
59+ return @ptrCast (& mime .charset );
60+ }
61+
62+ /// Removes quotes of value if quotes are given.
63+ ///
64+ /// Currently we don't validate the charset.
65+ /// See section 2.3 Naming Requirements:
66+ /// https://datatracker.ietf.org/doc/rfc2978/
67+ fn parseCharset (value : []const u8 ) error { CharsetTooBig , Invalid }! []const u8 {
68+ // Cannot be larger than 40.
69+ // https://datatracker.ietf.org/doc/rfc2978/
70+ if (value .len > 40 ) return error .CharsetTooBig ;
71+
72+ // If the first char is a quote, look for a pair.
73+ if (value [0 ] == '"' ) {
74+ if (value .len < 3 or value [value .len - 1 ] != '"' ) {
75+ return error .Invalid ;
76+ }
77+
78+ return value [1 .. value .len - 1 ];
79+ }
80+
81+ // No quotes.
82+ return value ;
83+ }
84+
5585 pub fn parse (input : []u8 ) ! Mime {
5686 if (input .len > 255 ) {
5787 return error .TooBig ;
@@ -69,7 +99,7 @@ pub const Mime = struct {
6999
70100 const params = trimLeft (normalized [type_len .. ]);
71101
72- var charset : ? [: 0 ] const u8 = null ;
102+ var charset : [ 41 ] u8 = undefined ;
73103
74104 var it = std .mem .splitScalar (u8 , params , ';' );
75105 while (it .next ()) | attr | {
@@ -87,35 +117,14 @@ pub const Mime = struct {
87117
88118 switch (attribute_name ) {
89119 .charset = > {
90- // We used to have a proper value parser, but we currently
91- // only care about the charset attribute, plus only about
92- // the UTF-8 value. It's a lot easier to do it this way,
93- // and it doesn't require an allocation to (a) unescape the
94- // value or (b) ensure the correct lifetime.
95120 if (value .len == 0 ) {
96121 break ;
97122 }
98- var attribute_value = value ;
99- if (value [0 ] == '"' ) {
100- if (value .len < 3 or value [value .len - 1 ] != '"' ) {
101- return error .Invalid ;
102- }
103- attribute_value = value [1 .. value .len - 1 ];
104- }
105123
106- if (std .ascii .eqlIgnoreCase (attribute_value , "utf-8" )) {
107- charset = "UTF-8" ;
108- } else if (std .ascii .eqlIgnoreCase (attribute_value , "iso-8859-1" )) {
109- charset = "ISO-8859-1" ;
110- } else {
111- // we only care about null (which we default to UTF-8)
112- // or UTF-8. If this is actually set (i.e. not null)
113- // and isn't UTF-8, we'll just put a dummy value. If
114- // we want to capture the actual value, we'll need to
115- // dupe/allocate it. Since, for now, we don't need that
116- // we can avoid the allocation.
117- charset = "lightpanda:UNSUPPORTED" ;
118- }
124+ const attribute_value = try parseCharset (value );
125+ @memcpy (charset [0.. attribute_value .len ], attribute_value );
126+ // Null-terminate right after attribute value.
127+ charset [attribute_value .len ] = 0 ;
119128 },
120129 }
121130 }
@@ -363,21 +372,33 @@ test "Mime: parse charset" {
363372
364373 try expect (.{
365374 .content_type = .{ .text_xml = {} },
366- .charset = "UTF -8" ,
375+ .charset = "utf -8" ,
367376 .params = "charset=utf-8" ,
368377 }, "text/xml; charset=utf-8" );
369378
370379 try expect (.{
371380 .content_type = .{ .text_xml = {} },
372- .charset = "UTF -8" ,
381+ .charset = "utf -8" ,
373382 .params = "charset=\" utf-8\" " ,
374- }, "text/xml;charset=\" utf-8\" " );
383+ }, "text/xml;charset=\" UTF-8\" " );
384+
385+ try expect (.{
386+ .content_type = .{ .text_html = {} },
387+ .charset = "iso-8859-1" ,
388+ .params = "charset=\" iso-8859-1\" " ,
389+ }, "text/html; charset=\" iso-8859-1\" " );
390+
391+ try expect (.{
392+ .content_type = .{ .text_html = {} },
393+ .charset = "iso-8859-1" ,
394+ .params = "charset=\" iso-8859-1\" " ,
395+ }, "text/html; charset=\" ISO-8859-1\" " );
375396
376397 try expect (.{
377398 .content_type = .{ .text_xml = {} },
378- .charset = "lightpanda:UNSUPPORTED " ,
379- .params = "charset=\" \\\\ \\ \" \" " ,
380- }, "text/xml;charset=\" \\\\ \\ \" \" " );
399+ .charset = "custom-non-standard-charset-value " ,
400+ .params = "charset=\" custom-non-standard-charset-value \" " ,
401+ }, "text/xml;charset=\" custom-non-standard-charset-value \" " );
381402}
382403
383404test "Mime: isHTML" {
@@ -490,8 +511,10 @@ fn expect(expected: Expectation, input: []const u8) !void {
490511 try testing .expectEqual (expected .params , actual .params );
491512
492513 if (expected .charset ) | ec | {
493- try testing .expectEqual (ec , actual .charset .? );
514+ // We remove the null characters for testing purposes here.
515+ try testing .expectEqual (ec , actual .charsetString ()[0.. ec .len ]);
494516 } else {
495- try testing .expectEqual (null , actual .charset );
517+ const m : Mime = .unknown ;
518+ try testing .expectEqual (m .charsetString (), actual .charsetString ());
496519 }
497520}
0 commit comments