Skip to content

Commit f3d1475

Browse files
committed
add private module with struct Html to fix wrong html charset
1 parent 93cf67e commit f3d1475

File tree

3 files changed

+90
-0
lines changed

3 files changed

+90
-0
lines changed

src/core/html.rs

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
use std::borrow::Cow;
2+
3+
#[cfg(feature = "serde")]
4+
use serde::{Deserialize, Serialize};
5+
6+
#[derive(Debug, PartialEq, Clone)]
7+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
8+
pub struct Html<'x>(Cow<'x, str>);
9+
10+
impl<'x> Html<'x> {
11+
pub fn make_owned(self) -> Html<'static> {
12+
Html(self.0.into_owned().into())
13+
}
14+
pub fn new(html: Cow<'x, str>) -> Html<'x> {
15+
Html(html)
16+
}
17+
/// Access the raw html with a potentially wrong charset.
18+
///
19+
/// `mail-parser` only returns utf-8 strings, so the only sensible charset for the html is utf-8. Because html can declare its charset in `<meta>` tags, in the process of transcoding to utf-8 these may be incorrect.
20+
/// Call [`Html::strip_charset`] before this method if the html will be given to a standard-conforming browser.
21+
pub fn potentially_wrong_charset(&self) -> &Cow<'x, str> {
22+
&self.0
23+
}
24+
/// Strip charset from html, making it utf-8 by default.
25+
///
26+
/// Call this method if the result is given to a standard-conforming browser.
27+
pub fn strip_charset(&mut self) {
28+
let mut off = 0;
29+
let mut first = true;
30+
let mut found = None;
31+
'meta: for part in self.0.split("<meta") {
32+
if !first {
33+
let Some((between, _)) = part.split_once('>') else {
34+
return;
35+
};
36+
for w in between.as_bytes().windows(b"charset".len()) {
37+
if w.eq_ignore_ascii_case(b"charset") {
38+
found = Some((off, off + "<meta".len() + between.len() + ">".len()));
39+
break 'meta;
40+
}
41+
}
42+
off += "<meta".len();
43+
}
44+
off += part.len();
45+
first = false;
46+
}
47+
if let Some((start, end)) = found {
48+
self.0.to_mut().replace_range(start..end, "");
49+
}
50+
}
51+
}
52+
53+
#[cfg(test)]
54+
mod tests {
55+
use super::*;
56+
57+
fn strip(html: &str) -> Cow<'_, str> {
58+
let mut html = Html(html.into());
59+
html.strip_charset();
60+
html.potentially_wrong_charset().clone()
61+
}
62+
63+
#[test]
64+
fn strip_charset() {
65+
assert_eq!(
66+
strip("<head><meta cHarSet=Windows-1252></head>"),
67+
"<head></head>"
68+
);
69+
70+
let stripped = strip("<head><meta cHarSet=\"Windows-1252\"></head>");
71+
assert_eq!(stripped, "<head></head>");
72+
73+
let stripped = strip("<head><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet=Windows-1252\"></head>");
74+
assert_eq!(stripped, "<head></head>");
75+
76+
let stripped = strip("<head><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;></head>");
77+
assert_eq!(stripped, "<head></head>");
78+
79+
let stripped = strip("<head><meta name=\"xxx\"><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;></head>");
80+
assert_eq!(stripped, "<head><meta name=\"xxx\"></head>");
81+
82+
let stripped = strip("<head><meta http-equiv=\"Content-Type\" content=\"text/html; cHarSet = &quot;Windows-1252&quot;><meta name=\"xxx\"></head>");
83+
assert_eq!(stripped, "<head><meta name=\"xxx\"></head>");
84+
}
85+
}

src/core/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ pub mod address;
88
pub mod body;
99
pub mod builder;
1010
pub mod header;
11+
mod html;
1112
pub mod message;
1213
#[cfg(feature = "rkyv")]
1314
pub mod rkyv;
15+
16+
pub use html::Html;

src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ pub mod decoders;
1111
pub mod mailbox;
1212
pub mod parsers;
1313

14+
15+
use core::Html;
1416
use parsers::MessageStream;
1517
use std::{borrow::Cow, collections::HashMap, hash::Hash, net::IpAddr};
1618

0 commit comments

Comments
 (0)