Skip to content

Commit 1943cec

Browse files
committed
find nodes by path, do more work on SVG embedding
1 parent dc78a40 commit 1943cec

File tree

5 files changed

+206
-141
lines changed

5 files changed

+206
-141
lines changed

src/core.rs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -511,9 +511,11 @@ pub fn retrieve_asset(
511511
let (media_type, charset, data) = parse_data_url(url);
512512
Ok((data, url.clone(), media_type, charset))
513513
} else if url.scheme() == "file" {
514+
let cache_key: String = clean_url(url.clone()).as_str().to_string();
515+
514516
// Check if parent_url is also a file: URL (if not, then we don't embed the asset)
515517
if parent_url.scheme() != "file" {
516-
print_error_message(&format!("{} (security error)", &url), options);
518+
print_error_message(&format!("{} (security error)", &cache_key), options);
517519

518520
// Provoke error
519521
client.get("").send()?;
@@ -523,12 +525,12 @@ pub fn retrieve_asset(
523525
let path: &Path = path_buf.as_path();
524526
if path.exists() {
525527
if path.is_dir() {
526-
print_error_message(&format!("{} (is a directory)", &url), options);
528+
print_error_message(&format!("{} (is a directory)", &cache_key), options);
527529

528530
// Provoke error
529531
Err(client.get("").send().unwrap_err())
530532
} else {
531-
print_info_message(&format!("{}", &url), options);
533+
print_info_message(&cache_key.to_string(), options);
532534

533535
let file_blob: Vec<u8> = fs::read(path).expect("unable to read file");
534536

src/html.rs

Lines changed: 148 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use html5ever::interface::{Attribute, QualName};
55
use html5ever::parse_document;
66
use html5ever::serialize::{serialize, SerializeOpts};
77
use html5ever::tendril::{format_tendril, TendrilSink};
8-
use html5ever::tree_builder::create_element;
8+
use html5ever::tree_builder::{create_element, TreeSink};
99
use html5ever::{namespace_url, ns, LocalName};
1010
use markup5ever_rcdom::{Handle, NodeData, RcDom, SerializableHandle};
1111
use regex::Regex;
@@ -38,7 +38,6 @@ struct SrcSetItem<'a> {
3838
}
3939

4040
const FAVICON_VALUES: &[&str] = &["icon", "shortcut icon"];
41-
4241
const WHITESPACES: &[char] = &['\t', '\n', '\x0c', '\r', ' '];
4342

4443
pub fn add_favicon(document: &Handle, favicon_data_url: String) -> RcDom {
@@ -67,6 +66,7 @@ pub fn add_favicon(document: &Handle, favicon_data_url: String) -> RcDom {
6766
},
6867
],
6968
);
69+
7070
// Insert favicon LINK tag into HEAD
7171
head.children.borrow_mut().push(favicon_node.clone());
7272
}
@@ -244,92 +244,61 @@ pub fn embed_srcset(
244244
result
245245
}
246246

247-
pub fn find_base_node(node: &Handle) -> Option<Handle> {
248-
match node.data {
249-
NodeData::Document => {
250-
// Dig deeper
251-
for child in node.children.borrow().iter() {
252-
if let Some(base_node) = find_base_node(child) {
253-
return Some(base_node);
254-
}
255-
}
256-
}
257-
NodeData::Element { ref name, .. } => {
258-
if name.local.as_ref() == "head" {
259-
return get_child_node_by_name(node, "base");
260-
}
247+
pub fn find_nodes(node: &Handle, mut path: Vec<&str>) -> Vec<Handle> {
248+
let mut result = vec![];
261249

262-
// Dig deeper
263-
for child in node.children.borrow().iter() {
264-
if let Some(base_node) = find_base_node(child) {
265-
return Some(base_node);
266-
}
267-
}
268-
}
269-
_ => {}
270-
}
271-
272-
None
273-
}
274-
275-
pub fn find_meta_charset_or_content_type_node(node: &Handle) -> Option<Handle> {
276-
match node.data {
277-
NodeData::Document => {
278-
// Dig deeper
279-
for child in node.children.borrow().iter() {
280-
if let Some(meta_charset_node) = find_meta_charset_or_content_type_node(child) {
281-
return Some(meta_charset_node);
282-
}
283-
}
284-
}
285-
NodeData::Element { ref name, .. } => {
286-
if name.local.as_ref() == "head" {
287-
if let Some(meta_node) = get_child_node_by_name(node, "meta") {
288-
if get_node_attr(&meta_node, "charset").is_some() {
289-
return Some(meta_node);
290-
} else if let Some(meta_node_http_equiv_attr_value) =
291-
get_node_attr(&meta_node, "http-equiv")
250+
while !path.is_empty() {
251+
match node.data {
252+
NodeData::Document | NodeData::Element { .. } => {
253+
// Dig deeper
254+
for child in node.children.borrow().iter() {
255+
if get_node_name(child)
256+
.unwrap_or_default()
257+
.eq_ignore_ascii_case(path[0])
292258
{
293-
if meta_node_http_equiv_attr_value.eq_ignore_ascii_case("content-type") {
294-
return Some(meta_node);
259+
if path.len() == 1 {
260+
result.push(child.clone());
261+
} else {
262+
result.append(&mut find_nodes(child, path[1..].to_vec()));
295263
}
296264
}
297265
}
298266
}
299-
300-
// Dig deeper
301-
for child in node.children.borrow().iter() {
302-
if let Some(meta_charset_node) = find_meta_charset_or_content_type_node(child) {
303-
return Some(meta_charset_node);
304-
}
305-
}
267+
_ => {}
306268
}
307-
_ => {}
269+
270+
path.remove(0);
308271
}
309272

310-
None
273+
result
311274
}
312275

313276
pub fn get_base_url(handle: &Handle) -> Option<String> {
314-
if let Some(base_node) = find_base_node(handle) {
315-
get_node_attr(&base_node, "href")
316-
} else {
317-
None
277+
for base_node in find_nodes(handle, vec!["html", "head", "base"]).iter() {
278+
// Only the first base tag matters (we ignore the rest, if there's any)
279+
return get_node_attr(base_node, "href");
318280
}
281+
282+
None
319283
}
320284

321285
pub fn get_charset(node: &Handle) -> Option<String> {
322-
if let Some(meta_charset_node) = find_meta_charset_or_content_type_node(node) {
323-
if let Some(meta_charset_node_attr_value) = get_node_attr(&meta_charset_node, "charset") {
286+
for meta_node in find_nodes(node, vec!["html", "head", "meta"]).iter() {
287+
if let Some(meta_charset_node_attr_value) = get_node_attr(meta_node, "charset") {
324288
// Processing <meta charset="..." />
325289
return Some(meta_charset_node_attr_value);
326-
} else if let Some(meta_content_type_node_attr_value) =
327-
get_node_attr(&meta_charset_node, "content")
290+
}
291+
292+
if get_node_attr(meta_node, "http-equiv")
293+
.unwrap_or_default()
294+
.eq_ignore_ascii_case("content-type")
328295
{
329-
// Processing <meta http-equiv="content-type" content="text/html; charset=..." />
330-
let (_media_type, charset, _is_base64) =
331-
parse_content_type(&meta_content_type_node_attr_value);
332-
return Some(charset);
296+
if let Some(meta_content_type_node_attr_value) = get_node_attr(meta_node, "content") {
297+
// Processing <meta http-equiv="content-type" content="text/html; charset=..." />
298+
let (_media_type, charset, _is_base64) =
299+
parse_content_type(&meta_content_type_node_attr_value);
300+
return Some(charset);
301+
}
333302
}
334303
}
335304

@@ -374,36 +343,13 @@ pub fn get_parent_node(child: &Handle) -> Handle {
374343
pub fn has_favicon(handle: &Handle) -> bool {
375344
let mut found_favicon: bool = false;
376345

377-
match handle.data {
378-
NodeData::Document => {
379-
// Dig deeper
380-
for child in handle.children.borrow().iter() {
381-
if has_favicon(child) {
382-
found_favicon = true;
383-
break;
384-
}
385-
}
386-
}
387-
NodeData::Element { ref name, .. } => {
388-
if name.local.as_ref() == "link" {
389-
if let Some(attr_value) = get_node_attr(handle, "rel") {
390-
if is_favicon(attr_value.trim()) {
391-
found_favicon = true;
392-
}
393-
}
394-
}
395-
396-
if !found_favicon {
397-
// Dig deeper
398-
for child in handle.children.borrow().iter() {
399-
if has_favicon(child) {
400-
found_favicon = true;
401-
break;
402-
}
403-
}
346+
for link_node in find_nodes(handle, vec!["html", "head", "link"]).iter() {
347+
if let Some(attr_value) = get_node_attr(link_node, "rel") {
348+
if is_favicon(attr_value.trim()) {
349+
found_favicon = true;
350+
break;
404351
}
405352
}
406-
_ => {}
407353
}
408354

409355
found_favicon
@@ -486,17 +432,28 @@ pub fn set_base_url(document: &Handle, desired_base_href: String) -> RcDom {
486432
}
487433

488434
pub fn set_charset(dom: RcDom, desired_charset: String) -> RcDom {
489-
if let Some(meta_charset_node) = find_meta_charset_or_content_type_node(&dom.document) {
490-
if get_node_attr(&meta_charset_node, "charset").is_some() {
491-
set_node_attr(&meta_charset_node, "charset", Some(desired_charset));
492-
} else if get_node_attr(&meta_charset_node, "content").is_some() {
435+
for meta_node in find_nodes(&dom.document, vec!["html", "head", "meta"]).iter() {
436+
if get_node_attr(meta_node, "charset").is_some() {
437+
set_node_attr(meta_node, "charset", Some(desired_charset));
438+
return dom;
439+
}
440+
441+
if get_node_attr(meta_node, "http-equiv")
442+
.unwrap_or_default()
443+
.eq_ignore_ascii_case("content-type")
444+
&& get_node_attr(meta_node, "content").is_some()
445+
{
493446
set_node_attr(
494-
&meta_charset_node,
447+
meta_node,
495448
"content",
496449
Some(format!("text/html;charset={}", desired_charset)),
497450
);
451+
return dom;
498452
}
499-
} else {
453+
}
454+
455+
// Manually append charset META node to HEAD
456+
{
500457
let meta_charset_node: Handle = create_element(
501458
&dom,
502459
QualName::new(None, ns!(), LocalName::from("meta")),
@@ -507,13 +464,11 @@ pub fn set_charset(dom: RcDom, desired_charset: String) -> RcDom {
507464
);
508465

509466
// Insert newly created META charset node into HEAD
510-
if let Some(html_node) = get_child_node_by_name(&dom.document, "html") {
511-
if let Some(head_node) = get_child_node_by_name(&html_node, "head") {
512-
head_node
513-
.children
514-
.borrow_mut()
515-
.push(meta_charset_node.clone());
516-
}
467+
for head_node in find_nodes(&dom.document, vec!["html", "head"]).iter() {
468+
head_node
469+
.children
470+
.borrow_mut()
471+
.push(meta_charset_node.clone());
517472
}
518473
}
519474

@@ -924,35 +879,90 @@ pub fn walk_and_embed_assets(
924879
}
925880
}
926881
"image" | "use" => {
927-
if let Some(image_attr_href_value) = get_node_attr(node, "href") {
928-
if options.no_images {
929-
set_node_attr(node, "href", None);
930-
} else {
931-
retrieve_and_embed_asset(
932-
cache,
933-
client,
934-
document_url,
935-
node,
936-
"href",
937-
&image_attr_href_value,
938-
options,
939-
);
940-
}
941-
}
882+
let attr_names: [&str; 2] = ["href", "xlink:href"];
942883

943-
if let Some(image_attr_xlink_href_value) = get_node_attr(node, "xlink:href") {
944-
if options.no_images {
945-
set_node_attr(node, "xlink:href", None);
946-
} else {
947-
retrieve_and_embed_asset(
948-
cache,
949-
client,
950-
document_url,
951-
node,
952-
"xlink:href",
953-
&image_attr_xlink_href_value,
954-
options,
955-
);
884+
for attr_name in attr_names.into_iter() {
885+
if let Some(image_attr_href_value) = get_node_attr(node, attr_name) {
886+
if options.no_images {
887+
set_node_attr(node, attr_name, None);
888+
} else {
889+
let image_asset_url: Url =
890+
resolve_url(document_url, &image_attr_href_value);
891+
892+
match retrieve_asset(
893+
cache,
894+
client,
895+
document_url,
896+
&image_asset_url,
897+
options,
898+
) {
899+
Ok((data, final_url, media_type, charset)) => {
900+
if media_type == "image/svg+xml" {
901+
// Parse SVG
902+
let svg_dom: RcDom = parse_document(
903+
RcDom::default(),
904+
Default::default(),
905+
)
906+
.from_utf8()
907+
.read_from(&mut data.as_slice())
908+
.unwrap();
909+
910+
if image_asset_url.fragment().is_some() {
911+
// Take only that one #fragment symbol from SVG and replace this image|use with that node
912+
for symbol_node in find_nodes(
913+
&svg_dom.document,
914+
vec!["html", "body", "svg", "defs", "symbol"],
915+
)
916+
.iter()
917+
{
918+
if get_node_attr(symbol_node, "id")
919+
.unwrap_or_default()
920+
== image_asset_url.fragment().unwrap()
921+
{
922+
// node.remove_from_parent(node);
923+
svg_dom
924+
.reparent_children(symbol_node, node);
925+
// node.set
926+
break;
927+
}
928+
}
929+
} else {
930+
// TODO: decide if we resort to using data URL here or stick with embedding the DOM
931+
// Replace this image|use with whole DOM of that SVG file
932+
for svg_node in find_nodes(
933+
&svg_dom.document,
934+
vec!["html", "body", "svg"],
935+
)
936+
.iter()
937+
{
938+
svg_dom.reparent_children(svg_node, node);
939+
break;
940+
}
941+
}
942+
} else {
943+
// It's likely a raster image; embed it as data URL
944+
let image_asset_data: Url = create_data_url(
945+
&media_type,
946+
&charset,
947+
&data,
948+
&final_url,
949+
);
950+
set_node_attr(
951+
node,
952+
attr_name,
953+
Some(image_asset_data.to_string()),
954+
);
955+
}
956+
}
957+
Err(_) => {
958+
set_node_attr(
959+
node,
960+
attr_name,
961+
Some(image_asset_url.to_string()),
962+
);
963+
}
964+
}
965+
}
956966
}
957967
}
958968
}

0 commit comments

Comments
 (0)