@@ -5,7 +5,7 @@ use html5ever::interface::{Attribute, QualName};
55use html5ever:: parse_document;
66use html5ever:: serialize:: { serialize, SerializeOpts } ;
77use html5ever:: tendril:: { format_tendril, TendrilSink } ;
8- use html5ever:: tree_builder:: create_element;
8+ use html5ever:: tree_builder:: { create_element, TreeSink } ;
99use html5ever:: { namespace_url, ns, LocalName } ;
1010use markup5ever_rcdom:: { Handle , NodeData , RcDom , SerializableHandle } ;
1111use regex:: Regex ;
@@ -38,7 +38,6 @@ struct SrcSetItem<'a> {
3838}
3939
4040const FAVICON_VALUES : & [ & str ] = & [ "icon" , "shortcut icon" ] ;
41-
4241const WHITESPACES : & [ char ] = & [ '\t' , '\n' , '\x0c' , '\r' , ' ' ] ;
4342
4443pub fn add_favicon ( document : & Handle , favicon_data_url : String ) -> RcDom {
@@ -67,6 +66,7 @@ pub fn add_favicon(document: &Handle, favicon_data_url: String) -> RcDom {
6766 } ,
6867 ] ,
6968 ) ;
69+
7070 // Insert favicon LINK tag into HEAD
7171 head. children . borrow_mut ( ) . push ( favicon_node. clone ( ) ) ;
7272 }
@@ -244,92 +244,61 @@ pub fn embed_srcset(
244244 result
245245}
246246
247- pub fn find_base_node ( node : & Handle ) -> Option < Handle > {
248- match node. data {
249- NodeData :: Document => {
250- // Dig deeper
251- for child in node. children . borrow ( ) . iter ( ) {
252- if let Some ( base_node) = find_base_node ( child) {
253- return Some ( base_node) ;
254- }
255- }
256- }
257- NodeData :: Element { ref name, .. } => {
258- if name. local . as_ref ( ) == "head" {
259- return get_child_node_by_name ( node, "base" ) ;
260- }
247+ pub fn find_nodes ( node : & Handle , mut path : Vec < & str > ) -> Vec < Handle > {
248+ let mut result = vec ! [ ] ;
261249
262- // Dig deeper
263- for child in node. children . borrow ( ) . iter ( ) {
264- if let Some ( base_node) = find_base_node ( child) {
265- return Some ( base_node) ;
266- }
267- }
268- }
269- _ => { }
270- }
271-
272- None
273- }
274-
275- pub fn find_meta_charset_or_content_type_node ( node : & Handle ) -> Option < Handle > {
276- match node. data {
277- NodeData :: Document => {
278- // Dig deeper
279- for child in node. children . borrow ( ) . iter ( ) {
280- if let Some ( meta_charset_node) = find_meta_charset_or_content_type_node ( child) {
281- return Some ( meta_charset_node) ;
282- }
283- }
284- }
285- NodeData :: Element { ref name, .. } => {
286- if name. local . as_ref ( ) == "head" {
287- if let Some ( meta_node) = get_child_node_by_name ( node, "meta" ) {
288- if get_node_attr ( & meta_node, "charset" ) . is_some ( ) {
289- return Some ( meta_node) ;
290- } else if let Some ( meta_node_http_equiv_attr_value) =
291- get_node_attr ( & meta_node, "http-equiv" )
250+ while !path. is_empty ( ) {
251+ match node. data {
252+ NodeData :: Document | NodeData :: Element { .. } => {
253+ // Dig deeper
254+ for child in node. children . borrow ( ) . iter ( ) {
255+ if get_node_name ( child)
256+ . unwrap_or_default ( )
257+ . eq_ignore_ascii_case ( path[ 0 ] )
292258 {
293- if meta_node_http_equiv_attr_value. eq_ignore_ascii_case ( "content-type" ) {
294- return Some ( meta_node) ;
259+ if path. len ( ) == 1 {
260+ result. push ( child. clone ( ) ) ;
261+ } else {
262+ result. append ( & mut find_nodes ( child, path[ 1 ..] . to_vec ( ) ) ) ;
295263 }
296264 }
297265 }
298266 }
299-
300- // Dig deeper
301- for child in node. children . borrow ( ) . iter ( ) {
302- if let Some ( meta_charset_node) = find_meta_charset_or_content_type_node ( child) {
303- return Some ( meta_charset_node) ;
304- }
305- }
267+ _ => { }
306268 }
307- _ => { }
269+
270+ path. remove ( 0 ) ;
308271 }
309272
310- None
273+ result
311274}
312275
313276pub fn get_base_url ( handle : & Handle ) -> Option < String > {
314- if let Some ( base_node) = find_base_node ( handle) {
315- get_node_attr ( & base_node, "href" )
316- } else {
317- None
277+ for base_node in find_nodes ( handle, vec ! [ "html" , "head" , "base" ] ) . iter ( ) {
278+ // Only the first base tag matters (we ignore the rest, if there's any)
279+ return get_node_attr ( base_node, "href" ) ;
318280 }
281+
282+ None
319283}
320284
321285pub fn get_charset ( node : & Handle ) -> Option < String > {
322- if let Some ( meta_charset_node ) = find_meta_charset_or_content_type_node ( node ) {
323- if let Some ( meta_charset_node_attr_value) = get_node_attr ( & meta_charset_node , "charset" ) {
286+ for meta_node in find_nodes ( node , vec ! [ "html" , "head" , "meta" ] ) . iter ( ) {
287+ if let Some ( meta_charset_node_attr_value) = get_node_attr ( meta_node , "charset" ) {
324288 // Processing <meta charset="..." />
325289 return Some ( meta_charset_node_attr_value) ;
326- } else if let Some ( meta_content_type_node_attr_value) =
327- get_node_attr ( & meta_charset_node, "content" )
290+ }
291+
292+ if get_node_attr ( meta_node, "http-equiv" )
293+ . unwrap_or_default ( )
294+ . eq_ignore_ascii_case ( "content-type" )
328295 {
329- // Processing <meta http-equiv="content-type" content="text/html; charset=..." />
330- let ( _media_type, charset, _is_base64) =
331- parse_content_type ( & meta_content_type_node_attr_value) ;
332- return Some ( charset) ;
296+ if let Some ( meta_content_type_node_attr_value) = get_node_attr ( meta_node, "content" ) {
297+ // Processing <meta http-equiv="content-type" content="text/html; charset=..." />
298+ let ( _media_type, charset, _is_base64) =
299+ parse_content_type ( & meta_content_type_node_attr_value) ;
300+ return Some ( charset) ;
301+ }
333302 }
334303 }
335304
@@ -374,36 +343,13 @@ pub fn get_parent_node(child: &Handle) -> Handle {
374343pub fn has_favicon ( handle : & Handle ) -> bool {
375344 let mut found_favicon: bool = false ;
376345
377- match handle. data {
378- NodeData :: Document => {
379- // Dig deeper
380- for child in handle. children . borrow ( ) . iter ( ) {
381- if has_favicon ( child) {
382- found_favicon = true ;
383- break ;
384- }
385- }
386- }
387- NodeData :: Element { ref name, .. } => {
388- if name. local . as_ref ( ) == "link" {
389- if let Some ( attr_value) = get_node_attr ( handle, "rel" ) {
390- if is_favicon ( attr_value. trim ( ) ) {
391- found_favicon = true ;
392- }
393- }
394- }
395-
396- if !found_favicon {
397- // Dig deeper
398- for child in handle. children . borrow ( ) . iter ( ) {
399- if has_favicon ( child) {
400- found_favicon = true ;
401- break ;
402- }
403- }
346+ for link_node in find_nodes ( handle, vec ! [ "html" , "head" , "link" ] ) . iter ( ) {
347+ if let Some ( attr_value) = get_node_attr ( link_node, "rel" ) {
348+ if is_favicon ( attr_value. trim ( ) ) {
349+ found_favicon = true ;
350+ break ;
404351 }
405352 }
406- _ => { }
407353 }
408354
409355 found_favicon
@@ -486,17 +432,28 @@ pub fn set_base_url(document: &Handle, desired_base_href: String) -> RcDom {
486432}
487433
488434pub fn set_charset ( dom : RcDom , desired_charset : String ) -> RcDom {
489- if let Some ( meta_charset_node) = find_meta_charset_or_content_type_node ( & dom. document ) {
490- if get_node_attr ( & meta_charset_node, "charset" ) . is_some ( ) {
491- set_node_attr ( & meta_charset_node, "charset" , Some ( desired_charset) ) ;
492- } else if get_node_attr ( & meta_charset_node, "content" ) . is_some ( ) {
435+ for meta_node in find_nodes ( & dom. document , vec ! [ "html" , "head" , "meta" ] ) . iter ( ) {
436+ if get_node_attr ( meta_node, "charset" ) . is_some ( ) {
437+ set_node_attr ( meta_node, "charset" , Some ( desired_charset) ) ;
438+ return dom;
439+ }
440+
441+ if get_node_attr ( meta_node, "http-equiv" )
442+ . unwrap_or_default ( )
443+ . eq_ignore_ascii_case ( "content-type" )
444+ && get_node_attr ( meta_node, "content" ) . is_some ( )
445+ {
493446 set_node_attr (
494- & meta_charset_node ,
447+ meta_node ,
495448 "content" ,
496449 Some ( format ! ( "text/html;charset={}" , desired_charset) ) ,
497450 ) ;
451+ return dom;
498452 }
499- } else {
453+ }
454+
455+ // Manually append charset META node to HEAD
456+ {
500457 let meta_charset_node: Handle = create_element (
501458 & dom,
502459 QualName :: new ( None , ns ! ( ) , LocalName :: from ( "meta" ) ) ,
@@ -507,13 +464,11 @@ pub fn set_charset(dom: RcDom, desired_charset: String) -> RcDom {
507464 ) ;
508465
509466 // Insert newly created META charset node into HEAD
510- if let Some ( html_node) = get_child_node_by_name ( & dom. document , "html" ) {
511- if let Some ( head_node) = get_child_node_by_name ( & html_node, "head" ) {
512- head_node
513- . children
514- . borrow_mut ( )
515- . push ( meta_charset_node. clone ( ) ) ;
516- }
467+ for head_node in find_nodes ( & dom. document , vec ! [ "html" , "head" ] ) . iter ( ) {
468+ head_node
469+ . children
470+ . borrow_mut ( )
471+ . push ( meta_charset_node. clone ( ) ) ;
517472 }
518473 }
519474
@@ -924,35 +879,90 @@ pub fn walk_and_embed_assets(
924879 }
925880 }
926881 "image" | "use" => {
927- if let Some ( image_attr_href_value) = get_node_attr ( node, "href" ) {
928- if options. no_images {
929- set_node_attr ( node, "href" , None ) ;
930- } else {
931- retrieve_and_embed_asset (
932- cache,
933- client,
934- document_url,
935- node,
936- "href" ,
937- & image_attr_href_value,
938- options,
939- ) ;
940- }
941- }
882+ let attr_names: [ & str ; 2 ] = [ "href" , "xlink:href" ] ;
942883
943- if let Some ( image_attr_xlink_href_value) = get_node_attr ( node, "xlink:href" ) {
944- if options. no_images {
945- set_node_attr ( node, "xlink:href" , None ) ;
946- } else {
947- retrieve_and_embed_asset (
948- cache,
949- client,
950- document_url,
951- node,
952- "xlink:href" ,
953- & image_attr_xlink_href_value,
954- options,
955- ) ;
884+ for attr_name in attr_names. into_iter ( ) {
885+ if let Some ( image_attr_href_value) = get_node_attr ( node, attr_name) {
886+ if options. no_images {
887+ set_node_attr ( node, attr_name, None ) ;
888+ } else {
889+ let image_asset_url: Url =
890+ resolve_url ( document_url, & image_attr_href_value) ;
891+
892+ match retrieve_asset (
893+ cache,
894+ client,
895+ document_url,
896+ & image_asset_url,
897+ options,
898+ ) {
899+ Ok ( ( data, final_url, media_type, charset) ) => {
900+ if media_type == "image/svg+xml" {
901+ // Parse SVG
902+ let svg_dom: RcDom = parse_document (
903+ RcDom :: default ( ) ,
904+ Default :: default ( ) ,
905+ )
906+ . from_utf8 ( )
907+ . read_from ( & mut data. as_slice ( ) )
908+ . unwrap ( ) ;
909+
910+ if image_asset_url. fragment ( ) . is_some ( ) {
911+ // Take only that one #fragment symbol from SVG and replace this image|use with that node
912+ for symbol_node in find_nodes (
913+ & svg_dom. document ,
914+ vec ! [ "html" , "body" , "svg" , "defs" , "symbol" ] ,
915+ )
916+ . iter ( )
917+ {
918+ if get_node_attr ( symbol_node, "id" )
919+ . unwrap_or_default ( )
920+ == image_asset_url. fragment ( ) . unwrap ( )
921+ {
922+ // node.remove_from_parent(node);
923+ svg_dom
924+ . reparent_children ( symbol_node, node) ;
925+ // node.set
926+ break ;
927+ }
928+ }
929+ } else {
930+ // TODO: decide if we resort to using data URL here or stick with embedding the DOM
931+ // Replace this image|use with whole DOM of that SVG file
932+ for svg_node in find_nodes (
933+ & svg_dom. document ,
934+ vec ! [ "html" , "body" , "svg" ] ,
935+ )
936+ . iter ( )
937+ {
938+ svg_dom. reparent_children ( svg_node, node) ;
939+ break ;
940+ }
941+ }
942+ } else {
943+ // It's likely a raster image; embed it as data URL
944+ let image_asset_data: Url = create_data_url (
945+ & media_type,
946+ & charset,
947+ & data,
948+ & final_url,
949+ ) ;
950+ set_node_attr (
951+ node,
952+ attr_name,
953+ Some ( image_asset_data. to_string ( ) ) ,
954+ ) ;
955+ }
956+ }
957+ Err ( _) => {
958+ set_node_attr (
959+ node,
960+ attr_name,
961+ Some ( image_asset_url. to_string ( ) ) ,
962+ ) ;
963+ }
964+ }
965+ }
956966 }
957967 }
958968 }
0 commit comments