@@ -694,30 +694,48 @@ def process_page_requisites(file_path, parent_remote_info)
694694 return unless File . exist? ( file_path )
695695
696696 content = File . read ( file_path )
697- # handle encoding slightly roughly for extraction
698697 content = content . force_encoding ( 'UTF-8' ) . scrub
699698
700699 assets = PageRequisites . extract ( content )
701700
702- parent_url = parent_remote_info [ :file_url ]
701+ # FIX 1: Construct a valid URI object including the scheme (http://)
702+ # parent_remote_info[:file_url] is usually "www.iana.org/path",
703+ # we need "http://www.iana.org/path" to resolve relative paths correctly.
704+ parent_raw = parent_remote_info [ :file_url ]
705+ parent_raw = "http://#{ parent_raw } " unless parent_raw . match? ( /^https?:\/ \/ / )
706+
707+ begin
708+ base_uri = URI ( parent_raw )
709+ rescue URI ::InvalidURIError
710+ return
711+ end
712+
703713 parent_timestamp = parent_remote_info [ :timestamp ]
704714
705715 assets . each do |asset_rel_url |
706- # resolve absolute URL
707716 begin
708- # assume relative to the parent file URL
709- # We need a fake base URI to resolve /paths and ../paths
710- base_uri = URI ( "http://base.example.com/" + parent_url )
717+ # resolve the relative asset URL against the parent page URL
718+ # e.g. parent: http://www.iana.org/help/ex
719+ # asset: /static/style.css
720+ # result: http://www.iana.org/static/style.css
711721 resolved_uri = base_uri + asset_rel_url
712722
713- # we only want the path part + query, not the host
714- asset_final_url = resolved_uri . path
715- asset_final_url = asset_final_url [ 1 ..-1 ] if asset_final_url . start_with? ( '/' ) # strip leading slash
723+ # filter out navigation links
724+ # If the path has no extension (like /domains) or is .html, it's likely a link and not a requisite
725+ # this prevents spidering the whole site
726+ path = resolved_uri . path
727+ ext = File . extname ( path ) . downcase
716728
717- # re-attach query string if present (as some assets use ?v=123)
729+ # skip empty extensions and standard page extensions
730+ if ext . empty? || [ '.html' , '.htm' , '.php' , '.asp' , '.aspx' ] . include? ( ext )
731+ next
732+ end
733+
734+ # reconstruct the ID expected by Wayback Machine
735+ asset_final_url = resolved_uri . host + resolved_uri . path
718736 asset_final_url += "?#{ resolved_uri . query } " if resolved_uri . query
719737
720- rescue URI ::InvalidURIError
738+ rescue URI ::InvalidURIError , StandardError
721739 next
722740 end
723741
0 commit comments