Skip to content

Commit 8632050

Browse files
page requisites fix
1 parent 2aa694e commit 8632050

File tree

1 file changed

+29
-11
lines changed

1 file changed

+29
-11
lines changed

lib/wayback_machine_downloader.rb

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -694,30 +694,48 @@ def process_page_requisites(file_path, parent_remote_info)
694694
return unless File.exist?(file_path)
695695

696696
content = File.read(file_path)
697-
# handle encoding slightly roughly for extraction
698697
content = content.force_encoding('UTF-8').scrub
699698

700699
assets = PageRequisites.extract(content)
701700

702-
parent_url = parent_remote_info[:file_url]
701+
# FIX 1: Construct a valid URI object including the scheme (http://)
702+
# parent_remote_info[:file_url] is usually "www.iana.org/path",
703+
# we need "http://www.iana.org/path" to resolve relative paths correctly.
704+
parent_raw = parent_remote_info[:file_url]
705+
parent_raw = "http://#{parent_raw}" unless parent_raw.match?(/^https?:\/\//)
706+
707+
begin
708+
base_uri = URI(parent_raw)
709+
rescue URI::InvalidURIError
710+
return
711+
end
712+
703713
parent_timestamp = parent_remote_info[:timestamp]
704714

705715
assets.each do |asset_rel_url|
706-
# resolve absolute URL
707716
begin
708-
# assume relative to the parent file URL
709-
# We need a fake base URI to resolve /paths and ../paths
710-
base_uri = URI("http://base.example.com/" + parent_url)
717+
# resolve the relative asset URL against the parent page URL
718+
# e.g. parent: http://www.iana.org/help/ex
719+
# asset: /static/style.css
720+
# result: http://www.iana.org/static/style.css
711721
resolved_uri = base_uri + asset_rel_url
712722

713-
# we only want the path part + query, not the host
714-
asset_final_url = resolved_uri.path
715-
asset_final_url = asset_final_url[1..-1] if asset_final_url.start_with?('/') # strip leading slash
723+
# filter out navigation links
724+
# If the path has no extension (like /domains) or is .html, it's likely a link and not a requisite
725+
# this prevents spidering the whole site
726+
path = resolved_uri.path
727+
ext = File.extname(path).downcase
716728

717-
# re-attach query string if present (as some assets use ?v=123)
729+
# skip empty extensions and standard page extensions
730+
if ext.empty? || ['.html', '.htm', '.php', '.asp', '.aspx'].include?(ext)
731+
next
732+
end
733+
734+
# reconstruct the ID expected by Wayback Machine
735+
asset_final_url = resolved_uri.host + resolved_uri.path
718736
asset_final_url += "?#{resolved_uri.query}" if resolved_uri.query
719737

720-
rescue URI::InvalidURIError
738+
rescue URI::InvalidURIError, StandardError
721739
next
722740
end
723741

0 commit comments

Comments
 (0)