Skip to content

Commit f7c0f1a

Browse files
Better support for .php, .asp, and other files when using --local
see #37
1 parent 99da3ca commit f7c0f1a

File tree

2 files changed

+76
-63
lines changed

2 files changed

+76
-63
lines changed

lib/wayback_machine_downloader.rb

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ class WaybackMachineDownloader
127127

128128
include ArchiveAPI
129129
include SubdomainProcessor
130+
include URLRewrite
130131

131132
VERSION = "2.4.4"
132133
DEFAULT_TIMEOUT = 30
@@ -648,7 +649,8 @@ def rewrite_urls_to_relative(file_path)
648649
begin
649650
content = File.binread(file_path)
650651

651-
if file_ext == '.html' || file_ext == '.htm'
652+
# detect encoding for HTML files
653+
if file_ext == '.html' || file_ext == '.htm' || file_ext == '.php' || file_ext == '.asp'
652654
encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
653655
content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
654656
else
@@ -664,13 +666,13 @@ def rewrite_urls_to_relative(file_path)
664666
# URLs in JavaScript
665667
content = rewrite_js_urls(content)
666668

667-
# for URLs in HTML attributes that start with a single slash
669+
# for URLs that start with a single slash, make them relative
668670
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
669671
prefix, path, suffix = $1, $2, $3
670672
"#{prefix}./#{path}#{suffix}"
671673
end
672674

673-
# for URLs in CSS that start with a single slash
675+
# for URLs in CSS that start with a single slash, make them relative
674676
content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
675677
path = $1
676678
"url(\"./#{path}\")"
Lines changed: 71 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,74 +1,85 @@
11
# frozen_string_literal: true
22

3-
# URLs in HTML attributes
4-
def rewrite_html_attr_urls(content)
5-
6-
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
7-
prefix, url, suffix = $1, $2, $3
8-
9-
if url.start_with?('http')
10-
begin
11-
uri = URI.parse(url)
12-
path = uri.path
13-
path = path[1..-1] if path.start_with?('/')
14-
"#{prefix}#{path}#{suffix}"
15-
rescue
16-
"#{prefix}#{url}#{suffix}"
17-
end
18-
elsif url.start_with?('/')
19-
"#{prefix}./#{url[1..-1]}#{suffix}"
20-
else
21-
"#{prefix}#{url}#{suffix}"
3+
module URLRewrite
4+
# server-side extensions that should work locally
5+
SERVER_SIDE_EXTS = %w[.php .asp .aspx .jsp .cgi .pl .py].freeze
6+
7+
def rewrite_html_attr_urls(content)
8+
# rewrite URLs to relative paths
9+
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
10+
prefix, path, suffix = $1, $2, $3
11+
path = normalize_path_for_local(path)
12+
"#{prefix}#{path}#{suffix}"
13+
end
14+
15+
# rewrite absolute URLs to same domain as relative
16+
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
17+
prefix, path, suffix = $1, $2, $3
18+
path = normalize_path_for_local(path)
19+
"#{prefix}#{path}#{suffix}"
2220
end
21+
22+
content
2323
end
24-
content
25-
end
2624

27-
# URLs in CSS
28-
def rewrite_css_urls(content)
25+
def rewrite_css_urls(content)
26+
# rewrite URLs in CSS
27+
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
28+
path = normalize_path_for_local($1)
29+
"url(\"#{path}\")"
30+
end
2931

30-
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
31-
url = $1
32-
33-
if url.start_with?('http')
34-
begin
35-
uri = URI.parse(url)
36-
path = uri.path
37-
path = path[1..-1] if path.start_with?('/')
38-
"url(\"#{path}\")"
39-
rescue
40-
"url(\"#{url}\")"
41-
end
42-
elsif url.start_with?('/')
43-
"url(\"./#{url[1..-1]}\")"
44-
else
45-
"url(\"#{url}\")"
32+
# rewrite absolute URLs in CSS
33+
content.gsub!(/url\(\s*["']?https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
34+
path = normalize_path_for_local($1)
35+
"url(\"#{path}\")"
4636
end
37+
38+
content
4739
end
48-
content
49-
end
5040

51-
# URLs in JavaScript
52-
def rewrite_js_urls(content)
53-
54-
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
55-
quote_start, url, quote_end = $1, $2, $3
41+
def rewrite_js_urls(content)
42+
# rewrite archive.org URLs in JavaScript strings
43+
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
44+
quote_start, path, quote_end = $1, $2, $3
45+
path = normalize_path_for_local(path)
46+
"#{quote_start}#{path}#{quote_end}"
47+
end
48+
49+
# rewrite absolute URLs in JavaScript
50+
content.gsub!(/(["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
51+
quote_start, path, quote_end = $1, $2, $3
52+
next "#{quote_start}http#{$2}#{quote_end}" if $2.start_with?('s://', '://')
53+
path = normalize_path_for_local(path)
54+
"#{quote_start}#{path}#{quote_end}"
55+
end
56+
57+
content
58+
end
59+
60+
private
61+
62+
def normalize_path_for_local(path)
63+
return "./index.html" if path.empty? || path == "/"
5664

57-
if url.start_with?('http')
58-
begin
59-
uri = URI.parse(url)
60-
path = uri.path
61-
path = path[1..-1] if path.start_with?('/')
62-
"#{quote_start}#{path}#{quote_end}"
63-
rescue
64-
"#{quote_start}#{url}#{quote_end}"
65-
end
66-
elsif url.start_with?('/')
67-
"#{quote_start}./#{url[1..-1]}#{quote_end}"
65+
# handle query strings - they're already part of the filename
66+
path = path.split('?').first if path.include?('?')
67+
68+
# check if this is a server-side script
69+
ext = File.extname(path).downcase
70+
if SERVER_SIDE_EXTS.include?(ext)
71+
# keep the path as-is but ensure it starts with ./
72+
path = "./#{path}" unless path.start_with?('./', '/')
6873
else
69-
"#{quote_start}#{url}#{quote_end}"
74+
# regular file handling
75+
path = "./#{path}" unless path.start_with?('./', '/')
76+
77+
# if it looks like a directory, add index.html
78+
if path.end_with?('/') || !path.include?('.')
79+
path = "#{path.chomp('/')}/index.html"
80+
end
7081
end
82+
83+
path
7184
end
72-
73-
content
7485
end

0 commit comments

Comments
 (0)