11# frozen_string_literal: true
22
3- # URLs in HTML attributes
4- def rewrite_html_attr_urls ( content )
5-
6- content . gsub! ( /(\s (?:href|src|action|data-src|data-url)=["'])https?:\/ \/ web\. archive\. org\/ web\/ [0-9]+(?:id_)?\/ ([^"']+)(["'])/i ) do
7- prefix , url , suffix = $1, $2, $3
8-
9- if url . start_with? ( 'http' )
10- begin
11- uri = URI . parse ( url )
12- path = uri . path
13- path = path [ 1 ..-1 ] if path . start_with? ( '/' )
14- "#{ prefix } #{ path } #{ suffix } "
15- rescue
16- "#{ prefix } #{ url } #{ suffix } "
17- end
18- elsif url . start_with? ( '/' )
19- "#{ prefix } ./#{ url [ 1 ..-1 ] } #{ suffix } "
20- else
21- "#{ prefix } #{ url } #{ suffix } "
3+ module URLRewrite
4+ # server-side extensions that should work locally
5+ SERVER_SIDE_EXTS = %w[ .php .asp .aspx .jsp .cgi .pl .py ] . freeze
6+
7+ def rewrite_html_attr_urls ( content )
8+ # rewrite URLs to relative paths
9+ content . gsub! ( /(\s (?:href|src|action|data-src|data-url)=["'])https?:\/ \/ web\. archive\. org\/ web\/ \d +(?:id_)?\/ https?:\/ \/ [^\/ ]+([^"']*)(["'])/i ) do
10+ prefix , path , suffix = $1, $2, $3
11+ path = normalize_path_for_local ( path )
12+ "#{ prefix } #{ path } #{ suffix } "
13+ end
14+
15+ # rewrite absolute URLs to same domain as relative
16+ content . gsub! ( /(\s (?:href|src|action|data-src|data-url)=["'])https?:\/ \/ [^\/ ]+([^"']*)(["'])/i ) do
17+ prefix , path , suffix = $1, $2, $3
18+ path = normalize_path_for_local ( path )
19+ "#{ prefix } #{ path } #{ suffix } "
2220 end
21+
22+ content
2323 end
24- content
25- end
2624
27- # URLs in CSS
28- def rewrite_css_urls ( content )
25+ def rewrite_css_urls ( content )
26+ # rewrite URLs in CSS
27+ content . gsub! ( /url\( \s *["']?https?:\/ \/ web\. archive\. org\/ web\/ \d +(?:id_)?\/ https?:\/ \/ [^\/ ]+([^"'\) ]*?)["']?\s *\) /i ) do
28+ path = normalize_path_for_local ( $1)
29+ "url(\" #{ path } \" )"
30+ end
2931
30- content . gsub! ( /url\( \s *["']?https?:\/ \/ web\. archive\. org\/ web\/ [0-9]+(?:id_)?\/ ([^"'\) ]+)["']?\s *\) /i ) do
31- url = $1
32-
33- if url . start_with? ( 'http' )
34- begin
35- uri = URI . parse ( url )
36- path = uri . path
37- path = path [ 1 ..-1 ] if path . start_with? ( '/' )
38- "url(\" #{ path } \" )"
39- rescue
40- "url(\" #{ url } \" )"
41- end
42- elsif url . start_with? ( '/' )
43- "url(\" ./#{ url [ 1 ..-1 ] } \" )"
44- else
45- "url(\" #{ url } \" )"
32+ # rewrite absolute URLs in CSS
33+ content . gsub! ( /url\( \s *["']?https?:\/ \/ [^\/ ]+([^"'\) ]*?)["']?\s *\) /i ) do
34+ path = normalize_path_for_local ( $1)
35+ "url(\" #{ path } \" )"
4636 end
37+
38+ content
4739 end
48- content
49- end
5040
51- # URLs in JavaScript
52- def rewrite_js_urls ( content )
53-
54- content . gsub! ( /(["'])https?:\/ \/ web\. archive\. org\/ web\/ [0-9]+(?:id_)?\/ ([^"']+)(["'])/i ) do
55- quote_start , url , quote_end = $1, $2, $3
41+ def rewrite_js_urls ( content )
42+ # rewrite archive.org URLs in JavaScript strings
43+ content . gsub! ( /(["'])https?:\/ \/ web\. archive\. org\/ web\/ \d +(?:id_)?\/ https?:\/ \/ [^\/ ]+([^"']*)(["'])/i ) do
44+ quote_start , path , quote_end = $1, $2, $3
45+ path = normalize_path_for_local ( path )
46+ "#{ quote_start } #{ path } #{ quote_end } "
47+ end
48+
49+ # rewrite absolute URLs in JavaScript
50+ content . gsub! ( /(["'])https?:\/ \/ [^\/ ]+([^"']*)(["'])/i ) do
51+ quote_start , path , quote_end = $1, $2, $3
52+ next "#{ quote_start } http#{ $2} #{ quote_end } " if $2. start_with? ( 's://' , '://' )
53+ path = normalize_path_for_local ( path )
54+ "#{ quote_start } #{ path } #{ quote_end } "
55+ end
56+
57+ content
58+ end
59+
60+ private
61+
62+ def normalize_path_for_local ( path )
63+ return "./index.html" if path . empty? || path == "/"
5664
57- if url . start_with? ( 'http' )
58- begin
59- uri = URI . parse ( url )
60- path = uri . path
61- path = path [ 1 ..-1 ] if path . start_with? ( '/' )
62- "#{ quote_start } #{ path } #{ quote_end } "
63- rescue
64- "#{ quote_start } #{ url } #{ quote_end } "
65- end
66- elsif url . start_with? ( '/' )
67- "#{ quote_start } ./#{ url [ 1 ..-1 ] } #{ quote_end } "
65+ # handle query strings - they're already part of the filename
66+ path = path . split ( '?' ) . first if path . include? ( '?' )
67+
68+ # check if this is a server-side script
69+ ext = File . extname ( path ) . downcase
70+ if SERVER_SIDE_EXTS . include? ( ext )
71+ # keep the path as-is but ensure it starts with ./
72+ path = "./#{ path } " unless path . start_with? ( './' , '/' )
6873 else
69- "#{ quote_start } #{ url } #{ quote_end } "
74+ # regular file handling
75+ path = "./#{ path } " unless path . start_with? ( './' , '/' )
76+
77+ # if it looks like a directory, add index.html
78+ if path . end_with? ( '/' ) || !path . include? ( '.' )
79+ path = "#{ path . chomp ( '/' ) } /index.html"
80+ end
7081 end
82+
83+ path
7184 end
72-
73- content
7485end
0 commit comments