Skip to content

Commit cd29f79

Browse files
committed
Switch to the JSON output format for easier parsing
1 parent afab72c commit cd29f79

File tree

2 files changed

+17
-12
lines changed

2 files changed

+17
-12
lines changed

lib/wayback_machine_downloader.rb

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def get_all_snapshots_to_consider
8484
# Note: Passing a page index parameter allow us to get more snapshots,
8585
# but from a less fresh index
8686
print "Getting snapshot pages"
87-
snapshot_list_to_consider = ""
87+
snapshot_list_to_consider = []
8888
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
8989
print "."
9090
unless @exact_url
@@ -95,17 +95,15 @@ def get_all_snapshots_to_consider
9595
print "."
9696
end
9797
end
98-
puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
98+
puts " found #{snapshot_list_to_consider.length} snaphots to consider."
9999
puts
100100
snapshot_list_to_consider
101101
end
102102

103103
def get_file_list_curated
104104
file_list_curated = Hash.new
105-
get_all_snapshots_to_consider.each_line do |line|
106-
next unless line.include?('/')
107-
file_timestamp = line[0..13].to_i
108-
file_url = line[15..-2]
105+
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
106+
next unless file_url.include?('/')
109107
file_id = file_url.split('/')[3..-1].join('/')
110108
file_id = CGI::unescape file_id
111109
file_id = file_id.tidy_bytes unless file_id == ""
@@ -130,10 +128,8 @@ def get_file_list_curated
130128

131129
def get_file_list_all_timestamps
132130
file_list_curated = Hash.new
133-
get_all_snapshots_to_consider.each_line do |line|
134-
next unless line.include?('/')
135-
file_timestamp = line[0..13].to_i
136-
file_url = line[15..-2]
131+
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
132+
next unless file_url.include?('/')
137133
file_id = file_url.split('/')[3..-1].join('/')
138134
file_id_and_timestamp = [file_timestamp, file_id].join('/')
139135
file_id_and_timestamp = CGI::unescape file_id_and_timestamp

lib/wayback_machine_downloader/archive_api.rb

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,23 @@
1+
require 'json'
12
require 'uri'
23

34
module ArchiveAPI
45

56
def get_raw_list_from_api url, page_index
67
request_url = URI("https://web.archive.org/cdx/search/xd")
7-
params = [["url", url]]
8+
params = [["output", "json"], ["url", url]]
89
params += parameters_for_api page_index
910
request_url.query = URI.encode_www_form(params)
1011

11-
URI.open(request_url).read
12+
begin
13+
json = JSON.parse(URI.open(request_url).read)
14+
if (json[0] <=> ["timestamp","original"]) == 0
15+
json.shift
16+
end
17+
json
18+
rescue JSON::ParserError
19+
[]
20+
end
1221
end
1322

1423
def parameters_for_api page_index

0 commit comments

Comments
 (0)