Skip to content

Commit ee33830

Browse files
committed
Improves performance
1 parent 9247049 commit ee33830

File tree

1 file changed

+30
-24
lines changed

1 file changed

+30
-24
lines changed

tools/dev/detect_dead_reference_links.rb

Lines changed: 30 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,20 @@
44
#
55
##
66

7-
require 'net/http'
8-
require 'uri'
7+
require 'faraday'
98
require 'json'
109
require 'csv'
1110
require 'concurrent'
1211
require 'logger'
1312
require 'fileutils'
1413
require 'optparse'
1514
require 'benchmark'
15+
require 'oj' # Optimized JSON
1616

1717
class UrlChecker
1818
WAYBACK_MACHINE_API_URL = 'https://archive.org/wayback/available?url='
1919
MAX_REDIRECTS = 5
20-
THREAD_POOL_SIZE = 5
20+
THREAD_POOL_SIZE = 20 # Increased thread pool size for better concurrency
2121
CHECKED_URLS_FILE = 'checked_urls.jsonl'
2222
BATCH_SIZE = 1000
2323
MAX_RETRIES = 3
@@ -45,7 +45,7 @@ def check_urls
4545
result = check_url(url_with_path)
4646
@results << result
4747
@checked_urls << url_with_path[:url]
48-
save_progress(result)
48+
save_progress_batch([result])
4949

5050
# Update the progress bar after each URL is processed
5151
update_progress
@@ -87,15 +87,17 @@ def check_url(url_with_path)
8787
return url_result
8888
end
8989

90-
uri = URI.parse(cleaned_url)
91-
http = Net::HTTP.new(uri.host, uri.port)
92-
http.use_ssl = uri.scheme == 'https'
90+
# Use Faraday for HTTP requests (persistent connection with pooling)
91+
conn = Faraday.new(url: cleaned_url) do |faraday|
92+
faraday.adapter Faraday.default_adapter # Net::HTTP
93+
faraday.options.timeout = 10 # seconds
94+
faraday.options.open_timeout = 5 # seconds
95+
end
9396

9497
start_time = Time.now
95-
9698
begin
97-
response = get_response(http, uri)
98-
follow_redirects(http, uri, response)
99+
response = get_response(conn, cleaned_url)
100+
follow_redirects(conn, response)
99101
rescue StandardError => e
100102
handle_error(url_result, e)
101103
end
@@ -113,28 +115,27 @@ def valid_url?(url)
113115
URI.parse(url).is_a?(URI::HTTP) rescue false
114116
end
115117

116-
def get_response(http, uri)
117-
http.get(uri.request_uri)
118+
def get_response(conn, url)
119+
conn.get(url)
118120
end
119121

120-
def follow_redirects(http, uri, response)
122+
def follow_redirects(conn, response)
121123
redirect_count = 0
122-
while response.is_a?(Net::HTTPRedirection) && redirect_count < MAX_REDIRECTS
123-
location = response['location']
124+
while response.status.to_i == 3 && redirect_count < MAX_REDIRECTS # HTTP 3xx redirects
125+
location = response.headers['Location']
124126
@logger.info("Redirecting to: #{location}")
125-
uri = URI.parse(location)
126-
response = http.get(uri.request_uri)
127+
response = conn.get(location)
127128
redirect_count += 1
128129
end
129130
end
130131

131132
def process_response(response, url_result)
132133
if response.nil?
133134
url_result[:status] = 'Error: No response received'
134-
elsif response.is_a?(Net::HTTPSuccess) || response.is_a?(Net::HTTPRedirection)
135+
elsif response.status.to_i.between?(200, 299) || response.status.to_i.between?(300, 399)
135136
url_result[:status] = 'Alive'
136137
else
137-
url_result[:status] = "Not Alive (Status Code: #{response.code})"
138+
url_result[:status] = "Not Alive (Status Code: #{response.status})"
138139
fetch_wayback_snapshot(url_result)
139140
end
140141
end
@@ -149,8 +150,7 @@ def fetch_wayback_snapshot(url_result)
149150
retries = 0
150151

151152
begin
152-
uri = URI.parse(wayback_url)
153-
response = Net::HTTP.get_response(uri)
153+
response = Faraday.get(wayback_url)
154154
handle_wayback_response(response, url_result)
155155
rescue StandardError => e
156156
retries += 1
@@ -165,8 +165,8 @@ def fetch_wayback_snapshot(url_result)
165165
end
166166

167167
def handle_wayback_response(response, url_result)
168-
if response.is_a?(Net::HTTPSuccess)
169-
data = JSON.parse(response.body)
168+
if response.status.to_i == 200
169+
data = Oj.load(response.body)
170170
snapshot = data.dig('archived_snapshots', 'closest', 'url')
171171
url_result[:archived_snapshot] = snapshot || 'No archived version found'
172172
else
@@ -183,6 +183,12 @@ def save_progress(result)
183183
File.open(CHECKED_URLS_FILE, 'a') { |file| file.puts JSON.generate(result) }
184184
end
185185

186+
def save_progress_batch(results)
187+
File.open(CHECKED_URLS_FILE, 'a') do |file|
188+
results.each { |result| file.puts JSON.generate(result) }
189+
end
190+
end
191+
186192
def load_checked_urls
187193
return [] unless File.exist?(CHECKED_URLS_FILE)
188194

@@ -226,7 +232,7 @@ def update_progress
226232
log_level = options[:log_level] || 'INFO'
227233
log_level = Logger.const_get(log_level)
228234

229-
urls_with_paths = JSON.parse(File.read(options[:file]))
235+
urls_with_paths = Oj.load(File.read(options[:file]))
230236

231237
mapped_data = urls_with_paths.flat_map do |_path, metadata|
232238
metadata['references'].map { |ref| { 'path' => metadata['path'], 'ref' => ref } }

0 commit comments

Comments
 (0)