Skip to content

Commit 435f728

Browse files
committed
Reverts performance changes
1 parent ee33830 commit 435f728

File tree

1 file changed

+103
-46
lines changed

1 file changed

+103
-46
lines changed
Lines changed: 103 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,34 @@
11
##
2+
# This script checks the status of URLs from a provided JSON file.
3+
# It validates if URLs are alive, handles redirects, and fetches Wayback Machine snapshots for URLs that are down.
4+
# It logs the status of each URL, including errors, redirects, and archived snapshots.
25
#
36
# Usage: ruby tools/dev/detect_dead_reference_links.rb -f db/modules_metadata_base.json -l WARN
47
#
5-
##
68

7-
require 'faraday'
9+
require 'net/http'
10+
require 'uri'
811
require 'json'
912
require 'csv'
1013
require 'concurrent'
1114
require 'logger'
1215
require 'fileutils'
1316
require 'optparse'
1417
require 'benchmark'
15-
require 'oj' # Optimized JSON
1618

1719
class UrlChecker
1820
WAYBACK_MACHINE_API_URL = 'https://archive.org/wayback/available?url='
19-
MAX_REDIRECTS = 5
20-
THREAD_POOL_SIZE = 20 # Increased thread pool size for better concurrency
21-
CHECKED_URLS_FILE = 'checked_urls.jsonl'
22-
BATCH_SIZE = 1000
23-
MAX_RETRIES = 3
24-
RETRY_DELAY = 5
25-
21+
MAX_REDIRECTS = 5 # Maximum number of redirects to follow for each URL
22+
THREAD_POOL_SIZE = 5 # Number of threads in the pool to process URLs concurrently
23+
CHECKED_URLS_FILE = 'checked_urls.jsonl' # File to save URLs that have been checked
24+
BATCH_SIZE = 1000 # Number of URLs to process in each batch
25+
MAX_RETRIES = 3 # Maximum number of retries for failed requests to the Wayback Machine
26+
RETRY_DELAY = 5 # Delay in seconds between retries
27+
28+
# Initializes the URL checker with the given URLs and logging options
29+
#
30+
# @param urls_with_paths [Array<Hash>] An array of hashes containing URL and path information.
31+
# @param log_level [Logger::Severity] The desired logging level (default: Logger::INFO)
2632
def initialize(urls_with_paths, log_level: Logger::INFO)
2733
@urls_with_paths = urls_with_paths
2834
@results = []
@@ -34,6 +40,8 @@ def initialize(urls_with_paths, log_level: Logger::INFO)
3440
@processed_urls = 0
3541
end
3642

43+
# Checks the provided URLs for status and fetches Wayback Machine snapshots if needed.
44+
# URLs are processed in batches, and results are saved to a file.
3745
def check_urls
3846
pool = Concurrent::FixedThreadPool.new(THREAD_POOL_SIZE)
3947
at_exit { shutdown_thread_pool(pool) }
@@ -45,19 +53,20 @@ def check_urls
4553
result = check_url(url_with_path)
4654
@results << result
4755
@checked_urls << url_with_path[:url]
48-
save_progress_batch([result])
56+
save_progress(result)
4957

5058
# Update the progress bar after each URL is processed
5159
update_progress
5260
end
5361
end
5462

55-
# Wait for batch to finish
63+
# Wait for all futures in the batch to finish
5664
Concurrent::Promises.zip(*futures).wait!
5765
# Sleep between batches to reduce resource consumption
5866
sleep 5
5967
end
6068

69+
# Save the results to a file after all URLs are processed
6170
save_results_to_file
6271
ensure
6372
pool.shutdown
@@ -67,37 +76,46 @@ def check_urls
6776

6877
private
6978

79+
# Returns the URLs that haven't been checked yet
80+
#
81+
# @return [Array<Hash>] An array of URLs with paths that haven't been checked
7082
def unchecked_urls
7183
@urls_with_paths.reject { |url_with_path| @checked_urls.include?(url_with_path[:url]) }
7284
end
7385

86+
# Checks the status of a single URL.
87+
#
88+
# @param url_with_path [Hash] The URL and associated path to check
89+
# @return [Hash] A hash with the URL, path, status, and archived snapshot (if any)
7490
def check_url(url_with_path)
7591
url_result = { url: url_with_path[:url], path: url_with_path[:path], status: nil, archived_snapshot: nil }
7692

77-
# Check if the URL is already a Wayback link
78-
if url_with_path[:url].start_with?('http://web.archive.org/web')
93+
# Skip non-URL references or Wayback links
94+
if !url_with_path[:url].start_with?('URL-')
95+
url_result[:status] = 'Skipped (not a URL- reference)'
96+
return url_result
97+
elsif url_with_path[:url].start_with?('http://web.archive.org/web')
7998
url_result[:status] = 'Wayback link (skipped)'
8099
return url_result
81100
end
82101

83-
# Remove "URL-" prefix
102+
# Clean the URL and validate it
84103
cleaned_url = url_with_path[:url].sub(/^URL-/, '')
85104
if !valid_url?(cleaned_url)
86105
url_result[:status] = "Invalid URL"
87106
return url_result
88107
end
89108

90-
# Use Faraday for HTTP requests (persistent connection with pooling)
91-
conn = Faraday.new(url: cleaned_url) do |faraday|
92-
faraday.adapter Faraday.default_adapter # Net::HTTP
93-
faraday.options.timeout = 10 # seconds
94-
faraday.options.open_timeout = 5 # seconds
95-
end
109+
# Prepare the HTTP request
110+
uri = URI.parse(cleaned_url)
111+
http = Net::HTTP.new(uri.host, uri.port)
112+
http.use_ssl = uri.scheme == 'https'
96113

97114
start_time = Time.now
115+
98116
begin
99-
response = get_response(conn, cleaned_url)
100-
follow_redirects(conn, response)
117+
response = get_response(http, uri)
118+
follow_redirects(http, uri, response)
101119
rescue StandardError => e
102120
handle_error(url_result, e)
103121
end
@@ -111,46 +129,73 @@ def check_url(url_with_path)
111129
save_progress(url_result)
112130
end
113131

132+
# Checks if a given URL is valid.
133+
#
134+
# @param url [String] The URL to check
135+
# @return [Boolean] True if the URL is valid, false otherwise
114136
def valid_url?(url)
115137
URI.parse(url).is_a?(URI::HTTP) rescue false
116138
end
117139

118-
def get_response(conn, url)
119-
conn.get(url)
140+
# Sends an HTTP GET request to the specified URI.
141+
#
142+
# @param http [Net::HTTP] The HTTP client to use
143+
# @param uri [URI] The URI to send the GET request to
144+
# @return [Net::HTTPResponse] The response from the HTTP request
145+
def get_response(http, uri)
146+
http.get(uri.request_uri)
120147
end
121148

122-
def follow_redirects(conn, response)
149+
# Follows HTTP redirects until the maximum redirect limit is reached.
150+
#
151+
# @param http [Net::HTTP] The HTTP client to use
152+
# @param uri [URI] The current URI to check
153+
# @param response [Net::HTTPResponse] The current response from the server
154+
def follow_redirects(http, uri, response)
123155
redirect_count = 0
124-
while response.status.to_i == 3 && redirect_count < MAX_REDIRECTS # HTTP 3xx redirects
125-
location = response.headers['Location']
156+
while response.is_a?(Net::HTTPRedirection) && redirect_count < MAX_REDIRECTS
157+
location = response['location']
126158
@logger.info("Redirecting to: #{location}")
127-
response = conn.get(location)
159+
uri = URI.parse(location)
160+
response = http.get(uri.request_uri)
128161
redirect_count += 1
129162
end
130163
end
131164

165+
# Processes the HTTP response and updates the URL result with its status.
166+
#
167+
# @param response [Net::HTTPResponse] The HTTP response to process
168+
# @param url_result [Hash] The result hash for the URL
132169
def process_response(response, url_result)
133170
if response.nil?
134171
url_result[:status] = 'Error: No response received'
135-
elsif response.status.to_i.between?(200, 299) || response.status.to_i.between?(300, 399)
172+
elsif response.is_a?(Net::HTTPSuccess) || response.is_a?(Net::HTTPRedirection)
136173
url_result[:status] = 'Alive'
137174
else
138-
url_result[:status] = "Not Alive (Status Code: #{response.status})"
175+
url_result[:status] = "Not Alive (Status Code: #{response.code})"
139176
fetch_wayback_snapshot(url_result)
140177
end
141178
end
142179

180+
# Handles errors by logging them and setting the error status on the URL result.
181+
#
182+
# @param url_result [Hash] The result hash for the URL
183+
# @param error [StandardError] The error encountered
143184
def handle_error(url_result, error)
144185
url_result[:status] = "Error: #{error.message}"
145186
url_result[:archived_snapshot] = nil
146187
end
147188

189+
# Fetches the archived snapshot of a URL from the Wayback Machine.
190+
#
191+
# @param url_result [Hash] The result hash for the URL
148192
def fetch_wayback_snapshot(url_result)
149193
wayback_url = "#{WAYBACK_MACHINE_API_URL}#{url_result[:url]}"
150194
retries = 0
151195

152196
begin
153-
response = Faraday.get(wayback_url)
197+
uri = URI.parse(wayback_url)
198+
response = Net::HTTP.get_response(uri)
154199
handle_wayback_response(response, url_result)
155200
rescue StandardError => e
156201
retries += 1
@@ -164,43 +209,52 @@ def fetch_wayback_snapshot(url_result)
164209
end
165210
end
166211

212+
# Handles the Wayback Machine response and updates the URL result with the archived snapshot.
213+
#
214+
# @param response [Net::HTTPResponse] The Wayback Machine response
215+
# @param url_result [Hash] The result hash for the URL
167216
def handle_wayback_response(response, url_result)
168-
if response.status.to_i == 200
169-
data = Oj.load(response.body)
217+
if response.is_a?(Net::HTTPSuccess)
218+
data = JSON.parse(response.body)
170219
snapshot = data.dig('archived_snapshots', 'closest', 'url')
171220
url_result[:archived_snapshot] = snapshot || 'No archived version found'
172221
else
173222
url_result[:archived_snapshot] = 'Error fetching Wayback Machine data'
174223
end
175224
end
176225

226+
# Saves the results of the URL checks to a file.
177227
def save_results_to_file
178228
File.open('url_check_results.json', 'w') { |file| file.write(JSON.pretty_generate(@results)) }
179229
@logger.info('Results have been saved to "url_check_results.json".')
180230
end
181231

232+
# Appends the progress of a URL check to the checked URLs file.
233+
#
234+
# @param result [Hash] The result of the URL check to save
182235
def save_progress(result)
183236
File.open(CHECKED_URLS_FILE, 'a') { |file| file.puts JSON.generate(result) }
184237
end
185238

186-
def save_progress_batch(results)
187-
File.open(CHECKED_URLS_FILE, 'a') do |file|
188-
results.each { |result| file.puts JSON.generate(result) }
189-
end
190-
end
191-
239+
# Loads the URLs that have already been checked from the file.
240+
#
241+
# @return [Array<String>] An array of URLs that have been checked previously
192242
def load_checked_urls
193243
return [] unless File.exist?(CHECKED_URLS_FILE)
194244

195245
File.readlines(CHECKED_URLS_FILE).map { |row| JSON.parse(row)['url'] }
196246
end
197247

248+
# Shuts down the thread pool and waits for all threads to finish.
249+
#
250+
# @param pool [Concurrent::FixedThreadPool] The thread pool to shut down
198251
def shutdown_thread_pool(pool)
199252
pool.shutdown
200253
pool.wait_for_termination
201254
@logger.info('Thread pool shut down successfully.')
202255
end
203256

257+
# Updates the progress bar and prints the progress to the console.
204258
def update_progress
205259
@processed_urls += 1
206260
percentage = (@processed_urls.to_f / @total_urls * 100).round
@@ -224,42 +278,45 @@ def update_progress
224278
end
225279
end.parse!
226280

281+
# Validate input file
227282
unless options[:file] && File.exist?(options[:file])
228283
puts "Please provide a valid JSON file with URLs and paths."
229284
exit 1
230285
end
231286

287+
# Set the desired log level
232288
log_level = options[:log_level] || 'INFO'
233289
log_level = Logger.const_get(log_level)
234290

235-
urls_with_paths = Oj.load(File.read(options[:file]))
291+
# Parse the JSON file containing URLs and paths
292+
urls_with_paths = JSON.parse(File.read(options[:file]))
236293

294+
# Map the data to the format required by the checker
237295
mapped_data = urls_with_paths.flat_map do |_path, metadata|
238296
metadata['references'].map { |ref| { 'path' => metadata['path'], 'ref' => ref } }
239297
end
240298

299+
# Validate the structure of the mapped data
241300
unless mapped_data.is_a?(Array) && mapped_data.all? { |entry| entry['ref'] && entry['path'] }
242301
puts "Invalid JSON structure. The file should contain an array of objects with 'ref' and 'path' keys."
243302
exit 1
244303
end
245304

305+
# Create the final list of URLs and paths
246306
urls_with_paths_final = mapped_data.map { |entry| { url: entry['ref'], path: entry['path'] } }
247307

248-
# Record the start time
308+
# Record the start time for performance tracking
249309
start_time = Time.now
250310

251-
# Create and run the UrlChecker
311+
# Create and run the UrlChecker instance
252312
url_checker = UrlChecker.new(urls_with_paths_final, log_level: log_level)
253313
url_checker.check_urls
254314

255315
# Calculate and display the total time taken
256316
end_time = Time.now
257317
elapsed_time = end_time - start_time
258-
259-
# Convert seconds into minutes and seconds
260318
minutes = (elapsed_time / 60).to_i
261319
seconds = (elapsed_time % 60).to_i
262320

263-
# Output the time in minutes and seconds
264321
puts "\nTotal time taken: #{minutes} minutes and #{seconds} seconds"
265322
end

0 commit comments

Comments
 (0)