Skip to content

Commit d87c28e

Browse files
committed
Final tidy up
1 parent 435f728 commit d87c28e

File tree

2 files changed

+88
-77
lines changed

2 files changed

+88
-77
lines changed

tools/dev/detect_dead_reference_links.rb

Lines changed: 62 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -25,28 +25,27 @@ class UrlChecker
2525
MAX_RETRIES = 3 # Maximum number of retries for failed requests to the Wayback Machine
2626
RETRY_DELAY = 5 # Delay in seconds between retries
2727

28-
# Initializes the URL checker with the given URLs and logging options
29-
#
30-
# @param urls_with_paths [Array<Hash>] An array of hashes containing URL and path information.
31-
# @param log_level [Logger::Severity] The desired logging level (default: Logger::INFO)
28+
# Initializes the UrlChecker instance with given URLs and configuration options
29+
# @param [Array<Hash>] urls_with_paths A list of URLs with associated paths to check
30+
# @param [Logger::Level] log_level The logging level (defaults to Logger::INFO)
3231
def initialize(urls_with_paths, log_level: Logger::INFO)
3332
@urls_with_paths = urls_with_paths
3433
@results = []
3534
@checked_urls = load_checked_urls
3635
@url_times = []
37-
@logger = Logger.new(STDOUT)
36+
@logger = Logger.new($stdout)
3837
@logger.level = log_level
3938
@total_urls = urls_with_paths.size
4039
@processed_urls = 0
4140
end
4241

43-
# Checks the provided URLs for status and fetches Wayback Machine snapshots if needed.
44-
# URLs are processed in batches, and results are saved to a file.
42+
# Starts the process of checking all URLs in batches, logging results and saving progress
43+
# in a thread-safe manner.
4544
def check_urls
4645
pool = Concurrent::FixedThreadPool.new(THREAD_POOL_SIZE)
4746
at_exit { shutdown_thread_pool(pool) }
4847

49-
# Process URLs in batches to prevent excessive memory usage
48+
# Process URLs in batches to avoid overwhelming the system
5049
@urls_with_paths.each_slice(BATCH_SIZE) do |batch|
5150
futures = batch.map do |url_with_path|
5251
Concurrent::Promises.future(pool) do
@@ -55,18 +54,17 @@ def check_urls
5554
@checked_urls << url_with_path[:url]
5655
save_progress(result)
5756

58-
# Update the progress bar after each URL is processed
5957
update_progress
6058
end
6159
end
6260

63-
# Wait for all futures in the batch to finish
61+
# Wait for all futures in the current batch to finish before proceeding
6462
Concurrent::Promises.zip(*futures).wait!
65-
# Sleep between batches to reduce resource consumption
63+
64+
# Sleep between batches to avoid overloading the server
6665
sleep 5
6766
end
6867

69-
# Save the results to a file after all URLs are processed
7068
save_results_to_file
7169
ensure
7270
pool.shutdown
@@ -76,21 +74,19 @@ def check_urls
7674

7775
private
7876

79-
# Returns the URLs that haven't been checked yet
80-
#
81-
# @return [Array<Hash>] An array of URLs with paths that haven't been checked
77+
# Filters out URLs that have already been checked.
78+
# @return [Array<Hash>] List of URLs and paths that have not been checked yet
8279
def unchecked_urls
8380
@urls_with_paths.reject { |url_with_path| @checked_urls.include?(url_with_path[:url]) }
8481
end
8582

86-
# Checks the status of a single URL.
87-
#
88-
# @param url_with_path [Hash] The URL and associated path to check
89-
# @return [Hash] A hash with the URL, path, status, and archived snapshot (if any)
83+
# Checks a single URL and processes its response.
84+
# @param [Hash] url_with_path The URL and its associated path to check
85+
# @return [Hash] A result containing the URL, path, status, and archived snapshot (if available)
9086
def check_url(url_with_path)
9187
url_result = { url: url_with_path[:url], path: url_with_path[:path], status: nil, archived_snapshot: nil }
9288

93-
# Skip non-URL references or Wayback links
89+
# Skip non-URL references and Wayback links
9490
if !url_with_path[:url].start_with?('URL-')
9591
url_result[:status] = 'Skipped (not a URL- reference)'
9692
return url_result
@@ -101,8 +97,10 @@ def check_url(url_with_path)
10197

10298
# Clean the URL and validate it
10399
cleaned_url = url_with_path[:url].sub(/^URL-/, '')
100+
101+
# Check if the URL is valid
104102
if !valid_url?(cleaned_url)
105-
url_result[:status] = "Invalid URL"
103+
url_result[:status] = 'Invalid URL'
106104
return url_result
107105
end
108106

@@ -114,12 +112,14 @@ def check_url(url_with_path)
114112
start_time = Time.now
115113

116114
begin
115+
# Get the HTTP response and handle redirects
117116
response = get_response(http, uri)
118117
follow_redirects(http, uri, response)
119118
rescue StandardError => e
120119
handle_error(url_result, e)
121120
end
122121

122+
# Process the response (check for success, failure, or error)
123123
process_response(response, url_result)
124124
elapsed_time = Time.now - start_time
125125
@url_times << elapsed_time
@@ -129,28 +129,27 @@ def check_url(url_with_path)
129129
save_progress(url_result)
130130
end
131131

132-
# Checks if a given URL is valid.
133-
#
134-
# @param url [String] The URL to check
132+
# Validates if a URL is properly formatted
133+
# @param [String] url The URL to check
135134
# @return [Boolean] True if the URL is valid, false otherwise
136135
def valid_url?(url)
137-
URI.parse(url).is_a?(URI::HTTP) rescue false
136+
URI.parse(url).is_a?(URI::HTTP)
137+
rescue StandardError
138+
false
138139
end
139140

140-
# Sends an HTTP GET request to the specified URI.
141-
#
142-
# @param http [Net::HTTP] The HTTP client to use
143-
# @param uri [URI] The URI to send the GET request to
144-
# @return [Net::HTTPResponse] The response from the HTTP request
141+
# Sends an HTTP GET request to the specified URI
142+
# @param [Net::HTTP] http The HTTP client
143+
# @param [URI] uri The URI to send the GET request to
144+
# @return [Net::HTTPResponse] The HTTP response
145145
def get_response(http, uri)
146146
http.get(uri.request_uri)
147147
end
148148

149-
# Follows HTTP redirects until the maximum redirect limit is reached.
150-
#
151-
# @param http [Net::HTTP] The HTTP client to use
152-
# @param uri [URI] The current URI to check
153-
# @param response [Net::HTTPResponse] The current response from the server
149+
# Follows HTTP redirects up to a maximum limit (MAX_REDIRECTS)
150+
# @param [Net::HTTP] http The HTTP client
151+
# @param [URI] uri The original URI
152+
# @param [Net::HTTPResponse] response The HTTP response to process
154153
def follow_redirects(http, uri, response)
155154
redirect_count = 0
156155
while response.is_a?(Net::HTTPRedirection) && redirect_count < MAX_REDIRECTS
@@ -162,10 +161,9 @@ def follow_redirects(http, uri, response)
162161
end
163162
end
164163

165-
# Processes the HTTP response and updates the URL result with its status.
166-
#
167-
# @param response [Net::HTTPResponse] The HTTP response to process
168-
# @param url_result [Hash] The result hash for the URL
164+
# Processes the HTTP response to determine the URL status
165+
# @param [Net::HTTPResponse] response The HTTP response to process
166+
# @param [Hash] url_result The result hash to update with the status
169167
def process_response(response, url_result)
170168
if response.nil?
171169
url_result[:status] = 'Error: No response received'
@@ -177,18 +175,16 @@ def process_response(response, url_result)
177175
end
178176
end
179177

180-
# Handles errors by logging them and setting the error status on the URL result.
181-
#
182-
# @param url_result [Hash] The result hash for the URL
183-
# @param error [StandardError] The error encountered
178+
# Handles errors encountered during URL checking (e.g., network errors)
179+
# @param [Hash] url_result The result hash to update with error information
180+
# @param [StandardError] error The error that was raised
184181
def handle_error(url_result, error)
185182
url_result[:status] = "Error: #{error.message}"
186183
url_result[:archived_snapshot] = nil
187184
end
188185

189-
# Fetches the archived snapshot of a URL from the Wayback Machine.
190-
#
191-
# @param url_result [Hash] The result hash for the URL
186+
# Attempts to fetch the Wayback Machine snapshot for the URL
187+
# @param [Hash] url_result The result hash to update with the Wayback snapshot information
192188
def fetch_wayback_snapshot(url_result)
193189
wayback_url = "#{WAYBACK_MACHINE_API_URL}#{url_result[:url]}"
194190
retries = 0
@@ -209,10 +205,9 @@ def fetch_wayback_snapshot(url_result)
209205
end
210206
end
211207

212-
# Handles the Wayback Machine response and updates the URL result with the archived snapshot.
213-
#
214-
# @param response [Net::HTTPResponse] The Wayback Machine response
215-
# @param url_result [Hash] The result hash for the URL
208+
# Processes the response from the Wayback Machine API
209+
# @param [Net::HTTPResponse] response The response from the Wayback Machine
210+
# @param [Hash] url_result The result hash to update with the archived snapshot URL
216211
def handle_wayback_response(response, url_result)
217212
if response.is_a?(Net::HTTPSuccess)
218213
data = JSON.parse(response.body)
@@ -223,38 +218,35 @@ def handle_wayback_response(response, url_result)
223218
end
224219
end
225220

226-
# Saves the results of the URL checks to a file.
221+
# Saves the final results of the URL checks to a JSON file
227222
def save_results_to_file
228223
File.open('url_check_results.json', 'w') { |file| file.write(JSON.pretty_generate(@results)) }
229224
@logger.info('Results have been saved to "url_check_results.json".')
230225
end
231226

232-
# Appends the progress of a URL check to the checked URLs file.
233-
#
234-
# @param result [Hash] The result of the URL check to save
227+
# Saves the progress of checked URLs to a file
228+
# @param [Hash] result The result of a single URL check
235229
def save_progress(result)
236230
File.open(CHECKED_URLS_FILE, 'a') { |file| file.puts JSON.generate(result) }
237231
end
238232

239-
# Loads the URLs that have already been checked from the file.
240-
#
241-
# @return [Array<String>] An array of URLs that have been checked previously
233+
# Loads the list of already checked URLs from the progress file
234+
# @return [Array<String>] A list of checked URLs
242235
def load_checked_urls
243236
return [] unless File.exist?(CHECKED_URLS_FILE)
244237

245238
File.readlines(CHECKED_URLS_FILE).map { |row| JSON.parse(row)['url'] }
246239
end
247240

248-
# Shuts down the thread pool and waits for all threads to finish.
249-
#
250-
# @param pool [Concurrent::FixedThreadPool] The thread pool to shut down
241+
# Shuts down the thread pool after URL checking is complete
242+
# @param [Concurrent::FixedThreadPool] pool The thread pool to shut down
251243
def shutdown_thread_pool(pool)
252244
pool.shutdown
253245
pool.wait_for_termination
254246
@logger.info('Thread pool shut down successfully.')
255247
end
256248

257-
# Updates the progress bar and prints the progress to the console.
249+
# Updates the progress bar based on the number of URLs processed
258250
def update_progress
259251
@processed_urls += 1
260252
percentage = (@processed_urls.to_f / @total_urls * 100).round
@@ -264,27 +256,28 @@ def update_progress
264256
end
265257
end
266258

267-
if __FILE__ == $0
259+
# Main entry point to run the URL checking process
260+
if __FILE__ == $PROGRAM_NAME
268261
options = {}
269262
OptionParser.new do |opts|
270-
opts.banner = "Usage: ruby url_checker.rb [options]"
263+
opts.banner = 'Usage: ruby url_checker.rb [options]'
271264

272-
opts.on("-f", "--file FILE", "JSON file containing URLs and paths") do |file|
265+
opts.on('-f', '--file FILE', 'JSON file containing URLs and paths') do |file|
273266
options[:file] = file
274267
end
275268

276-
opts.on("-l", "--log-level LEVEL", "Log level (DEBUG, INFO, WARN, ERROR, FATAL, UNKNOWN)") do |log_level|
269+
opts.on('-l', '--log-level LEVEL', 'Log level (DEBUG, INFO, WARN, ERROR, FATAL, UNKNOWN)') do |log_level|
277270
options[:log_level] = log_level.upcase.to_sym
278271
end
279272
end.parse!
280273

281274
# Validate input file
282275
unless options[:file] && File.exist?(options[:file])
283-
puts "Please provide a valid JSON file with URLs and paths."
276+
puts 'Please provide a valid JSON file with URLs and paths.'
284277
exit 1
285278
end
286279

287-
# Set the desired log level
280+
# Handling for log level
288281
log_level = options[:log_level] || 'INFO'
289282
log_level = Logger.const_get(log_level)
290283

@@ -305,15 +298,15 @@ def update_progress
305298
# Create the final list of URLs and paths
306299
urls_with_paths_final = mapped_data.map { |entry| { url: entry['ref'], path: entry['path'] } }
307300

308-
# Record the start time for performance tracking
309301
start_time = Time.now
310302

311303
# Create and run the UrlChecker instance
312304
url_checker = UrlChecker.new(urls_with_paths_final, log_level: log_level)
313305
url_checker.check_urls
314306

315-
# Calculate and display the total time taken
307+
316308
end_time = Time.now
309+
# Calculate and display the total time taken
317310
elapsed_time = end_time - start_time
318311
minutes = (elapsed_time / 60).to_i
319312
seconds = (elapsed_time % 60).to_i

tools/dev/find_and_replace_dead_reference_links.rb

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,32 +10,46 @@
1010
require 'json'
1111
require 'fileutils'
1212

13+
# Loads JSON data from the specified file.
14+
# @param file_path [String] the path to the JSON file to load.
15+
# @return [Array] parsed JSON data.
16+
# @raise [Errno::ENOENT] if the file cannot be found.
17+
# @raise [JSON::ParserError] if the JSON is malformed.
1318
def load_json(file_path)
1419
JSON.parse(File.read(file_path))
1520
end
1621

22+
# Replaces the original URLs with archived snapshots in the content of files.
23+
# This method processes each entry in the provided data, and if a valid
24+
# archived snapshot is available, it replaces the URL in the corresponding file.
25+
# @param data [Array] the array of data containing URL and archived_snapshot pairs.
26+
# @return [void]
1727
def replace_links_in_files(data)
1828
data.each_with_index do |entry, index|
1929
puts "Processing entry #{index + 1}: #{entry['url']} -> #{entry['archived_snapshot']}"
2030

21-
url = entry["url"].sub(/^URL-/, '')
22-
path = entry["path"]
23-
archived_snapshot = entry["archived_snapshot"]
31+
url = entry['url'].sub(/^URL-/, '')
32+
path = entry['path']
33+
archived_snapshot = entry['archived_snapshot']
2434

25-
if archived_snapshot == "No archived version found" || archived_snapshot.nil?
26-
puts "Skipping entry #{index + 1} because no archived version is available."
35+
# Skip entries with no archived version or errors fetching the snapshot
36+
if archived_snapshot == 'No archived version found' || archived_snapshot.nil? || archived_snapshot.start_with?('Error fetching Wayback')
37+
puts "Skipping entry #{index + 1} because no archived version is available or there was an error fetching it."
2738
next
2839
end
2940

41+
# Construct full file path and check if file exists
3042
full_path = File.join(Dir.pwd, path)
3143

3244
if File.exist?(full_path)
3345
file_content = File.read(full_path)
3446

47+
# Replace the original URL with the archived snapshot
3548
updated_content = file_content.gsub(url, archived_snapshot)
3649

50+
# Write changes back to the file if any replacements were made
3751
if file_content != updated_content
38-
File.open(full_path, "w") { |file| file.write(updated_content) }
52+
File.open(full_path, 'w') { |file| file.write(updated_content) }
3953
puts "Replaced URL in file: #{full_path}"
4054
else
4155
puts "No change needed for file: #{full_path}"
@@ -47,8 +61,12 @@ def replace_links_in_files(data)
4761
end
4862

4963
begin
50-
json_data = load_json('url_check_results.json') # Change this to the actual JSON file path
64+
# Load the JSON data from the file 'url_check_results.json'
65+
json_data = load_json('url_check_results.json')
66+
67+
# Replace the URLs in files based on the loaded data
5168
replace_links_in_files(json_data)
52-
rescue => e
69+
rescue StandardError => e
70+
# Handle errors gracefully and provide meaningful feedback
5371
puts "An error occurred: #{e.message}"
5472
end

0 commit comments

Comments
 (0)