11##
2+ # This script checks the status of URLs from a provided JSON file.
3+ # It validates if URLs are alive, handles redirects, and fetches Wayback Machine snapshots for URLs that are down.
4+ # It logs the status of each URL, including errors, redirects, and archived snapshots.
25#
36# Usage: ruby tools/dev/detect_dead_reference_links.rb -f db/modules_metadata_base.json -l WARN
47#
5- ##
68
7- require 'faraday'
9+ require 'net/http'
10+ require 'uri'
811require 'json'
912require 'csv'
1013require 'concurrent'
1114require 'logger'
1215require 'fileutils'
1316require 'optparse'
1417require 'benchmark'
15- require 'oj' # Optimized JSON
1618
1719class UrlChecker
1820 WAYBACK_MACHINE_API_URL = 'https://archive.org/wayback/available?url='
19- MAX_REDIRECTS = 5
20- THREAD_POOL_SIZE = 20 # Increased thread pool size for better concurrency
21- CHECKED_URLS_FILE = 'checked_urls.jsonl'
22- BATCH_SIZE = 1000
23- MAX_RETRIES = 3
24- RETRY_DELAY = 5
25-
21+ MAX_REDIRECTS = 5 # Maximum number of redirects to follow for each URL
22+ THREAD_POOL_SIZE = 5 # Number of threads in the pool to process URLs concurrently
23+ CHECKED_URLS_FILE = 'checked_urls.jsonl' # File to save URLs that have been checked
24+ BATCH_SIZE = 1000 # Number of URLs to process in each batch
25+ MAX_RETRIES = 3 # Maximum number of retries for failed requests to the Wayback Machine
26+ RETRY_DELAY = 5 # Delay in seconds between retries
27+
28+ # Initializes the URL checker with the given URLs and logging options
29+ #
30+ # @param urls_with_paths [Array<Hash>] An array of hashes containing URL and path information.
31+ # @param log_level [Logger::Severity] The desired logging level (default: Logger::INFO)
2632 def initialize ( urls_with_paths , log_level : Logger ::INFO )
2733 @urls_with_paths = urls_with_paths
2834 @results = [ ]
@@ -34,6 +40,8 @@ def initialize(urls_with_paths, log_level: Logger::INFO)
3440 @processed_urls = 0
3541 end
3642
43+ # Checks the provided URLs for status and fetches Wayback Machine snapshots if needed.
44+ # URLs are processed in batches, and results are saved to a file.
3745 def check_urls
3846 pool = Concurrent ::FixedThreadPool . new ( THREAD_POOL_SIZE )
3947 at_exit { shutdown_thread_pool ( pool ) }
@@ -45,19 +53,20 @@ def check_urls
4553 result = check_url ( url_with_path )
4654 @results << result
4755 @checked_urls << url_with_path [ :url ]
48- save_progress_batch ( [ result ] )
56+ save_progress ( result )
4957
5058 # Update the progress bar after each URL is processed
5159 update_progress
5260 end
5361 end
5462
55- # Wait for batch to finish
63+ # Wait for all futures in the batch to finish
5664 Concurrent ::Promises . zip ( *futures ) . wait!
5765 # Sleep between batches to reduce resource consumption
5866 sleep 5
5967 end
6068
69+ # Save the results to a file after all URLs are processed
6170 save_results_to_file
6271 ensure
6372 pool . shutdown
@@ -67,37 +76,46 @@ def check_urls
6776
6877 private
6978
79+ # Returns the URLs that haven't been checked yet
80+ #
81+ # @return [Array<Hash>] An array of URLs with paths that haven't been checked
7082 def unchecked_urls
7183 @urls_with_paths . reject { |url_with_path | @checked_urls . include? ( url_with_path [ :url ] ) }
7284 end
7385
86+ # Checks the status of a single URL.
87+ #
88+ # @param url_with_path [Hash] The URL and associated path to check
89+ # @return [Hash] A hash with the URL, path, status, and archived snapshot (if any)
7490 def check_url ( url_with_path )
7591 url_result = { url : url_with_path [ :url ] , path : url_with_path [ :path ] , status : nil , archived_snapshot : nil }
7692
77- # Check if the URL is already a Wayback link
78- if url_with_path [ :url ] . start_with? ( 'http://web.archive.org/web' )
93+ # Skip non-URL references or Wayback links
94+ if !url_with_path [ :url ] . start_with? ( 'URL-' )
95+ url_result [ :status ] = 'Skipped (not a URL- reference)'
96+ return url_result
97+ elsif url_with_path [ :url ] . start_with? ( 'http://web.archive.org/web' )
7998 url_result [ :status ] = 'Wayback link (skipped)'
8099 return url_result
81100 end
82101
83- # Remove " URL-" prefix
102+ # Clean the URL and validate it
84103 cleaned_url = url_with_path [ :url ] . sub ( /^URL-/ , '' )
85104 if !valid_url? ( cleaned_url )
86105 url_result [ :status ] = "Invalid URL"
87106 return url_result
88107 end
89108
90- # Use Faraday for HTTP requests (persistent connection with pooling)
91- conn = Faraday . new ( url : cleaned_url ) do |faraday |
92- faraday . adapter Faraday . default_adapter # Net::HTTP
93- faraday . options . timeout = 10 # seconds
94- faraday . options . open_timeout = 5 # seconds
95- end
109+ # Prepare the HTTP request
110+ uri = URI . parse ( cleaned_url )
111+ http = Net ::HTTP . new ( uri . host , uri . port )
112+ http . use_ssl = uri . scheme == 'https'
96113
97114 start_time = Time . now
115+
98116 begin
99- response = get_response ( conn , cleaned_url )
100- follow_redirects ( conn , response )
117+ response = get_response ( http , uri )
118+ follow_redirects ( http , uri , response )
101119 rescue StandardError => e
102120 handle_error ( url_result , e )
103121 end
@@ -111,46 +129,73 @@ def check_url(url_with_path)
111129 save_progress ( url_result )
112130 end
113131
132+ # Checks if a given URL is valid.
133+ #
134+ # @param url [String] The URL to check
135+ # @return [Boolean] True if the URL is valid, false otherwise
114136 def valid_url? ( url )
115137 URI . parse ( url ) . is_a? ( URI ::HTTP ) rescue false
116138 end
117139
118- def get_response ( conn , url )
119- conn . get ( url )
140+ # Sends an HTTP GET request to the specified URI.
141+ #
142+ # @param http [Net::HTTP] The HTTP client to use
143+ # @param uri [URI] The URI to send the GET request to
144+ # @return [Net::HTTPResponse] The response from the HTTP request
145+ def get_response ( http , uri )
146+ http . get ( uri . request_uri )
120147 end
121148
122- def follow_redirects ( conn , response )
149+ # Follows HTTP redirects until the maximum redirect limit is reached.
150+ #
151+ # @param http [Net::HTTP] The HTTP client to use
152+ # @param uri [URI] The current URI to check
153+ # @param response [Net::HTTPResponse] The current response from the server
154+ def follow_redirects ( http , uri , response )
123155 redirect_count = 0
124- while response . status . to_i == 3 && redirect_count < MAX_REDIRECTS # HTTP 3xx redirects
125- location = response . headers [ 'Location ']
156+ while response . is_a? ( Net :: HTTPRedirection ) && redirect_count < MAX_REDIRECTS
157+ location = response [ 'location ']
126158 @logger . info ( "Redirecting to: #{ location } " )
127- response = conn . get ( location )
159+ uri = URI . parse ( location )
160+ response = http . get ( uri . request_uri )
128161 redirect_count += 1
129162 end
130163 end
131164
165+ # Processes the HTTP response and updates the URL result with its status.
166+ #
167+ # @param response [Net::HTTPResponse] The HTTP response to process
168+ # @param url_result [Hash] The result hash for the URL
132169 def process_response ( response , url_result )
133170 if response . nil?
134171 url_result [ :status ] = 'Error: No response received'
135- elsif response . status . to_i . between? ( 200 , 299 ) || response . status . to_i . between? ( 300 , 399 )
172+ elsif response . is_a? ( Net :: HTTPSuccess ) || response . is_a? ( Net :: HTTPRedirection )
136173 url_result [ :status ] = 'Alive'
137174 else
138- url_result [ :status ] = "Not Alive (Status Code: #{ response . status } )"
175+ url_result [ :status ] = "Not Alive (Status Code: #{ response . code } )"
139176 fetch_wayback_snapshot ( url_result )
140177 end
141178 end
142179
180+ # Handles errors by logging them and setting the error status on the URL result.
181+ #
182+ # @param url_result [Hash] The result hash for the URL
183+ # @param error [StandardError] The error encountered
143184 def handle_error ( url_result , error )
144185 url_result [ :status ] = "Error: #{ error . message } "
145186 url_result [ :archived_snapshot ] = nil
146187 end
147188
189+ # Fetches the archived snapshot of a URL from the Wayback Machine.
190+ #
191+ # @param url_result [Hash] The result hash for the URL
148192 def fetch_wayback_snapshot ( url_result )
149193 wayback_url = "#{ WAYBACK_MACHINE_API_URL } #{ url_result [ :url ] } "
150194 retries = 0
151195
152196 begin
153- response = Faraday . get ( wayback_url )
197+ uri = URI . parse ( wayback_url )
198+ response = Net ::HTTP . get_response ( uri )
154199 handle_wayback_response ( response , url_result )
155200 rescue StandardError => e
156201 retries += 1
@@ -164,43 +209,52 @@ def fetch_wayback_snapshot(url_result)
164209 end
165210 end
166211
212+ # Handles the Wayback Machine response and updates the URL result with the archived snapshot.
213+ #
214+ # @param response [Net::HTTPResponse] The Wayback Machine response
215+ # @param url_result [Hash] The result hash for the URL
167216 def handle_wayback_response ( response , url_result )
168- if response . status . to_i == 200
169- data = Oj . load ( response . body )
217+ if response . is_a? ( Net :: HTTPSuccess )
218+ data = JSON . parse ( response . body )
170219 snapshot = data . dig ( 'archived_snapshots' , 'closest' , 'url' )
171220 url_result [ :archived_snapshot ] = snapshot || 'No archived version found'
172221 else
173222 url_result [ :archived_snapshot ] = 'Error fetching Wayback Machine data'
174223 end
175224 end
176225
226+ # Saves the results of the URL checks to a file.
177227 def save_results_to_file
178228 File . open ( 'url_check_results.json' , 'w' ) { |file | file . write ( JSON . pretty_generate ( @results ) ) }
179229 @logger . info ( 'Results have been saved to "url_check_results.json".' )
180230 end
181231
232+ # Appends the progress of a URL check to the checked URLs file.
233+ #
234+ # @param result [Hash] The result of the URL check to save
182235 def save_progress ( result )
183236 File . open ( CHECKED_URLS_FILE , 'a' ) { |file | file . puts JSON . generate ( result ) }
184237 end
185238
186- def save_progress_batch ( results )
187- File . open ( CHECKED_URLS_FILE , 'a' ) do |file |
188- results . each { |result | file . puts JSON . generate ( result ) }
189- end
190- end
191-
239+ # Loads the URLs that have already been checked from the file.
240+ #
241+ # @return [Array<String>] An array of URLs that have been checked previously
192242 def load_checked_urls
193243 return [ ] unless File . exist? ( CHECKED_URLS_FILE )
194244
195245 File . readlines ( CHECKED_URLS_FILE ) . map { |row | JSON . parse ( row ) [ 'url' ] }
196246 end
197247
248+ # Shuts down the thread pool and waits for all threads to finish.
249+ #
250+ # @param pool [Concurrent::FixedThreadPool] The thread pool to shut down
198251 def shutdown_thread_pool ( pool )
199252 pool . shutdown
200253 pool . wait_for_termination
201254 @logger . info ( 'Thread pool shut down successfully.' )
202255 end
203256
257+ # Updates the progress bar and prints the progress to the console.
204258 def update_progress
205259 @processed_urls += 1
206260 percentage = ( @processed_urls . to_f / @total_urls * 100 ) . round
@@ -224,42 +278,45 @@ def update_progress
224278 end
225279 end . parse!
226280
281+ # Validate input file
227282 unless options [ :file ] && File . exist? ( options [ :file ] )
228283 puts "Please provide a valid JSON file with URLs and paths."
229284 exit 1
230285 end
231286
287+ # Set the desired log level
232288 log_level = options [ :log_level ] || 'INFO'
233289 log_level = Logger . const_get ( log_level )
234290
235- urls_with_paths = Oj . load ( File . read ( options [ :file ] ) )
291+ # Parse the JSON file containing URLs and paths
292+ urls_with_paths = JSON . parse ( File . read ( options [ :file ] ) )
236293
294+ # Map the data to the format required by the checker
237295 mapped_data = urls_with_paths . flat_map do |_path , metadata |
238296 metadata [ 'references' ] . map { |ref | { 'path' => metadata [ 'path' ] , 'ref' => ref } }
239297 end
240298
299+ # Validate the structure of the mapped data
241300 unless mapped_data . is_a? ( Array ) && mapped_data . all? { |entry | entry [ 'ref' ] && entry [ 'path' ] }
242301 puts "Invalid JSON structure. The file should contain an array of objects with 'ref' and 'path' keys."
243302 exit 1
244303 end
245304
305+ # Create the final list of URLs and paths
246306 urls_with_paths_final = mapped_data . map { |entry | { url : entry [ 'ref' ] , path : entry [ 'path' ] } }
247307
248- # Record the start time
308+ # Record the start time for performance tracking
249309 start_time = Time . now
250310
251- # Create and run the UrlChecker
311+ # Create and run the UrlChecker instance
252312 url_checker = UrlChecker . new ( urls_with_paths_final , log_level : log_level )
253313 url_checker . check_urls
254314
255315 # Calculate and display the total time taken
256316 end_time = Time . now
257317 elapsed_time = end_time - start_time
258-
259- # Convert seconds into minutes and seconds
260318 minutes = ( elapsed_time / 60 ) . to_i
261319 seconds = ( elapsed_time % 60 ) . to_i
262320
263- # Output the time in minutes and seconds
264321 puts "\n Total time taken: #{ minutes } minutes and #{ seconds } seconds"
265322end
0 commit comments