@@ -25,28 +25,27 @@ class UrlChecker
2525 MAX_RETRIES = 3 # Maximum number of retries for failed requests to the Wayback Machine
2626 RETRY_DELAY = 5 # Delay in seconds between retries
2727
28- # Initializes the URL checker with the given URLs and logging options
29- #
30- # @param urls_with_paths [Array<Hash>] An array of hashes containing URL and path information.
31- # @param log_level [Logger::Severity] The desired logging level (default: Logger::INFO)
28+ # Initializes the UrlChecker instance with given URLs and configuration options
29+ # @param [Array<Hash>] urls_with_paths A list of URLs with associated paths to check
30+ # @param [Logger::Level] log_level The logging level (defaults to Logger::INFO)
3231 def initialize ( urls_with_paths , log_level : Logger ::INFO )
3332 @urls_with_paths = urls_with_paths
3433 @results = [ ]
3534 @checked_urls = load_checked_urls
3635 @url_times = [ ]
37- @logger = Logger . new ( STDOUT )
36+ @logger = Logger . new ( $stdout )
3837 @logger . level = log_level
3938 @total_urls = urls_with_paths . size
4039 @processed_urls = 0
4140 end
4241
43- # Checks the provided URLs for status and fetches Wayback Machine snapshots if needed.
44- # URLs are processed in batches, and results are saved to a file .
42+ # Starts the process of checking all URLs in batches, logging results and saving progress
43+ # in a thread-safe manner .
4544 def check_urls
4645 pool = Concurrent ::FixedThreadPool . new ( THREAD_POOL_SIZE )
4746 at_exit { shutdown_thread_pool ( pool ) }
4847
49- # Process URLs in batches to prevent excessive memory usage
48+ # Process URLs in batches to avoid overwhelming the system
5049 @urls_with_paths . each_slice ( BATCH_SIZE ) do |batch |
5150 futures = batch . map do |url_with_path |
5251 Concurrent ::Promises . future ( pool ) do
@@ -55,18 +54,17 @@ def check_urls
5554 @checked_urls << url_with_path [ :url ]
5655 save_progress ( result )
5756
58- # Update the progress bar after each URL is processed
5957 update_progress
6058 end
6159 end
6260
63- # Wait for all futures in the batch to finish
61+ # Wait for all futures in the current batch to finish before proceeding
6462 Concurrent ::Promises . zip ( *futures ) . wait!
65- # Sleep between batches to reduce resource consumption
63+
64+ # Sleep between batches to avoid overloading the server
6665 sleep 5
6766 end
6867
69- # Save the results to a file after all URLs are processed
7068 save_results_to_file
7169 ensure
7270 pool . shutdown
@@ -76,21 +74,19 @@ def check_urls
7674
7775 private
7876
79- # Returns the URLs that haven't been checked yet
80- #
81- # @return [Array<Hash>] An array of URLs with paths that haven't been checked
77+ # Filters out URLs that have already been checked.
78+ # @return [Array<Hash>] List of URLs and paths that have not been checked yet
8279 def unchecked_urls
8380 @urls_with_paths . reject { |url_with_path | @checked_urls . include? ( url_with_path [ :url ] ) }
8481 end
8582
86- # Checks the status of a single URL.
87- #
88- # @param url_with_path [Hash] The URL and associated path to check
89- # @return [Hash] A hash with the URL, path, status, and archived snapshot (if any)
83+ # Checks a single URL and processes its response.
84+ # @param [Hash] url_with_path The URL and its associated path to check
85+ # @return [Hash] A result containing the URL, path, status, and archived snapshot (if available)
9086 def check_url ( url_with_path )
9187 url_result = { url : url_with_path [ :url ] , path : url_with_path [ :path ] , status : nil , archived_snapshot : nil }
9288
93- # Skip non-URL references or Wayback links
89+ # Skip non-URL references and Wayback links
9490 if !url_with_path [ :url ] . start_with? ( 'URL-' )
9591 url_result [ :status ] = 'Skipped (not a URL- reference)'
9692 return url_result
@@ -101,8 +97,10 @@ def check_url(url_with_path)
10197
10298 # Clean the URL and validate it
10399 cleaned_url = url_with_path [ :url ] . sub ( /^URL-/ , '' )
100+
101+ # Check if the URL is valid
104102 if !valid_url? ( cleaned_url )
105- url_result [ :status ] = " Invalid URL"
103+ url_result [ :status ] = ' Invalid URL'
106104 return url_result
107105 end
108106
@@ -114,12 +112,14 @@ def check_url(url_with_path)
114112 start_time = Time . now
115113
116114 begin
115+ # Get the HTTP response and handle redirects
117116 response = get_response ( http , uri )
118117 follow_redirects ( http , uri , response )
119118 rescue StandardError => e
120119 handle_error ( url_result , e )
121120 end
122121
122+ # Process the response (check for success, failure, or error)
123123 process_response ( response , url_result )
124124 elapsed_time = Time . now - start_time
125125 @url_times << elapsed_time
@@ -129,28 +129,27 @@ def check_url(url_with_path)
129129 save_progress ( url_result )
130130 end
131131
132- # Checks if a given URL is valid.
133- #
134- # @param url [String] The URL to check
132+ # Validates if a URL is properly formatted
133+ # @param [String] url The URL to check
135134 # @return [Boolean] True if the URL is valid, false otherwise
136135 def valid_url? ( url )
137- URI . parse ( url ) . is_a? ( URI ::HTTP ) rescue false
136+ URI . parse ( url ) . is_a? ( URI ::HTTP )
137+ rescue StandardError
138+ false
138139 end
139140
140- # Sends an HTTP GET request to the specified URI.
141- #
142- # @param http [Net::HTTP] The HTTP client to use
143- # @param uri [URI] The URI to send the GET request to
144- # @return [Net::HTTPResponse] The response from the HTTP request
141+ # Sends an HTTP GET request to the specified URI
142+ # @param [Net::HTTP] http The HTTP client
143+ # @param [URI] uri The URI to send the GET request to
144+ # @return [Net::HTTPResponse] The HTTP response
145145 def get_response ( http , uri )
146146 http . get ( uri . request_uri )
147147 end
148148
149- # Follows HTTP redirects until the maximum redirect limit is reached.
150- #
151- # @param http [Net::HTTP] The HTTP client to use
152- # @param uri [URI] The current URI to check
153- # @param response [Net::HTTPResponse] The current response from the server
149+ # Follows HTTP redirects up to a maximum limit (MAX_REDIRECTS)
150+ # @param [Net::HTTP] http The HTTP client
151+ # @param [URI] uri The original URI
152+ # @param [Net::HTTPResponse] response The HTTP response to process
154153 def follow_redirects ( http , uri , response )
155154 redirect_count = 0
156155 while response . is_a? ( Net ::HTTPRedirection ) && redirect_count < MAX_REDIRECTS
@@ -162,10 +161,9 @@ def follow_redirects(http, uri, response)
162161 end
163162 end
164163
165- # Processes the HTTP response and updates the URL result with its status.
166- #
167- # @param response [Net::HTTPResponse] The HTTP response to process
168- # @param url_result [Hash] The result hash for the URL
164+ # Processes the HTTP response to determine the URL status
165+ # @param [Net::HTTPResponse] response The HTTP response to process
166+ # @param [Hash] url_result The result hash to update with the status
169167 def process_response ( response , url_result )
170168 if response . nil?
171169 url_result [ :status ] = 'Error: No response received'
@@ -177,18 +175,16 @@ def process_response(response, url_result)
177175 end
178176 end
179177
180- # Handles errors by logging them and setting the error status on the URL result.
181- #
182- # @param url_result [Hash] The result hash for the URL
183- # @param error [StandardError] The error encountered
178+ # Handles errors encountered during URL checking (e.g., network errors)
179+ # @param [Hash] url_result The result hash to update with error information
180+ # @param [StandardError] error The error that was raised
184181 def handle_error ( url_result , error )
185182 url_result [ :status ] = "Error: #{ error . message } "
186183 url_result [ :archived_snapshot ] = nil
187184 end
188185
189- # Fetches the archived snapshot of a URL from the Wayback Machine.
190- #
191- # @param url_result [Hash] The result hash for the URL
186+ # Attempts to fetch the Wayback Machine snapshot for the URL
187+ # @param [Hash] url_result The result hash to update with the Wayback snapshot information
192188 def fetch_wayback_snapshot ( url_result )
193189 wayback_url = "#{ WAYBACK_MACHINE_API_URL } #{ url_result [ :url ] } "
194190 retries = 0
@@ -209,10 +205,9 @@ def fetch_wayback_snapshot(url_result)
209205 end
210206 end
211207
212- # Handles the Wayback Machine response and updates the URL result with the archived snapshot.
213- #
214- # @param response [Net::HTTPResponse] The Wayback Machine response
215- # @param url_result [Hash] The result hash for the URL
208+ # Processes the response from the Wayback Machine API
209+ # @param [Net::HTTPResponse] response The response from the Wayback Machine
210+ # @param [Hash] url_result The result hash to update with the archived snapshot URL
216211 def handle_wayback_response ( response , url_result )
217212 if response . is_a? ( Net ::HTTPSuccess )
218213 data = JSON . parse ( response . body )
@@ -223,38 +218,35 @@ def handle_wayback_response(response, url_result)
223218 end
224219 end
225220
226- # Saves the results of the URL checks to a file.
221+ # Saves the final results of the URL checks to a JSON file
227222 def save_results_to_file
228223 File . open ( 'url_check_results.json' , 'w' ) { |file | file . write ( JSON . pretty_generate ( @results ) ) }
229224 @logger . info ( 'Results have been saved to "url_check_results.json".' )
230225 end
231226
232- # Appends the progress of a URL check to the checked URLs file.
233- #
234- # @param result [Hash] The result of the URL check to save
227+ # Saves the progress of checked URLs to a file
228+ # @param [Hash] result The result of a single URL check
235229 def save_progress ( result )
236230 File . open ( CHECKED_URLS_FILE , 'a' ) { |file | file . puts JSON . generate ( result ) }
237231 end
238232
239- # Loads the URLs that have already been checked from the file.
240- #
241- # @return [Array<String>] An array of URLs that have been checked previously
233+ # Loads the list of already checked URLs from the progress file
234+ # @return [Array<String>] A list of checked URLs
242235 def load_checked_urls
243236 return [ ] unless File . exist? ( CHECKED_URLS_FILE )
244237
245238 File . readlines ( CHECKED_URLS_FILE ) . map { |row | JSON . parse ( row ) [ 'url' ] }
246239 end
247240
248- # Shuts down the thread pool and waits for all threads to finish.
249- #
250- # @param pool [Concurrent::FixedThreadPool] The thread pool to shut down
241+ # Shuts down the thread pool after URL checking is complete
242+ # @param [Concurrent::FixedThreadPool] pool The thread pool to shut down
251243 def shutdown_thread_pool ( pool )
252244 pool . shutdown
253245 pool . wait_for_termination
254246 @logger . info ( 'Thread pool shut down successfully.' )
255247 end
256248
257- # Updates the progress bar and prints the progress to the console.
249+ # Updates the progress bar based on the number of URLs processed
258250 def update_progress
259251 @processed_urls += 1
260252 percentage = ( @processed_urls . to_f / @total_urls * 100 ) . round
@@ -264,27 +256,28 @@ def update_progress
264256 end
265257end
266258
267- if __FILE__ == $0
259+ # Main entry point to run the URL checking process
260+ if __FILE__ == $PROGRAM_NAME
268261 options = { }
269262 OptionParser . new do |opts |
270- opts . banner = " Usage: ruby url_checker.rb [options]"
263+ opts . banner = ' Usage: ruby url_checker.rb [options]'
271264
272- opts . on ( "-f" , " --file FILE" , " JSON file containing URLs and paths" ) do |file |
265+ opts . on ( '-f' , ' --file FILE' , ' JSON file containing URLs and paths' ) do |file |
273266 options [ :file ] = file
274267 end
275268
276- opts . on ( "-l" , " --log-level LEVEL" , " Log level (DEBUG, INFO, WARN, ERROR, FATAL, UNKNOWN)" ) do |log_level |
269+ opts . on ( '-l' , ' --log-level LEVEL' , ' Log level (DEBUG, INFO, WARN, ERROR, FATAL, UNKNOWN)' ) do |log_level |
277270 options [ :log_level ] = log_level . upcase . to_sym
278271 end
279272 end . parse!
280273
281274 # Validate input file
282275 unless options [ :file ] && File . exist? ( options [ :file ] )
283- puts " Please provide a valid JSON file with URLs and paths."
276+ puts ' Please provide a valid JSON file with URLs and paths.'
284277 exit 1
285278 end
286279
287- # Set the desired log level
280+ # Handling for log level
288281 log_level = options [ :log_level ] || 'INFO'
289282 log_level = Logger . const_get ( log_level )
290283
@@ -305,15 +298,15 @@ def update_progress
305298 # Create the final list of URLs and paths
306299 urls_with_paths_final = mapped_data . map { |entry | { url : entry [ 'ref' ] , path : entry [ 'path' ] } }
307300
308- # Record the start time for performance tracking
309301 start_time = Time . now
310302
311303 # Create and run the UrlChecker instance
312304 url_checker = UrlChecker . new ( urls_with_paths_final , log_level : log_level )
313305 url_checker . check_urls
314306
315- # Calculate and display the total time taken
307+
316308 end_time = Time . now
309+ # Calculate and display the total time taken
317310 elapsed_time = end_time - start_time
318311 minutes = ( elapsed_time / 60 ) . to_i
319312 seconds = ( elapsed_time % 60 ) . to_i
0 commit comments