Reverts performance changes

cgranleese-r7 · cgranleese-r7 · commit 435f7286f778 · 2025-02-27T17:04:06.000Z
diff --git a/tools/dev/detect_dead_reference_links.rb b/tools/dev/detect_dead_reference_links.rb
@@ -1,28 +1,34 @@
 ##
+# This script checks the status of URLs from a provided JSON file.
+# It validates if URLs are alive, handles redirects, and fetches Wayback Machine snapshots for URLs that are down.
+# It logs the status of each URL, including errors, redirects, and archived snapshots.
 #
 # Usage: ruby tools/dev/detect_dead_reference_links.rb -f db/modules_metadata_base.json -l WARN
 #
-##
 
-require 'faraday'
+require 'net/http'
+require 'uri'
 require 'json'
 require 'csv'
 require 'concurrent'
 require 'logger'
 require 'fileutils'
 require 'optparse'
 require 'benchmark'
-require 'oj'  # Optimized JSON
 
 class UrlChecker
   WAYBACK_MACHINE_API_URL = 'https://archive.org/wayback/available?url='
-  MAX_REDIRECTS = 5
-  THREAD_POOL_SIZE = 20  # Increased thread pool size for better concurrency
-  CHECKED_URLS_FILE = 'checked_urls.jsonl'
-  BATCH_SIZE = 1000
-  MAX_RETRIES = 3
-  RETRY_DELAY = 5
-
+  MAX_REDIRECTS = 5 # Maximum number of redirects to follow for each URL
+  THREAD_POOL_SIZE = 5 # Number of threads in the pool to process URLs concurrently
+  CHECKED_URLS_FILE = 'checked_urls.jsonl' # File to save URLs that have been checked
+  BATCH_SIZE = 1000 # Number of URLs to process in each batch
+  MAX_RETRIES = 3 # Maximum number of retries for failed requests to the Wayback Machine
+  RETRY_DELAY = 5 # Delay in seconds between retries
+
+  # Initializes the URL checker with the given URLs and logging options
+  #
+  # @param urls_with_paths [Array<Hash>] An array of hashes containing URL and path information.
+  # @param log_level [Logger::Severity] The desired logging level (default: Logger::INFO)
   def initialize(urls_with_paths, log_level: Logger::INFO)
     @urls_with_paths = urls_with_paths
     @results = []
@@ -34,6 +40,8 @@ def initialize(urls_with_paths, log_level: Logger::INFO)
     @processed_urls = 0
   end
 
+  # Checks the provided URLs for status and fetches Wayback Machine snapshots if needed.
+  # URLs are processed in batches, and results are saved to a file.
   def check_urls
     pool = Concurrent::FixedThreadPool.new(THREAD_POOL_SIZE)
     at_exit { shutdown_thread_pool(pool) }
@@ -45,19 +53,20 @@ def check_urls
           result = check_url(url_with_path)
           @results << result
           @checked_urls << url_with_path[:url]
-          save_progress_batch([result])
+          save_progress(result)
 
           # Update the progress bar after each URL is processed
           update_progress
         end
       end
 
-      # Wait for batch to finish
+      # Wait for all futures in the batch to finish
       Concurrent::Promises.zip(*futures).wait!
       # Sleep between batches to reduce resource consumption
       sleep 5
     end
 
+    # Save the results to a file after all URLs are processed
     save_results_to_file
   ensure
     pool.shutdown
@@ -67,37 +76,46 @@ def check_urls
 
   private
 
+  # Returns the URLs that haven't been checked yet
+  #
+  # @return [Array<Hash>] An array of URLs with paths that haven't been checked
   def unchecked_urls
     @urls_with_paths.reject { |url_with_path| @checked_urls.include?(url_with_path[:url]) }
   end
 
+  # Checks the status of a single URL.
+  #
+  # @param url_with_path [Hash] The URL and associated path to check
+  # @return [Hash] A hash with the URL, path, status, and archived snapshot (if any)
   def check_url(url_with_path)
     url_result = { url: url_with_path[:url], path: url_with_path[:path], status: nil, archived_snapshot: nil }
 
-    # Check if the URL is already a Wayback link
-    if url_with_path[:url].start_with?('http://web.archive.org/web')
+    # Skip non-URL references or Wayback links
+    if !url_with_path[:url].start_with?('URL-')
+      url_result[:status] = 'Skipped (not a URL- reference)'
+      return url_result
+    elsif url_with_path[:url].start_with?('http://web.archive.org/web')
       url_result[:status] = 'Wayback link (skipped)'
       return url_result
     end
 
-    # Remove "URL-" prefix
+    # Clean the URL and validate it
     cleaned_url = url_with_path[:url].sub(/^URL-/, '')
     if !valid_url?(cleaned_url)
       url_result[:status] = "Invalid URL"
       return url_result
     end
 
-    # Use Faraday for HTTP requests (persistent connection with pooling)
-    conn = Faraday.new(url: cleaned_url) do |faraday|
-      faraday.adapter Faraday.default_adapter  # Net::HTTP
-      faraday.options.timeout = 10  # seconds
-      faraday.options.open_timeout = 5  # seconds
-    end
+    # Prepare the HTTP request
+    uri = URI.parse(cleaned_url)
+    http = Net::HTTP.new(uri.host, uri.port)
+    http.use_ssl = uri.scheme == 'https'
 
     start_time = Time.now
+
     begin
-      response = get_response(conn, cleaned_url)
-      follow_redirects(conn, response)
+      response = get_response(http, uri)
+      follow_redirects(http, uri, response)
     rescue StandardError => e
       handle_error(url_result, e)
     end
@@ -111,46 +129,73 @@ def check_url(url_with_path)
     save_progress(url_result)
   end
 
+  # Checks if a given URL is valid.
+  #
+  # @param url [String] The URL to check
+  # @return [Boolean] True if the URL is valid, false otherwise
   def valid_url?(url)
     URI.parse(url).is_a?(URI::HTTP) rescue false
   end
 
-  def get_response(conn, url)
-    conn.get(url)
+  # Sends an HTTP GET request to the specified URI.
+  #
+  # @param http [Net::HTTP] The HTTP client to use
+  # @param uri [URI] The URI to send the GET request to
+  # @return [Net::HTTPResponse] The response from the HTTP request
+  def get_response(http, uri)
+    http.get(uri.request_uri)
   end
 
-  def follow_redirects(conn, response)
+  # Follows HTTP redirects until the maximum redirect limit is reached.
+  #
+  # @param http [Net::HTTP] The HTTP client to use
+  # @param uri [URI] The current URI to check
+  # @param response [Net::HTTPResponse] The current response from the server
+  def follow_redirects(http, uri, response)
     redirect_count = 0
-    while response.status.to_i == 3 && redirect_count < MAX_REDIRECTS  # HTTP 3xx redirects
-      location = response.headers['Location']
+    while response.is_a?(Net::HTTPRedirection) && redirect_count < MAX_REDIRECTS
+      location = response['location']
       @logger.info("Redirecting to: #{location}")
-      response = conn.get(location)
+      uri = URI.parse(location)
+      response = http.get(uri.request_uri)
       redirect_count += 1
     end
   end
 
+  # Processes the HTTP response and updates the URL result with its status.
+  #
+  # @param response [Net::HTTPResponse] The HTTP response to process
+  # @param url_result [Hash] The result hash for the URL
   def process_response(response, url_result)
     if response.nil?
       url_result[:status] = 'Error: No response received'
-    elsif response.status.to_i.between?(200, 299) || response.status.to_i.between?(300, 399)
+    elsif response.is_a?(Net::HTTPSuccess) || response.is_a?(Net::HTTPRedirection)
       url_result[:status] = 'Alive'
     else
-      url_result[:status] = "Not Alive (Status Code: #{response.status})"
+      url_result[:status] = "Not Alive (Status Code: #{response.code})"
       fetch_wayback_snapshot(url_result)
     end
   end
 
+  # Handles errors by logging them and setting the error status on the URL result.
+  #
+  # @param url_result [Hash] The result hash for the URL
+  # @param error [StandardError] The error encountered
   def handle_error(url_result, error)
     url_result[:status] = "Error: #{error.message}"
     url_result[:archived_snapshot] = nil
   end
 
+  # Fetches the archived snapshot of a URL from the Wayback Machine.
+  #
+  # @param url_result [Hash] The result hash for the URL
   def fetch_wayback_snapshot(url_result)
     wayback_url = "#{WAYBACK_MACHINE_API_URL}#{url_result[:url]}"
     retries = 0
 
     begin
-      response = Faraday.get(wayback_url)
+      uri = URI.parse(wayback_url)
+      response = Net::HTTP.get_response(uri)
       handle_wayback_response(response, url_result)
     rescue StandardError => e
       retries += 1
@@ -164,43 +209,52 @@ def fetch_wayback_snapshot(url_result)
     end
   end
 
+  # Handles the Wayback Machine response and updates the URL result with the archived snapshot.
+  #
+  # @param response [Net::HTTPResponse] The Wayback Machine response
+  # @param url_result [Hash] The result hash for the URL
   def handle_wayback_response(response, url_result)
-    if response.status.to_i == 200
-      data = Oj.load(response.body)
+    if response.is_a?(Net::HTTPSuccess)
+      data = JSON.parse(response.body)
       snapshot = data.dig('archived_snapshots', 'closest', 'url')
       url_result[:archived_snapshot] = snapshot || 'No archived version found'
     else
       url_result[:archived_snapshot] = 'Error fetching Wayback Machine data'
     end
   end
 
+  # Saves the results of the URL checks to a file.
   def save_results_to_file
     File.open('url_check_results.json', 'w') { |file| file.write(JSON.pretty_generate(@results)) }
     @logger.info('Results have been saved to "url_check_results.json".')
   end
 
+  # Appends the progress of a URL check to the checked URLs file.
+  #
+  # @param result [Hash] The result of the URL check to save
   def save_progress(result)
     File.open(CHECKED_URLS_FILE, 'a') { |file| file.puts JSON.generate(result) }
   end
 
-  def save_progress_batch(results)
-    File.open(CHECKED_URLS_FILE, 'a') do |file|
-      results.each { |result| file.puts JSON.generate(result) }
-    end
-  end
-
+  # Loads the URLs that have already been checked from the file.
+  #
+  # @return [Array<String>] An array of URLs that have been checked previously
   def load_checked_urls
     return [] unless File.exist?(CHECKED_URLS_FILE)
 
     File.readlines(CHECKED_URLS_FILE).map { |row| JSON.parse(row)['url'] }
   end
 
+  # Shuts down the thread pool and waits for all threads to finish.
+  #
+  # @param pool [Concurrent::FixedThreadPool] The thread pool to shut down
   def shutdown_thread_pool(pool)
     pool.shutdown
     pool.wait_for_termination
     @logger.info('Thread pool shut down successfully.')
   end
 
+  # Updates the progress bar and prints the progress to the console.
   def update_progress
     @processed_urls += 1
     percentage = (@processed_urls.to_f / @total_urls * 100).round
@@ -224,42 +278,45 @@ def update_progress
     end
   end.parse!
 
+  # Validate input file
   unless options[:file] && File.exist?(options[:file])
     puts "Please provide a valid JSON file with URLs and paths."
     exit 1
   end
 
+  # Set the desired log level
   log_level = options[:log_level] || 'INFO'
   log_level = Logger.const_get(log_level)
 
-  urls_with_paths = Oj.load(File.read(options[:file]))
+  # Parse the JSON file containing URLs and paths
+  urls_with_paths = JSON.parse(File.read(options[:file]))
 
+  # Map the data to the format required by the checker
   mapped_data = urls_with_paths.flat_map do |_path, metadata|
     metadata['references'].map { |ref| { 'path' => metadata['path'], 'ref' => ref } }
   end
 
+  # Validate the structure of the mapped data
   unless mapped_data.is_a?(Array) && mapped_data.all? { |entry| entry['ref'] && entry['path'] }
     puts "Invalid JSON structure. The file should contain an array of objects with 'ref' and 'path' keys."
     exit 1
   end
 
+  # Create the final list of URLs and paths
   urls_with_paths_final = mapped_data.map { |entry| { url: entry['ref'], path: entry['path'] } }
 
-  # Record the start time
+  # Record the start time for performance tracking
   start_time = Time.now
 
-  # Create and run the UrlChecker
+  # Create and run the UrlChecker instance
   url_checker = UrlChecker.new(urls_with_paths_final, log_level: log_level)
   url_checker.check_urls
 
   # Calculate and display the total time taken
   end_time = Time.now
   elapsed_time = end_time - start_time
-
-  # Convert seconds into minutes and seconds
   minutes = (elapsed_time / 60).to_i
   seconds = (elapsed_time % 60).to_i
 
-  # Output the time in minutes and seconds
   puts "\nTotal time taken: #{minutes} minutes and #{seconds} seconds"
 end