44#
55##
66
7- require 'net/http'
8- require 'uri'
7+ require 'faraday'
98require 'json'
109require 'csv'
1110require 'concurrent'
1211require 'logger'
1312require 'fileutils'
1413require 'optparse'
1514require 'benchmark'
15+ require 'oj' # Optimized JSON
1616
1717class UrlChecker
1818 WAYBACK_MACHINE_API_URL = 'https://archive.org/wayback/available?url='
1919 MAX_REDIRECTS = 5
20- THREAD_POOL_SIZE = 5
20+ THREAD_POOL_SIZE = 20 # Increased thread pool size for better concurrency
2121 CHECKED_URLS_FILE = 'checked_urls.jsonl'
2222 BATCH_SIZE = 1000
2323 MAX_RETRIES = 3
@@ -45,7 +45,7 @@ def check_urls
4545 result = check_url ( url_with_path )
4646 @results << result
4747 @checked_urls << url_with_path [ :url ]
48- save_progress ( result )
48+ save_progress_batch ( [ result ] )
4949
5050 # Update the progress bar after each URL is processed
5151 update_progress
@@ -87,15 +87,17 @@ def check_url(url_with_path)
8787 return url_result
8888 end
8989
90- uri = URI . parse ( cleaned_url )
91- http = Net ::HTTP . new ( uri . host , uri . port )
92- http . use_ssl = uri . scheme == 'https'
90+ # Use Faraday for HTTP requests (persistent connection with pooling)
91+ conn = Faraday . new ( url : cleaned_url ) do |faraday |
92+ faraday . adapter Faraday . default_adapter # Net::HTTP
93+ faraday . options . timeout = 10 # seconds
94+ faraday . options . open_timeout = 5 # seconds
95+ end
9396
9497 start_time = Time . now
95-
9698 begin
97- response = get_response ( http , uri )
98- follow_redirects ( http , uri , response )
99+ response = get_response ( conn , cleaned_url )
100+ follow_redirects ( conn , response )
99101 rescue StandardError => e
100102 handle_error ( url_result , e )
101103 end
@@ -113,28 +115,27 @@ def valid_url?(url)
113115 URI . parse ( url ) . is_a? ( URI ::HTTP ) rescue false
114116 end
115117
116- def get_response ( http , uri )
117- http . get ( uri . request_uri )
118+ def get_response ( conn , url )
119+ conn . get ( url )
118120 end
119121
120- def follow_redirects ( http , uri , response )
122+ def follow_redirects ( conn , response )
121123 redirect_count = 0
122- while response . is_a? ( Net :: HTTPRedirection ) && redirect_count < MAX_REDIRECTS
123- location = response [ 'location ']
124+ while response . status . to_i == 3 && redirect_count < MAX_REDIRECTS # HTTP 3xx redirects
125+ location = response . headers [ 'Location ']
124126 @logger . info ( "Redirecting to: #{ location } " )
125- uri = URI . parse ( location )
126- response = http . get ( uri . request_uri )
127+ response = conn . get ( location )
127128 redirect_count += 1
128129 end
129130 end
130131
131132 def process_response ( response , url_result )
132133 if response . nil?
133134 url_result [ :status ] = 'Error: No response received'
134- elsif response . is_a? ( Net :: HTTPSuccess ) || response . is_a? ( Net :: HTTPRedirection )
135+ elsif response . status . to_i . between? ( 200 , 299 ) || response . status . to_i . between? ( 300 , 399 )
135136 url_result [ :status ] = 'Alive'
136137 else
137- url_result [ :status ] = "Not Alive (Status Code: #{ response . code } )"
138+ url_result [ :status ] = "Not Alive (Status Code: #{ response . status } )"
138139 fetch_wayback_snapshot ( url_result )
139140 end
140141 end
@@ -149,8 +150,7 @@ def fetch_wayback_snapshot(url_result)
149150 retries = 0
150151
151152 begin
152- uri = URI . parse ( wayback_url )
153- response = Net ::HTTP . get_response ( uri )
153+ response = Faraday . get ( wayback_url )
154154 handle_wayback_response ( response , url_result )
155155 rescue StandardError => e
156156 retries += 1
@@ -165,8 +165,8 @@ def fetch_wayback_snapshot(url_result)
165165 end
166166
167167 def handle_wayback_response ( response , url_result )
168- if response . is_a? ( Net :: HTTPSuccess )
169- data = JSON . parse ( response . body )
168+ if response . status . to_i == 200
169+ data = Oj . load ( response . body )
170170 snapshot = data . dig ( 'archived_snapshots' , 'closest' , 'url' )
171171 url_result [ :archived_snapshot ] = snapshot || 'No archived version found'
172172 else
@@ -183,6 +183,12 @@ def save_progress(result)
183183 File . open ( CHECKED_URLS_FILE , 'a' ) { |file | file . puts JSON . generate ( result ) }
184184 end
185185
186+ def save_progress_batch ( results )
187+ File . open ( CHECKED_URLS_FILE , 'a' ) do |file |
188+ results . each { |result | file . puts JSON . generate ( result ) }
189+ end
190+ end
191+
186192 def load_checked_urls
187193 return [ ] unless File . exist? ( CHECKED_URLS_FILE )
188194
@@ -226,7 +232,7 @@ def update_progress
226232 log_level = options [ :log_level ] || 'INFO'
227233 log_level = Logger . const_get ( log_level )
228234
229- urls_with_paths = JSON . parse ( File . read ( options [ :file ] ) )
235+ urls_with_paths = Oj . load ( File . read ( options [ :file ] ) )
230236
231237 mapped_data = urls_with_paths . flat_map do |_path , metadata |
232238 metadata [ 'references' ] . map { |ref | { 'path' => metadata [ 'path' ] , 'ref' => ref } }
0 commit comments