44import requests
55import pandas as pd
66import re
7+ import time
78from urllib .parse import urlparse
9+ from requests .adapters import HTTPAdapter
10+ from urllib3 .util .retry import Retry
811
912class SimpleSiteURLExtractor :
10- def __init__ (self ):
13+ def __init__ (self , requests_per_second = 1 ):
1114 self .base_url = "https://catalog.civicdataecosystem.org"
1215 self .api_base = f"{ self .base_url } /api/3/action"
16+ self .min_interval = 1.0 / requests_per_second
17+ self .last_request_time = 0
18+
19+ # Setup session with retry strategy
20+ self .session = requests .Session ()
21+
22+ # Handle different urllib3 versions
23+ try :
24+ retry_strategy = Retry (
25+ total = 3 ,
26+ status_forcelist = [429 , 500 , 502 , 503 , 504 ],
27+ allowed_methods = ["HEAD" , "GET" , "OPTIONS" ], # New parameter name
28+ backoff_factor = 2
29+ )
30+ except TypeError :
31+ # Fallback for older urllib3 versions
32+ retry_strategy = Retry (
33+ total = 3 ,
34+ status_forcelist = [429 , 500 , 502 , 503 , 504 ],
35+ method_whitelist = ["HEAD" , "GET" , "OPTIONS" ], # Old parameter name
36+ backoff_factor = 2
37+ )
38+ adapter = HTTPAdapter (max_retries = retry_strategy )
39+ self .session .mount ("http://" , adapter )
40+ self .session .mount ("https://" , adapter )
41+
42+ # Set proper headers to identify ourselves
43+ self .session .headers .update ({
44+ 'User-Agent' : 'CKAN-Metadata-Workflow/1.0 (Educational/Research Purpose)' ,
45+ 'Accept' : 'application/json' ,
46+ 'Accept-Encoding' : 'gzip, deflate' ,
47+ 'Connection' : 'keep-alive'
48+ })
49+
50+ def _rate_limit (self ):
51+ """Enforce rate limiting between requests"""
52+ current_time = time .time ()
53+ time_since_last = current_time - self .last_request_time
54+ if time_since_last < self .min_interval :
55+ sleep_time = self .min_interval - time_since_last
56+ print (f"Rate limiting: waiting { sleep_time :.2f} s" )
57+ time .sleep (sleep_time )
58+ self .last_request_time = time .time ()
59+
60+ def _make_request (self , url , params = None , max_retries = 3 ):
61+ """Make a rate-limited request with error handling"""
62+ self ._rate_limit ()
63+
64+ for attempt in range (max_retries ):
65+ try :
66+ response = self .session .get (url , params = params , timeout = 30 )
67+
68+ if response .status_code == 403 :
69+ wait_time = (attempt + 1 ) * 30
70+ print (f"HTTP 403 - Rate limited or forbidden. Waiting { wait_time } s (attempt { attempt + 1 } /{ max_retries } )" )
71+ time .sleep (wait_time )
72+ continue
73+
74+ elif response .status_code == 429 :
75+ wait_time = (attempt + 1 ) * 60
76+ print (f"HTTP 429 - Rate limit exceeded. Waiting { wait_time } s (attempt { attempt + 1 } /{ max_retries } )" )
77+ time .sleep (wait_time )
78+ continue
79+
80+ response .raise_for_status ()
81+ return response
82+
83+ except requests .exceptions .RequestException as e :
84+ if attempt == max_retries - 1 : # Last attempt
85+ print (f"Request failed after { max_retries } attempts: { e } " )
86+ return None
87+ print (f"Request failed (attempt { attempt + 1 } /{ max_retries } ): { e } " )
88+ time .sleep ((attempt + 1 ) * 10 )
89+
90+ return None
1391
1492 def clean_url (self , url ):
1593 """Clean and validate URL"""
@@ -81,12 +159,40 @@ def get_all_sites(self):
81159
82160 all_packages = []
83161 start = 0
84- rows = 1000
162+ rows = 100 # Reduced batch size to be more conservative
163+
164+ # First, try to get a smaller test batch to verify API access
165+ print ("Testing API access with small batch..." )
166+ test_response = self ._make_request (
167+ f"{ self .api_base } /package_search" ,
168+ params = {
169+ 'q' : 'type:site' ,
170+ 'start' : 0 ,
171+ 'rows' : 10 ,
172+ 'include_private' : False
173+ }
174+ )
85175
86- while True :
176+ if not test_response :
177+ print ("Failed to access API." )
178+ return []
179+
180+ test_data = test_response .json ()
181+ if not test_data .get ('success' ):
182+ print ("API test failed." )
183+ return []
184+
185+ total_count = test_data ['result' ].get ('count' , 0 )
186+ print (f"API test successful. Total sites available: { total_count } " )
187+
188+ # Add a reasonable limit to avoid overwhelming the API
189+ max_sites = min (total_count , 1000 ) # Limit to 1000 sites max
190+ print (f"Will fetch up to { max_sites } sites" )
191+
192+ while len (all_packages ) < max_sites :
87193 print (f"Fetching batch starting at { start } ..." )
88194
89- response = requests . get (
195+ response = self . _make_request (
90196 f"{ self .api_base } /package_search" ,
91197 params = {
92198 'q' : 'type:site' ,
@@ -96,37 +202,49 @@ def get_all_sites(self):
96202 }
97203 )
98204
99- if response . status_code != 200 :
100- print (f"API failed with status { response . status_code } " )
205+ if not response :
206+ print ("Failed to fetch batch, stopping... " )
101207 break
102208
103209 data = response .json ()
104210 if not data .get ('success' ):
105- print ("API returned error" )
211+ print ("API returned error, stopping... " )
106212 break
107213
108214 result = data ['result' ]
109215 batch_packages = result .get ('results' , [])
110- total_count = result .get ('count' , 0 )
111216
217+ if not batch_packages :
218+ print ("No more results available" )
219+ break
220+
112221 all_packages .extend (batch_packages )
113222
114- print (f"Fetched { len (all_packages )} /{ total_count } sites" )
223+ print (f"Fetched { len (all_packages )} /{ min ( total_count , max_sites ) } sites" )
115224
116- # Stop if we got fewer results than requested or reached the total
117- if len (batch_packages ) < rows or len ( all_packages ) >= total_count :
225+ # Stop if we got fewer results than requested
226+ if len (batch_packages ) < rows :
118227 break
119228
120229 start += rows
230+
231+ # Extra delay between batches to be respectful
232+ print ("Waiting between batches..." )
233+ time .sleep (5 )
121234
122235 print (f"Total sites found: { len (all_packages )} " )
123-
236+ return self ._process_packages (all_packages )
237+
238+ def _process_packages (self , all_packages ):
239+ """Process packages to extract URLs"""
124240 results = []
125241
126242 for i , pkg in enumerate (all_packages , 1 ):
127243 site_name = pkg .get ('name' , '' )
128244 site_title = pkg .get ('title' , '' )
129- print (f"Processing { i } /{ len (all_packages )} : { site_name } " )
245+
246+ if i % 50 == 0 : # Progress update every 50 items
247+ print (f"Processing { i } /{ len (all_packages )} : { site_name } " )
130248
131249 # Extract the visit URL
132250 visit_url = self .extract_visit_url (pkg )
@@ -139,6 +257,11 @@ def get_all_sites(self):
139257 'name' : site_name ,
140258 'url' : visit_url
141259 })
260+
261+ # Small delay between processing items if extracting from notes
262+ # to avoid overwhelming regex processing
263+ if i % 100 == 0 :
264+ time .sleep (1 )
142265
143266 return results
144267
@@ -147,7 +270,14 @@ def save_to_csv(self, results, filename):
147270 # Filter out empty URLs
148271 filtered_results = [r for r in results if r ['url' ]]
149272
150- df = pd .DataFrame (filtered_results )
273+ if not filtered_results :
274+ # If no URLs found, create empty CSV
275+ print ("No URLs found, creating empty CSV file..." )
276+ # Create empty DataFrame with proper columns
277+ df = pd .DataFrame (columns = ['name' , 'url' ])
278+ else :
279+ df = pd .DataFrame (filtered_results )
280+
151281 df .to_csv (filename , index = False , encoding = 'utf-8' )
152282
153283 # Print summary
@@ -157,26 +287,36 @@ def save_to_csv(self, results, filename):
157287 print (f"\n Results saved to { filename } " )
158288 print (f"Total sites processed: { total } " )
159289 print (f"Sites with URLs: { with_urls } " )
160- print (f"Success rate: { with_urls / total * 100 :.1f} %" )
161290
162- # Show some examples
163- print (f"\n First 5 sites with URLs:" )
164- for i , result in enumerate (filtered_results [:5 ], 1 ):
165- print (f" { i } . { result ['name' ]} : { result ['url' ]} " )
291+ if total > 0 and with_urls > 0 :
292+ print (f"Success rate: { with_urls / total * 100 :.1f} %" )
293+
294+ # Show some examples if there are results
295+ if with_urls > 0 :
296+ print (f"\n First 5 sites with URLs:" )
297+ for i , (_ , row ) in enumerate (df .head ().iterrows (), 1 ):
298+ print (f" { i } . { row ['name' ]} : { row ['url' ]} " )
299+ else :
300+ print ("\n No sites with URLs found." )
166301
167302def main ():
168303 print ("Simple CKAN Site URL Extractor" )
169304 print ("=" * 40 )
170305
171306 output_file = "site_urls.csv"
172307
173- extractor = SimpleSiteURLExtractor ()
308+ # Use conservative rate limiting (1 request per second)
309+ extractor = SimpleSiteURLExtractor (requests_per_second = 1 )
174310 results = extractor .get_all_sites ()
175311
176312 if results :
177313 extractor .save_to_csv (results , output_file )
178314 else :
179315 print ("No results found!" )
316+ # Create empty CSV with proper headers
317+ df = pd .DataFrame (columns = ['name' , 'url' ])
318+ df .to_csv (output_file , index = False , encoding = 'utf-8' )
319+ print (f"Created empty { output_file } " )
180320
181321if __name__ == "__main__" :
182322 main ()
0 commit comments