Skip to content

Commit 2a17314

Browse files
committed
edit API script with rate limitting param
1 parent 24a712a commit 2a17314

File tree

1 file changed

+160
-20
lines changed

1 file changed

+160
-20
lines changed

sites-workflow/1getSitesURL.py

Lines changed: 160 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,90 @@
44
import requests
55
import pandas as pd
66
import re
7+
import time
78
from urllib.parse import urlparse
9+
from requests.adapters import HTTPAdapter
10+
from urllib3.util.retry import Retry
811

912
class SimpleSiteURLExtractor:
10-
def __init__(self):
13+
def __init__(self, requests_per_second=1):
1114
self.base_url = "https://catalog.civicdataecosystem.org"
1215
self.api_base = f"{self.base_url}/api/3/action"
16+
self.min_interval = 1.0 / requests_per_second
17+
self.last_request_time = 0
18+
19+
# Setup session with retry strategy
20+
self.session = requests.Session()
21+
22+
# Handle different urllib3 versions
23+
try:
24+
retry_strategy = Retry(
25+
total=3,
26+
status_forcelist=[429, 500, 502, 503, 504],
27+
allowed_methods=["HEAD", "GET", "OPTIONS"], # New parameter name
28+
backoff_factor=2
29+
)
30+
except TypeError:
31+
# Fallback for older urllib3 versions
32+
retry_strategy = Retry(
33+
total=3,
34+
status_forcelist=[429, 500, 502, 503, 504],
35+
method_whitelist=["HEAD", "GET", "OPTIONS"], # Old parameter name
36+
backoff_factor=2
37+
)
38+
adapter = HTTPAdapter(max_retries=retry_strategy)
39+
self.session.mount("http://", adapter)
40+
self.session.mount("https://", adapter)
41+
42+
# Set proper headers to identify ourselves
43+
self.session.headers.update({
44+
'User-Agent': 'CKAN-Metadata-Workflow/1.0 (Educational/Research Purpose)',
45+
'Accept': 'application/json',
46+
'Accept-Encoding': 'gzip, deflate',
47+
'Connection': 'keep-alive'
48+
})
49+
50+
def _rate_limit(self):
51+
"""Enforce rate limiting between requests"""
52+
current_time = time.time()
53+
time_since_last = current_time - self.last_request_time
54+
if time_since_last < self.min_interval:
55+
sleep_time = self.min_interval - time_since_last
56+
print(f"Rate limiting: waiting {sleep_time:.2f}s")
57+
time.sleep(sleep_time)
58+
self.last_request_time = time.time()
59+
60+
def _make_request(self, url, params=None, max_retries=3):
61+
"""Make a rate-limited request with error handling"""
62+
self._rate_limit()
63+
64+
for attempt in range(max_retries):
65+
try:
66+
response = self.session.get(url, params=params, timeout=30)
67+
68+
if response.status_code == 403:
69+
wait_time = (attempt + 1) * 30
70+
print(f"HTTP 403 - Rate limited or forbidden. Waiting {wait_time}s (attempt {attempt + 1}/{max_retries})")
71+
time.sleep(wait_time)
72+
continue
73+
74+
elif response.status_code == 429:
75+
wait_time = (attempt + 1) * 60
76+
print(f"HTTP 429 - Rate limit exceeded. Waiting {wait_time}s (attempt {attempt + 1}/{max_retries})")
77+
time.sleep(wait_time)
78+
continue
79+
80+
response.raise_for_status()
81+
return response
82+
83+
except requests.exceptions.RequestException as e:
84+
if attempt == max_retries - 1: # Last attempt
85+
print(f"Request failed after {max_retries} attempts: {e}")
86+
return None
87+
print(f"Request failed (attempt {attempt + 1}/{max_retries}): {e}")
88+
time.sleep((attempt + 1) * 10)
89+
90+
return None
1391

1492
def clean_url(self, url):
1593
"""Clean and validate URL"""
@@ -81,12 +159,40 @@ def get_all_sites(self):
81159

82160
all_packages = []
83161
start = 0
84-
rows = 1000
162+
rows = 100 # Reduced batch size to be more conservative
163+
164+
# First, try to get a smaller test batch to verify API access
165+
print("Testing API access with small batch...")
166+
test_response = self._make_request(
167+
f"{self.api_base}/package_search",
168+
params={
169+
'q': 'type:site',
170+
'start': 0,
171+
'rows': 10,
172+
'include_private': False
173+
}
174+
)
85175

86-
while True:
176+
if not test_response:
177+
print("Failed to access API.")
178+
return []
179+
180+
test_data = test_response.json()
181+
if not test_data.get('success'):
182+
print("API test failed.")
183+
return []
184+
185+
total_count = test_data['result'].get('count', 0)
186+
print(f"API test successful. Total sites available: {total_count}")
187+
188+
# Add a reasonable limit to avoid overwhelming the API
189+
max_sites = min(total_count, 1000) # Limit to 1000 sites max
190+
print(f"Will fetch up to {max_sites} sites")
191+
192+
while len(all_packages) < max_sites:
87193
print(f"Fetching batch starting at {start}...")
88194

89-
response = requests.get(
195+
response = self._make_request(
90196
f"{self.api_base}/package_search",
91197
params={
92198
'q': 'type:site',
@@ -96,37 +202,49 @@ def get_all_sites(self):
96202
}
97203
)
98204

99-
if response.status_code != 200:
100-
print(f"API failed with status {response.status_code}")
205+
if not response:
206+
print("Failed to fetch batch, stopping...")
101207
break
102208

103209
data = response.json()
104210
if not data.get('success'):
105-
print("API returned error")
211+
print("API returned error, stopping...")
106212
break
107213

108214
result = data['result']
109215
batch_packages = result.get('results', [])
110-
total_count = result.get('count', 0)
111216

217+
if not batch_packages:
218+
print("No more results available")
219+
break
220+
112221
all_packages.extend(batch_packages)
113222

114-
print(f"Fetched {len(all_packages)}/{total_count} sites")
223+
print(f"Fetched {len(all_packages)}/{min(total_count, max_sites)} sites")
115224

116-
# Stop if we got fewer results than requested or reached the total
117-
if len(batch_packages) < rows or len(all_packages) >= total_count:
225+
# Stop if we got fewer results than requested
226+
if len(batch_packages) < rows:
118227
break
119228

120229
start += rows
230+
231+
# Extra delay between batches to be respectful
232+
print("Waiting between batches...")
233+
time.sleep(5)
121234

122235
print(f"Total sites found: {len(all_packages)}")
123-
236+
return self._process_packages(all_packages)
237+
238+
def _process_packages(self, all_packages):
239+
"""Process packages to extract URLs"""
124240
results = []
125241

126242
for i, pkg in enumerate(all_packages, 1):
127243
site_name = pkg.get('name', '')
128244
site_title = pkg.get('title', '')
129-
print(f"Processing {i}/{len(all_packages)}: {site_name}")
245+
246+
if i % 50 == 0: # Progress update every 50 items
247+
print(f"Processing {i}/{len(all_packages)}: {site_name}")
130248

131249
# Extract the visit URL
132250
visit_url = self.extract_visit_url(pkg)
@@ -139,6 +257,11 @@ def get_all_sites(self):
139257
'name': site_name,
140258
'url': visit_url
141259
})
260+
261+
# Small delay between processing items if extracting from notes
262+
# to avoid overwhelming regex processing
263+
if i % 100 == 0:
264+
time.sleep(1)
142265

143266
return results
144267

@@ -147,7 +270,14 @@ def save_to_csv(self, results, filename):
147270
# Filter out empty URLs
148271
filtered_results = [r for r in results if r['url']]
149272

150-
df = pd.DataFrame(filtered_results)
273+
if not filtered_results:
274+
# If no URLs found, create empty CSV
275+
print("No URLs found, creating empty CSV file...")
276+
# Create empty DataFrame with proper columns
277+
df = pd.DataFrame(columns=['name', 'url'])
278+
else:
279+
df = pd.DataFrame(filtered_results)
280+
151281
df.to_csv(filename, index=False, encoding='utf-8')
152282

153283
# Print summary
@@ -157,26 +287,36 @@ def save_to_csv(self, results, filename):
157287
print(f"\nResults saved to {filename}")
158288
print(f"Total sites processed: {total}")
159289
print(f"Sites with URLs: {with_urls}")
160-
print(f"Success rate: {with_urls/total*100:.1f}%")
161290

162-
# Show some examples
163-
print(f"\nFirst 5 sites with URLs:")
164-
for i, result in enumerate(filtered_results[:5], 1):
165-
print(f" {i}. {result['name']}: {result['url']}")
291+
if total > 0 and with_urls > 0:
292+
print(f"Success rate: {with_urls/total*100:.1f}%")
293+
294+
# Show some examples if there are results
295+
if with_urls > 0:
296+
print(f"\nFirst 5 sites with URLs:")
297+
for i, (_, row) in enumerate(df.head().iterrows(), 1):
298+
print(f" {i}. {row['name']}: {row['url']}")
299+
else:
300+
print("\nNo sites with URLs found.")
166301

167302
def main():
168303
print("Simple CKAN Site URL Extractor")
169304
print("=" * 40)
170305

171306
output_file = "site_urls.csv"
172307

173-
extractor = SimpleSiteURLExtractor()
308+
# Use conservative rate limiting (1 request per second)
309+
extractor = SimpleSiteURLExtractor(requests_per_second=1)
174310
results = extractor.get_all_sites()
175311

176312
if results:
177313
extractor.save_to_csv(results, output_file)
178314
else:
179315
print("No results found!")
316+
# Create empty CSV with proper headers
317+
df = pd.DataFrame(columns=['name', 'url'])
318+
df.to_csv(output_file, index=False, encoding='utf-8')
319+
print(f"Created empty {output_file}")
180320

181321
if __name__ == "__main__":
182322
main()

0 commit comments

Comments
 (0)