-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.py
More file actions
341 lines (273 loc) · 12.1 KB
/
scrape.py
File metadata and controls
341 lines (273 loc) · 12.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
from collections import deque
import datetime
import pickle
# Configuration
START_URL = "https://civilization.fandom.com/wiki/Civilization_V"
OUTPUT_DIR = "./data"
MAX_ARTICLES = 10000
DELAY = 1 # Delay between requests in seconds to be respectful
USER_AGENT = "CivVWikiCrawler/1.0 (Educational Project)"
# How often to save state (in seconds)
SAVE_STATE_INTERVAL = 60
# State file path
STATE_FILE = os.path.join(OUTPUT_DIR, "crawler_state.pkl")
# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Track visited URLs to avoid duplicates
visited_urls = set()
# Track saved articles count
saved_count = 0
# Track total URLs processed
processed_count = 0
# Queue for URLs to process
queue = deque()
def is_valid_civ5_article(url):
"""Check if the URL is a valid Civilization V article."""
if url == START_URL:
return True
# Must be from the same domain
if "civilization.fandom.com" not in url:
if kasbah_url in url:
print(" - FAILED: Not from civilization.fandom.com domain")
return False
# Parse the URL to get the path
parsed_url = urlparse(url)
path = parsed_url.path
if "/fi/" in path or "/de/" in path or "/fr/" in path or "/es/" in path or "/it/" in path or "/ja/" in path or "/ko/" in path or "/pl/" in path or "/pt/" in path or "/ru/" in path or "/zh/" in path:
if kasbah_url in url:
print(" - FAILED: Contains language prefix")
return False
# Skip category, talk and category talk pages
if any(x in path for x in ["Category:", "Talk:", "Category_talk:"]):
if kasbah_url in url:
print(" - FAILED: Is a category, talk, or category talk page")
return False
# Must end with (Civ5)
if not path.endswith("(Civ5)"):
if kasbah_url in url:
print(f" - FAILED: Path does not end with (Civ5). Path: {path}")
return False
# Must not be a Civilopedia page
if "/Civilopedia" in path or path.endswith("(Civ5)/Civilopedia"):
if kasbah_url in url:
print(" - FAILED: Is a Civilopedia page")
return False
if kasbah_url in url:
print(" + PASSED ALL CHECKS: This URL is valid!")
return True
def save_article(url, html_content):
"""Save the HTML content to a file."""
global saved_count
# Create a filename from the URL
parsed_url = urlparse(url)
path = parsed_url.path
# Use a hash of the URL as the filename to avoid any issues with special characters
# Extract the article name from the URL and clean it up
article_name = url.split('/')[-1] # Get text after last '/'
article_name = article_name.replace('_(Civ5)', '') # Remove the Civ5 suffix
filename = article_name + '.html'
filepath = os.path.join(OUTPUT_DIR, filename)
# Save the HTML content
with open(filepath, "w", encoding="utf-8") as f:
f.write(html_content)
# Create a mapping file to keep track of which hash corresponds to which URL
with open(os.path.join(OUTPUT_DIR, "url_mapping.txt"), "a", encoding="utf-8") as f:
f.write(f"{filename}\t{url}\n")
saved_count += 1
print(f"Saved article {saved_count}/{MAX_ARTICLES}: {url}")
def print_progress():
"""Print progress information."""
global processed_count, saved_count
print(f"Processed: {processed_count} URLs | Saved: {saved_count}/{MAX_ARTICLES} articles | Queue: {len(queue)} URLs")
def save_state():
"""Save the current state of the crawler to a file."""
global visited_urls, saved_count, processed_count, queue
state = {
'visited_urls': list(visited_urls),
'saved_count': saved_count,
'processed_count': processed_count,
'queue': list(queue)
}
# Use a temporary file to avoid corruption if the process is killed during saving
temp_file = STATE_FILE + '.tmp'
try:
with open(temp_file, 'wb') as f:
pickle.dump(state, f)
# Rename the temporary file to the actual state file
if os.path.exists(STATE_FILE):
os.remove(STATE_FILE)
os.rename(temp_file, STATE_FILE)
print(f"State saved: {processed_count} URLs processed, {saved_count} articles saved, {len(queue)} URLs in queue")
except Exception as e:
print(f"Error saving state: {e}")
def load_state():
"""Load the crawler state from a file if it exists."""
global visited_urls, saved_count, processed_count, queue
if not os.path.exists(STATE_FILE):
print("No previous state found. Starting fresh crawl.")
return False
try:
with open(STATE_FILE, 'rb') as f:
state = pickle.load(f)
visited_urls = set(state['visited_urls'])
saved_count = state['saved_count']
processed_count = state['processed_count']
queue = deque(state['queue'])
print(f"Loaded previous state: {processed_count} URLs processed, {saved_count} articles saved, {len(queue)} URLs in queue")
return True
except Exception as e:
print(f"Error loading state: {e}")
print("Starting fresh crawl.")
return False
def crawl():
"""Crawl the website starting from the START_URL using a queue-based approach."""
global saved_count, processed_count, queue, visited_urls
# Kasbah URL for debugging
kasbah_url = "https://civilization.fandom.com/wiki/Kasbah_(Civ5)"
# Check if we have a saved state to resume from
state_loaded = load_state()
# If no state was loaded, initialize the queue with the starting URL
if not state_loaded:
queue = deque([START_URL])
# Add Kasbah URL directly to the queue for testing
if kasbah_url not in queue:
queue.append(kasbah_url)
print(f"\n!!! MANUALLY ADDED KASBAH URL TO QUEUE !!!")
# Set up headers for requests
headers = {
"User-Agent": USER_AGENT
}
# Record start time
start_time = time.time()
last_progress_time = start_time
last_save_time = start_time
# Create or append to log file
log_file = os.path.join(OUTPUT_DIR, "crawl_log.txt")
log_mode = "a" if state_loaded else "w"
with open(log_file, log_mode, encoding="utf-8") as f:
f.write(f"\nCrawl {'resumed' if state_loaded else 'started'} at: {datetime.datetime.now()}\n")
if not state_loaded:
f.write(f"Starting URL: {START_URL}\n")
f.write(f"Max articles: {MAX_ARTICLES}\n")
f.write(f"Current progress: {saved_count}/{MAX_ARTICLES} articles saved\n\n")
# Check if Kasbah URL is in visited_urls
if kasbah_url in visited_urls:
print(f"\n!!! KASBAH URL ALREADY VISITED: {kasbah_url} !!!")
# Check if Kasbah URL is in queue
if any(kasbah_url in url for url in queue):
print(f"\n!!! KASBAH URL ALREADY IN QUEUE !!!")
while queue and saved_count < MAX_ARTICLES:
# Get the next URL from the queue
url = queue.popleft()
# Remove any URL fragments (#) and query parameters (?)
url = url.split('#')[0] # Remove fragment
url = url.split('?')[0] # Remove query parameters
# Skip if we've already visited this URL
if url in visited_urls:
if kasbah_url in url:
print(f"\n!!! SKIPPING KASBAH URL (ALREADY VISITED): {url} !!!")
continue
# Mark as visited
visited_urls.add(url)
processed_count += 1
# Print progress every 10 seconds
current_time = time.time()
if current_time - last_progress_time > 10:
print_progress()
last_progress_time = current_time
# Save state periodically
if current_time - last_save_time > SAVE_STATE_INTERVAL:
save_state()
last_save_time = current_time
try:
# Make the request
print(f"Crawling: {url}")
# Save the article if it's a valid Civ5 article
if not is_valid_civ5_article(url):
continue
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
# Parse the HTML
soup = BeautifulSoup(response.text, "html.parser")
save_article(url, response.text)
# Log the saved article
with open(log_file, "a", encoding="utf-8") as f:
f.write(f"Saved: {url}\n")
if saved_count >= MAX_ARTICLES:
print(f"Reached maximum number of articles ({MAX_ARTICLES})")
break
# Find all links on the page
links = soup.find_all("a", href=True)
# Process each link
for link in links:
href = link["href"]
# Skip empty links, anchors, and non-HTTP links
if not href or href.startswith("#") or href.startswith("javascript:"):
continue
# Convert relative URLs to absolute URLs
absolute_url = urljoin(url, href)
# Check for Kasbah URL
if kasbah_url in absolute_url:
print(f"\n!!! FOUND KASBAH LINK: {absolute_url} !!!")
print(f"!!! ORIGINAL HREF: {href} !!!")
print(f"!!! FOUND ON PAGE: {url} !!!")
print(f"!!! IS IN VISITED_URLS: {absolute_url in visited_urls} !!!")
# Only follow links to the same domain
if "civilization.fandom.com" in absolute_url and absolute_url not in visited_urls:
# Add to the queue
queue.append(absolute_url)
if kasbah_url in absolute_url:
print(f"\n!!! ADDED KASBAH URL TO QUEUE: {absolute_url} !!!")
# Be respectful and don't hammer the server
time.sleep(DELAY)
except requests.exceptions.RequestException as e:
print(f"Request error crawling {url}: {e}")
with open(log_file, "a", encoding="utf-8") as f:
f.write(f"Error: {url} - {str(e)}\n")
except Exception as e:
print(f"Error crawling {url}: {e}")
with open(log_file, "a", encoding="utf-8") as f:
f.write(f"Error: {url} - {str(e)}\n")
# Save final state
save_state()
# Record end time and calculate duration
end_time = time.time()
duration = end_time - start_time
# Log completion
with open(log_file, "a", encoding="utf-8") as f:
f.write(f"\nCrawl completed at: {datetime.datetime.now()}\n")
f.write(f"Duration: {duration:.2f} seconds\n")
f.write(f"Processed URLs: {processed_count}\n")
f.write(f"Saved articles: {saved_count}\n")
if __name__ == "__main__":
print(f"Starting crawler at {START_URL}")
print(f"Saving articles to {OUTPUT_DIR}")
print(f"Maximum articles: {MAX_ARTICLES}")
# Test Kasbah URL validation
kasbah_url = "https://civilization.fandom.com/wiki/Kasbah_(Civ5)"
print("\n=== TESTING KASBAH URL VALIDATION ===")
print(f"URL: {kasbah_url}")
parsed_url = urlparse(kasbah_url)
print(f"Parsed URL: {parsed_url}")
print(f"Path: {parsed_url.path}")
print(f"Path ends with (Civ5): {parsed_url.path.endswith('(Civ5)')}")
print(f"Is valid: {is_valid_civ5_article(kasbah_url)}")
print("=====================================\n")
try:
# Start the crawl
crawl()
except KeyboardInterrupt:
print("\nCrawling interrupted by user.")
# Save state on keyboard interrupt
save_state()
except Exception as e:
print(f"Unexpected error: {e}")
# Save state on unexpected error
save_state()
print(f"Crawling complete. Saved {saved_count} articles.")
print(f"Processed {processed_count} URLs in total.")