Skip to content

Commit cc03ba1

Browse files
Fix CodeQL security vulnerability and remove page limit
- Fix incomplete URL substring sanitization by using proper domain validation for Twitter/X links - Remove 200 page limit to crawl ALL pages as requested - Change max_pages default to None for unlimited crawling - Update workflow to use unlimited page crawling - Use uv pip instead of pip for consistency Co-Authored-By: Alek <[email protected]>
1 parent 8ec4796 commit cc03ba1

File tree

2 files changed

+7
-8
lines changed

2 files changed

+7
-8
lines changed

.github/workflows/deploy-dev.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,10 @@ jobs:
4646
- name: Check for dead links
4747
run: |
4848
# Install dependencies for dead link checker
49-
pip install -r scripts/requirements.txt
49+
uv pip install -r scripts/requirements.txt
5050
5151
# Wait a moment for deployment to be fully ready
5252
sleep 30
5353
5454
# Run dead link checker
55-
python scripts/check_dead_links.py https://pcweb-gray-orca.rxc.app --max-pages 200 --timeout 15 --delay 1
55+
python scripts/check_dead_links.py https://pcweb-gray-orca.rxc.app --timeout 15 --delay 1

scripts/check_dead_links.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,13 @@
1010
import time
1111
from collections import deque
1212
from urllib.parse import urljoin, urlparse
13-
from urllib.robotparser import RobotFileParser
1413

1514
import requests
1615
from bs4 import BeautifulSoup
1716

1817

1918
class DeadLinkChecker:
20-
def __init__(self, base_url, max_pages=500, timeout=10, delay=0.5):
19+
def __init__(self, base_url, max_pages=None, timeout=10, delay=0.5):
2120
self.base_url = base_url.rstrip('/')
2221
self.domain = urlparse(base_url).netloc
2322
self.max_pages = max_pages
@@ -64,8 +63,8 @@ def check_link(self, url, source_page):
6463
if response.status_code == 405:
6564
response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
6665

67-
if response.status_code == 403 and 'twitter.com' in url:
68-
print(f"Warning: Twitter link may be blocked by bot detection: {url}")
66+
if response.status_code == 403 and parsed.netloc in ['twitter.com', 'www.twitter.com', 'x.com', 'www.x.com']:
67+
print(f"Warning: Twitter/X link may be blocked by bot detection: {url}")
6968
return True
7069

7170
if response.status_code >= 400:
@@ -113,7 +112,7 @@ def extract_links(self, html, page_url):
113112

114113
def crawl_page(self, url):
115114
"""Crawl a single page and extract links."""
116-
if url in self.visited_pages or len(self.visited_pages) >= self.max_pages:
115+
if url in self.visited_pages or (self.max_pages and len(self.visited_pages) >= self.max_pages):
117116
return []
118117

119118
self.visited_pages.add(url)
@@ -149,7 +148,7 @@ def run(self):
149148
print(f"Starting dead link check for {self.base_url}")
150149
print(f"Max pages: {self.max_pages}, Timeout: {self.timeout}s")
151150

152-
while self.pages_to_visit and len(self.visited_pages) < self.max_pages:
151+
while self.pages_to_visit and (not self.max_pages or len(self.visited_pages) < self.max_pages):
153152
url = self.pages_to_visit.popleft()
154153
self.crawl_page(url)
155154

0 commit comments

Comments
 (0)