Fix CodeQL security vulnerability and remove page limit

devin-ai-integration[bot] · Alek99 · devin-ai-integration[bot] · commit cc03ba179e7b · 2025-07-10T22:50:18.000Z
- Fix incomplete URL substring sanitization by using proper domain validation for Twitter/X links
- Remove 200 page limit to crawl ALL pages as requested
- Change max_pages default to None for unlimited crawling
- Update workflow to use unlimited page crawling
- Use uv pip instead of pip for consistency

Co-Authored-By: Alek &lt;alek@pynecone.io&gt;
diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml
@@ -46,10 +46,10 @@ jobs:
       - name: Check for dead links
         run: |
           # Install dependencies for dead link checker
-          pip install -r scripts/requirements.txt
+          uv pip install -r scripts/requirements.txt
           
           # Wait a moment for deployment to be fully ready
           sleep 30
           
           # Run dead link checker
-          python scripts/check_dead_links.py https://pcweb-gray-orca.rxc.app --max-pages 200 --timeout 15 --delay 1
+          python scripts/check_dead_links.py https://pcweb-gray-orca.rxc.app --timeout 15 --delay 1
diff --git a/scripts/check_dead_links.py b/scripts/check_dead_links.py
@@ -10,14 +10,13 @@
 import time
 from collections import deque
 from urllib.parse import urljoin, urlparse
-from urllib.robotparser import RobotFileParser
 
 import requests
 from bs4 import BeautifulSoup
 
 
 class DeadLinkChecker:
-    def __init__(self, base_url, max_pages=500, timeout=10, delay=0.5):
+    def __init__(self, base_url, max_pages=None, timeout=10, delay=0.5):
         self.base_url = base_url.rstrip('/')
         self.domain = urlparse(base_url).netloc
         self.max_pages = max_pages
@@ -64,8 +63,8 @@ def check_link(self, url, source_page):
             if response.status_code == 405:
                 response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
             
-            if response.status_code == 403 and 'twitter.com' in url:
-                print(f"Warning: Twitter link may be blocked by bot detection: {url}")
+            if response.status_code == 403 and parsed.netloc in ['twitter.com', 'www.twitter.com', 'x.com', 'www.x.com']:
+                print(f"Warning: Twitter/X link may be blocked by bot detection: {url}")
                 return True
             
             if response.status_code >= 400:
@@ -113,7 +112,7 @@ def extract_links(self, html, page_url):
 
     def crawl_page(self, url):
         """Crawl a single page and extract links."""
-        if url in self.visited_pages or len(self.visited_pages) >= self.max_pages:
+        if url in self.visited_pages or (self.max_pages and len(self.visited_pages) >= self.max_pages):
             return []
             
         self.visited_pages.add(url)
@@ -149,7 +148,7 @@ def run(self):
         print(f"Starting dead link check for {self.base_url}")
         print(f"Max pages: {self.max_pages}, Timeout: {self.timeout}s")
         
-        while self.pages_to_visit and len(self.visited_pages) < self.max_pages:
+        while self.pages_to_visit and (not self.max_pages or len(self.visited_pages) < self.max_pages):
             url = self.pages_to_visit.popleft()
             self.crawl_page(url)