-
-
Notifications
You must be signed in to change notification settings - Fork 6k
Open
Labels
π BugSomething isn't workingSomething isn't workingπ Root causedidentified the root cause of bugidentified the root cause of bug
Description
crawl4ai version
0.7.4
Expected Behavior
When crawling a url with a specified port number in it, and with external links filtering enabled, crawler should visit all links containing the port number in the url.
Current Behavior
crawler compares the url with the port number to the base_domain url it gets from get_base_domain(), which removes the port number.
Making the comparison between the two always false, and thus discarding the URL as a valid internal URL.
Crawler will then stop after scraping the first page.
Is this reproducible?
Yes
Inputs Causing the Bug
- starting URL: http://localhost:8000 (if using fastapi)
- minimal working config: default BFSDeepCrawlStrategy and CrawlerRunConfig(exclude_external_links=True)Steps to Reproduce
Have a fastapi/flask server running in localhost, with several pages accessible by the crawler from the home page
Start the crawlerCode snippets
fastapi server used:
# pyright: basic, reportMissingTypeArgument=error
import datetime
import os
from fastapi import Depends
from fastapi import FastAPI
from fastapi import Form
from fastapi import HTTPException
from fastapi import Request
from fastapi import status
from fastapi.responses import HTMLResponse
from fastapi.responses import JSONResponse
from fastapi.responses import RedirectResponse
from fastapi.security import HTTPBasic
from fastapi.security import HTTPBasicCredentials
from starlette.middleware.sessions import SessionMiddleware
app = FastAPI(title="Authentication Demo Server", version="1.0.0")
# Add session middleware
app.add_middleware(SessionMiddleware, secret_key="your-secret-key-change-this")
# Simple user database (use proper database in production)
USERS = {"admin": "password", "user": "123456"}
# HTTP Basic Auth setup
security = HTTPBasic()
def verify_credentials(username: str, password: str) -> bool:
"""Check if username/password combination is valid."""
return username in USERS and USERS[username] == password
def get_current_user_basic(credentials: HTTPBasicCredentials = Depends(security)) -> str:
"""Get current user from HTTP Basic Auth."""
if not verify_credentials(credentials.username, credentials.password):
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid authentication credentials",
headers={"WWW-Authenticate": "Basic"},
)
return credentials.username
def get_current_user_session(request: Request) -> str | None:
"""Get current user from session."""
return request.session.get("user")
def get_current_user_either(request: Request, basic_user: str | None = None) -> str:
"""Get current user from either session or basic auth."""
# Check session first
session_user = get_current_user_session(request)
if session_user:
return session_user
# Try basic auth
try:
if request.headers.get("authorization"):
import base64
auth_header = request.headers.get("authorization", "")
if auth_header.startswith("Basic "):
encoded_credentials = auth_header.split(" ")[1]
decoded_credentials = base64.b64decode(encoded_credentials).decode("utf-8")
username, password = decoded_credentials.split(":", 1)
if verify_credentials(username, password):
return username
except Exception:
pass
# No valid authentication
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Authentication required",
headers={"WWW-Authenticate": "Basic"},
)
def require_auth(request: Request) -> str:
"""Dependency that requires either form of authentication."""
return get_current_user_either(request)
# HTML template for the form
FORM_TEMPLATE = """
<!DOCTYPE html>
<html>
<head>
<title>Secure Form Authentication</title>
<style>
body {{ font-family: Arial, sans-serif; max-width: 400px; margin: 100px auto; padding: 20px; }}
.form-group {{ margin-bottom: 15px; }}
label {{ display: block; margin-bottom: 5px; }}
input[type="text"], input[type="password"] {{ width: 100%; padding: 8px; border: 1px solid #ddd; }}
input[type="submit"] {{ background: #007cba; color: white; padding: 10px 20px; border: none; cursor: pointer; }}
.error {{ color: red; margin-top: 10px; }}
.success {{ color: green; margin-top: 10px; }}
.secure-badge {{ background: #28a745; color: white; padding: 4px 8px; border-radius: 4px; font-size: 12px; }}
</style>
</head>
<body>
<h2>π Secure Sign In</h2>
<div class="secure-badge">HTTPS Secured Connection</div>
{error_message}
{success_message}
<form method="POST" action="/form_auth">
<div class="form-group">
<label for="username">Username:</label>
<input type="text" id="username" name="username" required>
</div>
<div class="form-group">
<label for="password">Password:</label>
<input type="password" id="password" name="password" required>
</div>
<input id="submit" type="submit" value="Sign In">
</form>
<p style="margin-top: 30px; font-size: 12px; color: #666;">
Demo credentials:<br>
Username: admin, Password: password<br>
Username: user, Password: 123456
</p>
</body>
</html>
"""
@app.get("/", response_class=HTMLResponse)
async def home(request: Request):
print(request.headers)
current_user = get_current_user_session(request)
auth_status = f"Logged in as: {current_user}" if current_user else "Not logged in"
return f"""
<h1>FastAPI HTTPS Authentication Demo Server</h1>
<p><strong>Status:</strong> {auth_status}</p>
<p><strong>π Secure Connection:</strong> This server is running over HTTPS</p>
<h2>Authentication Methods</h2>
<p><a href="/http_auth">HTTP Basic Authentication Only</a></p>
<p><a href="/form_auth">Form Authentication</a></p>
<h2>Protected Routes (Require Either Auth Method)</h2>
<p><a href="/dashboard">Dashboard</a></p>
<p><a href="/profile">User Profile</a></p>
<p><a href="/settings">Settings</a></p>
<p><a href="/files">File Manager</a></p>
<p><a href="/admin">Admin Panel</a></p>
<p><a href="/reports">Reports</a></p>
<p><a href="/api/user-info">API: User Info (JSON)</a></p>
<p><a href="/api/data">API: Sample Data (JSON)</a></p>
<p><a href="/secure-content">Secure Content</a></p>
<p><a href="/members-only">Members Only Area</a></p>
<h2>Actions</h2>
<p><a href="/logout">Logout (clears form session)</a></p>
"""
@app.get("/http_auth", response_class=HTMLResponse)
async def http_auth(current_user: str = Depends(get_current_user_basic)):
return f"""
<h1>HTTP Basic Authentication Success!</h1>
<p>Welcome, <strong>{current_user}</strong>!</p>
<p>You have successfully authenticated using HTTP Basic Auth.</p>
<p><a href="/">Back to home</a></p>
"""
@app.get("/form_auth", response_class=HTMLResponse)
async def form_auth_get(request: Request):
# Check if already logged in
if get_current_user_session(request):
user = get_current_user_session(request)
success_html = f'<div class="success">Welcome, {user}! You have successfully signed in.</div><p><a href="/">Go back to home</a></p>'
return FORM_TEMPLATE.format(error_message="", success_message=success_html)
return FORM_TEMPLATE.format(error_message="", success_message="")
@app.post("/form_auth", response_class=HTMLResponse)
async def form_auth_post(request: Request, username: str = Form(...), password: str = Form(...)):
if verify_credentials(username, password):
request.session["user"] = username
success_html = f'<div class="success">Welcome, {username}! You have successfully signed in.</div><p><a href="/">Go back to home</a></p>'
return FORM_TEMPLATE.format(error_message="", success_message=success_html)
else:
error_html = '<div class="error">Invalid username or password</div>'
return FORM_TEMPLATE.format(error_message=error_html, success_message="")
@app.get("/logout")
async def logout(request: Request):
request.session.pop("user", None)
return RedirectResponse(url="/", status_code=302)
# Protected routes that accept either authentication method
@app.get("/dashboard", response_class=HTMLResponse)
async def dashboard(request: Request, current_user: str = Depends(require_auth)):
auth_method = "Form Session" if get_current_user_session(request) else "HTTP Basic"
return f"""
<h1>Dashboard</h1>
<p>Welcome to your dashboard, <strong>{current_user}</strong>!</p>
<p>This page is protected and accessible via both HTTP Basic Auth and form authentication.</p>
<h3>Quick Stats</h3>
<ul>
<li>Login time: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</li>
<li>Auth method: {auth_method}</li>
<li>User role: {"Administrator" if current_user == "admin" else "User"}</li>
</ul>
<p><a href="/">β Back to Home</a></p>
"""
@app.get("/profile", response_class=HTMLResponse)
async def profile(current_user: str = Depends(require_auth)):
return f"""
<h1>User Profile</h1>
<p><strong>Username:</strong> {current_user}</p>
<p><strong>Account Type:</strong> {"Administrator" if current_user == "admin" else "Standard User"}</p>
<p><strong>Member Since:</strong> January 2024</p>
<p><strong>Last Login:</strong> {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</p>
<h3>Profile Settings</h3>
<p>Email: {current_user}@example.com</p>
<p>Status: Active</p>
<p><a href="/">β Back to Home</a></p>
"""
@app.get("/settings", response_class=HTMLResponse)
async def settings(current_user: str = Depends(require_auth)):
return f"""
<h1>Settings</h1>
<p>Account settings for <strong>{current_user}</strong></p>
<h3>General Settings</h3>
<div style="background: #f8f9fa; padding: 15px; border-radius: 5px; margin: 10px 0;">
<p><strong>Theme:</strong> Light Mode</p>
<p><strong>Language:</strong> English</p>
<p><strong>Timezone:</strong> UTC</p>
<p><strong>Notifications:</strong> {"Enabled" if current_user == "admin" else "Limited"}</p>
</div>
<h3>Security Settings</h3>
<div style="background: #fff3cd; padding: 15px; border-radius: 5px; margin: 10px 0;">
<p><strong>Two-Factor Auth:</strong> {"Enabled" if current_user == "admin" else "Disabled"}</p>
<p><strong>Session Timeout:</strong> 30 minutes</p>
<p><strong>Last Password Change:</strong> 2024-01-15</p>
</div>
<p><a href="/">β Back to Home</a></p>
"""
@app.get("/files", response_class=HTMLResponse)
async def files(current_user: str = Depends(require_auth)):
return f"""
<h1>File Manager</h1>
<p>File access for <strong>{current_user}</strong></p>
<h3>Recent Files</h3>
<div style="background: #e9ecef; padding: 15px; border-radius: 5px; margin: 10px 0;">
<ul style="list-style-type: none; padding: 0;">
<li style="padding: 5px 0; border-bottom: 1px solid #ccc;">π document1.pdf - 2.3MB</li>
<li style="padding: 5px 0; border-bottom: 1px solid #ccc;">π spreadsheet.xlsx - 1.8MB</li>
<li style="padding: 5px 0; border-bottom: 1px solid #ccc;">πΌοΈ presentation.pptx - 4.2MB</li>
{"<li style='padding: 5px 0;'>π§ admin_config.json - 0.5MB</li>" if current_user == "admin" else ""}
</ul>
</div>
<h3>Storage Info</h3>
<p><strong>Used:</strong> {"8.8GB" if current_user == "admin" else "4.2GB"} of 10GB</p>
<p><strong>Available:</strong> {"1.2GB" if current_user == "admin" else "5.8GB"}</p>
<p><a href="/">β Back to Home</a></p>
"""
@app.get("/admin", response_class=HTMLResponse)
async def admin(current_user: str = Depends(require_auth)):
if current_user != "admin":
return f"""
<h1>Access Denied</h1>
<p style="color: red;">Administrator privileges required to access this page.</p>
<p>Current user: <strong>{current_user}</strong></p>
<p><a href="/">β Back to Home</a></p>
"""
return f"""
<h1>Admin Panel</h1>
<p>Administrative dashboard for <strong>{current_user}</strong></p>
<h3>System Status</h3>
<div style="background: #d4edda; padding: 15px; border-radius: 5px; margin: 10px 0; border: 1px solid #c3e6cb;">
<p><strong>Server Status:</strong> β
Online</p>
<p><strong>Database:</strong> β
Connected</p>
<p><strong>Cache:</strong> β
Active</p>
<p><strong>Uptime:</strong> 3 days, 14 hours</p>
</div>
<h3>User Management</h3>
<div style="background: #f8f9fa; padding: 15px; border-radius: 5px; margin: 10px 0;">
<p><strong>Total Users:</strong> 2</p>
<p><strong>Active Sessions:</strong> 1</p>
<p><strong>Failed Login Attempts:</strong> 0</p>
</div>
<h3>Recent Admin Actions</h3>
<ul>
<li>User 'admin' logged in - {datetime.datetime.now().strftime("%H:%M:%S")}</li>
<li>System backup completed - 12:00:00</li>
<li>Configuration updated - Yesterday</li>
</ul>
<p><a href="/">β Back to Home</a></p>
"""
@app.get("/reports", response_class=HTMLResponse)
async def reports(current_user: str = Depends(require_auth)):
return f"""
<h1>Reports</h1>
<p>Analytics and reports for <strong>{current_user}</strong></p>
<h3>Available Reports</h3>
<div style="background: #f8f9fa; padding: 15px; border-radius: 5px; margin: 10px 0;">
<div style="margin: 10px 0; padding: 10px; border-left: 4px solid #007cba;">
<strong>π Usage Report</strong><br>
<small>Last 30 days activity summary</small><br>
<span style="color: #666;">Generated: {datetime.datetime.now().strftime("%Y-%m-%d")}</span>
</div>
<div style="margin: 10px 0; padding: 10px; border-left: 4px solid #28a745;">
<strong>π Security Report</strong><br>
<small>Login attempts and security events</small><br>
<span style="color: #666;">{"Available for admin users" if current_user == "admin" else "Limited access"}</span>
</div>
<div style="margin: 10px 0; padding: 10px; border-left: 4px solid #ffc107;">
<strong>πΎ Storage Report</strong><br>
<small>File usage and quota information</small><br>
<span style="color: #666;">Real-time data</span>
</div>
</div>
<h3>Quick Stats</h3>
<p><strong>Total Logins Today:</strong> {"15" if current_user == "admin" else "3"}</p>
<p><strong>Files Processed:</strong> {"127" if current_user == "admin" else "8"}</p>
<p><strong>Errors:</strong> {"2" if current_user == "admin" else "0"}</p>
<p><a href="/">β Back to Home</a></p>
"""
@app.get("/secure-content", response_class=HTMLResponse)
async def secure_content(current_user: str = Depends(require_auth)):
return f"""
<h1>π Secure Content Area</h1>
<p>Protected content for authenticated user: <strong>{current_user}</strong></p>
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin: 15px 0;">
<h3>π― Confidential Information</h3>
<p>This area contains sensitive information that requires authentication to access.</p>
<ul>
<li>API Keys: β’β’β’β’β’β’β’β’β’β’β’β’abcd</li>
<li>Database Connection: secure-db.example.com</li>
<li>{"Admin Token: β’β’β’β’β’β’β’β’β’β’β’β’xyz9" if current_user == "admin" else "Limited access granted"}</li>
</ul>
</div>
<h3>π Secure Documents</h3>
<div style="background: #fff; padding: 15px; border: 2px dashed #007cba; border-radius: 5px; margin: 10px 0;">
<p><strong>Classification Level:</strong> {"TOP SECRET" if current_user == "admin" else "CONFIDENTIAL"}</p>
<p><strong>Access Level:</strong> {current_user.upper()}</p>
<p><strong>Document Count:</strong> {"47" if current_user == "admin" else "12"}</p>
</div>
<p style="color: #dc3545; font-weight: bold;">β οΈ All access to this area is logged and monitored.</p>
<p><a href="/">β Back to Home</a></p>
"""
@app.get("/members-only", response_class=HTMLResponse)
async def members_only(current_user: str = Depends(require_auth)):
return f"""
<h1>π₯ Members Only Area</h1>
<p>Exclusive content for member: <strong>{current_user}</strong></p>
<div style="background: #ffd700; color: #000; padding: 20px; border-radius: 10px; margin: 15px 0; text-align: center;">
<h2>π VIP ACCESS GRANTED</h2>
<p>Welcome to the exclusive members area!</p>
</div>
<h3>π Member Benefits</h3>
<div style="background: #f0f8ff; padding: 15px; border-radius: 5px; margin: 10px 0; border-left: 5px solid #007cba;">
<ul>
<li>β¨ Priority support access</li>
<li>π± Mobile app premium features</li>
<li>π° {"Advanced analytics dashboard" if current_user == "admin" else "Basic analytics"}</li>
<li>π― {"Full API access" if current_user == "admin" else "Limited API access"}</li>
<li>π Exclusive content library</li>
</ul>
</div>
<h3>π Member Status</h3>
<div style="background: #e8f5e8; padding: 15px; border-radius: 5px; margin: 10px 0;">
<p><strong>Membership Level:</strong> {"Premium Elite" if current_user == "admin" else "Standard"}</p>
<p><strong>Member Since:</strong> January 2024</p>
<p><strong>Points Balance:</strong> {"2,847" if current_user == "admin" else "156"}</p>
<p><strong>Next Reward:</strong> {"Unlocked!" if current_user == "admin" else "89 points away"}</p>
</div>
<p><a href="/">β Back to Home</a></p>
"""
@app.get("/api/user-info")
async def api_user_info(request: Request, current_user: str = Depends(require_auth)):
return JSONResponse({
"username": current_user,
"auth_method": "session" if get_current_user_session(request) else "basic",
"is_admin": current_user == "admin",
"login_time": datetime.datetime.now().isoformat(),
"permissions": ["read", "write"] if current_user == "admin" else ["read"],
})
@app.get("/api/data")
async def api_data(current_user: str = Depends(require_auth)):
return JSONResponse({
"message": "Sample data endpoint",
"user": current_user,
"timestamp": datetime.datetime.now().isoformat(),
"data": {
"items": ["item1", "item2", "item3"] if current_user == "admin" else ["item1"],
"count": 47 if current_user == "admin" else 12,
"status": "active",
"metadata": {
"version": "1.0",
"access_level": "admin" if current_user == "admin" else "user",
},
},
})
if __name__ == "__main__":
import uvicorn
# Check if certificates exist
cert_file = "server.crt"
key_file = "server.key"
if not os.path.exists(cert_file) or not os.path.exists(key_file):
print("SSL certificates not found!")
print("Please generate them using OpenSSL:")
print("1. openssl genrsa -out server.key 2048")
print(
"2. openssl req -new -key server.key -out server.csr -subj '/C=US/ST=Dev/L=Local/O=Demo/CN=localhost'"
)
print("3. openssl x509 -req -days 365 -in server.csr -signkey server.key -out server.crt")
print("4. rm server.csr")
print("\nOr run without HTTPS:")
print("uvicorn main:app --host 127.0.0.1 --port 8000")
exit(1)
print("Starting FastAPI server with HTTPS...")
print("Server will be available at: https://localhost:8000")
print("\nDemo credentials:")
print("- Username: admin, Password: password")
print("- Username: user, Password: 123456")
uvicorn.run(app, host="127.0.0.1", port=8000, ssl_keyfile=key_file, ssl_certfile=cert_file)and the issue happens here, in is_external_url(url: str, base_domain: str):
try:
parsed = urlparse(url)
if not parsed.netloc: # Relative URL
return False
# Strip 'www.' from both domains for comparison
url_domain = parsed.netloc.lower().replace("www.", "") # <- parsed.netloc url keeps the port number
base = base_domain.lower().replace("www.", "") # <- assuming base_domain comes from get_base_domain(), it does not have the port number
# Check if URL domain ends with base domain
return not url_domain.endswith(base) # <- this will return false
OS
Linux
Python version
Python 3.12.11
Browser
No response
Browser version
No response
Error logs & Screenshots (if applicable)

Metadata
Metadata
Assignees
Labels
π BugSomething isn't workingSomething isn't workingπ Root causedidentified the root cause of bugidentified the root cause of bug