|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""A Red Hat documentation downloader. |
| 3 | +
|
| 4 | +Downloads HTML pages from a given starting URL, preserving the directory structure. |
| 5 | +""" |
| 6 | + |
| 7 | +import argparse |
| 8 | +import asyncio |
| 9 | +import datetime |
| 10 | +import json |
| 11 | +import logging |
| 12 | +import sqlite3 |
| 13 | +import sys |
| 14 | +import time |
| 15 | +from pathlib import Path |
| 16 | +from typing import Optional, Union |
| 17 | +from urllib.parse import urljoin, urlparse |
| 18 | + |
| 19 | +import aiohttp |
| 20 | +from bs4 import BeautifulSoup |
| 21 | + |
| 22 | +# Configure logging |
| 23 | +logging.basicConfig( |
| 24 | + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" |
| 25 | +) |
| 26 | +logger = logging.getLogger(__name__) |
| 27 | + |
| 28 | +# --- URL Handling --- |
| 29 | + |
| 30 | +def normalize_url(url: str) -> str: |
| 31 | + """Normalize a URL by removing fragment identifiers and query parameters.""" |
| 32 | + parsed = urlparse(url) |
| 33 | + return parsed._replace(fragment="", query="").geturl() |
| 34 | + |
| 35 | +def is_in_scope(url: str, base_url: str) -> bool: |
| 36 | + """Check if a URL is within the same domain and path as the base URL.""" |
| 37 | + parsed_url = urlparse(url) |
| 38 | + parsed_base_url = urlparse(base_url) |
| 39 | + if parsed_url.netloc != parsed_base_url.netloc: |
| 40 | + return False |
| 41 | + return parsed_url.path.startswith(parsed_base_url.path.rsplit('/', 1)[0]) |
| 42 | + |
| 43 | +def get_local_path(url: str, output_dir: Path, base_url: Optional[str] = None) -> Path: |
| 44 | + """Convert a URL to a local file path.""" |
| 45 | + parsed_url = urlparse(url) |
| 46 | + path = parsed_url.path |
| 47 | + |
| 48 | + if base_url is not None: |
| 49 | + parsed_base_url = urlparse(base_url) |
| 50 | + base_path = parsed_base_url.path |
| 51 | + # If base_url is a directory-like path, ensure it ends with a slash for clean prefix removal |
| 52 | + if not base_path.endswith('/') and '.' not in base_path.split('/')[-1]: |
| 53 | + base_path += '/' |
| 54 | + |
| 55 | + if path.startswith(base_path): |
| 56 | + path = path[len(base_path):] |
| 57 | + |
| 58 | + path = path.lstrip("/") |
| 59 | + |
| 60 | + if path == "" or path.endswith('/'): |
| 61 | + path = path + "index.html" |
| 62 | + |
| 63 | + local_path = output_dir / path |
| 64 | + local_path.parent.mkdir(parents=True, exist_ok=True) |
| 65 | + return local_path |
| 66 | + |
| 67 | +# --- Database Functions --- |
| 68 | + |
| 69 | +def init_database(db_path: str) -> str: |
| 70 | + """Initialize SQLite database.""" |
| 71 | + Path(db_path).parent.mkdir(parents=True, exist_ok=True) |
| 72 | + with sqlite3.connect(db_path) as conn: |
| 73 | + cursor = conn.cursor() |
| 74 | + cursor.execute( |
| 75 | + """ |
| 76 | + CREATE TABLE IF NOT EXISTS downloads ( |
| 77 | + id INTEGER PRIMARY KEY AUTOINCREMENT, |
| 78 | + url TEXT UNIQUE NOT NULL, |
| 79 | + local_path TEXT NOT NULL, |
| 80 | + status TEXT NOT NULL, |
| 81 | + etag TEXT, |
| 82 | + last_modified TEXT, |
| 83 | + timestamp DATETIME DEFAULT CURRENT_TIMESTAMP |
| 84 | + ) |
| 85 | + """ |
| 86 | + ) |
| 87 | + conn.commit() |
| 88 | + logger.info("Database initialized at %s", db_path) |
| 89 | + return db_path |
| 90 | + |
| 91 | +def record_download( |
| 92 | + db_path: str, |
| 93 | + url: str, |
| 94 | + local_path: str, |
| 95 | + status: str = "success", |
| 96 | + etag: Optional[str] = None, |
| 97 | + last_modified: Optional[str] = None, |
| 98 | +): |
| 99 | + """Record a download in the database.""" |
| 100 | + with sqlite3.connect(db_path) as conn: |
| 101 | + cursor = conn.cursor() |
| 102 | + cursor.execute( |
| 103 | + "INSERT OR REPLACE INTO downloads (url, local_path, status, etag, last_modified, timestamp) VALUES (?, ?, ?, ?, ?, datetime('now'))", |
| 104 | + (url, str(local_path), status, etag, last_modified), |
| 105 | + ) |
| 106 | + conn.commit() |
| 107 | + |
| 108 | +def get_download_status(db_path: str, url: str) -> tuple: |
| 109 | + """Get download status from database.""" |
| 110 | + with sqlite3.connect(db_path) as conn: |
| 111 | + cursor = conn.cursor() |
| 112 | + cursor.execute( |
| 113 | + "SELECT etag, last_modified FROM downloads WHERE url = ? AND status = 'success'", |
| 114 | + (url,), |
| 115 | + ) |
| 116 | + result = cursor.fetchone() |
| 117 | + return result or (None, None) |
| 118 | + |
| 119 | +# --- Network Functions --- |
| 120 | + |
| 121 | +async def fetch_page(session: aiohttp.ClientSession, url: str, semaphore: asyncio.Semaphore) -> Optional[str]: |
| 122 | + """Fetch a single page.""" |
| 123 | + try: |
| 124 | + async with semaphore: |
| 125 | + async with session.get(url, timeout=30) as response: |
| 126 | + if response.status == 200: |
| 127 | + return await response.text() |
| 128 | + logger.warning("Failed to fetch %s: HTTP %s", url, response.status) |
| 129 | + return None |
| 130 | + except Exception as e: |
| 131 | + logger.error("Error fetching %s: %s", url, e) |
| 132 | + return None |
| 133 | + |
| 134 | +async def download_page( |
| 135 | + session: aiohttp.ClientSession, |
| 136 | + url: str, |
| 137 | + output_dir: Path, |
| 138 | + db_path: str, |
| 139 | + semaphore: asyncio.Semaphore, |
| 140 | + force: bool, |
| 141 | + max_retries: int, |
| 142 | + base_url: str, |
| 143 | +) -> tuple[str, bool]: |
| 144 | + """Download a single page and save it.""" |
| 145 | + local_path = get_local_path(url, output_dir, base_url) |
| 146 | + |
| 147 | + for attempt in range(max_retries): |
| 148 | + try: |
| 149 | + async with semaphore: |
| 150 | + async with session.get(url, timeout=30) as response: |
| 151 | + if response.status == 200: |
| 152 | + content = await response.text() |
| 153 | + with open(local_path, "w", encoding="utf-8") as f: |
| 154 | + f.write(content) |
| 155 | + record_download(db_path, url, str(local_path)) |
| 156 | + logger.info("Downloaded %s -> %s", url, local_path) |
| 157 | + return url, True |
| 158 | + else: |
| 159 | + logger.warning("Failed to download %s: HTTP %s", url, response.status) |
| 160 | + if response.status == 404: |
| 161 | + break # Don't retry on 404 |
| 162 | + except Exception as e: |
| 163 | + logger.error("Error downloading %s: %s", url, e) |
| 164 | + |
| 165 | + if attempt < max_retries - 1: |
| 166 | + await asyncio.sleep(2 ** attempt) |
| 167 | + |
| 168 | + record_download(db_path, url, str(local_path), status="failed") |
| 169 | + return url, False |
| 170 | + |
| 171 | +async def extract_links( |
| 172 | + session: aiohttp.ClientSession, |
| 173 | + url: str, |
| 174 | + base_url: str, |
| 175 | + visited_urls: set, |
| 176 | + semaphore: asyncio.Semaphore, |
| 177 | +) -> set: |
| 178 | + """Extract all in-scope links from a page.""" |
| 179 | + content = await fetch_page(session, url, semaphore) |
| 180 | + if not content: |
| 181 | + return set() |
| 182 | + |
| 183 | + soup = BeautifulSoup(content, "html.parser") |
| 184 | + new_links = set() |
| 185 | + for a_tag in soup.find_all("a", href=True): |
| 186 | + href = a_tag["href"] |
| 187 | + absolute_url = normalize_url(urljoin(url, href)) |
| 188 | + |
| 189 | + if absolute_url not in visited_urls and is_in_scope(absolute_url, base_url): |
| 190 | + new_links.add(absolute_url) |
| 191 | + |
| 192 | + return new_links |
| 193 | + |
| 194 | +async def crawl(session: aiohttp.ClientSession, start_url: str, semaphore: asyncio.Semaphore) -> set: |
| 195 | + """Crawl a website to discover all pages.""" |
| 196 | + base_url = normalize_url(start_url) |
| 197 | + to_visit = {base_url} |
| 198 | + visited_urls = set() |
| 199 | + |
| 200 | + while to_visit: |
| 201 | + url = to_visit.pop() |
| 202 | + if url in visited_urls: |
| 203 | + continue |
| 204 | + |
| 205 | + visited_urls.add(url) |
| 206 | + logger.debug("Crawling %s", url) |
| 207 | + |
| 208 | + new_links = await extract_links(session, url, base_url, visited_urls, semaphore) |
| 209 | + to_visit.update(new_links) |
| 210 | + |
| 211 | + # Heuristic: if the start_url doesn't look like a document page, don't include it for download. |
| 212 | + # Document pages usually end in .html or are in a /html-single/ directory. |
| 213 | + parsed_start_url = urlparse(base_url) |
| 214 | + if not parsed_start_url.path.endswith('.html') and '/html-single/' not in parsed_start_url.path: |
| 215 | + if base_url in visited_urls: |
| 216 | + logger.info("Excluding dispatch page from download list: %s", base_url) |
| 217 | + visited_urls.remove(base_url) |
| 218 | + |
| 219 | + logger.info("Crawling completed. Found %s pages.", len(visited_urls)) |
| 220 | + return visited_urls |
| 221 | + |
| 222 | +# --- Main Execution --- |
| 223 | + |
| 224 | +async def run_downloader( |
| 225 | + base_url: str, |
| 226 | + output_dir: Union[str, Path], |
| 227 | + concurrency: int, |
| 228 | + force: bool, |
| 229 | + max_retries: int, |
| 230 | +) -> tuple[bool, bool, float]: |
| 231 | + """Run the complete download process.""" |
| 232 | + output_dir_path = Path(output_dir) |
| 233 | + output_dir_path.mkdir(parents=True, exist_ok=True) |
| 234 | + db_path = str(output_dir_path / "download_database.sqlite") |
| 235 | + init_database(db_path) |
| 236 | + |
| 237 | + semaphore = asyncio.Semaphore(concurrency) |
| 238 | + start_time = time.time() |
| 239 | + |
| 240 | + async with aiohttp.ClientSession(trust_env=True) as session: |
| 241 | + discovered_urls = await crawl(session, base_url, semaphore) |
| 242 | + |
| 243 | + tasks = [ |
| 244 | + download_page(session, url, output_dir_path, db_path, semaphore, force, max_retries, base_url) |
| 245 | + for url in discovered_urls |
| 246 | + ] |
| 247 | + results = await asyncio.gather(*tasks) |
| 248 | + |
| 249 | + successful_downloads = sum(1 for _, success in results if success) |
| 250 | + elapsed_time = time.time() - start_time |
| 251 | + |
| 252 | + logger.info( |
| 253 | + "Download process completed in %.2f seconds. %s/%s pages downloaded successfully.", |
| 254 | + elapsed_time, |
| 255 | + successful_downloads, |
| 256 | + len(discovered_urls), |
| 257 | + ) |
| 258 | + |
| 259 | + return successful_downloads > 0, True, elapsed_time |
| 260 | + |
| 261 | +def main(): |
| 262 | + """Command-line entry point.""" |
| 263 | + parser = argparse.ArgumentParser(description="Download documentation from a URL.") |
| 264 | + parser.add_argument("--doc-url", required=True, help="The starting URL to crawl.") |
| 265 | + parser.add_argument("--output-dir", required=True, help="Directory to save files.") |
| 266 | + parser.add_argument("--concurrency", type=int, default=10, help="Concurrency level.") |
| 267 | + parser.add_argument("--force", action="store_true", help="Force re-download of all files.") |
| 268 | + parser.add_argument("--max-retries", type=int, default=3, help="Max retries for failed downloads.") |
| 269 | + args = parser.parse_args() |
| 270 | + |
| 271 | + asyncio.run(run_downloader( |
| 272 | + base_url=args.doc_url, |
| 273 | + output_dir=args.output_dir, |
| 274 | + concurrency=args.concurrency, |
| 275 | + force=args.force, |
| 276 | + max_retries=args.max_retries, |
| 277 | + )) |
| 278 | + |
| 279 | +if __name__ == "__main__": |
| 280 | + main() |
0 commit comments