Skip to content

Commit f04dad5

Browse files
Merge pull request #438 from max-svistunov/lcore-308-arbitrary-product-fetching
LCORE-308 Generalize product docs fetching from OpenShift to any product(s)
2 parents 5d79087 + 44405e7 commit f04dad5

File tree

15 files changed

+724
-1537
lines changed

15 files changed

+724
-1537
lines changed

scripts/doc_downloader/downloader.py

Lines changed: 280 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,280 @@
1+
#!/usr/bin/env python3
2+
"""A Red Hat documentation downloader.
3+
4+
Downloads HTML pages from a given starting URL, preserving the directory structure.
5+
"""
6+
7+
import argparse
8+
import asyncio
9+
import datetime
10+
import json
11+
import logging
12+
import sqlite3
13+
import sys
14+
import time
15+
from pathlib import Path
16+
from typing import Optional, Union
17+
from urllib.parse import urljoin, urlparse
18+
19+
import aiohttp
20+
from bs4 import BeautifulSoup
21+
22+
# Configure logging
23+
logging.basicConfig(
24+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
25+
)
26+
logger = logging.getLogger(__name__)
27+
28+
# --- URL Handling ---
29+
30+
def normalize_url(url: str) -> str:
31+
"""Normalize a URL by removing fragment identifiers and query parameters."""
32+
parsed = urlparse(url)
33+
return parsed._replace(fragment="", query="").geturl()
34+
35+
def is_in_scope(url: str, base_url: str) -> bool:
36+
"""Check if a URL is within the same domain and path as the base URL."""
37+
parsed_url = urlparse(url)
38+
parsed_base_url = urlparse(base_url)
39+
if parsed_url.netloc != parsed_base_url.netloc:
40+
return False
41+
return parsed_url.path.startswith(parsed_base_url.path.rsplit('/', 1)[0])
42+
43+
def get_local_path(url: str, output_dir: Path, base_url: Optional[str] = None) -> Path:
44+
"""Convert a URL to a local file path."""
45+
parsed_url = urlparse(url)
46+
path = parsed_url.path
47+
48+
if base_url is not None:
49+
parsed_base_url = urlparse(base_url)
50+
base_path = parsed_base_url.path
51+
# If base_url is a directory-like path, ensure it ends with a slash for clean prefix removal
52+
if not base_path.endswith('/') and '.' not in base_path.split('/')[-1]:
53+
base_path += '/'
54+
55+
if path.startswith(base_path):
56+
path = path[len(base_path):]
57+
58+
path = path.lstrip("/")
59+
60+
if path == "" or path.endswith('/'):
61+
path = path + "index.html"
62+
63+
local_path = output_dir / path
64+
local_path.parent.mkdir(parents=True, exist_ok=True)
65+
return local_path
66+
67+
# --- Database Functions ---
68+
69+
def init_database(db_path: str) -> str:
70+
"""Initialize SQLite database."""
71+
Path(db_path).parent.mkdir(parents=True, exist_ok=True)
72+
with sqlite3.connect(db_path) as conn:
73+
cursor = conn.cursor()
74+
cursor.execute(
75+
"""
76+
CREATE TABLE IF NOT EXISTS downloads (
77+
id INTEGER PRIMARY KEY AUTOINCREMENT,
78+
url TEXT UNIQUE NOT NULL,
79+
local_path TEXT NOT NULL,
80+
status TEXT NOT NULL,
81+
etag TEXT,
82+
last_modified TEXT,
83+
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
84+
)
85+
"""
86+
)
87+
conn.commit()
88+
logger.info("Database initialized at %s", db_path)
89+
return db_path
90+
91+
def record_download(
92+
db_path: str,
93+
url: str,
94+
local_path: str,
95+
status: str = "success",
96+
etag: Optional[str] = None,
97+
last_modified: Optional[str] = None,
98+
):
99+
"""Record a download in the database."""
100+
with sqlite3.connect(db_path) as conn:
101+
cursor = conn.cursor()
102+
cursor.execute(
103+
"INSERT OR REPLACE INTO downloads (url, local_path, status, etag, last_modified, timestamp) VALUES (?, ?, ?, ?, ?, datetime('now'))",
104+
(url, str(local_path), status, etag, last_modified),
105+
)
106+
conn.commit()
107+
108+
def get_download_status(db_path: str, url: str) -> tuple:
109+
"""Get download status from database."""
110+
with sqlite3.connect(db_path) as conn:
111+
cursor = conn.cursor()
112+
cursor.execute(
113+
"SELECT etag, last_modified FROM downloads WHERE url = ? AND status = 'success'",
114+
(url,),
115+
)
116+
result = cursor.fetchone()
117+
return result or (None, None)
118+
119+
# --- Network Functions ---
120+
121+
async def fetch_page(session: aiohttp.ClientSession, url: str, semaphore: asyncio.Semaphore) -> Optional[str]:
122+
"""Fetch a single page."""
123+
try:
124+
async with semaphore:
125+
async with session.get(url, timeout=30) as response:
126+
if response.status == 200:
127+
return await response.text()
128+
logger.warning("Failed to fetch %s: HTTP %s", url, response.status)
129+
return None
130+
except Exception as e:
131+
logger.error("Error fetching %s: %s", url, e)
132+
return None
133+
134+
async def download_page(
135+
session: aiohttp.ClientSession,
136+
url: str,
137+
output_dir: Path,
138+
db_path: str,
139+
semaphore: asyncio.Semaphore,
140+
force: bool,
141+
max_retries: int,
142+
base_url: str,
143+
) -> tuple[str, bool]:
144+
"""Download a single page and save it."""
145+
local_path = get_local_path(url, output_dir, base_url)
146+
147+
for attempt in range(max_retries):
148+
try:
149+
async with semaphore:
150+
async with session.get(url, timeout=30) as response:
151+
if response.status == 200:
152+
content = await response.text()
153+
with open(local_path, "w", encoding="utf-8") as f:
154+
f.write(content)
155+
record_download(db_path, url, str(local_path))
156+
logger.info("Downloaded %s -> %s", url, local_path)
157+
return url, True
158+
else:
159+
logger.warning("Failed to download %s: HTTP %s", url, response.status)
160+
if response.status == 404:
161+
break # Don't retry on 404
162+
except Exception as e:
163+
logger.error("Error downloading %s: %s", url, e)
164+
165+
if attempt < max_retries - 1:
166+
await asyncio.sleep(2 ** attempt)
167+
168+
record_download(db_path, url, str(local_path), status="failed")
169+
return url, False
170+
171+
async def extract_links(
172+
session: aiohttp.ClientSession,
173+
url: str,
174+
base_url: str,
175+
visited_urls: set,
176+
semaphore: asyncio.Semaphore,
177+
) -> set:
178+
"""Extract all in-scope links from a page."""
179+
content = await fetch_page(session, url, semaphore)
180+
if not content:
181+
return set()
182+
183+
soup = BeautifulSoup(content, "html.parser")
184+
new_links = set()
185+
for a_tag in soup.find_all("a", href=True):
186+
href = a_tag["href"]
187+
absolute_url = normalize_url(urljoin(url, href))
188+
189+
if absolute_url not in visited_urls and is_in_scope(absolute_url, base_url):
190+
new_links.add(absolute_url)
191+
192+
return new_links
193+
194+
async def crawl(session: aiohttp.ClientSession, start_url: str, semaphore: asyncio.Semaphore) -> set:
195+
"""Crawl a website to discover all pages."""
196+
base_url = normalize_url(start_url)
197+
to_visit = {base_url}
198+
visited_urls = set()
199+
200+
while to_visit:
201+
url = to_visit.pop()
202+
if url in visited_urls:
203+
continue
204+
205+
visited_urls.add(url)
206+
logger.debug("Crawling %s", url)
207+
208+
new_links = await extract_links(session, url, base_url, visited_urls, semaphore)
209+
to_visit.update(new_links)
210+
211+
# Heuristic: if the start_url doesn't look like a document page, don't include it for download.
212+
# Document pages usually end in .html or are in a /html-single/ directory.
213+
parsed_start_url = urlparse(base_url)
214+
if not parsed_start_url.path.endswith('.html') and '/html-single/' not in parsed_start_url.path:
215+
if base_url in visited_urls:
216+
logger.info("Excluding dispatch page from download list: %s", base_url)
217+
visited_urls.remove(base_url)
218+
219+
logger.info("Crawling completed. Found %s pages.", len(visited_urls))
220+
return visited_urls
221+
222+
# --- Main Execution ---
223+
224+
async def run_downloader(
225+
base_url: str,
226+
output_dir: Union[str, Path],
227+
concurrency: int,
228+
force: bool,
229+
max_retries: int,
230+
) -> tuple[bool, bool, float]:
231+
"""Run the complete download process."""
232+
output_dir_path = Path(output_dir)
233+
output_dir_path.mkdir(parents=True, exist_ok=True)
234+
db_path = str(output_dir_path / "download_database.sqlite")
235+
init_database(db_path)
236+
237+
semaphore = asyncio.Semaphore(concurrency)
238+
start_time = time.time()
239+
240+
async with aiohttp.ClientSession(trust_env=True) as session:
241+
discovered_urls = await crawl(session, base_url, semaphore)
242+
243+
tasks = [
244+
download_page(session, url, output_dir_path, db_path, semaphore, force, max_retries, base_url)
245+
for url in discovered_urls
246+
]
247+
results = await asyncio.gather(*tasks)
248+
249+
successful_downloads = sum(1 for _, success in results if success)
250+
elapsed_time = time.time() - start_time
251+
252+
logger.info(
253+
"Download process completed in %.2f seconds. %s/%s pages downloaded successfully.",
254+
elapsed_time,
255+
successful_downloads,
256+
len(discovered_urls),
257+
)
258+
259+
return successful_downloads > 0, True, elapsed_time
260+
261+
def main():
262+
"""Command-line entry point."""
263+
parser = argparse.ArgumentParser(description="Download documentation from a URL.")
264+
parser.add_argument("--doc-url", required=True, help="The starting URL to crawl.")
265+
parser.add_argument("--output-dir", required=True, help="Directory to save files.")
266+
parser.add_argument("--concurrency", type=int, default=10, help="Concurrency level.")
267+
parser.add_argument("--force", action="store_true", help="Force re-download of all files.")
268+
parser.add_argument("--max-retries", type=int, default=3, help="Max retries for failed downloads.")
269+
args = parser.parse_args()
270+
271+
asyncio.run(run_downloader(
272+
base_url=args.doc_url,
273+
output_dir=args.output_dir,
274+
concurrency=args.concurrency,
275+
force=args.force,
276+
max_retries=args.max_retries,
277+
))
278+
279+
if __name__ == "__main__":
280+
main()

0 commit comments

Comments
 (0)