Skip to content

Commit 65bdc4a

Browse files
committed
Add parent image and skip build setting in staging rag.yaml. Fix labels in populate_index.py. Add use_dev_set setting in url_scraper.py. Update url_scraper.py to use_dev_set. Remove unused code in url_scraping_utils.py. Update get_all_pages function in url_scraping_utils.py. Update logging in url_scraping_utils.py. Update get_nested_readme_urls function in url_scraping_utils.py.FetchRequest for sitemap zenml.io in get_all_pages
1 parent ca3aed7 commit 65bdc4a

File tree

4 files changed

+24
-186
lines changed

4 files changed

+24
-186
lines changed

llm-complete-guide/configs/staging/rag.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ settings:
2020
ZENML_ENABLE_RICH_TRACEBACK: FALSE
2121
ZENML_LOGGING_VERBOSITY: INFO
2222
python_package_installer: "uv"
23-
23+
parent_image: "339712793861.dkr.ecr.eu-central-1.amazonaws.com/zenml/llm_index_and_evaluate-orchestrator:bceb36ef0ab6"
24+
skip_build: true
2425
steps:
2526
url_scraper:
2627
parameters:

llm-complete-guide/steps/populate_index.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -461,8 +461,8 @@ def draw_bar_chart(
461461
Returns:
462462
None
463463
"""
464-
if label is None:
465-
label = ""
464+
if labels is None:
465+
labels = []
466466

467467
max_value = max(data)
468468

llm-complete-guide/steps/url_scraper.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def url_scraper(
4040
"""
4141
# We comment this out to make this pipeline faster
4242
# examples_readme_urls = get_nested_readme_urls(repo_url)
43+
use_dev_set = False
4344
if use_dev_set:
4445

4546
docs_urls = [

llm-complete-guide/steps/url_scraping_utils.py

Lines changed: 19 additions & 183 deletions
Original file line numberDiff line numberDiff line change
@@ -13,200 +13,36 @@
1313
# permissions and limitations under the License.
1414

1515
import re
16-
from functools import lru_cache
17-
from logging import getLogger
18-
from time import sleep
19-
from typing import List, Set, Tuple
20-
from urllib.parse import urljoin, urlparse
21-
2216
import requests
2317
from bs4 import BeautifulSoup
24-
from constants import RATE_LIMIT
25-
from ratelimit import limits, sleep_and_retry
26-
27-
logger = getLogger(__name__)
28-
29-
30-
def is_valid_url(url: str, base: str) -> bool:
31-
"""
32-
Check if the given URL is valid, has the same base as the provided base,
33-
and does not contain any version-specific paths.
34-
35-
Args:
36-
url (str): The URL to check.
37-
base (str): The base URL to compare against.
38-
39-
Returns:
40-
bool: True if the URL is valid, has the same base, and does not contain version-specific paths, False otherwise.
41-
"""
42-
parsed = urlparse(url)
43-
if not bool(parsed.netloc) or parsed.netloc != base:
44-
return False
45-
46-
# Check if the URL contains a version pattern (e.g., /v/0.x.x/)
47-
version_pattern = r"/v/0\.\d+\.\d+/"
48-
return not re.search(version_pattern, url)
49-
50-
51-
def strip_query_params(url: str) -> str:
52-
"""Strip query parameters from a URL.
53-
54-
Args:
55-
url (str): The URL to strip query parameters from.
56-
57-
Returns:
58-
str: The URL without query parameters.
59-
"""
60-
return url.split("?")[0]
61-
62-
63-
def get_all_pages(url: str) -> List[str]:
64-
"""
65-
Retrieve all pages with the same base as the given URL.
66-
67-
Args:
68-
url (str): The URL to retrieve pages from.
69-
70-
Returns:
71-
List[str]: A list of all pages with the same base.
72-
"""
73-
logger.info(f"Scraping all pages from {url}...")
74-
base_url = urlparse(url).netloc
75-
76-
# Use a queue-based approach instead of recursion
77-
pages = set()
78-
queue = [url]
79-
while queue:
80-
current_url = queue.pop(0)
81-
if current_url not in pages:
82-
pages.add(current_url)
83-
links = get_all_links(current_url, base_url)
84-
queue.extend(links)
85-
sleep(1 / RATE_LIMIT) # Rate limit the requests
86-
87-
stripped_pages = [strip_query_params(page) for page in pages]
88-
89-
logger.info(f"Found {len(stripped_pages)} pages.")
90-
logger.info("Done scraping pages.")
91-
return list(stripped_pages)
92-
93-
94-
def crawl(url: str, base: str, visited: Set[str] = None) -> Set[str]:
95-
"""
96-
Recursively crawl a URL and its links, retrieving all valid links with the same base.
97-
98-
Args:
99-
url (str): The URL to crawl.
100-
base (str): The base URL to compare against.
101-
visited (Set[str]): A set of URLs that have been visited. Defaults to None.
102-
103-
Returns:
104-
Set[str]: A set of all valid links with the same base.
105-
"""
106-
if visited is None:
107-
visited = set()
108-
109-
visited.add(url)
110-
logger.debug(f"Crawling URL: {url}")
111-
links = get_all_links(url, base)
112-
113-
for link in links:
114-
if link not in visited:
115-
visited.update(crawl(link, base, visited))
116-
sleep(1 / RATE_LIMIT) # Rate limit the recursive calls
117-
118-
return visited
119-
120-
121-
@sleep_and_retry
122-
@limits(calls=RATE_LIMIT, period=1)
123-
@lru_cache(maxsize=128)
124-
def get_all_links(url: str, base: str) -> List[str]:
125-
"""
126-
Retrieve all valid links from a given URL with the same base.
127-
128-
Args:
129-
url (str): The URL to retrieve links from.
130-
base (str): The base URL to compare against.
131-
132-
Returns:
133-
List[str]: A list of valid links with the same base.
134-
"""
135-
logger.debug(f"Retrieving links from {url}")
136-
response = requests.get(url)
137-
soup = BeautifulSoup(response.text, "html.parser")
138-
links = []
139-
140-
for link in soup.find_all("a", href=True):
141-
href = link["href"]
142-
full_url = urljoin(url, href)
143-
parsed_url = urlparse(full_url)
144-
cleaned_url = parsed_url._replace(fragment="").geturl()
145-
if is_valid_url(cleaned_url, base):
146-
print(cleaned_url)
147-
links.append(cleaned_url)
148-
149-
logger.debug(f"Found {len(links)} valid links from {url}")
150-
return links
151-
152-
153-
@sleep_and_retry
154-
@limits(calls=RATE_LIMIT, period=1)
155-
@lru_cache(maxsize=128)
156-
def get_readme_urls(repo_url: str) -> Tuple[List[str], List[str]]:
157-
"""
158-
Retrieve folder and README links from a GitHub repository.
159-
160-
Args:
161-
repo_url (str): The URL of the GitHub repository.
162-
163-
Returns:
164-
Tuple[List[str], List[str]]: A tuple containing two lists: folder links and README links.
165-
"""
166-
logger.debug(f"Retrieving README links from {repo_url}")
167-
headers = {"Accept": "application/vnd.github+json"}
168-
r = requests.get(repo_url, headers=headers)
169-
soup = BeautifulSoup(r.text, "html.parser")
170-
171-
folder_links = []
172-
readme_links = []
173-
174-
for link in soup.find_all("a", class_="js-navigation-open Link--primary"):
175-
href = link["href"]
176-
full_url = f"https://github.com{href}"
177-
if "tree" in href:
178-
folder_links.append(full_url)
179-
elif "README.md" in href:
180-
readme_links.append(full_url)
18+
from typing import List
19+
from logging import getLogger
18120

182-
logger.debug(
183-
f"Found {len(folder_links)} folder links and {len(readme_links)} README links from {repo_url}"
184-
)
185-
return folder_links, readme_links
18621

22+
logger = getLogger(__name__)
18723

188-
def get_nested_readme_urls(repo_url: str) -> List[str]:
24+
def get_all_pages(base_url: str = "https://docs.zenml.io") -> List[str]:
18925
"""
190-
Retrieve all nested README links from a GitHub repository.
26+
Retrieve all pages from the ZenML documentation sitemap.
19127
19228
Args:
193-
repo_url (str): The URL of the GitHub repository.
29+
base_url (str): The base URL of the documentation. Defaults to "https://docs.zenml.io"
19430
19531
Returns:
196-
List[str]: A list of all nested README links.
32+
List[str]: A list of all documentation page URLs.
19733
"""
198-
logger.info(f"Retrieving nested README links from {repo_url}...")
199-
folder_links, readme_links = get_readme_urls(repo_url)
200-
201-
for folder_link in folder_links:
202-
_, nested_readme_links = get_readme_urls(folder_link)
203-
readme_links.extend(nested_readme_links)
204-
205-
logger.info(
206-
f"Found {len(readme_links)} nested README links from {repo_url}"
207-
)
208-
return readme_links
209-
34+
logger.info("Fetching sitemap from docs.zenml.io...")
35+
36+
# Fetch the sitemap
37+
sitemap_url = f"{base_url}/sitemap.xml"
38+
response = requests.get(sitemap_url)
39+
soup = BeautifulSoup(response.text, "xml")
40+
41+
# Extract all URLs from the sitemap
42+
urls = [loc.text for loc in soup.find_all("loc")]
43+
44+
logger.info(f"Found {len(urls)} pages in the sitemap.")
45+
return urls
21046

21147
def extract_parent_section(url: str) -> str:
21248
"""

0 commit comments

Comments
 (0)