Add parent image and skip build setting in staging rag.yaml. Fix labels in populate_index.py. Add use_dev_set setting in url_scraper.py. Update url_scraper.py to use_dev_set. Remove unused code in url_scraping_utils.py. Update get_all_pages function in url_scraping_utils.py. Update logging in url_scraping_utils.py. Update get_nested_readme_urls function in url_scraping_utils.py.FetchRequest for sitemap zenml.io in get_all_pages

htahir1 · htahir1 · commit 65bdc4a054ba · 2024-11-11T20:38:38.000+01:00
diff --git a/llm-complete-guide/configs/staging/rag.yaml b/llm-complete-guide/configs/staging/rag.yaml
@@ -20,7 +20,8 @@ settings:
       ZENML_ENABLE_RICH_TRACEBACK: FALSE
       ZENML_LOGGING_VERBOSITY: INFO
     python_package_installer: "uv"
-
+    parent_image: "339712793861.dkr.ecr.eu-central-1.amazonaws.com/zenml/llm_index_and_evaluate-orchestrator:bceb36ef0ab6"
+    skip_build: true
 steps:
   url_scraper:
     parameters:
diff --git a/llm-complete-guide/steps/populate_index.py b/llm-complete-guide/steps/populate_index.py
@@ -461,8 +461,8 @@ def draw_bar_chart(
     Returns:
         None
     """
-    if label is None:
-        label = ""
+    if labels is None:
+        labels = []
 
     max_value = max(data)
 
diff --git a/llm-complete-guide/steps/url_scraper.py b/llm-complete-guide/steps/url_scraper.py
@@ -40,6 +40,7 @@ def url_scraper(
     """
     # We comment this out to make this pipeline faster
     # examples_readme_urls = get_nested_readme_urls(repo_url)
+    use_dev_set = False
     if use_dev_set:
 
         docs_urls = [
diff --git a/llm-complete-guide/steps/url_scraping_utils.py b/llm-complete-guide/steps/url_scraping_utils.py
@@ -13,200 +13,36 @@
 #  permissions and limitations under the License.
 
 import re
-from functools import lru_cache
-from logging import getLogger
-from time import sleep
-from typing import List, Set, Tuple
-from urllib.parse import urljoin, urlparse
-
 import requests
 from bs4 import BeautifulSoup
-from constants import RATE_LIMIT
-from ratelimit import limits, sleep_and_retry
-
-logger = getLogger(__name__)
-
-
-def is_valid_url(url: str, base: str) -> bool:
-    """
-    Check if the given URL is valid, has the same base as the provided base,
-    and does not contain any version-specific paths.
-
-    Args:
-        url (str): The URL to check.
-        base (str): The base URL to compare against.
-
-    Returns:
-        bool: True if the URL is valid, has the same base, and does not contain version-specific paths, False otherwise.
-    """
-    parsed = urlparse(url)
-    if not bool(parsed.netloc) or parsed.netloc != base:
-        return False
-
-    # Check if the URL contains a version pattern (e.g., /v/0.x.x/)
-    version_pattern = r"/v/0\.\d+\.\d+/"
-    return not re.search(version_pattern, url)
-
-
-def strip_query_params(url: str) -> str:
-    """Strip query parameters from a URL.
-
-    Args:
-        url (str): The URL to strip query parameters from.
-
-    Returns:
-        str: The URL without query parameters.
-    """
-    return url.split("?")[0]
-
-
-def get_all_pages(url: str) -> List[str]:
-    """
-    Retrieve all pages with the same base as the given URL.
-
-    Args:
-        url (str): The URL to retrieve pages from.
-
-    Returns:
-        List[str]: A list of all pages with the same base.
-    """
-    logger.info(f"Scraping all pages from {url}...")
-    base_url = urlparse(url).netloc
-
-    # Use a queue-based approach instead of recursion
-    pages = set()
-    queue = [url]
-    while queue:
-        current_url = queue.pop(0)
-        if current_url not in pages:
-            pages.add(current_url)
-            links = get_all_links(current_url, base_url)
-            queue.extend(links)
-            sleep(1 / RATE_LIMIT)  # Rate limit the requests
-
-    stripped_pages = [strip_query_params(page) for page in pages]
-
-    logger.info(f"Found {len(stripped_pages)} pages.")
-    logger.info("Done scraping pages.")
-    return list(stripped_pages)
-
-
-def crawl(url: str, base: str, visited: Set[str] = None) -> Set[str]:
-    """
-    Recursively crawl a URL and its links, retrieving all valid links with the same base.
-
-    Args:
-        url (str): The URL to crawl.
-        base (str): The base URL to compare against.
-        visited (Set[str]): A set of URLs that have been visited. Defaults to None.
-
-    Returns:
-        Set[str]: A set of all valid links with the same base.
-    """
-    if visited is None:
-        visited = set()
-
-    visited.add(url)
-    logger.debug(f"Crawling URL: {url}")
-    links = get_all_links(url, base)
-
-    for link in links:
-        if link not in visited:
-            visited.update(crawl(link, base, visited))
-            sleep(1 / RATE_LIMIT)  # Rate limit the recursive calls
-
-    return visited
-
-
-@sleep_and_retry
-@limits(calls=RATE_LIMIT, period=1)
-@lru_cache(maxsize=128)
-def get_all_links(url: str, base: str) -> List[str]:
-    """
-    Retrieve all valid links from a given URL with the same base.
-
-    Args:
-        url (str): The URL to retrieve links from.
-        base (str): The base URL to compare against.
-
-    Returns:
-        List[str]: A list of valid links with the same base.
-    """
-    logger.debug(f"Retrieving links from {url}")
-    response = requests.get(url)
-    soup = BeautifulSoup(response.text, "html.parser")
-    links = []
-
-    for link in soup.find_all("a", href=True):
-        href = link["href"]
-        full_url = urljoin(url, href)
-        parsed_url = urlparse(full_url)
-        cleaned_url = parsed_url._replace(fragment="").geturl()
-        if is_valid_url(cleaned_url, base):
-            print(cleaned_url)
-            links.append(cleaned_url)
-
-    logger.debug(f"Found {len(links)} valid links from {url}")
-    return links
-
-
-@sleep_and_retry
-@limits(calls=RATE_LIMIT, period=1)
-@lru_cache(maxsize=128)
-def get_readme_urls(repo_url: str) -> Tuple[List[str], List[str]]:
-    """
-    Retrieve folder and README links from a GitHub repository.
-
-    Args:
-        repo_url (str): The URL of the GitHub repository.
-
-    Returns:
-        Tuple[List[str], List[str]]: A tuple containing two lists: folder links and README links.
-    """
-    logger.debug(f"Retrieving README links from {repo_url}")
-    headers = {"Accept": "application/vnd.github+json"}
-    r = requests.get(repo_url, headers=headers)
-    soup = BeautifulSoup(r.text, "html.parser")
-
-    folder_links = []
-    readme_links = []
-
-    for link in soup.find_all("a", class_="js-navigation-open Link--primary"):
-        href = link["href"]
-        full_url = f"https://github.com{href}"
-        if "tree" in href:
-            folder_links.append(full_url)
-        elif "README.md" in href:
-            readme_links.append(full_url)
+from typing import List
+from logging import getLogger
 
-    logger.debug(
-        f"Found {len(folder_links)} folder links and {len(readme_links)} README links from {repo_url}"
-    )
-    return folder_links, readme_links
 
+logger = getLogger(__name__)
 
-def get_nested_readme_urls(repo_url: str) -> List[str]:
+def get_all_pages(base_url: str = "https://docs.zenml.io") -> List[str]:
     """
-    Retrieve all nested README links from a GitHub repository.
+    Retrieve all pages from the ZenML documentation sitemap.
 
     Args:
-        repo_url (str): The URL of the GitHub repository.
+        base_url (str): The base URL of the documentation. Defaults to "https://docs.zenml.io"
 
     Returns:
-        List[str]: A list of all nested README links.
+        List[str]: A list of all documentation page URLs.
     """
-    logger.info(f"Retrieving nested README links from {repo_url}...")
-    folder_links, readme_links = get_readme_urls(repo_url)
-
-    for folder_link in folder_links:
-        _, nested_readme_links = get_readme_urls(folder_link)
-        readme_links.extend(nested_readme_links)
-
-    logger.info(
-        f"Found {len(readme_links)} nested README links from {repo_url}"
-    )
-    return readme_links
-
+    logger.info("Fetching sitemap from docs.zenml.io...")
+    
+    # Fetch the sitemap
+    sitemap_url = f"{base_url}/sitemap.xml"
+    response = requests.get(sitemap_url)
+    soup = BeautifulSoup(response.text, "xml")
+    
+    # Extract all URLs from the sitemap
+    urls = [loc.text for loc in soup.find_all("loc")]
+    
+    logger.info(f"Found {len(urls)} pages in the sitemap.")
+    return urls
 
 def extract_parent_section(url: str) -> str:
     """