Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,6 @@ __pycache__/
# Generated at build time by generate.py
source/_static/projects.json
source/_static/search-index.json

# Generated at build time by generate_sitemap.py
sitemap.xml
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ help:
# Generate projects.json and search-index.json from projects.yaml
generate:
@SPHINXPY=$$(head -1 "$$(which $(SPHINXBUILD))" | sed 's/^#!//') && \
$$SPHINXPY generate.py
$$SPHINXPY generate.py && \
$$SPHINXPY scripts/generate_sitemap.py

# Generate (offline, no sitemap crawling)
generate-offline:
Expand Down
4 changes: 4 additions & 0 deletions projects.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
description: >-
A PyTorch-native agentic reinforcement learning library.
docs: https://meta-pytorch.org/torchforge/
sitemap: https://meta-pytorch.org/torchforge/main/sitemap.xml
keywords: >-
agentic reinforcement learning agentic RL PyTorch native RL library
RL algorithms sequential decision making agent-based
Expand All @@ -98,6 +99,7 @@
description: >-
Communication primitives for distributed PyTorch training.
docs: https://meta-pytorch.org/torchcomms/
sitemap: https://meta-pytorch.org/torchcomms/main/sitemap.xml
keywords: >-
torchcomms communication primitives distributed training split_group
DeviceMesh AllGather AllToAll AllReduce NCCLX RCCL NCCL Gloo comms pool
Expand Down Expand Up @@ -128,6 +130,7 @@
Domain library for recommendation systems, including distributed
embedding tables and data pipelining.
docs: https://meta-pytorch.org/torchrec/
sitemap: null # TorchRec does not have a sitemap
keywords: >-
recommendation systems distributed embedding tables embedding sharding
model parallelism DMP DistributedModelParallel EmbeddingBagCollection
Expand Down Expand Up @@ -156,6 +159,7 @@
Fast, hardware-accelerated video and audio decoding with native
PyTorch tensor output.
docs: https://meta-pytorch.org/torchcodec/
sitemap: https://meta-pytorch.org/torchcodec/main/sitemap.xml
keywords: >-
VideoDecoder AudioDecoder VideoEncoder video decoding audio decoding
GPU video encoding CUDA video decoding hardware accelerated FFmpeg
Expand Down
7 changes: 7 additions & 0 deletions robots.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Robots.txt for meta-pytorch.org
# https://www.robotstxt.org/

User-agent: *
Allow: /

Sitemap: https://meta-pytorch.org/sitemap.xml
213 changes: 213 additions & 0 deletions scripts/generate_sitemap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
#!/usr/bin/env python3
"""
Sitemap Generator for meta-pytorch.org

This script generates a unified sitemap by:
1. Reading projects from projects.yaml
2. Fetching individual project sitemaps
3. Generating a single sitemap.xml with all URLs

Can be run locally or via GitHub Actions.

Usage:
python scripts/generate_sitemap.py

Options:
--validate Only validate that all project sitemaps exist
"""

import argparse
import os
import sys
import urllib.error
import urllib.request
from datetime import datetime
from pathlib import Path
from typing import Optional
from xml.dom import minidom
from xml.etree import ElementTree as ET

import yaml

# Configuration
BASE_URL = "https://meta-pytorch.org"
PROJECTS_YAML_PATH = "projects.yaml"
SITEMAP_OUTPUT_PATH = "sitemap.xml"

# XML namespaces
SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"
ET.register_namespace("", SITEMAP_NS)


def prettify_xml(elem: ET.Element) -> str:
"""Return a pretty-printed XML string with proper indentation."""
rough_string = ET.tostring(elem, encoding="unicode")
reparsed = minidom.parseString(rough_string)
return reparsed.toprettyxml(indent=" ", encoding=None)


def load_projects(projects_path: str) -> list:
"""Load projects from projects.yaml file."""
with open(projects_path, "r") as f:
return yaml.safe_load(f)


def fetch_sitemap(url: str, timeout: int = 10) -> Optional[ET.Element]:
"""Fetch and parse a sitemap from a URL."""
try:
with urllib.request.urlopen(url, timeout=timeout) as response:
content = response.read()
return ET.fromstring(content)
except (urllib.error.URLError, urllib.error.HTTPError, ET.ParseError) as e:
print(f"Warning: Could not fetch sitemap from {url}: {e}")
return None


def get_project_sitemap_url(project: dict) -> Optional[str]:
"""Get the sitemap URL for a project.

Uses the 'sitemap' field if specified, otherwise defaults to {docs}/sitemap.xml.
Returns None if sitemap is explicitly set to null.
"""
if "sitemap" in project:
return project["sitemap"] # Could be a URL or None
docs_url = project.get("docs", "").rstrip("/")
return f"{docs_url}/sitemap.xml"


def generate_sitemap(projects: list) -> str:
"""Generate a unified sitemap with all URLs from all projects."""
root = ET.Element("urlset", xmlns=SITEMAP_NS)
today = datetime.now().strftime("%Y-%m-%d")

# Add main site URL
url_elem = ET.SubElement(root, "url")
loc = ET.SubElement(url_elem, "loc")
loc.text = f"{BASE_URL}/"
lastmod = ET.SubElement(url_elem, "lastmod")
lastmod.text = today
priority = ET.SubElement(url_elem, "priority")
priority.text = "1.0"
changefreq = ET.SubElement(url_elem, "changefreq")
changefreq.text = "weekly"

# Fetch and merge each project's sitemap
for project in projects:
sitemap_url = get_project_sitemap_url(project)
if sitemap_url is None:
# Project has no sitemap, just add the docs root URL
url_elem = ET.SubElement(root, "url")
loc = ET.SubElement(url_elem, "loc")
loc.text = project.get("docs", "").rstrip("/") + "/"
lastmod = ET.SubElement(url_elem, "lastmod")
lastmod.text = today
priority = ET.SubElement(url_elem, "priority")
priority.text = "0.8"
continue

print(f"Fetching sitemap: {sitemap_url}")

sitemap_root = fetch_sitemap(sitemap_url)
if sitemap_root is None:
# If sitemap doesn't exist, at least add the docs root
url_elem = ET.SubElement(root, "url")
loc = ET.SubElement(url_elem, "loc")
loc.text = project.get("docs", "").rstrip("/") + "/"
lastmod = ET.SubElement(url_elem, "lastmod")
lastmod.text = today
priority = ET.SubElement(url_elem, "priority")
priority.text = "0.8"
continue

# Extract all URL entries from the project sitemap
for url_entry in sitemap_root.findall(f".//{{{SITEMAP_NS}}}url"):
# Clone the URL entry to our unified sitemap
new_url = ET.SubElement(root, "url")
has_lastmod = False
for child in url_entry:
tag_name = child.tag.replace(f"{{{SITEMAP_NS}}}", "")
new_child = ET.SubElement(new_url, tag_name)
new_child.text = child.text
if tag_name == "lastmod":
has_lastmod = True
# Add lastmod if not present in source sitemap
if not has_lastmod:
lastmod = ET.SubElement(new_url, "lastmod")
lastmod.text = today

return prettify_xml(root)


def validate_sitemaps(projects: list) -> bool:
"""Check that all project sitemaps are accessible."""
all_valid = True
for project in projects:
sitemap_url = get_project_sitemap_url(project)
if sitemap_url is None:
print(f"Skipping {project.get('id', 'unknown')}: no sitemap configured")
continue

print(f"Validating: {sitemap_url} ... ", end="")

if fetch_sitemap(sitemap_url) is not None:
print("OK")
else:
print("MISSING or INVALID")
all_valid = False

return all_valid


def main():
parser = argparse.ArgumentParser(
description="Generate sitemap for meta-pytorch.org"
)
parser.add_argument(
"--validate",
action="store_true",
help="Only validate that all project sitemaps exist",
)
parser.add_argument(
"--output-dir",
type=str,
default=".",
help="Output directory for generated sitemap",
)

args = parser.parse_args()

# Determine script directory and project root
script_dir = Path(__file__).parent
project_root = script_dir.parent
os.chdir(project_root)

# Load projects
print(f"Loading projects from {PROJECTS_YAML_PATH}...")
projects = load_projects(PROJECTS_YAML_PATH)
print(f"Found {len(projects)} projects")

if args.validate:
print("\nValidating project sitemaps...")
if validate_sitemaps(projects):
print("\nAll sitemaps are valid!")
sys.exit(0)
else:
print("\nSome sitemaps are missing or invalid.")
sys.exit(1)

output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

# Generate sitemap
print("\nGenerating sitemap...")
sitemap = generate_sitemap(projects)
sitemap_path = output_dir / SITEMAP_OUTPUT_PATH
with open(sitemap_path, "w") as f:
f.write(sitemap)
print(f"Saved sitemap to {sitemap_path}")

print("\nDone!")


if __name__ == "__main__":
main()
6 changes: 5 additions & 1 deletion source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,11 @@

html_title = "Meta-PyTorch"
html_static_path = ["_static"]
html_extra_path = ["_html_extra"]
html_extra_path = [
"_html_extra",
"../sitemap.xml",
"../robots.txt",
]
html_css_files = ["css/custom.css"]
html_show_sourcelink = False
html_sidebars = {"**": []}
Expand Down