Skip to content

Commit 50b1790

Browse files
authored
Fallback to latest version zip when main.zip doesn't exist (#748)
Some libraries (e.g., reachy_mini) don't have a main.zip file in the hf-doc-build/doc-build dataset. This change adds fallback logic to: - Query the API to find available version zips when main.zip returns 404 - Sort versions using the packaging library to find the latest - Download and extract the latest versioned zip instead - Dynamically find the version folder instead of hardcoding "main" Also updates workflow to test with reachy_mini library.
1 parent 024a050 commit 50b1790

File tree

2 files changed

+83
-9
lines changed

2 files changed

+83
-9
lines changed

.github/workflows/populate_search_engine.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ jobs:
3434
HF_IE_URL: ${{ secrets.HF_IE_URL }}
3535
HF_IE_TOKEN: ${{ secrets.HF_IE_TOKEN }}
3636
MEILISEARCH_KEY: ${{ secrets.MEILISEARCH_KEY }}
37-
run: uv run doc-builder populate-search-engine --libraries transformers
37+
run: uv run doc-builder populate-search-engine --libraries reachy_mini
3838

3939
# gradio-job:
4040
# runs-on: ubuntu-latest

src/doc_builder/process_hf_docs.py

Lines changed: 82 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from pathlib import Path
2727

2828
import requests
29+
from packaging import version as package_version
2930
from tqdm import tqdm
3031

3132
from .build_embeddings import Chunk, split_markdown_by_headings
@@ -35,6 +36,55 @@
3536
HF_DATASET_BASE_URL = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main"
3637

3738

39+
def get_latest_version_zip(library_name: str) -> str | None:
40+
"""
41+
Get the latest version zip filename for a library by querying the API.
42+
43+
Args:
44+
library_name: Name of the library (e.g., 'reachy_mini')
45+
46+
Returns:
47+
The filename of the latest version zip (e.g., 'v1.2.13.zip'), or None if not found
48+
"""
49+
api_url = f"{HF_DATASET_API_URL}/{library_name}"
50+
print(f" Querying API for available versions: {api_url}")
51+
52+
try:
53+
response = requests.get(api_url)
54+
response.raise_for_status()
55+
files = response.json()
56+
57+
# Filter for zip files (exclude _versions.yml and main.zip)
58+
zip_files = [
59+
f["path"].split("/")[-1] # Get just the filename
60+
for f in files
61+
if f["type"] == "file" and f["path"].endswith(".zip") and "main.zip" not in f["path"]
62+
]
63+
64+
if not zip_files:
65+
print(f" No version zips found for {library_name}")
66+
return None
67+
68+
# Sort by version (highest first) using packaging.version
69+
# Filenames are like "v1.2.13.zip" -> extract "1.2.13" (strip 'v' prefix)
70+
def version_key(filename):
71+
version_str = filename.replace(".zip", "").lstrip("v")
72+
try:
73+
return package_version.parse(version_str)
74+
except Exception:
75+
return package_version.parse("0")
76+
77+
zip_files_sorted = sorted(zip_files, key=version_key, reverse=True)
78+
latest = zip_files_sorted[0]
79+
80+
print(f" Found {len(zip_files)} versions, latest: {latest}")
81+
return latest
82+
83+
except Exception as e:
84+
print(f" Error querying API: {e}")
85+
return None
86+
87+
3888
def fetch_library_directories() -> list[dict]:
3989
"""
4090
Fetch the list of library directories from the HF doc-build dataset.
@@ -55,18 +105,19 @@ def fetch_library_directories() -> list[dict]:
55105
return directories
56106

57107

58-
def download_and_extract_zip(library_name: str, output_dir: Path) -> Path | None:
108+
def download_and_extract_zip(library_name: str, output_dir: Path, zip_filename: str = "main.zip") -> Path | None:
59109
"""
60-
Download and extract the main.zip file for a library.
110+
Download and extract a zip file for a library.
61111
62112
Args:
63113
library_name: Name of the library (e.g., 'accelerate')
64114
output_dir: Directory to extract files to
115+
zip_filename: Name of the zip file to download (default: 'main.zip')
65116
66117
Returns:
67118
Path to extracted directory, or None if download failed
68119
"""
69-
zip_url = f"{HF_DATASET_BASE_URL}/{library_name}/main.zip"
120+
zip_url = f"{HF_DATASET_BASE_URL}/{library_name}/{zip_filename}"
70121

71122
try:
72123
print(f" Downloading {zip_url}...")
@@ -96,8 +147,18 @@ def download_and_extract_zip(library_name: str, output_dir: Path) -> Path | None
96147

97148
except requests.exceptions.HTTPError as e:
98149
if e.response.status_code == 404:
99-
print(f" ⚠️ No main.zip found for {library_name}, skipping...")
100-
return None
150+
if zip_filename == "main.zip":
151+
# Try to find and download the latest version instead
152+
print(f" ⚠️ No main.zip found for {library_name}, looking for latest version...")
153+
latest_zip = get_latest_version_zip(library_name)
154+
if latest_zip:
155+
return download_and_extract_zip(library_name, output_dir, zip_filename=latest_zip)
156+
else:
157+
print(f" ⚠️ No versions found for {library_name}, skipping...")
158+
return None
159+
else:
160+
print(f" ⚠️ {zip_filename} not found for {library_name}, skipping...")
161+
return None
101162
raise
102163
except Exception as e:
103164
print(f" ❌ Error processing {library_name}: {e}")
@@ -272,11 +333,24 @@ def process_library(
272333
if extract_path is None:
273334
return []
274335

275-
# The zip extracts to: extract_path/library_name/main/en/
336+
# The zip extracts to: extract_path/library_name/{version}/en/
337+
# where {version} can be "main" or a version like "v1.2.13"
276338
# We only process the 'en' (English) folder
277-
base_dir = extract_path / library_name / "main" / "en"
339+
library_dir = extract_path / library_name
340+
341+
# Find the version folder (main, v1.2.13, etc.)
342+
version_folders = [d for d in library_dir.iterdir() if d.is_dir() and not d.name.startswith("_")]
343+
if not version_folders:
344+
print(f" ⚠️ No version folder found for {library_name}")
345+
return []
346+
347+
# Use the first (and typically only) version folder
348+
version_folder = version_folders[0]
349+
print(f" Using version folder: {version_folder.name}")
350+
351+
base_dir = version_folder / "en"
278352
if not base_dir.exists():
279-
print(f" ⚠️ No 'main/en' folder found for {library_name}")
353+
print(f" ⚠️ No 'en' folder found in {version_folder.name} for {library_name}")
280354
return []
281355
print(f" Using English docs at {base_dir}")
282356

0 commit comments

Comments
 (0)