2626from pathlib import Path
2727
2828import requests
29+ from packaging import version as package_version
2930from tqdm import tqdm
3031
3132from .build_embeddings import Chunk , split_markdown_by_headings
3536HF_DATASET_BASE_URL = f"https://huggingface.co/datasets/{ HF_DATASET_REPO } /resolve/main"
3637
3738
39+ def get_latest_version_zip (library_name : str ) -> str | None :
40+ """
41+ Get the latest version zip filename for a library by querying the API.
42+
43+ Args:
44+ library_name: Name of the library (e.g., 'reachy_mini')
45+
46+ Returns:
47+ The filename of the latest version zip (e.g., 'v1.2.13.zip'), or None if not found
48+ """
49+ api_url = f"{ HF_DATASET_API_URL } /{ library_name } "
50+ print (f" Querying API for available versions: { api_url } " )
51+
52+ try :
53+ response = requests .get (api_url )
54+ response .raise_for_status ()
55+ files = response .json ()
56+
57+ # Filter for zip files (exclude _versions.yml and main.zip)
58+ zip_files = [
59+ f ["path" ].split ("/" )[- 1 ] # Get just the filename
60+ for f in files
61+ if f ["type" ] == "file" and f ["path" ].endswith (".zip" ) and "main.zip" not in f ["path" ]
62+ ]
63+
64+ if not zip_files :
65+ print (f" No version zips found for { library_name } " )
66+ return None
67+
68+ # Sort by version (highest first) using packaging.version
69+ # Filenames are like "v1.2.13.zip" -> extract "1.2.13" (strip 'v' prefix)
70+ def version_key (filename ):
71+ version_str = filename .replace (".zip" , "" ).lstrip ("v" )
72+ try :
73+ return package_version .parse (version_str )
74+ except Exception :
75+ return package_version .parse ("0" )
76+
77+ zip_files_sorted = sorted (zip_files , key = version_key , reverse = True )
78+ latest = zip_files_sorted [0 ]
79+
80+ print (f" Found { len (zip_files )} versions, latest: { latest } " )
81+ return latest
82+
83+ except Exception as e :
84+ print (f" Error querying API: { e } " )
85+ return None
86+
87+
3888def fetch_library_directories () -> list [dict ]:
3989 """
4090 Fetch the list of library directories from the HF doc-build dataset.
@@ -55,18 +105,19 @@ def fetch_library_directories() -> list[dict]:
55105 return directories
56106
57107
58- def download_and_extract_zip (library_name : str , output_dir : Path ) -> Path | None :
108+ def download_and_extract_zip (library_name : str , output_dir : Path , zip_filename : str = "main.zip" ) -> Path | None :
59109 """
60- Download and extract the main. zip file for a library.
110+ Download and extract a zip file for a library.
61111
62112 Args:
63113 library_name: Name of the library (e.g., 'accelerate')
64114 output_dir: Directory to extract files to
115+ zip_filename: Name of the zip file to download (default: 'main.zip')
65116
66117 Returns:
67118 Path to extracted directory, or None if download failed
68119 """
69- zip_url = f"{ HF_DATASET_BASE_URL } /{ library_name } /main.zip "
120+ zip_url = f"{ HF_DATASET_BASE_URL } /{ library_name } /{ zip_filename } "
70121
71122 try :
72123 print (f" Downloading { zip_url } ..." )
@@ -96,8 +147,18 @@ def download_and_extract_zip(library_name: str, output_dir: Path) -> Path | None
96147
97148 except requests .exceptions .HTTPError as e :
98149 if e .response .status_code == 404 :
99- print (f" ⚠️ No main.zip found for { library_name } , skipping..." )
100- return None
150+ if zip_filename == "main.zip" :
151+ # Try to find and download the latest version instead
152+ print (f" ⚠️ No main.zip found for { library_name } , looking for latest version..." )
153+ latest_zip = get_latest_version_zip (library_name )
154+ if latest_zip :
155+ return download_and_extract_zip (library_name , output_dir , zip_filename = latest_zip )
156+ else :
157+ print (f" ⚠️ No versions found for { library_name } , skipping..." )
158+ return None
159+ else :
160+ print (f" ⚠️ { zip_filename } not found for { library_name } , skipping..." )
161+ return None
101162 raise
102163 except Exception as e :
103164 print (f" ❌ Error processing { library_name } : { e } " )
@@ -272,11 +333,24 @@ def process_library(
272333 if extract_path is None :
273334 return []
274335
275- # The zip extracts to: extract_path/library_name/main/en/
336+ # The zip extracts to: extract_path/library_name/{version}/en/
337+ # where {version} can be "main" or a version like "v1.2.13"
276338 # We only process the 'en' (English) folder
277- base_dir = extract_path / library_name / "main" / "en"
339+ library_dir = extract_path / library_name
340+
341+ # Find the version folder (main, v1.2.13, etc.)
342+ version_folders = [d for d in library_dir .iterdir () if d .is_dir () and not d .name .startswith ("_" )]
343+ if not version_folders :
344+ print (f" ⚠️ No version folder found for { library_name } " )
345+ return []
346+
347+ # Use the first (and typically only) version folder
348+ version_folder = version_folders [0 ]
349+ print (f" Using version folder: { version_folder .name } " )
350+
351+ base_dir = version_folder / "en"
278352 if not base_dir .exists ():
279- print (f" ⚠️ No 'main/ en' folder found for { library_name } " )
353+ print (f" ⚠️ No 'en' folder found in { version_folder . name } for { library_name } " )
280354 return []
281355 print (f" Using English docs at { base_dir } " )
282356
0 commit comments