Skip to content

Commit 3d54b84

Browse files
authored
Change dependency update function to operate on index.html only rather then copy whls over (#7487)
Fixes : pytorch/pytorch#159409 We process index.html file rather then whl files. This will allow us to remove all conda and pypi dependencies from our s3 and CDN. CUDA package example: https://pypi.nvidia.com/nvidia-cuda-nvrtc/ Simple pypi index example: https://pypi.org/simple/filelock/ Deployment strategy: 1. Deploy only 1 cuda and 1 non cuda package index in nightly 2. Run delete these packages from nightly repo 3. Test 4. Deploy rest of the dependencies 5. Test
1 parent 4ae55aa commit 3d54b84

File tree

3 files changed

+151
-96
lines changed

3 files changed

+151
-96
lines changed

.github/workflows/update-s3-dependencies.yml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,6 @@
11
name: Update S3 HTML dependencies for download.pytorch.org nightly and test
22

33
on:
4-
push:
5-
branches:
6-
- main
7-
paths:
8-
- s3_management/update_dependencies.py
94
workflow_dispatch:
105
inputs:
116
dryrun:

s3_management/manage.py

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,36 @@ def to_simple_packages_html(
563563
out.append("<!DOCTYPE html>")
564564
out.append("<html>")
565565
out.append(" <body>")
566-
for pkg_name in sorted(self.get_package_names(subdir)):
566+
567+
# Get packages from wheel files
568+
packages_from_wheels = set(self.get_package_names(subdir))
569+
570+
# Also find packages that have index.html but no wheels
571+
packages_with_index_only = set()
572+
resolved_subdir = self._resolve_subdir(subdir)
573+
574+
# List all objects in the subdir to find packagename/index.html patterns
575+
prefix_to_search = f"{resolved_subdir}/"
576+
for obj in BUCKET.objects.filter(Prefix=prefix_to_search):
577+
# Check if this is a packagename/index.html file
578+
relative_key = obj.key[len(prefix_to_search) :]
579+
parts = relative_key.split("/")
580+
if len(parts) == 2 and parts[1] == "index.html":
581+
package_name = parts[0].replace("-", "_")
582+
# Convert back to the format used in wheel names (use _ not -)
583+
# But we need to check if this package already has wheels
584+
if package_name.lower() not in {
585+
p.lower() for p in packages_from_wheels
586+
}:
587+
packages_with_index_only.add(package_name)
588+
print(
589+
f"INFO: Including package '{package_name}' in {prefix_to_search} (has index.html but no wheels)"
590+
)
591+
592+
# Combine both sets of packages
593+
all_packages = packages_from_wheels | packages_with_index_only
594+
595+
for pkg_name in sorted(all_packages):
567596
out.append(
568597
f' <a href="{pkg_name.lower().replace("_", "-")}/">{pkg_name.replace("_", "-")}</a><br/>'
569598
)
@@ -588,6 +617,7 @@ def upload_libtorch_html(self) -> None:
588617
def upload_pep503_htmls(self) -> None:
589618
for subdir in self.subdirs:
590619
index_html = self.to_simple_packages_html(subdir=subdir)
620+
591621
for bucket in INDEX_BUCKETS:
592622
print(f"INFO Uploading {subdir}/index.html to {bucket.name}")
593623
bucket.Object(key=f"{subdir}/index.html").put(
@@ -692,16 +722,18 @@ def fetch_metadata(self) -> None:
692722
# Add PEP 503-compatible hashes to URLs to allow clients to avoid spurious downloads, if possible.
693723
regex_multipart_upload = r"^[A-Za-z0-9+/=]+=-[0-9]+$"
694724
with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
695-
for idx, future in {
696-
idx: executor.submit(
697-
lambda key: CLIENT.head_object(
698-
Bucket=BUCKET.name, Key=key, ChecksumMode="Enabled"
699-
),
700-
obj.orig_key,
701-
)
702-
for (idx, obj) in enumerate(self.objects)
703-
if obj.size is None
704-
}.items():
725+
futures = {}
726+
for idx, obj in enumerate(self.objects):
727+
if obj.size is None:
728+
future = executor.submit(
729+
lambda key: CLIENT.head_object(
730+
Bucket=BUCKET.name, Key=key, ChecksumMode="Enabled"
731+
),
732+
obj.orig_key,
733+
)
734+
futures[idx] = future
735+
736+
for idx, future in futures.items():
705737
response = future.result()
706738
raw = response.get("ChecksumSHA256")
707739
if raw and match(regex_multipart_upload, raw):

s3_management/update_dependencies.py

Lines changed: 108 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -569,108 +569,141 @@
569569
}
570570

571571

572+
def is_nvidia_package(pkg_name: str) -> bool:
573+
"""Check if a package is from NVIDIA and should use pypi.nvidia.com"""
574+
return pkg_name.startswith("nvidia-")
575+
576+
577+
def get_package_source_url(pkg_name: str) -> str:
578+
"""Get the source URL for a package based on its type"""
579+
if is_nvidia_package(pkg_name):
580+
return f"https://pypi.nvidia.com/{pkg_name}/"
581+
else:
582+
return f"https://pypi.org/simple/{pkg_name}/"
583+
584+
572585
def download(url: str) -> bytes:
573586
from urllib.request import urlopen
574587

575588
with urlopen(url) as conn:
576589
return conn.read()
577590

578591

579-
def is_stable(package_version: str) -> bool:
580-
return bool(re.match(r"^([0-9]+\.)+[0-9]+$", package_version))
592+
def replace_relative_links_with_absolute(html: str, base_url: str) -> str:
593+
"""
594+
Replace all relative links in HTML with absolute links.
595+
596+
Args:
597+
html: HTML content as string
598+
base_url: Base URL to prepend to relative links
599+
600+
Returns:
601+
Modified HTML with absolute links
602+
"""
603+
# Ensure base_url ends with /
604+
if not base_url.endswith("/"):
605+
base_url += "/"
606+
607+
# Pattern to match href attributes with relative URLs (not starting with http:// or https://)
608+
def replace_href(match):
609+
full_match = match.group(0)
610+
url = match.group(1)
611+
612+
# If URL is already absolute, don't modify it
613+
if (
614+
url.startswith("http://")
615+
or url.startswith("https://")
616+
or url.startswith("//")
617+
):
618+
return full_match
619+
620+
# Remove leading ./ or /
621+
url = url.lstrip("./")
622+
url = url.lstrip("/")
623+
624+
# Replace with absolute URL
625+
return f'href="{base_url}{url}"'
626+
627+
# Replace href="..." patterns
628+
html = re.sub(r'href="([^"]+)"', replace_href, html)
629+
630+
return html
581631

582632

583-
def parse_simple_idx(url: str) -> Dict[str, str]:
584-
html = download(url).decode("ascii")
585-
return {
633+
def parse_simple_idx(url: str) -> tuple[Dict[str, str], str]:
634+
"""
635+
Parse a simple package index and return package dict and raw HTML.
636+
637+
Returns:
638+
Tuple of (package_dict, raw_html)
639+
"""
640+
html = download(url).decode("utf-8", errors="ignore")
641+
packages = {
586642
name: url
587643
for (url, name) in re.findall('<a href="([^"]+)"[^>]*>([^>]+)</a>', html)
588644
}
645+
return packages, html
589646

590647

591-
def get_whl_versions(idx: Dict[str, str]) -> List[str]:
592-
return [
593-
k.split("-")[1]
594-
for k in idx.keys()
595-
if k.endswith(".whl") and is_stable(k.split("-")[1])
596-
]
648+
def upload_index_html(
649+
pkg_name: str,
650+
prefix: str,
651+
html: str,
652+
base_url: str,
653+
*,
654+
dry_run: bool = False,
655+
) -> None:
656+
"""Upload modified index.html to S3 with absolute links"""
657+
# Replace relative links with absolute links
658+
modified_html = replace_relative_links_with_absolute(html, base_url)
597659

660+
index_key = f"{prefix}/{pkg_name}/index.html"
598661

599-
def get_wheels_of_version(idx: Dict[str, str], version: str) -> Dict[str, str]:
600-
return {
601-
k: v
602-
for (k, v) in idx.items()
603-
if k.endswith(".whl") and k.split("-")[1] == version
604-
}
662+
if dry_run:
663+
print(f"Dry Run - not uploading index.html to s3://pytorch/{index_key}")
664+
return
665+
666+
print(f"Uploading index.html to s3://pytorch/{index_key}")
667+
BUCKET.Object(key=index_key).put(
668+
ACL="public-read", ContentType="text/html", Body=modified_html.encode("utf-8")
669+
)
605670

606671

607-
def upload_missing_whls(
608-
pkg_name: str = "numpy",
609-
prefix: str = "whl/test",
672+
def upload_package_using_simple_index(
673+
pkg_name: str,
674+
prefix: str,
610675
*,
611676
dry_run: bool = False,
612-
only_pypi: bool = False,
613-
target_version: str = "latest",
614677
) -> None:
615-
pypi_idx = parse_simple_idx(f"https://pypi.org/simple/{pkg_name}")
616-
pypi_versions = get_whl_versions(pypi_idx)
617-
618-
# Determine which version to use
619-
if target_version == "latest" or not target_version:
620-
selected_version = pypi_versions[-1] if pypi_versions else None
621-
elif target_version in pypi_versions:
622-
selected_version = target_version
623-
else:
624-
print(
625-
f"Warning: Version {target_version} not found for {pkg_name}, using latest"
626-
)
627-
selected_version = pypi_versions[-1] if pypi_versions else None
678+
"""
679+
Upload package index.html from PyPI Simple Index.
680+
Simply copies the index.html with absolute links - no wheel uploads or version filtering.
681+
Works for both NVIDIA and non-NVIDIA packages.
682+
"""
683+
source_url = get_package_source_url(pkg_name)
684+
is_nvidia = is_nvidia_package(pkg_name)
628685

629-
if not selected_version:
630-
print(f"No stable versions found for {pkg_name}")
686+
print(
687+
f"Processing {pkg_name} using {'NVIDIA' if is_nvidia else 'PyPI'} Simple Index: {source_url}"
688+
)
689+
690+
# Parse the index and get raw HTML
691+
try:
692+
_, raw_html = parse_simple_idx(source_url)
693+
except Exception as e:
694+
print(f"Error fetching package {pkg_name}: {e}")
631695
return
632696

633-
pypi_latest_packages = get_wheels_of_version(pypi_idx, selected_version)
634-
635-
download_latest_packages: Dict[str, str] = {}
636-
if not only_pypi:
637-
download_idx = parse_simple_idx(
638-
f"https://download.pytorch.org/{prefix}/{pkg_name}"
639-
)
640-
download_latest_packages = get_wheels_of_version(download_idx, selected_version)
641-
642-
has_updates = False
643-
for pkg in pypi_latest_packages:
644-
if pkg in download_latest_packages:
645-
continue
646-
# Skip pp packages
647-
if "-pp3" in pkg:
648-
continue
649-
# Skip win32 packages
650-
if "-win32" in pkg:
651-
continue
652-
# Skip muslinux packages
653-
if "-musllinux" in pkg:
654-
continue
655-
print(f"Downloading {pkg}")
656-
if dry_run:
657-
has_updates = True
658-
print(f"Dry Run - not Uploading {pkg} to s3://pytorch/{prefix}/")
659-
continue
660-
data = download(pypi_idx[pkg])
661-
print(f"Uploading {pkg} to s3://pytorch/{prefix}/")
662-
BUCKET.Object(key=f"{prefix}/{pkg}").put(
663-
ACL="public-read", ContentType="binary/octet-stream", Body=data
664-
)
665-
has_updates = True
666-
if not has_updates:
667-
print(f"{pkg_name} is already at version {selected_version} for {prefix}")
697+
# Upload modified index.html with absolute links
698+
upload_index_html(pkg_name, prefix, raw_html, source_url, dry_run=dry_run)
699+
700+
print(f"Successfully processed index.html for {pkg_name}")
668701

669702

670703
def main() -> None:
671704
from argparse import ArgumentParser
672705

673-
parser = ArgumentParser("Upload dependent packages to s3://pytorch")
706+
parser = ArgumentParser("Upload dependent package indexes to s3://pytorch")
674707
# Get unique paths from the packages list
675708
project_paths = list(
676709
{
@@ -682,7 +715,6 @@ def main() -> None:
682715
project_paths += ["all"]
683716
parser.add_argument("--package", choices=project_paths, default="torch")
684717
parser.add_argument("--dry-run", action="store_true")
685-
parser.add_argument("--only-pypi", action="store_true")
686718
parser.add_argument("--include-stable", action="store_true")
687719
args = parser.parse_args()
688720

@@ -707,12 +739,8 @@ def main() -> None:
707739
else:
708740
full_path = f"{prefix}"
709741

710-
upload_missing_whls(
711-
pkg_name,
712-
full_path,
713-
dry_run=args.dry_run,
714-
only_pypi=args.only_pypi,
715-
target_version=pkg_config["version"],
742+
upload_package_using_simple_index(
743+
pkg_name, full_path, dry_run=args.dry_run
716744
)
717745

718746

0 commit comments

Comments
 (0)