fix(pypi): change the parallelisation scheme for querying SimpleAPI

aignas · aignas · commit a6810e9d7a88 · 2024-12-24T13:12:07.000+09:00
Instead of querying everything in parallel and yielding a lot of 404 warnings, let's query the main index first and then query the other indexes only for the packages that were not yet found. This should make the problem in #2100 not as serious. What is more, we can print the value of for the users to use. Work towards #2100
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -56,6 +56,10 @@ Unreleased changes template.
 * Bazel 6 support is dropped and Bazel 7.4.1 is the minimum supported
   version, per our Bazel support matrix. Earlier versions are not
   tested by CI, so functionality cannot be guaranteed.
+* ({bzl:obj}`pip.parse`) From now we will make fewer calls to indexes when
+  fetching the metadata from SimpleAPI. The calls will be done in parallel to
+  each index separately, so the extension evaluation time might slow down if
+  not using {bzl:obj}`pip.parse.experimental_index_url_overrides`.
 
 {#v0-0-0-fixed}
 ### Fixed
diff --git a/python/private/pypi/extension.bzl b/python/private/pypi/extension.bzl
@@ -653,6 +653,11 @@ The indexes must support Simple API as described here:
 https://packaging.python.org/en/latest/specifications/simple-repository-api/
 
 This is equivalent to `--extra-index-urls` `pip` option.
+
+:::{versionchanged} 1.1.0
+Starting with this version we will iterate over each index specified until
+we find metadata for all references distributions.
+:::
 """,
             default = [],
         ),
diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl
@@ -20,6 +20,7 @@ load("@bazel_features//:features.bzl", "bazel_features")
 load("//python/private:auth.bzl", "get_auth")
 load("//python/private:envsubst.bzl", "envsubst")
 load("//python/private:normalize_name.bzl", "normalize_name")
+load("//python/private:text_util.bzl", "render")
 load(":parse_simpleapi_html.bzl", "parse_simpleapi_html")
 
 def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
@@ -64,14 +65,20 @@ def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
 
     # NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
     # to replicate how `pip` would handle this case.
-    async_downloads = {}
     contents = {}
     index_urls = [attr.index_url] + attr.extra_index_urls
-    for pkg in attr.sources:
-        pkg_normalized = normalize_name(pkg)
 
-        success = False
-        for index_url in index_urls:
+    found_on_index = {}
+    warn_overrides = False
+    for i, index_url in enumerate(index_urls):
+        if i != 0:
+            # Warn the user about a potential fix for the overrides
+            warn_overrides = True
+
+        async_downloads = {}
+        sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
+        for pkg in sources:
+            pkg_normalized = normalize_name(pkg)
             result = _read_simpleapi(
                 ctx = ctx,
                 url = "{}/{}/".format(
@@ -84,42 +91,44 @@ def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
             )
             if hasattr(result, "wait"):
                 # We will process it in a separate loop:
-                async_downloads.setdefault(pkg_normalized, []).append(
-                    struct(
-                        pkg_normalized = pkg_normalized,
-                        wait = result.wait,
-                    ),
+                async_downloads[pkg] = struct(
+                    pkg_normalized = pkg_normalized,
+                    wait = result.wait,
                 )
-                continue
-
-            if result.success:
+            else:
                 contents[pkg_normalized] = result.output
-                success = True
-                break
-
-        if not async_downloads and not success:
-            fail("Failed to download metadata from urls: {}".format(
-                ", ".join(index_urls),
-            ))
-
-    if not async_downloads:
-        return contents
-
-    # If we use `block` == False, then we need to have a second loop that is
-    # collecting all of the results as they were being downloaded in parallel.
-    for pkg, downloads in async_downloads.items():
-        success = False
-        for download in downloads:
+                found_on_index[pkg] = index_url
+
+        if not async_downloads:
+            continue
+
+        # If we use `block` == False, then we need to have a second loop that is
+        # collecting all of the results as they were being downloaded in parallel.
+        for pkg, download in async_downloads.items():
             result = download.wait()
 
-            if result.success and download.pkg_normalized not in contents:
+            if result.success:
                 contents[download.pkg_normalized] = result.output
-                success = True
-
-        if not success:
-            fail("Failed to download metadata from urls: {}".format(
-                ", ".join(index_urls),
-            ))
+                found_on_index[pkg] = index_url
+
+    failed_sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
+    if failed_sources:
+        fail("Failed to download metadata for {} for from urls: {}".format(
+            failed_sources,
+            index_urls,
+        ))
+
+    if warn_overrides:
+        index_url_overrides = {
+            pkg: found_on_index[pkg]
+            for pkg in attr.sources
+            if found_on_index[pkg] != attr.index_url
+        }
+
+        # buildifier: disable=print
+        print("You can use the following `index_url_overrides` to avoid the 404 warnings:\n{}".format(
+            render.dict(index_url_overrides),
+        ))
 
     return contents