diff --git a/CHANGELOG.md b/CHANGELOG.md index fd5d455147..ae9813d51c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -56,6 +56,10 @@ Unreleased changes template. * Bazel 6 support is dropped and Bazel 7.4.1 is the minimum supported version, per our Bazel support matrix. Earlier versions are not tested by CI, so functionality cannot be guaranteed. +* ({bzl:obj}`pip.parse`) From now we will make fewer calls to indexes when + fetching the metadata from SimpleAPI. The calls will be done in parallel to + each index separately, so the extension evaluation time might slow down if + not using {bzl:obj}`pip.parse.experimental_index_url_overrides`. * ({bzl:obj}`pip.parse`) Only query SimpleAPI for packages that have sha values in the `requirements.txt` file. diff --git a/python/private/pypi/extension.bzl b/python/private/pypi/extension.bzl index d16a7cce2f..6409bccdd6 100644 --- a/python/private/pypi/extension.bzl +++ b/python/private/pypi/extension.bzl @@ -657,6 +657,11 @@ The indexes must support Simple API as described here: https://packaging.python.org/en/latest/specifications/simple-repository-api/ This is equivalent to `--extra-index-urls` `pip` option. + +:::{versionchanged} 1.1.0 +Starting with this version we will iterate over each index specified until +we find metadata for all references distributions. +::: """, default = [], ), diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl index c730c20439..6401a066c2 100644 --- a/python/private/pypi/simpleapi_download.bzl +++ b/python/private/pypi/simpleapi_download.bzl @@ -20,9 +20,17 @@ load("@bazel_features//:features.bzl", "bazel_features") load("//python/private:auth.bzl", "get_auth") load("//python/private:envsubst.bzl", "envsubst") load("//python/private:normalize_name.bzl", "normalize_name") +load("//python/private:text_util.bzl", "render") load(":parse_simpleapi_html.bzl", "parse_simpleapi_html") -def simpleapi_download(ctx, *, attr, cache, parallel_download = True): +def simpleapi_download( + ctx, + *, + attr, + cache, + parallel_download = True, + read_simpleapi = None, + _fail = fail): """Download Simple API HTML. Args: @@ -49,6 +57,9 @@ def simpleapi_download(ctx, *, attr, cache, parallel_download = True): reflected when re-evaluating the extension unless we do `bazel clean --expunge`. parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads. + read_simpleapi: a function for reading and parsing of the SimpleAPI contents. + Used in tests. + _fail: a function to print a failure. Used in tests. Returns: dict of pkg name to the parsed HTML contents - a list of structs. @@ -64,15 +75,22 @@ def simpleapi_download(ctx, *, attr, cache, parallel_download = True): # NOTE @aignas 2024-03-31: we are not merging results from multiple indexes # to replicate how `pip` would handle this case. - async_downloads = {} contents = {} index_urls = [attr.index_url] + attr.extra_index_urls - for pkg in attr.sources: - pkg_normalized = normalize_name(pkg) - - success = False - for index_url in index_urls: - result = _read_simpleapi( + read_simpleapi = read_simpleapi or _read_simpleapi + + found_on_index = {} + warn_overrides = False + for i, index_url in enumerate(index_urls): + if i != 0: + # Warn the user about a potential fix for the overrides + warn_overrides = True + + async_downloads = {} + sources = [pkg for pkg in attr.sources if pkg not in found_on_index] + for pkg in sources: + pkg_normalized = normalize_name(pkg) + result = read_simpleapi( ctx = ctx, url = "{}/{}/".format( index_url_overrides.get(pkg_normalized, index_url).rstrip("/"), @@ -84,42 +102,45 @@ def simpleapi_download(ctx, *, attr, cache, parallel_download = True): ) if hasattr(result, "wait"): # We will process it in a separate loop: - async_downloads.setdefault(pkg_normalized, []).append( - struct( - pkg_normalized = pkg_normalized, - wait = result.wait, - ), + async_downloads[pkg] = struct( + pkg_normalized = pkg_normalized, + wait = result.wait, ) - continue - - if result.success: + elif result.success: contents[pkg_normalized] = result.output - success = True - break - - if not async_downloads and not success: - fail("Failed to download metadata from urls: {}".format( - ", ".join(index_urls), - )) - - if not async_downloads: - return contents - - # If we use `block` == False, then we need to have a second loop that is - # collecting all of the results as they were being downloaded in parallel. - for pkg, downloads in async_downloads.items(): - success = False - for download in downloads: + found_on_index[pkg] = index_url + + if not async_downloads: + continue + + # If we use `block` == False, then we need to have a second loop that is + # collecting all of the results as they were being downloaded in parallel. + for pkg, download in async_downloads.items(): result = download.wait() - if result.success and download.pkg_normalized not in contents: + if result.success: contents[download.pkg_normalized] = result.output - success = True - - if not success: - fail("Failed to download metadata from urls: {}".format( - ", ".join(index_urls), - )) + found_on_index[pkg] = index_url + + failed_sources = [pkg for pkg in attr.sources if pkg not in found_on_index] + if failed_sources: + _fail("Failed to download metadata for {} for from urls: {}".format( + failed_sources, + index_urls, + )) + return None + + if warn_overrides: + index_url_overrides = { + pkg: found_on_index[pkg] + for pkg in attr.sources + if found_on_index[pkg] != attr.index_url + } + + # buildifier: disable=print + print("You can use the following `index_url_overrides` to avoid the 404 warnings:\n{}".format( + render.dict(index_url_overrides), + )) return contents diff --git a/tests/pypi/simpleapi_download/BUILD.bazel b/tests/pypi/simpleapi_download/BUILD.bazel new file mode 100644 index 0000000000..04747b6246 --- /dev/null +++ b/tests/pypi/simpleapi_download/BUILD.bazel @@ -0,0 +1,5 @@ +load("simpleapi_download_tests.bzl", "simpleapi_download_test_suite") + +simpleapi_download_test_suite( + name = "simpleapi_download_tests", +) diff --git a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl new file mode 100644 index 0000000000..9b2967b0da --- /dev/null +++ b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl @@ -0,0 +1,128 @@ +# Copyright 2024 The Bazel Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"" + +load("@rules_testing//lib:test_suite.bzl", "test_suite") +load("//python/private/pypi:simpleapi_download.bzl", "simpleapi_download") # buildifier: disable=bzl-visibility + +_tests = [] + +def _test_simple(env): + calls = [] + + def read_simpleapi(ctx, url, attr, cache, block): + _ = ctx # buildifier: disable=unused-variable + _ = attr + _ = cache + env.expect.that_bool(block).equals(False) + calls.append(url) + if "foo" in url and "main" in url: + return struct( + output = "", + success = False, + ) + else: + return struct( + output = "data from {}".format(url), + success = True, + ) + + contents = simpleapi_download( + ctx = struct( + os = struct(environ = {}), + ), + attr = struct( + index_url_overrides = {}, + index_url = "main", + extra_index_urls = ["extra"], + sources = ["foo", "bar", "baz"], + envsubst = [], + ), + cache = {}, + parallel_download = True, + read_simpleapi = read_simpleapi, + ) + + env.expect.that_collection(calls).contains_exactly([ + "extra/foo/", + "main/bar/", + "main/baz/", + "main/foo/", + ]) + env.expect.that_dict(contents).contains_exactly({ + "bar": "data from main/bar/", + "baz": "data from main/baz/", + "foo": "data from extra/foo/", + }) + +_tests.append(_test_simple) + +def _test_fail(env): + calls = [] + fails = [] + + def read_simpleapi(ctx, url, attr, cache, block): + _ = ctx # buildifier: disable=unused-variable + _ = attr + _ = cache + env.expect.that_bool(block).equals(False) + calls.append(url) + if "foo" in url: + return struct( + output = "", + success = False, + ) + else: + return struct( + output = "data from {}".format(url), + success = True, + ) + + simpleapi_download( + ctx = struct( + os = struct(environ = {}), + ), + attr = struct( + index_url_overrides = {}, + index_url = "main", + extra_index_urls = ["extra"], + sources = ["foo", "bar", "baz"], + envsubst = [], + ), + cache = {}, + parallel_download = True, + read_simpleapi = read_simpleapi, + _fail = fails.append, + ) + + env.expect.that_collection(fails).contains_exactly([ + """Failed to download metadata for ["foo"] for from urls: ["main", "extra"]""", + ]) + env.expect.that_collection(calls).contains_exactly([ + "extra/foo/", + "main/bar/", + "main/baz/", + "main/foo/", + ]) + +_tests.append(_test_fail) + +def simpleapi_download_test_suite(name): + """Create the test suite. + + Args: + name: the name of the test suite + """ + test_suite(name = name, basic_tests = _tests)