feat: compute top dependent corpus into cache (#17797)

miketheman · web-flow · commit dc21a3404f19 · 2025-03-18T13:51:33.000Z
* feat: compute top dependent corpus into cache

In preparation of being able to use a longer corpus, compute the top 10k
dependents and place them in the query results cache.

The results are not yet used, so this is a good way of exercising the
task logic without impeding any other processing.

Signed-off-by: Mike Fiedler &lt;miketheman@gmail.com&gt;

* fix: account for periods in names and normalize

Also add a more complex specifier from the documentation.

Signed-off-by: Mike Fiedler &lt;miketheman@gmail.com&gt;

* refactor: use inclusion vs exclusion

We don't care about `obsoletes`, `provides`, or legacy keys, only the
ones that are requirement dependencies on other projects.

Signed-off-by: Mike Fiedler &lt;miketheman@gmail.com&gt;

* lint

Signed-off-by: Mike Fiedler &lt;miketheman@gmail.com&gt;

---------

Signed-off-by: Mike Fiedler &lt;miketheman@gmail.com&gt;
diff --git a/tests/unit/packaging/test_tasks.py b/tests/unit/packaging/test_tasks.py
@@ -23,11 +23,12 @@
 import warehouse.packaging.tasks
 
 from warehouse.accounts.models import WebAuthn
-from warehouse.packaging.models import Description
+from warehouse.packaging.models import DependencyKind, Description
 from warehouse.packaging.tasks import (
     check_file_cache_tasks_outstanding,
     compute_2fa_metrics,
     compute_packaging_metrics,
+    compute_top_dependents_corpus,
     sync_file_to_cache,
     update_bigquery_release_files,
     update_description_html,
@@ -37,6 +38,7 @@
 from warehouse.utils.row_counter import compute_row_counts
 
 from ...common.db.packaging import (
+    DependencyFactory,
     DescriptionFactory,
     FileFactory,
     ProjectFactory,
@@ -707,3 +709,55 @@ def test_compute_2fa_metrics(db_request, monkeypatch):
         pretend.call("warehouse.2fa.total_users_with_webauthn_enabled", 1),
         pretend.call("warehouse.2fa.total_users_with_two_factor_enabled", 2),
     ]
+
+
+@pytest.mark.parametrize(
+    ("project_name", "specifier_string"),
+    [
+        (
+            "requests",
+            'requests [security,tests] >= 2.8.1, == 2.8.* ; python_version < "2.7"',
+        ),
+        ("xml.parsers.expat", "xml.parsers.expat (>1.0)"),
+        ("zope.event", "zope.event (==4.5.0)"),
+    ],
+)
+def test_compute_top_dependents_corpus(db_request, project_name, specifier_string):
+    # A base project, others depend on it
+    base_proj = ProjectFactory.create(name=project_name)
+    # A project with no recent dependents
+    leaf_proj = ProjectFactory.create()
+
+    # A Project with multiple Releases, the most recent of which is yanked
+    project_a = ProjectFactory.create()
+    release_a1 = ReleaseFactory.create(project=project_a, version="1.0")
+    release_a2 = ReleaseFactory.create(project=project_a, version="2.0", yanked=True)
+    # Add dependency relationships
+    DependencyFactory.create(
+        release=release_a1, kind=DependencyKind.requires_dist, specifier=base_proj.name
+    )
+    DependencyFactory.create(
+        release=release_a2, kind=DependencyKind.requires_dist, specifier=base_proj.name
+    )
+
+    # A project with an older release depending on leaf_proj, now base_proj instead
+    project_b = ProjectFactory.create()
+    release_b1 = ReleaseFactory.create(project=project_b, version="1.0")
+    release_b2 = ReleaseFactory.create(project=project_b, version="2.0")
+    DependencyFactory.create(
+        release=release_b1, kind=DependencyKind.requires_dist, specifier=leaf_proj.name
+    )
+    DependencyFactory.create(
+        release=release_b2, kind=DependencyKind.requires_dist, specifier=base_proj.name
+    )
+
+    # legacy `project_url` kind, should not be included in corpus
+    legacy_proj = ProjectFactory.create()
+    legacy_release = ReleaseFactory.create(project=legacy_proj, version="1.0")
+    DependencyFactory.create(
+        release=legacy_release, kind=8, specifier="https://example.com"
+    )
+
+    results = compute_top_dependents_corpus(db_request)
+
+    assert results == {base_proj.normalized_name: 2}
diff --git a/warehouse/packaging/__init__.py b/warehouse/packaging/__init__.py
@@ -30,6 +30,7 @@
     check_file_cache_tasks_outstanding,
     compute_2fa_metrics,
     compute_packaging_metrics,
+    compute_top_dependents_corpus,
     update_description_html,
 )
 from warehouse.rate_limiting import IRateLimiter, RateLimit
@@ -196,3 +197,6 @@ def includeme(config):
 
     # Add a periodic task to generate general metrics
     config.add_periodic_task(crontab(minute="*/5"), compute_packaging_metrics)
+
+    # Add a periodic task to compute dependents corpus once a day
+    config.add_periodic_task(crontab(minute=0, hour=5), compute_top_dependents_corpus)
diff --git a/warehouse/packaging/tasks.py b/warehouse/packaging/tasks.py
@@ -10,23 +10,38 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import datetime
 import logging
 import tempfile
+import typing
 
 from collections import namedtuple
 
 from celery.exceptions import SoftTimeLimitExceeded, TimeLimitExceeded
+from sqlalchemy import desc, func, nulls_last, select
 from sqlalchemy.orm import joinedload
 
 from warehouse import tasks
 from warehouse.accounts.models import User, WebAuthn
+from warehouse.cache.interfaces import IQueryResultsCache
 from warehouse.metrics import IMetricsService
 from warehouse.packaging.interfaces import IFileStorage
-from warehouse.packaging.models import Description, File, Project, Release
+from warehouse.packaging.models import (
+    Dependency,
+    DependencyKind,
+    Description,
+    File,
+    Project,
+    Release,
+)
 from warehouse.utils import readme
 from warehouse.utils.row_counter import RowCount
 
+if typing.TYPE_CHECKING:
+    from pyramid.request import Request
+
 logger = logging.getLogger(__name__)
 
 
@@ -342,3 +357,84 @@ def update_bigquery_release_files(task, request, dist_metadata):
         json_rows = [json_rows]
 
         bq.insert_rows_json(table=table_name, json_rows=json_rows)
+
+
+@tasks.task(ignore_result=True, acks_late=True)
+def compute_top_dependents_corpus(request: Request) -> dict[str, int]:
+    """
+    Query to collect all dependents from projects' most recent release
+    and rank them by the number of dependents.
+    Store in query results cache for retrieval during `file_upload`.
+    """
+    # Create a CTE with the most recent releases for each project.
+    # Selects each release's ID, project ID, and version, with a row number
+    # partitioned by project and ordered to get the most recent non-yanked releases.
+    recent_releases_cte = (
+        select(
+            Release.id.label("release_id"),
+            Release.project_id,
+            Release.version,
+            func.row_number()
+            .over(
+                partition_by=Release.project_id,
+                order_by=[
+                    nulls_last(
+                        Release.is_prerelease
+                    ),  # False first, True next, nulls last
+                    desc(Release._pypi_ordering),
+                ],
+            )
+            .label("rn"),
+        )
+        .where(Release.yanked.is_(False))
+        .cte("recent_releases")
+    )
+    # Create a CTE that parses dependency names from release_dependencies.
+    #
+    # Extracts normalized dependency names by:
+    # 1. Taking the specifier from release_dependencies
+    # 2. Using regex to extract just the package name portion
+    # 3. Converting to lowercase for normalization
+    parsed_dependencies_cte = (
+        select(
+            func.normalize_pep426_name(
+                # TODO: this isn't perfect, but it's a start.
+                #  A better solution would be to use a proper parser, but we'd need
+                #  to teach Postgres how to parse it.
+                func.regexp_replace(Dependency.specifier, "^([A-Za-z0-9_.-]+).*", "\\1")
+            ).label("dependent_name")
+        )
+        .select_from(recent_releases_cte)
+        .join(Dependency, Dependency.release_id == recent_releases_cte.c.release_id)
+        .where(
+            recent_releases_cte.c.rn == 1,  # "latest" release per-project
+            Dependency.kind.in_(
+                [DependencyKind.requires_dist, DependencyKind.requires]
+            ),
+        )
+        .cte("parsed_dependencies")
+    )
+
+    # Final query that gets the top dependents by count
+    top_dependents_stmt = (
+        select(
+            parsed_dependencies_cte.c.dependent_name,
+            func.count().label("dependent_count"),
+        )
+        .group_by(parsed_dependencies_cte.c.dependent_name)
+        .order_by(desc("dependent_count"), parsed_dependencies_cte.c.dependent_name)
+        .limit(10000)
+    )
+
+    # Execute the query and fetch the constructed object
+    results = request.db.execute(top_dependents_stmt).fetchall()
+    # Result is Rows, so convert to a dicts of "name: count" pairs
+    results = {row.dependent_name: row.dependent_count for row in results}
+
+    # Store the results in the query results cache
+    cache = request.find_service(IQueryResultsCache)
+    cache_key = "top_dependents_corpus"
+    cache.set(cache_key, results)
+    logger.info("Stored `top_dependents_corpus` in query results cache.")
+
+    return results