Skip to content

Commit dc21a34

Browse files
authored
feat: compute top dependent corpus into cache (#17797)
* feat: compute top dependent corpus into cache In preparation of being able to use a longer corpus, compute the top 10k dependents and place them in the query results cache. The results are not yet used, so this is a good way of exercising the task logic without impeding any other processing. Signed-off-by: Mike Fiedler <[email protected]> * fix: account for periods in names and normalize Also add a more complex specifier from the documentation. Signed-off-by: Mike Fiedler <[email protected]> * refactor: use inclusion vs exclusion We don't care about `obsoletes`, `provides`, or legacy keys, only the ones that are requirement dependencies on other projects. Signed-off-by: Mike Fiedler <[email protected]> * lint Signed-off-by: Mike Fiedler <[email protected]> --------- Signed-off-by: Mike Fiedler <[email protected]>
1 parent 8fbbcd4 commit dc21a34

File tree

3 files changed

+156
-2
lines changed

3 files changed

+156
-2
lines changed

tests/unit/packaging/test_tasks.py

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,12 @@
2323
import warehouse.packaging.tasks
2424

2525
from warehouse.accounts.models import WebAuthn
26-
from warehouse.packaging.models import Description
26+
from warehouse.packaging.models import DependencyKind, Description
2727
from warehouse.packaging.tasks import (
2828
check_file_cache_tasks_outstanding,
2929
compute_2fa_metrics,
3030
compute_packaging_metrics,
31+
compute_top_dependents_corpus,
3132
sync_file_to_cache,
3233
update_bigquery_release_files,
3334
update_description_html,
@@ -37,6 +38,7 @@
3738
from warehouse.utils.row_counter import compute_row_counts
3839

3940
from ...common.db.packaging import (
41+
DependencyFactory,
4042
DescriptionFactory,
4143
FileFactory,
4244
ProjectFactory,
@@ -707,3 +709,55 @@ def test_compute_2fa_metrics(db_request, monkeypatch):
707709
pretend.call("warehouse.2fa.total_users_with_webauthn_enabled", 1),
708710
pretend.call("warehouse.2fa.total_users_with_two_factor_enabled", 2),
709711
]
712+
713+
714+
@pytest.mark.parametrize(
715+
("project_name", "specifier_string"),
716+
[
717+
(
718+
"requests",
719+
'requests [security,tests] >= 2.8.1, == 2.8.* ; python_version < "2.7"',
720+
),
721+
("xml.parsers.expat", "xml.parsers.expat (>1.0)"),
722+
("zope.event", "zope.event (==4.5.0)"),
723+
],
724+
)
725+
def test_compute_top_dependents_corpus(db_request, project_name, specifier_string):
726+
# A base project, others depend on it
727+
base_proj = ProjectFactory.create(name=project_name)
728+
# A project with no recent dependents
729+
leaf_proj = ProjectFactory.create()
730+
731+
# A Project with multiple Releases, the most recent of which is yanked
732+
project_a = ProjectFactory.create()
733+
release_a1 = ReleaseFactory.create(project=project_a, version="1.0")
734+
release_a2 = ReleaseFactory.create(project=project_a, version="2.0", yanked=True)
735+
# Add dependency relationships
736+
DependencyFactory.create(
737+
release=release_a1, kind=DependencyKind.requires_dist, specifier=base_proj.name
738+
)
739+
DependencyFactory.create(
740+
release=release_a2, kind=DependencyKind.requires_dist, specifier=base_proj.name
741+
)
742+
743+
# A project with an older release depending on leaf_proj, now base_proj instead
744+
project_b = ProjectFactory.create()
745+
release_b1 = ReleaseFactory.create(project=project_b, version="1.0")
746+
release_b2 = ReleaseFactory.create(project=project_b, version="2.0")
747+
DependencyFactory.create(
748+
release=release_b1, kind=DependencyKind.requires_dist, specifier=leaf_proj.name
749+
)
750+
DependencyFactory.create(
751+
release=release_b2, kind=DependencyKind.requires_dist, specifier=base_proj.name
752+
)
753+
754+
# legacy `project_url` kind, should not be included in corpus
755+
legacy_proj = ProjectFactory.create()
756+
legacy_release = ReleaseFactory.create(project=legacy_proj, version="1.0")
757+
DependencyFactory.create(
758+
release=legacy_release, kind=8, specifier="https://example.com"
759+
)
760+
761+
results = compute_top_dependents_corpus(db_request)
762+
763+
assert results == {base_proj.normalized_name: 2}

warehouse/packaging/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
check_file_cache_tasks_outstanding,
3131
compute_2fa_metrics,
3232
compute_packaging_metrics,
33+
compute_top_dependents_corpus,
3334
update_description_html,
3435
)
3536
from warehouse.rate_limiting import IRateLimiter, RateLimit
@@ -196,3 +197,6 @@ def includeme(config):
196197

197198
# Add a periodic task to generate general metrics
198199
config.add_periodic_task(crontab(minute="*/5"), compute_packaging_metrics)
200+
201+
# Add a periodic task to compute dependents corpus once a day
202+
config.add_periodic_task(crontab(minute=0, hour=5), compute_top_dependents_corpus)

warehouse/packaging/tasks.py

Lines changed: 97 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,23 +10,38 @@
1010
# See the License for the specific language governing permissions and
1111
# limitations under the License.
1212

13+
from __future__ import annotations
14+
1315
import datetime
1416
import logging
1517
import tempfile
18+
import typing
1619

1720
from collections import namedtuple
1821

1922
from celery.exceptions import SoftTimeLimitExceeded, TimeLimitExceeded
23+
from sqlalchemy import desc, func, nulls_last, select
2024
from sqlalchemy.orm import joinedload
2125

2226
from warehouse import tasks
2327
from warehouse.accounts.models import User, WebAuthn
28+
from warehouse.cache.interfaces import IQueryResultsCache
2429
from warehouse.metrics import IMetricsService
2530
from warehouse.packaging.interfaces import IFileStorage
26-
from warehouse.packaging.models import Description, File, Project, Release
31+
from warehouse.packaging.models import (
32+
Dependency,
33+
DependencyKind,
34+
Description,
35+
File,
36+
Project,
37+
Release,
38+
)
2739
from warehouse.utils import readme
2840
from warehouse.utils.row_counter import RowCount
2941

42+
if typing.TYPE_CHECKING:
43+
from pyramid.request import Request
44+
3045
logger = logging.getLogger(__name__)
3146

3247

@@ -342,3 +357,84 @@ def update_bigquery_release_files(task, request, dist_metadata):
342357
json_rows = [json_rows]
343358

344359
bq.insert_rows_json(table=table_name, json_rows=json_rows)
360+
361+
362+
@tasks.task(ignore_result=True, acks_late=True)
363+
def compute_top_dependents_corpus(request: Request) -> dict[str, int]:
364+
"""
365+
Query to collect all dependents from projects' most recent release
366+
and rank them by the number of dependents.
367+
Store in query results cache for retrieval during `file_upload`.
368+
"""
369+
# Create a CTE with the most recent releases for each project.
370+
# Selects each release's ID, project ID, and version, with a row number
371+
# partitioned by project and ordered to get the most recent non-yanked releases.
372+
recent_releases_cte = (
373+
select(
374+
Release.id.label("release_id"),
375+
Release.project_id,
376+
Release.version,
377+
func.row_number()
378+
.over(
379+
partition_by=Release.project_id,
380+
order_by=[
381+
nulls_last(
382+
Release.is_prerelease
383+
), # False first, True next, nulls last
384+
desc(Release._pypi_ordering),
385+
],
386+
)
387+
.label("rn"),
388+
)
389+
.where(Release.yanked.is_(False))
390+
.cte("recent_releases")
391+
)
392+
# Create a CTE that parses dependency names from release_dependencies.
393+
#
394+
# Extracts normalized dependency names by:
395+
# 1. Taking the specifier from release_dependencies
396+
# 2. Using regex to extract just the package name portion
397+
# 3. Converting to lowercase for normalization
398+
parsed_dependencies_cte = (
399+
select(
400+
func.normalize_pep426_name(
401+
# TODO: this isn't perfect, but it's a start.
402+
# A better solution would be to use a proper parser, but we'd need
403+
# to teach Postgres how to parse it.
404+
func.regexp_replace(Dependency.specifier, "^([A-Za-z0-9_.-]+).*", "\\1")
405+
).label("dependent_name")
406+
)
407+
.select_from(recent_releases_cte)
408+
.join(Dependency, Dependency.release_id == recent_releases_cte.c.release_id)
409+
.where(
410+
recent_releases_cte.c.rn == 1, # "latest" release per-project
411+
Dependency.kind.in_(
412+
[DependencyKind.requires_dist, DependencyKind.requires]
413+
),
414+
)
415+
.cte("parsed_dependencies")
416+
)
417+
418+
# Final query that gets the top dependents by count
419+
top_dependents_stmt = (
420+
select(
421+
parsed_dependencies_cte.c.dependent_name,
422+
func.count().label("dependent_count"),
423+
)
424+
.group_by(parsed_dependencies_cte.c.dependent_name)
425+
.order_by(desc("dependent_count"), parsed_dependencies_cte.c.dependent_name)
426+
.limit(10000)
427+
)
428+
429+
# Execute the query and fetch the constructed object
430+
results = request.db.execute(top_dependents_stmt).fetchall()
431+
# Result is Rows, so convert to a dicts of "name: count" pairs
432+
results = {row.dependent_name: row.dependent_count for row in results}
433+
434+
# Store the results in the query results cache
435+
cache = request.find_service(IQueryResultsCache)
436+
cache_key = "top_dependents_corpus"
437+
cache.set(cache_key, results)
438+
logger.info("Stored `top_dependents_corpus` in query results cache.")
439+
440+
return results

0 commit comments

Comments
 (0)