|
10 | 10 | # See the License for the specific language governing permissions and
|
11 | 11 | # limitations under the License.
|
12 | 12 |
|
| 13 | +from __future__ import annotations |
| 14 | + |
13 | 15 | import datetime
|
14 | 16 | import logging
|
15 | 17 | import tempfile
|
| 18 | +import typing |
16 | 19 |
|
17 | 20 | from collections import namedtuple
|
18 | 21 |
|
19 | 22 | from celery.exceptions import SoftTimeLimitExceeded, TimeLimitExceeded
|
| 23 | +from sqlalchemy import desc, func, nulls_last, select |
20 | 24 | from sqlalchemy.orm import joinedload
|
21 | 25 |
|
22 | 26 | from warehouse import tasks
|
23 | 27 | from warehouse.accounts.models import User, WebAuthn
|
| 28 | +from warehouse.cache.interfaces import IQueryResultsCache |
24 | 29 | from warehouse.metrics import IMetricsService
|
25 | 30 | from warehouse.packaging.interfaces import IFileStorage
|
26 |
| -from warehouse.packaging.models import Description, File, Project, Release |
| 31 | +from warehouse.packaging.models import ( |
| 32 | + Dependency, |
| 33 | + DependencyKind, |
| 34 | + Description, |
| 35 | + File, |
| 36 | + Project, |
| 37 | + Release, |
| 38 | +) |
27 | 39 | from warehouse.utils import readme
|
28 | 40 | from warehouse.utils.row_counter import RowCount
|
29 | 41 |
|
| 42 | +if typing.TYPE_CHECKING: |
| 43 | + from pyramid.request import Request |
| 44 | + |
30 | 45 | logger = logging.getLogger(__name__)
|
31 | 46 |
|
32 | 47 |
|
@@ -342,3 +357,84 @@ def update_bigquery_release_files(task, request, dist_metadata):
|
342 | 357 | json_rows = [json_rows]
|
343 | 358 |
|
344 | 359 | bq.insert_rows_json(table=table_name, json_rows=json_rows)
|
| 360 | + |
| 361 | + |
| 362 | +@tasks.task(ignore_result=True, acks_late=True) |
| 363 | +def compute_top_dependents_corpus(request: Request) -> dict[str, int]: |
| 364 | + """ |
| 365 | + Query to collect all dependents from projects' most recent release |
| 366 | + and rank them by the number of dependents. |
| 367 | + Store in query results cache for retrieval during `file_upload`. |
| 368 | + """ |
| 369 | + # Create a CTE with the most recent releases for each project. |
| 370 | + # Selects each release's ID, project ID, and version, with a row number |
| 371 | + # partitioned by project and ordered to get the most recent non-yanked releases. |
| 372 | + recent_releases_cte = ( |
| 373 | + select( |
| 374 | + Release.id.label("release_id"), |
| 375 | + Release.project_id, |
| 376 | + Release.version, |
| 377 | + func.row_number() |
| 378 | + .over( |
| 379 | + partition_by=Release.project_id, |
| 380 | + order_by=[ |
| 381 | + nulls_last( |
| 382 | + Release.is_prerelease |
| 383 | + ), # False first, True next, nulls last |
| 384 | + desc(Release._pypi_ordering), |
| 385 | + ], |
| 386 | + ) |
| 387 | + .label("rn"), |
| 388 | + ) |
| 389 | + .where(Release.yanked.is_(False)) |
| 390 | + .cte("recent_releases") |
| 391 | + ) |
| 392 | + # Create a CTE that parses dependency names from release_dependencies. |
| 393 | + # |
| 394 | + # Extracts normalized dependency names by: |
| 395 | + # 1. Taking the specifier from release_dependencies |
| 396 | + # 2. Using regex to extract just the package name portion |
| 397 | + # 3. Converting to lowercase for normalization |
| 398 | + parsed_dependencies_cte = ( |
| 399 | + select( |
| 400 | + func.normalize_pep426_name( |
| 401 | + # TODO: this isn't perfect, but it's a start. |
| 402 | + # A better solution would be to use a proper parser, but we'd need |
| 403 | + # to teach Postgres how to parse it. |
| 404 | + func.regexp_replace(Dependency.specifier, "^([A-Za-z0-9_.-]+).*", "\\1") |
| 405 | + ).label("dependent_name") |
| 406 | + ) |
| 407 | + .select_from(recent_releases_cte) |
| 408 | + .join(Dependency, Dependency.release_id == recent_releases_cte.c.release_id) |
| 409 | + .where( |
| 410 | + recent_releases_cte.c.rn == 1, # "latest" release per-project |
| 411 | + Dependency.kind.in_( |
| 412 | + [DependencyKind.requires_dist, DependencyKind.requires] |
| 413 | + ), |
| 414 | + ) |
| 415 | + .cte("parsed_dependencies") |
| 416 | + ) |
| 417 | + |
| 418 | + # Final query that gets the top dependents by count |
| 419 | + top_dependents_stmt = ( |
| 420 | + select( |
| 421 | + parsed_dependencies_cte.c.dependent_name, |
| 422 | + func.count().label("dependent_count"), |
| 423 | + ) |
| 424 | + .group_by(parsed_dependencies_cte.c.dependent_name) |
| 425 | + .order_by(desc("dependent_count"), parsed_dependencies_cte.c.dependent_name) |
| 426 | + .limit(10000) |
| 427 | + ) |
| 428 | + |
| 429 | + # Execute the query and fetch the constructed object |
| 430 | + results = request.db.execute(top_dependents_stmt).fetchall() |
| 431 | + # Result is Rows, so convert to a dicts of "name: count" pairs |
| 432 | + results = {row.dependent_name: row.dependent_count for row in results} |
| 433 | + |
| 434 | + # Store the results in the query results cache |
| 435 | + cache = request.find_service(IQueryResultsCache) |
| 436 | + cache_key = "top_dependents_corpus" |
| 437 | + cache.set(cache_key, results) |
| 438 | + logger.info("Stored `top_dependents_corpus` in query results cache.") |
| 439 | + |
| 440 | + return results |
0 commit comments