add prerelease, postrelease, devrelease

gsheni · gsheni · commit 5aaa62e9535f · 2025-07-28T11:26:46.000-04:00
diff --git a/.github/workflows/daily_collection.yaml b/.github/workflows/daily_collection.yaml
@@ -27,6 +27,9 @@ jobs:
     timeout-minutes: 25
     steps:
     - uses: actions/checkout@v4
+      with:
+        repository: sdv-dev/PyMetrics
+        token: ${{ secrets.GH_TOKEN }}
     - name: Install uv
       uses: astral-sh/setup-uv@v6
       with:
@@ -56,6 +59,13 @@ jobs:
       env:
         PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
         ANACONDA_OUTPUT_FOLDER: ${{ secrets.ANACONDA_OUTPUT_FOLDER }}
+    - name: Collect GitHub Downloads
+      run: |
+        uv run pymetrics collect-github \
+          --output-folder ${{ secrets.GH_OUTPUT_FOLDER }}
+      env:
+        PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
+        GH_OUTPUT_FOLDER: ${{ secrets.GH_OUTPUT_FOLDER }}
   alert:
     needs: [collect]
     runs-on: ubuntu-latest
@@ -77,4 +87,4 @@ jobs:
         -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }} \
         -m 'Daily Collection PyMetrics failed :fire: :dumpster-fire: :fire:'
       env:
-        SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
+        SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
diff --git a/.github/workflows/daily_summarization.yaml b/.github/workflows/daily_summarization.yaml
@@ -1,4 +1,4 @@
-name: Daily Summarization
+name: Daily Summarize
 
 on:
   workflow_dispatch:
@@ -17,6 +17,9 @@ jobs:
     timeout-minutes: 10
     steps:
     - uses: actions/checkout@v4
+      with:
+        repository: sdv-dev/PyMetrics
+        token: ${{ secrets.GH_TOKEN }}
     - name: Install uv
       uses: astral-sh/setup-uv@v6
       with:
@@ -71,4 +74,4 @@ jobs:
         -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }} \
         -m 'Daily Summarize PyMetrics failed :fire: :dumpster-fire: :fire:'
       env:
-        SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
+        SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
diff --git a/README.md b/README.md
@@ -142,6 +142,9 @@ The aggregation metrics spreasheets contain the following tabs:
 * **By Month and Python Version:** Absolute number of downloads per month and Python version.
 * **By Month and Country Code:** Absolute number of downloads per month and country.
 * **By Month and Installer Name:** Absolute number of downloads per month and Installer.
+* **By Prerelease**: Absolute and relative number of downloads for pre-release versions (alpha, beta, release candidate, and development versions).
+* **By Postrelease**: Absolute and relative number of downloads for post-release versions.
+* **By Devrelease**: Absolute and relative number of downloads for development release versions.
 
 ## Known Issues
 1. The conda package download data for Anaconda does not match the download count shown on the website. This is due to missing download data in the conda package download data. See this: https://github.com/anaconda/anaconda-package-data/issues/45
diff --git a/pymetrics/__main__.py b/pymetrics/__main__.py
@@ -267,7 +267,7 @@ def _get_parser():
         '-c',
         '--config-file',
         type=str,
-        default='config.yaml',
+        default='github_config.yaml',
         help='Path to the configuration file.',
     )
     collect_github.add_argument(
diff --git a/pymetrics/anaconda.py b/pymetrics/anaconda.py
@@ -10,7 +10,7 @@
 from tqdm import tqdm
 
 from pymetrics.output import append_row, create_csv, get_path, load_csv
-from pymetrics.time_utils import drop_duplicates_by_date
+from pymetrics.time_utils import drop_duplicates_by_date, get_current_utc
 
 LOGGER = logging.getLogger(__name__)
 dir_path = os.path.dirname(os.path.realpath(__file__))
@@ -89,7 +89,7 @@ def _get_downloads_from_anaconda_org(packages, channel='conda-forge'):
 
     for pkg_name in packages:
         URL = f'https://api.anaconda.org/package/{channel}/{pkg_name}'
-        timestamp = datetime.now(ZoneInfo('UTC'))
+        timestamp = get_current_utc()
         response = requests.get(URL)
         row_info = {'pkg_name': [pkg_name], TIME_COLUMN: [timestamp], 'total_ndownloads': 0}
         data = response.json()
@@ -158,6 +158,8 @@ def collect_anaconda_downloads(
             `start_date` has not been provided. Defaults to 90 days.
         dry_run (bool):
             If `True`, do not upload the results. Defaults to `False`.
+        verbose (bool):
+            If `True`, will output dataframes tails of anaconda data. Defaults to `False`.
     """
     overall_df, version_downloads = _collect_ananconda_downloads_from_website(
         projects, output_folder=output_folder
diff --git a/pymetrics/gh_downloads.py b/pymetrics/gh_downloads.py
@@ -3,15 +3,13 @@
 import logging
 import os
 from collections import defaultdict
-from datetime import datetime
-from zoneinfo import ZoneInfo
 
 import pandas as pd
 from tqdm import tqdm
 
 from pymetrics.github import GithubClient
 from pymetrics.output import append_row, create_csv, get_path, load_csv
-from pymetrics.time_utils import drop_duplicates_by_date
+from pymetrics.time_utils import drop_duplicates_by_date, get_current_utc
 
 LOGGER = logging.getLogger(__name__)
 dir_path = os.path.dirname(os.path.realpath(__file__))
@@ -21,6 +19,7 @@
 
 
 def get_previous_github_downloads(output_folder, dry_run=False):
+    """Get previous GitHub Downloads."""
     csv_path = get_path(output_folder, GITHUB_DOWNLOAD_COUNT_FILENAME)
     read_csv_kwargs = {
         'parse_dates': [
@@ -42,24 +41,29 @@ def get_previous_github_downloads(output_folder, dry_run=False):
 def collect_github_downloads(
     projects: dict[str, list[str]], output_folder: str, dry_run: bool = False, verbose: bool = False
 ):
+    """Pull data about the downloads of a GitHub project.
+
+    Args:
+        projects (dict[str, list[str]]):
+            List of projects to analyze. Each key is the name of the ecosystem, and
+            each value is a list of github repositories (including organization).
+        output_folder (str):
+            Folder in which project downloads will be stored.
+            It can be passed as a local folder or as a Google Drive path in the format
+            `gdrive://{folder_id}`.
+            The folder must contain 'github_download_counts.csv'
+        dry_run (bool):
+            If `True`, do not upload the results. Defaults to `False`.
+        verbose (bool):
+            If `True`, will output dataframes heads of github download data. Defaults to `False`.
+    """
     overall_df = get_previous_github_downloads(output_folder=output_folder)
-    # overall_df = pd.DataFrame(
-    #     columns=[
-    #         TIME_COLUMN,
-    #         'created_at',
-    #         'ecosystem_name',
-    #         'org_repo',
-    #         'tag_name',
-    #         'prerelease',
-    #         'download_count',
-    #     ]
-    # )
 
     gh_client = GithubClient()
     download_counts = defaultdict(int)
 
-    for ecosystem_name, repositories in tqdm(projects.items(), position=2, desc='Overall'):
-        for org_repo in tqdm(repositories, position=1, desc=f'For Ecosystem: {ecosystem_name}'):
+    for ecosystem_name, repositories in projects.items():
+        for org_repo in tqdm(repositories, position=1, desc=f'Ecosystem: {ecosystem_name}'):
             pages_remain = True
             page = 1
             per_page = 100
@@ -85,18 +89,19 @@ def collect_github_downloads(
 
                 # Get download count
                 for release_info in tqdm(
-                    release_data, position=0, desc=f'For {repo} releases, page: {page}'
+                    release_data, position=0, desc=f'{repo} releases, page={page}'
                 ):
                     release_id = release_info.get('id')
                     tag_name = release_info.get('tag_name')
                     prerelease = release_info.get('prerelease')
                     created_at = release_info.get('created_at')
                     endpoint = f'releases/{release_id}'
-                    timestamp = datetime.now(ZoneInfo('UTC'))
 
+                    timestamp = get_current_utc()
                     response = gh_client.get(github_org, repo, endpoint=endpoint)
                     data = response.json()
                     assets = data.get('assets')
+
                     tag_row = {
                         'ecosystem_name': [ecosystem_name],
                         'org_repo': [org_repo],
@@ -122,6 +127,10 @@ def collect_github_downloads(
         time_column=TIME_COLUMN,
         group_by_columns=['ecosystem_name', 'org_repo', 'tag_name'],
     )
+    if verbose:
+        LOGGER.info(f'{GITHUB_DOWNLOAD_COUNT_FILENAME} tail')
+        LOGGER.info(overall_df.tail(5).to_string())
+
     overall_df.to_csv('github_download_counts.csv', index=False)
 
     if not dry_run:
diff --git a/pymetrics/metrics.py b/pymetrics/metrics.py
@@ -80,6 +80,9 @@ def _get_sheet_name(column):
     'OS_type',
     'cpu',
     'ci',
+    'is_prerelease',
+    'is_postrelease',
+    'is_devrelease',
 ]
 SORT_BY_DOWNLOADS = [
     'country_code',
@@ -106,6 +109,30 @@ def _get_sheet_name(column):
 ]
 
 
+def _safe_version_parse(version_str):
+    if pd.isna(version_str):
+        return np.nan
+
+    try:
+        version = Version(str(version_str))
+    except InvalidVersion:
+        cleaned = str(version_str).rstrip('+~')
+        try:
+            version = Version(cleaned)
+        except (InvalidVersion, TypeError):
+            LOGGER.info(f'Unable to parse version: {version_str}')
+            version = np.nan
+
+    return version
+
+
+def _extract_version_attribute(version_str, attribute):
+    version_obj = _safe_version_parse(version_str)
+    if isinstance(version_obj, Version):
+        return getattr(version_obj, attribute)
+    return np.nan
+
+
 def _mangle_columns(downloads):
     downloads = downloads.rename(columns=RENAME_COLUMNS)
     for col in [
@@ -124,24 +151,17 @@ def _mangle_columns(downloads):
     downloads['distro_version'] = downloads['distro_name'] + ' ' + downloads['distro_version']
     downloads['distro_kernel'] = downloads['distro_version'] + ' - ' + downloads['distro_kernel']
 
-    return downloads
-
-
-def _safe_version_parse(version_str):
-    if pd.isna(version_str):
-        return np.nan
-
-    try:
-        version = Version(str(version_str))
-    except InvalidVersion:
-        cleaned = str(version_str).rstrip('+~')
-        try:
-            version = Version(cleaned)
-        except (InvalidVersion, TypeError):
-            LOGGER.info(f'Unable to parse version: {version_str}')
-            version = np.nan
+    downloads['is_prerelease'] = downloads['version'].apply(
+        _extract_version_attribute, args=('is_prerelease',)
+    )
+    downloads['is_postrelease'] = downloads['version'].apply(
+        _extract_version_attribute, args=('is_postrelease',)
+    )
+    downloads['is_devrelease'] = downloads['version'].apply(
+        _extract_version_attribute, args=('is_devrelease',)
+    )
 
-    return version
+    return downloads
 
 
 def _version_order_key(version_column):
diff --git a/pymetrics/time_utils.py b/pymetrics/time_utils.py
@@ -1,6 +1,7 @@
 """Time utility functions."""
 
 from datetime import datetime
+from zoneinfo import ZoneInfo
 
 import pandas as pd
 from pandas.api.types import is_datetime64_any_dtype
@@ -11,6 +12,11 @@ def get_current_year(tz=None):
     return datetime.now(tz=tz).year
 
 
+def get_current_utc():
+    """Get the current datetime UTC."""
+    return datetime.now(ZoneInfo('UTC'))
+
+
 def get_first_datetime_in_year(year, tzinfo=None):
     """Get the first possible datetime value in a given year."""
     min_date = datetime(year, day=1, month=1).date()

Original file line number	Diff line number	Diff line change
`@@ -267,7 +267,7 @@ def _get_parser():`
`267`	`267`	`'-c',`
`268`	`268`	`'--config-file',`
`269`	`269`	`type=str,`
`270`		`- default='config.yaml',`
	`270`	`+ default='github_config.yaml',`
`271`	`271`	`help='Path to the configuration file.',`
`272`	`272`	`)`
`273`	`273`	`collect_github.add_argument(`