wip

gsheni · gsheni · commit ff20930e69d3 · 2025-07-25T17:45:31.000-04:00
diff --git a/README.md b/README.md
@@ -44,9 +44,8 @@ Currently, the download data is collected from the following distributions:
       - Replace `{package_name}` with the specific package (`sdv`) in the Anaconda channel
     - For each file returned by the API endpoint, the current number of downloads is saved. Over time, a historical download recording can be built.
 
-### Future Data Sources
-In the future, we may expand the source distributions to include:
-* [GitHub Releases](https://github.com/): Information about the project downloads from GitHub releases.
+* [GitHub Releases](https://github.com/): Information about the project downloads from GitHub release assets.
+  See this [GitHub API](https://docs.github.com/en/rest/releases/releases?apiVersion=2022-11-28#get-a-release).
 
 # Install
 Install pymetrics using pip (or uv):
diff --git a/github_config.yml b/github_config.yml
@@ -0,0 +1,28 @@
+projects:
+  sdv-dev:
+  - sdv-dev/SDV
+  - sdv-dev/RDT
+  - sdv-dev/SDMetrics
+  - sdv-dev/SDGym
+  - sdv-dev/Copulas
+  - sdv-dev/CTGAN
+  - sdv-dev/DeepEcho
+  gretel:
+  - gretelai/gretel-python-client
+  - gretelai/trainer
+  - gretelai/gretel-synthetics
+  mostly-ai:
+  - mostly-ai/mostlyai
+  - mostly-ai/mostlyai-mock
+  ydata:
+  - ydataai/ydata-synthetic
+  - ydataai/ydata-quality
+  - ydataai/ydata-fabric-sdk
+  realtabformer:
+  - worldbank/REaLTabFormer
+  synthcity:
+  - vanderschaarlab/synthcity
+  smartnoise-sdk:
+  - opendp/smartnoise-sdk
+  be_great:
+  - kathrinse/be_great
diff --git a/pymetrics/__main__.py b/pymetrics/__main__.py
@@ -10,6 +10,7 @@
 import yaml
 
 from pymetrics.anaconda import collect_anaconda_downloads
+from pymetrics.gh_downloads import collect_github_downloads
 from pymetrics.main import collect_pypi_downloads
 from pymetrics.summarize import summarize_downloads
 
@@ -76,6 +77,19 @@ def _collect_anaconda(args):
     )
 
 
+def _collect_github(args):
+    config = _load_config(args.config_file)
+    projects = config['projects']
+    output_folder = args.output_folder
+
+    collect_github_downloads(
+        projects=projects,
+        output_folder=output_folder,
+        dry_run=args.dry_run,
+        verbose=args.verbose,
+    )
+
+
 def _summarize(args):
     config = _load_config(args.config_file)
     projects = config['projects']
@@ -243,6 +257,29 @@ def _get_parser():
         default=90,
         help='Max days of data to pull. Default to last 90 days.',
     )
+
+    # collect Anaconda
+    collect_github = action.add_parser(
+        'collect-github', help='Collect download data from GitHub.', parents=[logging_args]
+    )
+    collect_github.set_defaults(action=_collect_github)
+    collect_github.add_argument(
+        '-c',
+        '--config-file',
+        type=str,
+        default='config.yaml',
+        help='Path to the configuration file.',
+    )
+    collect_github.add_argument(
+        '-o',
+        '--output-folder',
+        type=str,
+        required=True,
+        help=(
+            'Path to the folder where data will be outputted. It can be a local path or a'
+            ' Google Drive folder path in the format gdrive://<folder-id>'
+        ),
+    )
     return parser
 
 
diff --git a/pymetrics/gh_downloads.py b/pymetrics/gh_downloads.py
@@ -0,0 +1,129 @@
+"""Functions to get GitHub downloads from GitHub."""
+
+import logging
+import os
+from collections import defaultdict
+from datetime import datetime
+from zoneinfo import ZoneInfo
+
+import pandas as pd
+from tqdm import tqdm
+
+from pymetrics.github import GithubClient
+from pymetrics.output import append_row, create_csv, get_path, load_csv
+from pymetrics.time_utils import drop_duplicates_by_date
+
+LOGGER = logging.getLogger(__name__)
+dir_path = os.path.dirname(os.path.realpath(__file__))
+TIME_COLUMN = 'timestamp'
+
+GITHUB_DOWNLOAD_COUNT_FILENAME = 'github_download_counts.csv'
+
+
+def get_previous_github_downloads(output_folder, dry_run=False):
+    csv_path = get_path(output_folder, GITHUB_DOWNLOAD_COUNT_FILENAME)
+    read_csv_kwargs = {
+        'parse_dates': [
+            TIME_COLUMN,
+            'created_at',
+        ],
+        'dtype': {
+            'ecosystem_name': pd.CategoricalDtype(),
+            'org_repo': pd.CategoricalDtype(),
+            'tag_name': pd.CategoricalDtype(),
+            'prerelease': pd.BooleanDtype(),
+            'download_count': pd.Int64Dtype(),
+        },
+    }
+    data = load_csv(csv_path, read_csv_kwargs=read_csv_kwargs)
+    return data
+
+
+def collect_github_downloads(
+    projects: dict[str, list[str]], output_folder: str, dry_run: bool = False, verbose: bool = False
+):
+    overall_df = get_previous_github_downloads(output_folder=output_folder)
+    # overall_df = pd.DataFrame(
+    #     columns=[
+    #         TIME_COLUMN,
+    #         'created_at',
+    #         'ecosystem_name',
+    #         'org_repo',
+    #         'tag_name',
+    #         'prerelease',
+    #         'download_count',
+    #     ]
+    # )
+
+    gh_client = GithubClient()
+    download_counts = defaultdict(int)
+
+    for ecosystem_name, repositories in tqdm(projects.items(), position=2, desc='Overall'):
+        for org_repo in tqdm(repositories, position=1, desc=f'For Ecosystem: {ecosystem_name}'):
+            pages_remain = True
+            page = 1
+            per_page = 100
+            download_counts[org_repo] = 0
+
+            github_org = org_repo.split('/')[0]
+            repo = org_repo.split('/')[1]
+
+            while pages_remain is True:
+                response = gh_client.get(
+                    github_org,
+                    repo,
+                    endpoint='releases',
+                    query_params={'per_page': per_page, 'page': page},
+                )
+                release_data = response.json()
+                link_header = response.headers.get('link')
+
+                if response.status_code == 404:
+                    LOGGER.debug(f'Skipping: {org_repo} because org/repo does not exist')
+                    pages_remain = False
+                    break
+
+                # Get download count
+                for release_info in tqdm(
+                    release_data, position=0, desc=f'For {repo} releases, page: {page}'
+                ):
+                    release_id = release_info.get('id')
+                    tag_name = release_info.get('tag_name')
+                    prerelease = release_info.get('prerelease')
+                    created_at = release_info.get('created_at')
+                    endpoint = f'releases/{release_id}'
+                    timestamp = datetime.now(ZoneInfo('UTC'))
+
+                    response = gh_client.get(github_org, repo, endpoint=endpoint)
+                    data = response.json()
+                    assets = data.get('assets')
+                    tag_row = {
+                        'ecosystem_name': [ecosystem_name],
+                        'org_repo': [org_repo],
+                        'timestamp': [timestamp],
+                        'tag_name': [tag_name],
+                        'prerelease': [prerelease],
+                        'created_at': [created_at],
+                        'download_count': 0,
+                    }
+                    if assets and len(assets) > 0:
+                        for asset in assets:
+                            tag_row['download_count'] += asset.get('download_count', 0)
+
+                    overall_df = append_row(overall_df, tag_row)
+
+                # Check pagination
+                if link_header and 'rel="next"' in link_header:
+                    page += 1
+                else:
+                    break
+    overall_df = drop_duplicates_by_date(
+        overall_df,
+        time_column=TIME_COLUMN,
+        group_by_columns=['ecosystem_name', 'org_repo', 'tag_name'],
+    )
+    overall_df.to_csv('github_download_counts.csv', index=False)
+
+    if not dry_run:
+        gfolder_path = f'{output_folder}/{GITHUB_DOWNLOAD_COUNT_FILENAME}'
+        create_csv(output_path=gfolder_path, data=overall_df)
diff --git a/pymetrics/github.py b/pymetrics/github.py
@@ -0,0 +1,80 @@
+"""Clients for making requests to Github APIs."""
+
+import os
+
+import requests
+
+
+class BaseClient:
+    """Base GitHub client."""
+
+    def __init__(self):
+        token = os.getenv('GH_ACCESS_TOKEN')
+        self.headers = {
+            'Authorization': f'Bearer {token}',
+            'Accept': 'application/vnd.github+json',
+            'X-GitHub-Api-Version': '2022-11-28',
+        }
+
+
+class GithubClient(BaseClient):
+    """Client for GitHub API."""
+
+    def __init__(self):
+        super().__init__()
+        self.base_url = 'https://api.github.com/repos'
+
+    def _construct_url(self, github_org: str, repo: str, resource: str, id: str | None = None):
+        url = f'{self.base_url}/{github_org}/{repo}/{resource}'
+        if id:
+            url += f'/{id}'
+        return url
+
+    def get(
+        self,
+        github_org: str,
+        repo: str,
+        endpoint: str,
+        query_params: dict | None = None,
+        timeout: int | None = None,
+    ):
+        """Get a specific value of a resource from an endpoint in the GitHub API.
+
+        Args:
+            github_org (str):
+                The name of the GitHub organization to search.
+            repo (str):
+                The name of the repository to search.
+            endpoint (str):
+                The endpoint for the resource. For example, issues/{issue_number}. This means we'd
+                be making a request to https://api.github.com/repos/{github_org}/{repo}/issues/{issue_number}.
+            query_params (dict):
+                A dictionary mapping any query parameters to the desired value. Defaults to None.
+            timeout (int):
+                How long to wait before the request times out. Defaults to None.
+
+        Returns:
+            requests.models.Response
+        """
+        url = self._construct_url(github_org, repo, endpoint)
+        return requests.get(url, headers=self.headers, params=query_params, timeout=timeout)
+
+    def post(self, github_org: str, repo: str, endpoint: str, payload: dict):
+        """Post to an endpooint in the GitHub API.
+
+        Args:
+            github_org (str):
+                The name of the GitHub organization to search.
+            repo (str):
+                The name of the repository to search.
+            endpoint (str):
+                The endpoint for the resource. For example, issues. This means we'd be
+                making a request to https://api.github.com/repos/{github_org}/{repo}/issues.
+            payload (dict):
+                The payload to post.
+
+        Returns:
+            requests.models.Response
+        """
+        url = self._construct_url(github_org, repo, endpoint)
+        return requests.post(url, headers=self.headers, json=payload)