Skip to content

Commit ff20930

Browse files
committed
wip
1 parent 538f270 commit ff20930

File tree

5 files changed

+276
-3
lines changed

5 files changed

+276
-3
lines changed

README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,8 @@ Currently, the download data is collected from the following distributions:
4444
- Replace `{package_name}` with the specific package (`sdv`) in the Anaconda channel
4545
- For each file returned by the API endpoint, the current number of downloads is saved. Over time, a historical download recording can be built.
4646

47-
### Future Data Sources
48-
In the future, we may expand the source distributions to include:
49-
* [GitHub Releases](https://github.com/): Information about the project downloads from GitHub releases.
47+
* [GitHub Releases](https://github.com/): Information about the project downloads from GitHub release assets.
48+
See this [GitHub API](https://docs.github.com/en/rest/releases/releases?apiVersion=2022-11-28#get-a-release).
5049

5150
# Install
5251
Install pymetrics using pip (or uv):

github_config.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
projects:
2+
sdv-dev:
3+
- sdv-dev/SDV
4+
- sdv-dev/RDT
5+
- sdv-dev/SDMetrics
6+
- sdv-dev/SDGym
7+
- sdv-dev/Copulas
8+
- sdv-dev/CTGAN
9+
- sdv-dev/DeepEcho
10+
gretel:
11+
- gretelai/gretel-python-client
12+
- gretelai/trainer
13+
- gretelai/gretel-synthetics
14+
mostly-ai:
15+
- mostly-ai/mostlyai
16+
- mostly-ai/mostlyai-mock
17+
ydata:
18+
- ydataai/ydata-synthetic
19+
- ydataai/ydata-quality
20+
- ydataai/ydata-fabric-sdk
21+
realtabformer:
22+
- worldbank/REaLTabFormer
23+
synthcity:
24+
- vanderschaarlab/synthcity
25+
smartnoise-sdk:
26+
- opendp/smartnoise-sdk
27+
be_great:
28+
- kathrinse/be_great

pymetrics/__main__.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import yaml
1111

1212
from pymetrics.anaconda import collect_anaconda_downloads
13+
from pymetrics.gh_downloads import collect_github_downloads
1314
from pymetrics.main import collect_pypi_downloads
1415
from pymetrics.summarize import summarize_downloads
1516

@@ -76,6 +77,19 @@ def _collect_anaconda(args):
7677
)
7778

7879

80+
def _collect_github(args):
81+
config = _load_config(args.config_file)
82+
projects = config['projects']
83+
output_folder = args.output_folder
84+
85+
collect_github_downloads(
86+
projects=projects,
87+
output_folder=output_folder,
88+
dry_run=args.dry_run,
89+
verbose=args.verbose,
90+
)
91+
92+
7993
def _summarize(args):
8094
config = _load_config(args.config_file)
8195
projects = config['projects']
@@ -243,6 +257,29 @@ def _get_parser():
243257
default=90,
244258
help='Max days of data to pull. Default to last 90 days.',
245259
)
260+
261+
# collect Anaconda
262+
collect_github = action.add_parser(
263+
'collect-github', help='Collect download data from GitHub.', parents=[logging_args]
264+
)
265+
collect_github.set_defaults(action=_collect_github)
266+
collect_github.add_argument(
267+
'-c',
268+
'--config-file',
269+
type=str,
270+
default='config.yaml',
271+
help='Path to the configuration file.',
272+
)
273+
collect_github.add_argument(
274+
'-o',
275+
'--output-folder',
276+
type=str,
277+
required=True,
278+
help=(
279+
'Path to the folder where data will be outputted. It can be a local path or a'
280+
' Google Drive folder path in the format gdrive://<folder-id>'
281+
),
282+
)
246283
return parser
247284

248285

pymetrics/gh_downloads.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
"""Functions to get GitHub downloads from GitHub."""
2+
3+
import logging
4+
import os
5+
from collections import defaultdict
6+
from datetime import datetime
7+
from zoneinfo import ZoneInfo
8+
9+
import pandas as pd
10+
from tqdm import tqdm
11+
12+
from pymetrics.github import GithubClient
13+
from pymetrics.output import append_row, create_csv, get_path, load_csv
14+
from pymetrics.time_utils import drop_duplicates_by_date
15+
16+
LOGGER = logging.getLogger(__name__)
17+
dir_path = os.path.dirname(os.path.realpath(__file__))
18+
TIME_COLUMN = 'timestamp'
19+
20+
GITHUB_DOWNLOAD_COUNT_FILENAME = 'github_download_counts.csv'
21+
22+
23+
def get_previous_github_downloads(output_folder, dry_run=False):
24+
csv_path = get_path(output_folder, GITHUB_DOWNLOAD_COUNT_FILENAME)
25+
read_csv_kwargs = {
26+
'parse_dates': [
27+
TIME_COLUMN,
28+
'created_at',
29+
],
30+
'dtype': {
31+
'ecosystem_name': pd.CategoricalDtype(),
32+
'org_repo': pd.CategoricalDtype(),
33+
'tag_name': pd.CategoricalDtype(),
34+
'prerelease': pd.BooleanDtype(),
35+
'download_count': pd.Int64Dtype(),
36+
},
37+
}
38+
data = load_csv(csv_path, read_csv_kwargs=read_csv_kwargs)
39+
return data
40+
41+
42+
def collect_github_downloads(
43+
projects: dict[str, list[str]], output_folder: str, dry_run: bool = False, verbose: bool = False
44+
):
45+
overall_df = get_previous_github_downloads(output_folder=output_folder)
46+
# overall_df = pd.DataFrame(
47+
# columns=[
48+
# TIME_COLUMN,
49+
# 'created_at',
50+
# 'ecosystem_name',
51+
# 'org_repo',
52+
# 'tag_name',
53+
# 'prerelease',
54+
# 'download_count',
55+
# ]
56+
# )
57+
58+
gh_client = GithubClient()
59+
download_counts = defaultdict(int)
60+
61+
for ecosystem_name, repositories in tqdm(projects.items(), position=2, desc='Overall'):
62+
for org_repo in tqdm(repositories, position=1, desc=f'For Ecosystem: {ecosystem_name}'):
63+
pages_remain = True
64+
page = 1
65+
per_page = 100
66+
download_counts[org_repo] = 0
67+
68+
github_org = org_repo.split('/')[0]
69+
repo = org_repo.split('/')[1]
70+
71+
while pages_remain is True:
72+
response = gh_client.get(
73+
github_org,
74+
repo,
75+
endpoint='releases',
76+
query_params={'per_page': per_page, 'page': page},
77+
)
78+
release_data = response.json()
79+
link_header = response.headers.get('link')
80+
81+
if response.status_code == 404:
82+
LOGGER.debug(f'Skipping: {org_repo} because org/repo does not exist')
83+
pages_remain = False
84+
break
85+
86+
# Get download count
87+
for release_info in tqdm(
88+
release_data, position=0, desc=f'For {repo} releases, page: {page}'
89+
):
90+
release_id = release_info.get('id')
91+
tag_name = release_info.get('tag_name')
92+
prerelease = release_info.get('prerelease')
93+
created_at = release_info.get('created_at')
94+
endpoint = f'releases/{release_id}'
95+
timestamp = datetime.now(ZoneInfo('UTC'))
96+
97+
response = gh_client.get(github_org, repo, endpoint=endpoint)
98+
data = response.json()
99+
assets = data.get('assets')
100+
tag_row = {
101+
'ecosystem_name': [ecosystem_name],
102+
'org_repo': [org_repo],
103+
'timestamp': [timestamp],
104+
'tag_name': [tag_name],
105+
'prerelease': [prerelease],
106+
'created_at': [created_at],
107+
'download_count': 0,
108+
}
109+
if assets and len(assets) > 0:
110+
for asset in assets:
111+
tag_row['download_count'] += asset.get('download_count', 0)
112+
113+
overall_df = append_row(overall_df, tag_row)
114+
115+
# Check pagination
116+
if link_header and 'rel="next"' in link_header:
117+
page += 1
118+
else:
119+
break
120+
overall_df = drop_duplicates_by_date(
121+
overall_df,
122+
time_column=TIME_COLUMN,
123+
group_by_columns=['ecosystem_name', 'org_repo', 'tag_name'],
124+
)
125+
overall_df.to_csv('github_download_counts.csv', index=False)
126+
127+
if not dry_run:
128+
gfolder_path = f'{output_folder}/{GITHUB_DOWNLOAD_COUNT_FILENAME}'
129+
create_csv(output_path=gfolder_path, data=overall_df)

pymetrics/github.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
"""Clients for making requests to Github APIs."""
2+
3+
import os
4+
5+
import requests
6+
7+
8+
class BaseClient:
9+
"""Base GitHub client."""
10+
11+
def __init__(self):
12+
token = os.getenv('GH_ACCESS_TOKEN')
13+
self.headers = {
14+
'Authorization': f'Bearer {token}',
15+
'Accept': 'application/vnd.github+json',
16+
'X-GitHub-Api-Version': '2022-11-28',
17+
}
18+
19+
20+
class GithubClient(BaseClient):
21+
"""Client for GitHub API."""
22+
23+
def __init__(self):
24+
super().__init__()
25+
self.base_url = 'https://api.github.com/repos'
26+
27+
def _construct_url(self, github_org: str, repo: str, resource: str, id: str | None = None):
28+
url = f'{self.base_url}/{github_org}/{repo}/{resource}'
29+
if id:
30+
url += f'/{id}'
31+
return url
32+
33+
def get(
34+
self,
35+
github_org: str,
36+
repo: str,
37+
endpoint: str,
38+
query_params: dict | None = None,
39+
timeout: int | None = None,
40+
):
41+
"""Get a specific value of a resource from an endpoint in the GitHub API.
42+
43+
Args:
44+
github_org (str):
45+
The name of the GitHub organization to search.
46+
repo (str):
47+
The name of the repository to search.
48+
endpoint (str):
49+
The endpoint for the resource. For example, issues/{issue_number}. This means we'd
50+
be making a request to https://api.github.com/repos/{github_org}/{repo}/issues/{issue_number}.
51+
query_params (dict):
52+
A dictionary mapping any query parameters to the desired value. Defaults to None.
53+
timeout (int):
54+
How long to wait before the request times out. Defaults to None.
55+
56+
Returns:
57+
requests.models.Response
58+
"""
59+
url = self._construct_url(github_org, repo, endpoint)
60+
return requests.get(url, headers=self.headers, params=query_params, timeout=timeout)
61+
62+
def post(self, github_org: str, repo: str, endpoint: str, payload: dict):
63+
"""Post to an endpooint in the GitHub API.
64+
65+
Args:
66+
github_org (str):
67+
The name of the GitHub organization to search.
68+
repo (str):
69+
The name of the repository to search.
70+
endpoint (str):
71+
The endpoint for the resource. For example, issues. This means we'd be
72+
making a request to https://api.github.com/repos/{github_org}/{repo}/issues.
73+
payload (dict):
74+
The payload to post.
75+
76+
Returns:
77+
requests.models.Response
78+
"""
79+
url = self._construct_url(github_org, repo, endpoint)
80+
return requests.post(url, headers=self.headers, json=payload)

0 commit comments

Comments
 (0)