Skip to content

Commit 5aaa62e

Browse files
committed
add prerelease, postrelease, devrelease
1 parent ff20930 commit 5aaa62e

File tree

8 files changed

+94
-41
lines changed

8 files changed

+94
-41
lines changed

.github/workflows/daily_collection.yaml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ jobs:
2727
timeout-minutes: 25
2828
steps:
2929
- uses: actions/checkout@v4
30+
with:
31+
repository: sdv-dev/PyMetrics
32+
token: ${{ secrets.GH_TOKEN }}
3033
- name: Install uv
3134
uses: astral-sh/setup-uv@v6
3235
with:
@@ -56,6 +59,13 @@ jobs:
5659
env:
5760
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
5861
ANACONDA_OUTPUT_FOLDER: ${{ secrets.ANACONDA_OUTPUT_FOLDER }}
62+
- name: Collect GitHub Downloads
63+
run: |
64+
uv run pymetrics collect-github \
65+
--output-folder ${{ secrets.GH_OUTPUT_FOLDER }}
66+
env:
67+
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
68+
GH_OUTPUT_FOLDER: ${{ secrets.GH_OUTPUT_FOLDER }}
5969
alert:
6070
needs: [collect]
6171
runs-on: ubuntu-latest
@@ -77,4 +87,4 @@ jobs:
7787
-c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }} \
7888
-m 'Daily Collection PyMetrics failed :fire: :dumpster-fire: :fire:'
7989
env:
80-
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
90+
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}

.github/workflows/daily_summarization.yaml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Daily Summarization
1+
name: Daily Summarize
22

33
on:
44
workflow_dispatch:
@@ -17,6 +17,9 @@ jobs:
1717
timeout-minutes: 10
1818
steps:
1919
- uses: actions/checkout@v4
20+
with:
21+
repository: sdv-dev/PyMetrics
22+
token: ${{ secrets.GH_TOKEN }}
2023
- name: Install uv
2124
uses: astral-sh/setup-uv@v6
2225
with:
@@ -71,4 +74,4 @@ jobs:
7174
-c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }} \
7275
-m 'Daily Summarize PyMetrics failed :fire: :dumpster-fire: :fire:'
7376
env:
74-
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
77+
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,9 @@ The aggregation metrics spreasheets contain the following tabs:
142142
* **By Month and Python Version:** Absolute number of downloads per month and Python version.
143143
* **By Month and Country Code:** Absolute number of downloads per month and country.
144144
* **By Month and Installer Name:** Absolute number of downloads per month and Installer.
145+
* **By Prerelease**: Absolute and relative number of downloads for pre-release versions (alpha, beta, release candidate, and development versions).
146+
* **By Postrelease**: Absolute and relative number of downloads for post-release versions.
147+
* **By Devrelease**: Absolute and relative number of downloads for development release versions.
145148

146149
## Known Issues
147150
1. The conda package download data for Anaconda does not match the download count shown on the website. This is due to missing download data in the conda package download data. See this: https://github.com/anaconda/anaconda-package-data/issues/45

pymetrics/__main__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ def _get_parser():
267267
'-c',
268268
'--config-file',
269269
type=str,
270-
default='config.yaml',
270+
default='github_config.yaml',
271271
help='Path to the configuration file.',
272272
)
273273
collect_github.add_argument(

pymetrics/anaconda.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from tqdm import tqdm
1111

1212
from pymetrics.output import append_row, create_csv, get_path, load_csv
13-
from pymetrics.time_utils import drop_duplicates_by_date
13+
from pymetrics.time_utils import drop_duplicates_by_date, get_current_utc
1414

1515
LOGGER = logging.getLogger(__name__)
1616
dir_path = os.path.dirname(os.path.realpath(__file__))
@@ -89,7 +89,7 @@ def _get_downloads_from_anaconda_org(packages, channel='conda-forge'):
8989

9090
for pkg_name in packages:
9191
URL = f'https://api.anaconda.org/package/{channel}/{pkg_name}'
92-
timestamp = datetime.now(ZoneInfo('UTC'))
92+
timestamp = get_current_utc()
9393
response = requests.get(URL)
9494
row_info = {'pkg_name': [pkg_name], TIME_COLUMN: [timestamp], 'total_ndownloads': 0}
9595
data = response.json()
@@ -158,6 +158,8 @@ def collect_anaconda_downloads(
158158
`start_date` has not been provided. Defaults to 90 days.
159159
dry_run (bool):
160160
If `True`, do not upload the results. Defaults to `False`.
161+
verbose (bool):
162+
If `True`, will output dataframes tails of anaconda data. Defaults to `False`.
161163
"""
162164
overall_df, version_downloads = _collect_ananconda_downloads_from_website(
163165
projects, output_folder=output_folder

pymetrics/gh_downloads.py

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,13 @@
33
import logging
44
import os
55
from collections import defaultdict
6-
from datetime import datetime
7-
from zoneinfo import ZoneInfo
86

97
import pandas as pd
108
from tqdm import tqdm
119

1210
from pymetrics.github import GithubClient
1311
from pymetrics.output import append_row, create_csv, get_path, load_csv
14-
from pymetrics.time_utils import drop_duplicates_by_date
12+
from pymetrics.time_utils import drop_duplicates_by_date, get_current_utc
1513

1614
LOGGER = logging.getLogger(__name__)
1715
dir_path = os.path.dirname(os.path.realpath(__file__))
@@ -21,6 +19,7 @@
2119

2220

2321
def get_previous_github_downloads(output_folder, dry_run=False):
22+
"""Get previous GitHub Downloads."""
2423
csv_path = get_path(output_folder, GITHUB_DOWNLOAD_COUNT_FILENAME)
2524
read_csv_kwargs = {
2625
'parse_dates': [
@@ -42,24 +41,29 @@ def get_previous_github_downloads(output_folder, dry_run=False):
4241
def collect_github_downloads(
4342
projects: dict[str, list[str]], output_folder: str, dry_run: bool = False, verbose: bool = False
4443
):
44+
"""Pull data about the downloads of a GitHub project.
45+
46+
Args:
47+
projects (dict[str, list[str]]):
48+
List of projects to analyze. Each key is the name of the ecosystem, and
49+
each value is a list of github repositories (including organization).
50+
output_folder (str):
51+
Folder in which project downloads will be stored.
52+
It can be passed as a local folder or as a Google Drive path in the format
53+
`gdrive://{folder_id}`.
54+
The folder must contain 'github_download_counts.csv'
55+
dry_run (bool):
56+
If `True`, do not upload the results. Defaults to `False`.
57+
verbose (bool):
58+
If `True`, will output dataframes heads of github download data. Defaults to `False`.
59+
"""
4560
overall_df = get_previous_github_downloads(output_folder=output_folder)
46-
# overall_df = pd.DataFrame(
47-
# columns=[
48-
# TIME_COLUMN,
49-
# 'created_at',
50-
# 'ecosystem_name',
51-
# 'org_repo',
52-
# 'tag_name',
53-
# 'prerelease',
54-
# 'download_count',
55-
# ]
56-
# )
5761

5862
gh_client = GithubClient()
5963
download_counts = defaultdict(int)
6064

61-
for ecosystem_name, repositories in tqdm(projects.items(), position=2, desc='Overall'):
62-
for org_repo in tqdm(repositories, position=1, desc=f'For Ecosystem: {ecosystem_name}'):
65+
for ecosystem_name, repositories in projects.items():
66+
for org_repo in tqdm(repositories, position=1, desc=f'Ecosystem: {ecosystem_name}'):
6367
pages_remain = True
6468
page = 1
6569
per_page = 100
@@ -85,18 +89,19 @@ def collect_github_downloads(
8589

8690
# Get download count
8791
for release_info in tqdm(
88-
release_data, position=0, desc=f'For {repo} releases, page: {page}'
92+
release_data, position=0, desc=f'{repo} releases, page={page}'
8993
):
9094
release_id = release_info.get('id')
9195
tag_name = release_info.get('tag_name')
9296
prerelease = release_info.get('prerelease')
9397
created_at = release_info.get('created_at')
9498
endpoint = f'releases/{release_id}'
95-
timestamp = datetime.now(ZoneInfo('UTC'))
9699

100+
timestamp = get_current_utc()
97101
response = gh_client.get(github_org, repo, endpoint=endpoint)
98102
data = response.json()
99103
assets = data.get('assets')
104+
100105
tag_row = {
101106
'ecosystem_name': [ecosystem_name],
102107
'org_repo': [org_repo],
@@ -122,6 +127,10 @@ def collect_github_downloads(
122127
time_column=TIME_COLUMN,
123128
group_by_columns=['ecosystem_name', 'org_repo', 'tag_name'],
124129
)
130+
if verbose:
131+
LOGGER.info(f'{GITHUB_DOWNLOAD_COUNT_FILENAME} tail')
132+
LOGGER.info(overall_df.tail(5).to_string())
133+
125134
overall_df.to_csv('github_download_counts.csv', index=False)
126135

127136
if not dry_run:

pymetrics/metrics.py

Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,9 @@ def _get_sheet_name(column):
8080
'OS_type',
8181
'cpu',
8282
'ci',
83+
'is_prerelease',
84+
'is_postrelease',
85+
'is_devrelease',
8386
]
8487
SORT_BY_DOWNLOADS = [
8588
'country_code',
@@ -106,6 +109,30 @@ def _get_sheet_name(column):
106109
]
107110

108111

112+
def _safe_version_parse(version_str):
113+
if pd.isna(version_str):
114+
return np.nan
115+
116+
try:
117+
version = Version(str(version_str))
118+
except InvalidVersion:
119+
cleaned = str(version_str).rstrip('+~')
120+
try:
121+
version = Version(cleaned)
122+
except (InvalidVersion, TypeError):
123+
LOGGER.info(f'Unable to parse version: {version_str}')
124+
version = np.nan
125+
126+
return version
127+
128+
129+
def _extract_version_attribute(version_str, attribute):
130+
version_obj = _safe_version_parse(version_str)
131+
if isinstance(version_obj, Version):
132+
return getattr(version_obj, attribute)
133+
return np.nan
134+
135+
109136
def _mangle_columns(downloads):
110137
downloads = downloads.rename(columns=RENAME_COLUMNS)
111138
for col in [
@@ -124,24 +151,17 @@ def _mangle_columns(downloads):
124151
downloads['distro_version'] = downloads['distro_name'] + ' ' + downloads['distro_version']
125152
downloads['distro_kernel'] = downloads['distro_version'] + ' - ' + downloads['distro_kernel']
126153

127-
return downloads
128-
129-
130-
def _safe_version_parse(version_str):
131-
if pd.isna(version_str):
132-
return np.nan
133-
134-
try:
135-
version = Version(str(version_str))
136-
except InvalidVersion:
137-
cleaned = str(version_str).rstrip('+~')
138-
try:
139-
version = Version(cleaned)
140-
except (InvalidVersion, TypeError):
141-
LOGGER.info(f'Unable to parse version: {version_str}')
142-
version = np.nan
154+
downloads['is_prerelease'] = downloads['version'].apply(
155+
_extract_version_attribute, args=('is_prerelease',)
156+
)
157+
downloads['is_postrelease'] = downloads['version'].apply(
158+
_extract_version_attribute, args=('is_postrelease',)
159+
)
160+
downloads['is_devrelease'] = downloads['version'].apply(
161+
_extract_version_attribute, args=('is_devrelease',)
162+
)
143163

144-
return version
164+
return downloads
145165

146166

147167
def _version_order_key(version_column):

pymetrics/time_utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Time utility functions."""
22

33
from datetime import datetime
4+
from zoneinfo import ZoneInfo
45

56
import pandas as pd
67
from pandas.api.types import is_datetime64_any_dtype
@@ -11,6 +12,11 @@ def get_current_year(tz=None):
1112
return datetime.now(tz=tz).year
1213

1314

15+
def get_current_utc():
16+
"""Get the current datetime UTC."""
17+
return datetime.now(ZoneInfo('UTC'))
18+
19+
1420
def get_first_datetime_in_year(year, tzinfo=None):
1521
"""Get the first possible datetime value in a given year."""
1622
min_date = datetime(year, day=1, month=1).date()

0 commit comments

Comments
 (0)