Skip to content

Commit 98503ca

Browse files
committed
fix parse version
1 parent 62291f6 commit 98503ca

File tree

6 files changed

+91
-40
lines changed

6 files changed

+91
-40
lines changed

.github/workflows/daily_summarize.yaml renamed to .github/workflows/daily_summarization.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Daily Summarize
1+
name: Daily Summarization
22

33
on:
44
workflow_dispatch:

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ pymetrics collect-pypi --max-days 30 --add-metrics --output-folder {OUTPUT_FOLDE
7878
### Daily Collection
7979
On a daily basis, this workflow collects download data from PyPI and Anaconda. The data is then published in CSV format (`pypi.csv`). In addition, it computes metrics for the PyPI downloads (see [#Aggregation Metrics](#aggregation-metrics))
8080

81-
### Daily Summarize
81+
### Daily Summarization
8282

8383
On a daily basis, this workflow summarizes the PyPI download data from `pypi.csv` and calculates downloads for libraries. The summarized data is published to a GitHub repo:
8484
- [Downloads_Summary.xlsx](https://github.com/sdv-dev/sdv-dev.github.io/blob/gatsby-home/assets/Downloads_Summary.xlsx)

pymetrics/metrics.py

Lines changed: 30 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
"""Functions to compute aggregation metrics over raw downloads."""
22

33
import logging
4-
import re
54

5+
import numpy as np
66
import pandas as pd
7+
from packaging.version import InvalidVersion, Version
78

89
from pymetrics.output import create_spreadsheet
910

@@ -105,34 +106,6 @@ def _get_sheet_name(column):
105106
]
106107

107108

108-
RE_NUMERIC = re.compile(r'^\d+')
109-
110-
111-
def _version_element_order_key(version):
112-
components = []
113-
last_component = None
114-
last_numeric = None
115-
for component in version.split('.', 2):
116-
if RE_NUMERIC.match(component):
117-
try:
118-
numeric = RE_NUMERIC.match(component).group(0)
119-
components.append(int(numeric))
120-
last_component = component
121-
last_numeric = numeric
122-
except AttributeError:
123-
# From time to time this errors out in github actions
124-
# while it shouldn't enter the `if`.
125-
pass
126-
127-
components.append(last_component[len(last_numeric) :])
128-
129-
return components
130-
131-
132-
def _version_order_key(version_column):
133-
return version_column.apply(_version_element_order_key)
134-
135-
136109
def _mangle_columns(downloads):
137110
downloads = downloads.rename(columns=RENAME_COLUMNS)
138111
for col in [
@@ -154,6 +127,32 @@ def _mangle_columns(downloads):
154127
return downloads
155128

156129

130+
def _safe_version_parse(version_str):
131+
if pd.isna(version_str):
132+
return np.nan
133+
134+
try:
135+
version = Version(str(version_str))
136+
except InvalidVersion:
137+
cleaned = str(version_str).rstrip('+~')
138+
try:
139+
version = Version(cleaned)
140+
except (InvalidVersion, TypeError):
141+
LOGGER.info(f'Unable to parse version: {version_str}')
142+
version = np.nan
143+
144+
return version
145+
146+
147+
def _version_order_key(version_column):
148+
return version_column.apply(_safe_version_parse)
149+
150+
151+
def _sort_by_version(data, column, ascending=False):
152+
data = data.sort_values(by=column, key=_version_order_key, ascending=ascending)
153+
return data
154+
155+
157156
def compute_metrics(downloads, output_path=None):
158157
"""Compute aggregation metrics over the given downloads.
159158
@@ -172,8 +171,7 @@ def compute_metrics(downloads, output_path=None):
172171
if column in SORT_BY_DOWNLOADS:
173172
sheet = sheet.sort_values('downloads', ascending=False)
174173
elif column in SORT_BY_VERSION:
175-
sheet = sheet.sort_values(column, ascending=False, key=_version_order_key)
176-
174+
sheet = _sort_by_version(sheet, column=column, ascending=False)
177175
sheets[name] = sheet
178176

179177
for column in HISTORICAL_COLUMNS:
@@ -182,7 +180,7 @@ def compute_metrics(downloads, output_path=None):
182180
sheets[name] = _historical_groupby(downloads, [column])
183181

184182
if output_path:
185-
create_spreadsheet(output_path, sheets)
183+
create_spreadsheet(output_path, sheets, na_rep='<NaN>')
186184
return None
187185

188186
return sheets

pymetrics/output.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ def get_path(folder, filename):
3434
return str(pathlib.Path(folder) / filename)
3535

3636

37-
def _add_sheet(writer, data, sheet_name):
38-
data.to_excel(writer, sheet_name=sheet_name, index=False, engine='xlsxwriter')
37+
def _add_sheet(writer, data, sheet_name, na_rep=''):
38+
data.to_excel(writer, sheet_name=sheet_name, index=False, engine='xlsxwriter', na_rep=na_rep)
3939

4040
for column in data:
4141
column_length = None
@@ -51,7 +51,7 @@ def _add_sheet(writer, data, sheet_name):
5151
)
5252

5353

54-
def create_spreadsheet(output_path, sheets):
54+
def create_spreadsheet(output_path, sheets, na_rep=''):
5555
"""Create a spreadsheet with the indicated name and data.
5656
5757
If the ``output_path`` variable starts with ``gdrive://`` it is interpreted
@@ -74,7 +74,7 @@ def create_spreadsheet(output_path, sheets):
7474

7575
with pd.ExcelWriter(output, engine='xlsxwriter') as writer: # pylint: disable=E0110
7676
for title, data in sheets.items():
77-
_add_sheet(writer, data, title)
77+
_add_sheet(writer, data, title, na_rep=na_rep)
7878

7979
if drive.is_drive_path(output_path):
8080
LOGGER.info('Creating file %s', output_path)

pymetrics/summarize.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,7 @@ def get_previous_pypi_downloads(output_folder, dry_run=False):
147147
read_csv_kwargs['nrows'] = 10_000
148148
data = load_csv(csv_path, read_csv_kwargs=read_csv_kwargs)
149149
LOGGER.info('Parsing version column to Version class objects')
150-
if 'version' in data.columns:
151-
data['version'] = data['version'].apply(parse)
150+
data['version'] = data['version'].apply(parse)
152151
return data
153152

154153

tests/unit/test_metrics.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
from pymetrics.metrics import _sort_by_version
5+
6+
7+
def test__sort_by_version():
8+
# Setup
9+
data = pd.DataFrame({
10+
'version': pd.Series(
11+
['1.9.0', '1.9.0.dev0', '1.24.1', '0.9.1', '0.16.0', '0.0.0'], dtype='object'
12+
),
13+
'name': ['v5', 'v4', 'v6', 'v2', 'v3', 'v1'],
14+
})
15+
16+
# Run
17+
sorted_df = _sort_by_version(data, 'version', ascending=False)
18+
19+
# Assert
20+
expected_versions = ['1.24.1', '1.9.0', '1.9.0.dev0', '0.16.0', '0.9.1', '0.0.0']
21+
assert sorted_df['version'].map(str).tolist() == expected_versions
22+
assert sorted_df['name'].tolist() == ['v6', 'v5', 'v4', 'v3', 'v2', 'v1']
23+
24+
25+
def test__sort_by_version_with_invalid_versions():
26+
# Setup
27+
data = pd.DataFrame({
28+
'version': pd.Series(['2.7.11+', '2.0.0', 'invalid', '3.0', np.nan], dtype='object'),
29+
'name': ['v4', 'v3', 'v2', 'v5', 'v1'],
30+
})
31+
32+
# Run
33+
sorted_df = _sort_by_version(data, 'version')
34+
35+
# Assert
36+
expected_versions = ['3.0', '2.7.11+', '2.0.0', 'invalid', np.nan]
37+
assert sorted_df['version'].tolist() == expected_versions
38+
assert sorted_df['name'].tolist() == ['v5', 'v4', 'v3', 'v2', 'v1']
39+
40+
41+
def test__sort_by_version_with_mixed_version_formats():
42+
# Setup
43+
data = pd.DataFrame({
44+
'version': ['1.0a1', '1.0b2', '1.0rc3', '1.0', '1.0.post0'],
45+
'name': ['alpha', 'beta', 'rc', 'stable', 'post'],
46+
})
47+
48+
# Run
49+
sorted_df = _sort_by_version(data, 'version', ascending=False)
50+
51+
# Assert
52+
expected_versions = ['1.0.post0', '1.0', '1.0rc3', '1.0b2', '1.0a1']
53+
assert sorted_df['version'].tolist() == expected_versions
54+
assert sorted_df['name'].tolist() == ['post', 'stable', 'rc', 'beta', 'alpha']

0 commit comments

Comments
 (0)