Skip to content

Commit cd80f18

Browse files
authored
Add daily workflow to summarize download counts for sdv and external libraries (#20)
* wip * lint * fix start date * add project print * fix print * update message * update to use pyarrow dtypes * fix string * update to ubuntu-latest-largeA * update to ubuntu * fix engine * docstring * use category dtype * remove pyarrow * fix ns * lint * use pyarrow everywhere * remove pyarrow dtypes * add readme instructions * fix manual * cleanup * fix manual * fix manual * fix max_days * fix docs * wip * wip * wip * fix workflow * fix workflow * add message to workflow * cleanup * fix repo * fix slack msg * fix slack msg * use extensions * summarize fix" * use uv * fix uv * use cache * change token * add unit tests * add unit workflow * add dry-run * remove unused arg * fix dry run * use uv in lint * add date * cleanup readme * Rename daily_collect.yaml to daily_collection.yaml * Update daily_collection.yaml * Update daily_summarize.yaml * Update dryrun.yaml * Update lint.yaml * Update manual.yaml * Update unit.yaml * Address feedback 2 * add version parse * use object dtype * fix local write * lint * Update daily_summarize.yaml * exclude pre-releases * cleanup
1 parent 6c3180d commit cd80f18

File tree

18 files changed

+724
-84
lines changed

18 files changed

+724
-84
lines changed
Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,20 +13,21 @@ on:
1313
jobs:
1414
collect:
1515
runs-on: ubuntu-latest-large
16-
timeout-minutes: 30
16+
timeout-minutes: 20
1717
steps:
1818
- uses: actions/checkout@v4
19-
- name: Set up Python ${{ matrix.python-version }}
20-
uses: actions/setup-python@v5
19+
- name: Install uv
20+
uses: astral-sh/setup-uv@v6
2121
with:
22-
python-version: '3.13'
23-
- name: Install dependencies
22+
enable-cache: true
23+
activate-environment: true
24+
- name: Install pip and dependencies
2425
run: |
25-
python -m pip install --upgrade pip
26-
python -m pip install .
26+
uv pip install -U pip
27+
uv pip install -e .
2728
- name: Collect Downloads Data
2829
run: |
29-
download-analytics collect \
30+
uv run download-analytics collect
3031
--verbose \
3132
--max-days 30 \
3233
--add-metrics \
@@ -41,15 +42,16 @@ jobs:
4142
if: failure()
4243
steps:
4344
- uses: actions/checkout@v4
44-
- uses: actions/setup-python@v5
45+
- name: Install uv
46+
uses: astral-sh/setup-uv@v6
4547
with:
46-
python-version: '3.13'
47-
- name: Install slack dependencies
48+
enable-cache: true
49+
activate-environment: true
50+
- name: Install pip and dependencies
4851
run: |
49-
python -m pip install --upgrade pip
50-
python -m pip install invoke
51-
python -m pip install .[dev]
52+
uv pip install -U pip
53+
uv pip install -e .[dev]
5254
- name: Slack alert if failure
53-
run: python -m download_analytics.slack_utils -r ${{ github.run_id }} -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }}
55+
run: uv run python -m download_analytics.slack_utils -r ${{ github.run_id }} -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }}
5456
env:
5557
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}

.github/workflows/daily_summarize.yaml

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,65 @@ on:
88
required: false
99
default: "sdv-alerts-debug"
1010
schedule:
11-
- cron: '0 0 * * *'
11+
- cron: '0 1 * * *'
1212

1313
jobs:
14-
collect:
14+
summarize:
1515
runs-on: ubuntu-latest-large
1616
timeout-minutes: 10
1717
steps:
1818
- uses: actions/checkout@v4
19+
- name: Install uv
20+
uses: astral-sh/setup-uv@v6
21+
with:
22+
enable-cache: true
23+
activate-environment: true
24+
- name: Install pip and dependencies
25+
run: |
26+
uv pip install -U pip
27+
uv pip install .
28+
- name: Run Summarize
29+
run: |
30+
uv run download-analytics summarize \
31+
--output-folder gdrive://***REMOVED***
32+
env:
33+
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
34+
- uses: actions/checkout@v4
35+
with:
36+
repository: sdv-dev/sdv-dev.github.io
37+
path: sdv-dev.github.io
38+
ref: 'gatsby-home'
39+
token: ${{ secrets.GH_TOKEN }}
40+
- name: Write to sdv-dev.github.io repo
41+
run: |
42+
find ./ -name '*.xlsx' -exec cp -prv '{}' 'sdv-dev.github.io/assets/' ';'
43+
cd sdv-dev.github.io
44+
git config --local user.name "github-actions[bot]"
45+
git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
46+
git add --force assets/*.xlsx
47+
git commit -m "Upload Download Summary - $(date '+%Y-%m-%d %H:%M:%S')"
48+
git remote set-url origin https://sdv-team:${{ secrets.GH_TOKEN }}@github.com/sdv-dev/sdv-dev.github.io
49+
git push origin gatsby-home
50+
alert:
51+
needs: [summarize]
52+
runs-on: ubuntu-latest
53+
if: failure()
54+
steps:
55+
- uses: actions/checkout@v4
56+
- name: Install uv
57+
uses: astral-sh/setup-uv@v6
58+
with:
59+
enable-cache: true
60+
activate-environment: true
61+
- name: Install pip and dependencies
62+
run: |
63+
uv pip install -U pip
64+
uv pip install .[dev]
65+
- name: Slack alert if failure
66+
run: |
67+
uv run python -m download_analytics.slack_utils \
68+
-r ${{ github.run_id }} \
69+
-c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }} \
70+
-m 'Summarize Analytics build failed :fire: :dumpster-fire: :fire:'
71+
env:
72+
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}

.github/workflows/dryrun.yaml

Lines changed: 16 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,5 @@
1-
name: Health-check Dry Run
1+
name: Dry Run Collect and Summarize
22
on:
3-
workflow_dispatch:
4-
inputs:
5-
slack_channel:
6-
description: Slack channel to post the error message to if the builds fail.
7-
required: false
8-
default: "sdv-alerts-debug"
93
pull_request:
104
types:
115
- opened
@@ -19,17 +13,18 @@ jobs:
1913
runs-on: ubuntu-latest
2014
steps:
2115
- uses: actions/checkout@v4
22-
- name: Set up Python ${{ matrix.python-version }}
23-
uses: actions/setup-python@v5
16+
- name: Install uv
17+
uses: astral-sh/setup-uv@v6
2418
with:
25-
python-version: '3.13'
26-
- name: Install dependencies
19+
enable-cache: true
20+
activate-environment: true
21+
- name: Install pip and dependencies
2722
run: |
28-
python -m pip install --upgrade pip
29-
python -m pip install .
30-
- name: Collect Downloads Data
23+
uv pip install -U pip
24+
uv pip install .
25+
- name: Collect Downloads Data - Dry Run
3126
run: |
32-
download-analytics collect \
27+
uv run download-analytics collect \
3328
--verbose \
3429
--max-days 30 \
3530
--add-metrics \
@@ -38,22 +33,11 @@ jobs:
3833
env:
3934
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
4035
BIGQUERY_CREDENTIALS: ${{ secrets.BIGQUERY_CREDENTIALS }}
41-
42-
alert:
43-
needs: [dry_run]
44-
runs-on: ubuntu-latest
45-
if: failure()
46-
steps:
47-
- uses: actions/checkout@v4
48-
- uses: actions/setup-python@v5
49-
with:
50-
python-version: '3.13'
51-
- name: Install slack dependencies
36+
- name: Run Summarize - Dry Run
5237
run: |
53-
python -m pip install --upgrade pip
54-
python -m pip install invoke
55-
python -m pip install .[dev]
56-
- name: Slack alert if failure
57-
run: python -m download_analytics.slack_utils -r ${{ github.run_id }} -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }}
38+
uv run download-analytics summarize \
39+
--verbose \
40+
--output-folder gdrive://***REMOVED*** \
41+
--dry-run
5842
env:
59-
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
43+
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}

.github/workflows/lint.yaml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@ jobs:
1313
runs-on: ubuntu-latest
1414
steps:
1515
- uses: actions/checkout@v4
16-
- name: Set up Python 3.13
17-
uses: actions/setup-python@v5
16+
- name: Install uv
17+
uses: astral-sh/setup-uv@v6
1818
with:
19-
python-version: '3.13'
20-
- name: Install dependencies
19+
enable-cache: true
20+
activate-environment: true
21+
- name: Install pip and dependencies
2122
run: |
22-
python -m pip install --upgrade pip
23-
python -m pip install invoke .[dev]
23+
uv pip install -U pip
24+
uv pip install .[dev]
2425
- name: Run lint checks
25-
run: invoke lint
26-
26+
run: uv run invoke lint

.github/workflows/unit.yaml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
name: Unit Tests
2+
on:
3+
pull_request:
4+
types:
5+
- opened
6+
- synchronize
7+
- ready_for_review
8+
push:
9+
branches:
10+
- main
11+
workflow_dispatch:
12+
concurrency:
13+
group: ${{ github.workflow }}-${{ github.ref }}
14+
cancel-in-progress: true
15+
jobs:
16+
tests:
17+
name: unit tests
18+
runs-on: ubuntu-latest
19+
timeout-minutes: 5
20+
steps:
21+
- uses: actions/checkout@v4
22+
- name: Install uv
23+
uses: astral-sh/setup-uv@v6
24+
with:
25+
enable-cache: true
26+
activate-environment: true
27+
- name: Install pip and dependencies
28+
run: |
29+
uv pip install -U pip
30+
uv pip install -e .[test,dev]
31+
- name: Run summarize
32+
run: |
33+
uv run pytest -s -vv tests/unit/

.gitignore

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,16 @@
1+
settings.yaml
2+
bigquery_creds.json
3+
client_secrets.json
4+
credentials.json
5+
sdv-dev.github.io/*
6+
7+
notebooks
8+
*.xlsx
9+
*.csv
10+
11+
secrets/
12+
*.bak
13+
114
# Byte-compiled / optimized / DLL files
215
__pycache__/
316
*.py[cod]
@@ -107,14 +120,4 @@ ENV/
107120
.DS_Store
108121

109122
# Vim
110-
.*.swp
111-
112-
notebooks
113-
*.xlsx
114-
115-
settings.yaml
116-
credentials.json
117-
client_secrets.json
118-
119-
secrets/
120-
*.bak
123+
.*.swp

README.md

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Download Analytics
22

3-
Scripts to extract metrics about OSS project downloads.
3+
The Download Analytics project allows you to extract download metrics from a Python library published on [PyPI](https://pypi.org/).
44

55
## Overview
66

@@ -10,13 +10,13 @@ engagement metrics.
1010

1111
### Data sources
1212

13-
Currently the data is being downloaded from the following sources:
13+
Currently the download data is coming from the following distributions:
1414

1515
* [PyPI](https://pypi.org/): Information about the project downloads from [PyPI](https://pypi.org/)
1616
obtained from the public Big Query dataset, equivalent to the information shown on
1717
[pepy.tech](https://pepy.tech).
1818

19-
In the future, these sources may also be added:
19+
In the future, we may also expand the source distributions to include:
2020

2121
* [conda-forge](https://conda-forge.org/): Information about the project downloads from the
2222
`conda-forge` channel on `conda`.
@@ -37,8 +37,16 @@ In order add new libraries, it is important to follow these steps to ensure that
3737
4. Get your pull request reviewed and merged into `main`. The daily collection workflow will fill the data for the last 30 days and future days.
3838
- Note: The collection script looks at timestamps and avoids adding overlapping data.
3939

40-
## Resources
40+
### Metrics
41+
This library collects the number of downloads for your chosen software. You can break these up along several dimensions:
42+
43+
- **By Month**: The number of downloads per month
44+
- **By Version**: The number of downloads per version of the software, as determine by the software maintainers
45+
- **By Python Version**: The number of downloads per minor Python version (eg. 3.8)
46+
- **And more!** See the resources below for more information.
4147

48+
## Resources
49+
For more information about the configuration, workflows and metrics, see the resources below.
4250
| | Document | Description |
4351
| ------------- | ----------------------------------- | ----------- |
4452
| :pilot: | [WORKFLOWS](docs/WORKFLOWS.md) | How to collect data and add new libraries to the Github actions. |

download_analytics/__main__.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import yaml
1111

1212
from download_analytics.main import collect_downloads
13+
from download_analytics.summarize import summarize_downloads
1314

1415
LOGGER = logging.getLogger(__name__)
1516

@@ -61,6 +62,22 @@ def _collect(args):
6162
)
6263

6364

65+
def _summarize(args):
66+
config = _load_config(args.config_file)
67+
projects = config['projects']
68+
vendors = config['vendors']
69+
output_folder = args.output_folder or config.get('output-folder', '.')
70+
71+
summarize_downloads(
72+
projects=projects,
73+
vendors=vendors,
74+
input_file=args.input_file,
75+
output_folder=output_folder,
76+
dry_run=args.dry_run,
77+
verbose=args.verbose,
78+
)
79+
80+
6481
def _valid_date(arg):
6582
try:
6683
return datetime.strptime(arg, '%Y-%m-%d')
@@ -159,6 +176,41 @@ def _get_parser():
159176
help='Compute the aggregation metrics and create the corresponding spreadsheets.',
160177
)
161178

179+
# collect
180+
summarize = action.add_parser(
181+
'summarize', help='Summarize the downloads data.', parents=[logging_args]
182+
)
183+
summarize.set_defaults(action=_summarize)
184+
summarize.add_argument(
185+
'-c',
186+
'--config-file',
187+
type=str,
188+
default='summarize_config.yaml',
189+
help='Path to the configuration file.',
190+
)
191+
summarize.add_argument(
192+
'-i',
193+
'--input-file',
194+
type=str,
195+
default=None,
196+
help='Path to the pypi.csv. Default None, which means to use output-folder for pypi.csv',
197+
)
198+
summarize.add_argument(
199+
'-o',
200+
'--output-folder',
201+
type=str,
202+
required=False,
203+
help=(
204+
'Path to the folder where data will be outputted. It can be a local path or a'
205+
' Google Drive folder path in the format gdrive://<folder-id>'
206+
),
207+
)
208+
summarize.add_argument(
209+
'-d',
210+
'--dry-run',
211+
action='store_true',
212+
help='Do not upload the summary results. Just calculate them.',
213+
)
162214
return parser
163215

164216

0 commit comments

Comments
 (0)