Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
100 commits
Select commit Hold shift + click to select a range
7564a82
wip
gsheni Jun 3, 2025
b4bcf87
lint
gsheni Jun 3, 2025
cfa02fa
fix start date
gsheni Jun 3, 2025
18bbb95
add project print
gsheni Jun 3, 2025
059e9ac
fix print
gsheni Jun 3, 2025
c735af7
update message
gsheni Jun 3, 2025
f1d9a31
update to use pyarrow dtypes
gsheni Jun 3, 2025
f73291f
fix string
gsheni Jun 3, 2025
20a0108
update to ubuntu-latest-largeA
gsheni Jun 3, 2025
ecdc74d
update to ubuntu
gsheni Jun 3, 2025
a694d4c
fix engine
gsheni Jun 3, 2025
420cf09
docstring
gsheni Jun 3, 2025
e6b751b
use category dtype
gsheni Jun 3, 2025
4924d5b
remove pyarrow
gsheni Jun 3, 2025
3c380e4
fix ns
gsheni Jun 3, 2025
21c4701
lint
gsheni Jun 3, 2025
b71f64b
use pyarrow everywhere
gsheni Jun 3, 2025
6384bc9
remove pyarrow dtypes
gsheni Jun 3, 2025
dac71d5
add readme instructions
gsheni Jun 3, 2025
e195d6e
fix manual
gsheni Jun 3, 2025
b4aca6d
cleanup
gsheni Jun 3, 2025
e40da58
fix manual
gsheni Jun 3, 2025
70d339e
fix manual
gsheni Jun 3, 2025
e0f728f
fix max_days
gsheni Jun 3, 2025
b8b9741
fix docs
gsheni Jun 3, 2025
473e19b
wip
gsheni Jun 4, 2025
4f75889
Merge branch 'main' of https://github.com/datacebo/download-analytics…
gsheni Jun 4, 2025
193b842
wip
gsheni Jun 5, 2025
80628e8
wip
gsheni Jun 6, 2025
c9372f7
pull main
gsheni Jun 6, 2025
e912721
pull main
gsheni Jun 6, 2025
3aa5abf
fix workflow
gsheni Jun 6, 2025
9511379
fix workflow
gsheni Jun 6, 2025
3b86fee
add message to workflow
gsheni Jun 6, 2025
9e3fc48
cleanup
gsheni Jun 6, 2025
aaa0c29
fix repo
gsheni Jun 6, 2025
7c7d615
fix slack msg
gsheni Jun 6, 2025
430b5f6
fix slack msg
gsheni Jun 6, 2025
1fc2573
use extensions
gsheni Jun 6, 2025
d386d47
summarize fix"
gsheni Jun 6, 2025
bc163eb
use uv
gsheni Jun 6, 2025
c1d3370
fix uv
gsheni Jun 6, 2025
3cc2fe6
use cache
gsheni Jun 6, 2025
8064477
change token
gsheni Jun 6, 2025
b53c6bf
add unit tests
gsheni Jun 6, 2025
a4001b4
add unit workflow
gsheni Jun 6, 2025
e06bf05
add dry-run
gsheni Jun 6, 2025
b11083c
remove unused arg
gsheni Jun 6, 2025
e8bffc9
fix dry run
gsheni Jun 6, 2025
77658fe
use uv in lint
gsheni Jun 6, 2025
21ad83a
add date
gsheni Jun 6, 2025
53a197b
cleanup readme
gsheni Jun 6, 2025
7a2c11e
Rename daily_collect.yaml to daily_collection.yaml
gsheni Jun 6, 2025
b518f22
Update daily_collection.yaml
gsheni Jun 6, 2025
3808ffa
Update daily_summarize.yaml
gsheni Jun 6, 2025
e6d0177
Update dryrun.yaml
gsheni Jun 6, 2025
ce1ed90
Update lint.yaml
gsheni Jun 6, 2025
4106bfe
Update manual.yaml
gsheni Jun 6, 2025
39c0abe
Update unit.yaml
gsheni Jun 6, 2025
c8ffbb4
wip
gsheni Jun 9, 2025
f18f836
Address feedback 2
gsheni Jun 10, 2025
5de3e1f
add version parse
gsheni Jun 10, 2025
cddcdce
use object dtype
gsheni Jun 10, 2025
8730c66
fix local write
gsheni Jun 10, 2025
2e8586a
lint
gsheni Jun 10, 2025
8803f0e
Merge branch '13-add-a-daily-workflow-to-summarize-download-counts-fo…
gsheni Jun 10, 2025
f357ed6
Update daily_summarize.yaml
gsheni Jun 10, 2025
74a95f1
cleanup
gsheni Jun 10, 2025
60e4cf8
wip
gsheni Jun 11, 2025
7dc033d
exclude pre-releases
gsheni Jun 11, 2025
3debe90
wip
gsheni Jun 11, 2025
ec7740c
cleanup
gsheni Jun 11, 2025
6bf1852
cleanup
gsheni Jun 11, 2025
80153c7
cleanup
gsheni Jun 11, 2025
860b5d5
cleanup
gsheni Jun 11, 2025
ff200e3
fix conflicts
gsheni Jun 11, 2025
0ada91e
update workflow
gsheni Jun 11, 2025
46e126a
rename workflow
gsheni Jun 11, 2025
5c7ffe1
fix unit tests
gsheni Jun 11, 2025
8f00d16
define cache break
gsheni Jun 11, 2025
6b9dc1a
force reinstall
gsheni Jun 11, 2025
84ce344
remove force install
gsheni Jun 11, 2025
140825e
lint
gsheni Jun 11, 2025
a806920
fix summarize config
gsheni Jun 11, 2025
d459539
cleanup
gsheni Jun 11, 2025
430b0b3
fix conflicts
gsheni Jun 11, 2025
2369a4f
fix dry run
gsheni Jun 11, 2025
ee9b714
fix conflicts
gsheni Jun 12, 2025
bf745df
Merge branch 'main' into 15-add-daily-workflow-to-export-anaconda-dow…
gsheni Jun 13, 2025
314b182
Update dryrun.yaml
gsheni Jun 13, 2025
aea12c4
Update dryrun.yaml
gsheni Jun 13, 2025
e20cfea
fix dry run
gsheni Jun 13, 2025
5ba5f8d
fix dry run
gsheni Jun 13, 2025
a14f53c
fix dry run
gsheni Jun 13, 2025
b408543
fix write
gsheni Jun 13, 2025
d023546
remove breakpoint
gsheni Jun 13, 2025
0e0d548
Update download_analytics/time_utils.py
gsheni Jun 16, 2025
70f93ab
fix tz
gsheni Jun 16, 2025
262994e
fix based on feedback
gsheni Jun 17, 2025
db48947
fix workflow
gsheni Jun 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 26 additions & 6 deletions .github/workflows/daily_collection.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,35 +7,55 @@ on:
description: Slack channel to post the error message to if the builds fail.
required: false
default: "sdv-alerts-debug"
max_days_pypi:
description: 'Maximum number of days to collect, starting from today for PyPI.'
required: false
type: number
default: 30
max_days_anaconda:
description: 'Maximum number of days to collect, starting from today for Anaconda'
required: false
type: number
default: 90
schedule:
- cron: '0 0 * * *'

jobs:
collect:
runs-on: ubuntu-latest-large
timeout-minutes: 20
timeout-minutes: 25
steps:
- uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
activate-environment: true
cache-dependency-glob: |
**/pyproject.toml
**/__main__.py
- name: Install pip and dependencies
run: |
uv pip install -U pip
uv pip install -e .
- name: Collect Downloads Data
uv pip install .
- name: Collect PyPI Downloads
run: |
uv run download-analytics collect \
uv run download-analytics collect-pypi \
--verbose \
--max-days 30 \
--max-days ${{ inputs.max_days_pypi || 30 }} \
--add-metrics \
--output-folder gdrive://10QHbqyvptmZX4yhu2Y38YJbVHqINRr0n
env:
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
BIGQUERY_CREDENTIALS: ${{ secrets.BIGQUERY_CREDENTIALS }}

- name: Collect Anaconda Downloads
run: |
uv run download-analytics collect-anaconda \
--output-folder gdrive://1UnDYovLkL4gletOF5328BG1X59mSHF-Z \
--max-days ${{ inputs.max_days_anaconda || 90 }} \
--verbose
env:
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
alert:
needs: [collect]
runs-on: ubuntu-latest
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/daily_summarize.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ jobs:
with:
enable-cache: true
activate-environment: true
cache-dependency-glob: |
**/pyproject.toml
**/__main__.py
- name: Install pip and dependencies
run: |
uv pip install -U pip
Expand Down
21 changes: 17 additions & 4 deletions .github/workflows/dryrun.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,25 @@ concurrency:
cancel-in-progress: true
jobs:
dry_run:
runs-on: ubuntu-latest
runs-on: ubuntu-latest-large
timeout-minutes: 25
steps:
- uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
activate-environment: true
cache-dependency-glob: |
**/pyproject.toml
**/__main__.py
- name: Install pip and dependencies
run: |
uv pip install -U pip
uv pip install .
- name: Collect Downloads Data - Dry Run
- name: Collect PyPI Downloads - Dry Run
run: |
uv run download-analytics collect \
uv run download-analytics collect-pypi \
--verbose \
--max-days 30 \
--add-metrics \
Expand All @@ -33,7 +37,16 @@ jobs:
env:
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
BIGQUERY_CREDENTIALS: ${{ secrets.BIGQUERY_CREDENTIALS }}
- name: Run Summarize - Dry Run
- name: Collect Anaconda Downloads - Dry Run
run: |
uv run download-analytics collect-anaconda \
--output-folder gdrive://1UnDYovLkL4gletOF5328BG1X59mSHF-Z \
--max-days 90 \
--verbose \
--dry-run
env:
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
- name: Summarize - Dry Run
run: |
uv run download-analytics summarize \
--verbose \
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/manual.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
uv pip install .
- name: Collect Downloads Data
run: |
uv run download-analytics collect \
uv run download-analytics collect-pypi \
--verbose \
--projects ${{ github.event.inputs.projects }} \
${{ github.event.inputs.max_days && '--max-days ' || '' }} \
Expand Down
14 changes: 11 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,24 @@ engagement metrics.

### Data sources

Currently the download data is coming from the following distributions:
Currently the download data is collected from the following distributions:

* [PyPI](https://pypi.org/): Information about the project downloads from [PyPI](https://pypi.org/)
obtained from the public Big Query dataset, equivalent to the information shown on
[pepy.tech](https://pepy.tech).
* [conda-forge](https://conda-forge.org/): Information about the project downloads from the
`conda-forge` channel on `conda`.
- The conda package download data provided by Anaconda. It includes package download counts
starting from January 2017. More information:
- https://github.com/anaconda/anaconda-package-data
- The conda package metadata data provided by Anaconda. There is a public API which allows for
the retrieval of package information, including current number of downloads.
- https://api.anaconda.org/package/{username}/{package_name}
- Replace {username} with the Anaconda username (`conda-forge`) and {package_name} with
the specific package name (`sdv`).

In the future, we may also expand the source distributions to include:

* [conda-forge](https://conda-forge.org/): Information about the project downloads from the
`conda-forge` channel on `conda`.
* [github](https://github.com/): Information about the project downloads from github releases.

For more information about how to configure and use the software, or about the data that is being
Expand Down
8 changes: 4 additions & 4 deletions docs/DEVELOPMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,14 @@ For development, run `make install-develop` instead.
## Command Line Interface

After the installation, a new `download-analytics` command will have been registered inside your
`virtualenv`. This command can be used in conjunction with the `collect` action to collect
`virtualenv`. This command can be used in conjunction with the `collect-pypi` action to collect
downloads data from BigQuery and store the output locally or in Google Drive.

Here is the entire list of arguments that the command line has:

```bash
$ download-analytics collect --help
usage: download-analytics collect [-h] [-v] [-l LOGFILE] [-o OUTPUT_FOLDER] [-a AUTHENTICATION_CREDENTIALS]
$ download-analytics collect-pypi --help
usage: download-analytics collect-pypi [-h] [-v] [-l LOGFILE] [-o OUTPUT_FOLDER] [-a AUTHENTICATION_CREDENTIALS]
[-c CONFIG_FILE] [-p [PROJECTS [PROJECTS ...]]] [-s START_DATE]
[-m MAX_DAYS] [-d] [-f] [-M]

Expand Down Expand Up @@ -73,7 +73,7 @@ and store the downloads data into a Google Drive folder alongside the correspond
metric spreadsheets would look like this:

```bash
$ download-analytics collect --verbose --projects sdv ctgan --start-date 2021-01-01 \
$ download-analytics collect-pypi --verbose --projects sdv ctgan --start-date 2021-01-01 \
--add-metrics --output-folder gdrive://10QHbqyvptmZX4yhu2Y38YJbVHqINRr0n
```

Expand Down
6 changes: 3 additions & 3 deletions docs/SETUP.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ if contains the application KEY which should never be made public.

Once the file is created, you can follow these steps:

1. Run the `download-analytics collect` command. If the `settings.yaml` file has been properly
1. Run the `download-analytics collect-pypi` command. If the `settings.yaml` file has been properly
created, this will **open a new tab on your web browser**, where you need to authenticate.

| ![pydrive-collect](imgs/pydrive-collect.png "Run the `download-analytics collect` Command") |
| ![pydrive-collect](imgs/pydrive-collect.png "Run the `download-analytics collect-pypi` Command") |
| - |

2. Click on the Google account which you which to authenticate with. Notice that the account that
Expand Down Expand Up @@ -67,7 +67,7 @@ be provided to you by a privileged admin.
Once you have this JSON file, you have two options:

1. Pass the path to the authentication file with the `-a` or `--authentication-credentials`
argument to the `download-analytics collect` command.
argument to the `download-analytics collect-pypi` command.

| ![bigquery-a](imgs/bigquery-a.png "Pass the credentials on command line") |
| - |
Expand Down
86 changes: 63 additions & 23 deletions download_analytics/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import yaml

from download_analytics.anaconda import collect_anaconda_downloads
from download_analytics.main import collect_downloads
from download_analytics.summarize import summarize_downloads

Expand Down Expand Up @@ -44,7 +45,7 @@ def _load_config(config_path):
return config


def _collect(args):
def _collect_pypi(args):
config = _load_config(args.config_file)
projects = args.projects or config['projects']
output_folder = args.output_folder or config.get('output-folder', '.')
Expand All @@ -62,6 +63,19 @@ def _collect(args):
)


def _collect_anaconda(args):
config = _load_config(args.config_file)
projects = config['projects']
output_folder = args.output_folder or config.get('output-folder', '.')
collect_anaconda_downloads(
projects=projects,
output_folder=output_folder,
max_days=args.max_days,
dry_run=args.dry_run,
verbose=args.verbose,
)


def _summarize(args):
config = _load_config(args.config_file)
projects = config['projects']
Expand Down Expand Up @@ -98,7 +112,12 @@ def _get_parser():
logging_args.add_argument(
'-l', '--logfile', help='If given, file where the logs will be written.'
)

logging_args.add_argument(
'-d',
'--dry-run',
action='store_true',
help='Do not upload the results. Just calculate them.',
)
parser = argparse.ArgumentParser(
prog='download-analytics',
description='Download Analytics Command Line Interface',
Expand All @@ -109,10 +128,12 @@ def _get_parser():
action.required = True

# collect
collect = action.add_parser('collect', help='Collect downloads data.', parents=[logging_args])
collect.set_defaults(action=_collect)
collect_pypi = action.add_parser(
'collect-pypi', help='Collect download data from PyPi.', parents=[logging_args]
)
collect_pypi.set_defaults(action=_collect_pypi)

collect.add_argument(
collect_pypi.add_argument(
'-o',
'--output-folder',
type=str,
Expand All @@ -122,54 +143,48 @@ def _get_parser():
' Google Drive folder path in the format gdrive://<folder-id>'
),
)
collect.add_argument(
collect_pypi.add_argument(
'-a',
'--authentication-credentials',
type=str,
required=False,
help='Path to the GCP (BigQuery) credentials file to use.',
)
collect.add_argument(
collect_pypi.add_argument(
'-c',
'--config-file',
type=str,
default='config.yaml',
help='Path to the configuration file.',
)
collect.add_argument(
collect_pypi.add_argument(
'-p',
'--projects',
nargs='*',
help='List of projects to collect. If not given use the configured ones.',
default=None,
)
collect.add_argument(
collect_pypi.add_argument(
'-s',
'--start-date',
type=_valid_date,
required=False,
help='Date from which to start pulling data.',
)
collect.add_argument(
collect_pypi.add_argument(
'-m',
'--max-days',
type=int,
required=False,
help='Max days of data to pull if start-date is not given.',
)
collect.add_argument(
'-d',
'--dry-run',
action='store_true',
help='Do not run the actual query, only simulate it.',
)
collect.add_argument(
collect_pypi.add_argument(
'-f',
'--force',
action='store_true',
help='Force the download even if the data already exists or there is a gap',
)
collect.add_argument(
collect_pypi.add_argument(
'-M',
'--add-metrics',
action='store_true',
Expand Down Expand Up @@ -205,11 +220,36 @@ def _get_parser():
' Google Drive folder path in the format gdrive://<folder-id>'
),
)
summarize.add_argument(
'-d',
'--dry-run',
action='store_true',
help='Do not upload the summary results. Just calculate them.',

# collect
collect_anaconda = action.add_parser(
'collect-anaconda', help='Collect download data from Anaconda.', parents=[logging_args]
)
collect_anaconda.set_defaults(action=_collect_anaconda)
collect_anaconda.add_argument(
'-c',
'--config-file',
type=str,
default='config.yaml',
help='Path to the configuration file.',
)
collect_anaconda.add_argument(
'-o',
'--output-folder',
type=str,
required=False,
help=(
'Path to the folder where data will be outputted. It can be a local path or a'
' Google Drive folder path in the format gdrive://<folder-id>'
),
)
collect_anaconda.add_argument(
'-m',
'--max-days',
type=int,
required=False,
default=90,
help='Max days of data to pull.',
)
return parser

Expand Down
Loading