Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions .github/workflows/daily.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
name: Daily Collection

on:
workflow_dispatch:
schedule:
- cron: '0 0 * * *'

jobs:
collect:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v1
uses: actions/setup-python@v5
with:
python-version: 3.8
python-version: '3.13'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/dryrun.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ jobs:
collect:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v1
uses: actions/setup-python@v5
with:
python-version: 3.8
python-version: '3.13'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
23 changes: 23 additions & 0 deletions .github/workflows/lint.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: Style Checks

on:
push:
pull_request:
types: [opened, reopened]

jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.13
uses: actions/setup-python@v5
with:
python-version: '3.13'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install invoke .[dev]
- name: Run lint checks
run: invoke lint

6 changes: 3 additions & 3 deletions .github/workflows/manual.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ jobs:
collect:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v1
uses: actions/setup-python@v5
with:
python-version: 3.8
python-version: '3.13'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,15 @@ install: clean-build clean-pyc ## install the package to the active Python's sit

.PHONY: install-develop
install-develop: clean-build clean-pyc ## install the package in editable mode and dependencies for development
pip install -r dev-requirements.txt -e .
pip install -e .[dev]


# LINT TARGETS

.PHONY: lint
lint: ## check style with flake8 and isort
invoke lint

.PHONY: fix-lint
fix-lint:
invoke fix-lint
26 changes: 0 additions & 26 deletions dev-requirements.txt

This file was deleted.

83 changes: 60 additions & 23 deletions download_analytics/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,21 @@ def _valid_date(arg):
def _get_parser():
# Logging
logging_args = argparse.ArgumentParser(add_help=False)
logging_args.add_argument('-v', '--verbose', action='count', default=0,
help='Be verbose. Use `-vv` for increased verbosity.')
logging_args.add_argument('-l', '--logfile',
help='If given, file where the logs will be written.')
logging_args.add_argument(
'-v',
'--verbose',
action='count',
default=0,
help='Be verbose. Use `-vv` for increased verbosity.',
)
logging_args.add_argument(
'-l', '--logfile', help='If given, file where the logs will be written.'
)

parser = argparse.ArgumentParser(
prog='download-analytics',
description='Download Analytics Command Line Interface',
parents=[logging_args]
parents=[logging_args],
)
parser.set_defaults(action=None)
action = parser.add_subparsers(title='action')
Expand All @@ -90,36 +96,67 @@ def _get_parser():
collect.set_defaults(action=_collect)

collect.add_argument(
'-o', '--output-folder', type=str, required=False,
'-o',
'--output-folder',
type=str,
required=False,
help=(
'Path to the folder where data will be stored. It can be a local path or a'
' Google Drive folder path in the format gdrive://<folder-id>'
)
),
)
collect.add_argument(
'-a', '--authentication-credentials', type=str, required=False,
help='Path to the GCP (BigQuery) credentials file to use.')
'-a',
'--authentication-credentials',
type=str,
required=False,
help='Path to the GCP (BigQuery) credentials file to use.',
)
collect.add_argument(
'-c', '--config-file', type=str, default='config.yaml',
help='Path to the configuration file.')
'-c',
'--config-file',
type=str,
default='config.yaml',
help='Path to the configuration file.',
)
collect.add_argument(
'-p', '--projects', nargs='*',
help='List of projects to collect. If not given use the configured ones.')
'-p',
'--projects',
nargs='*',
help='List of projects to collect. If not given use the configured ones.',
)
collect.add_argument(
'-s', '--start-date', type=_valid_date, required=False,
help='Date from which to start pulling data.')
'-s',
'--start-date',
type=_valid_date,
required=False,
help='Date from which to start pulling data.',
)
collect.add_argument(
'-m', '--max-days', type=int, required=False,
help='Max days of data to pull if start-date is not given.')
'-m',
'--max-days',
type=int,
required=False,
help='Max days of data to pull if start-date is not given.',
)
collect.add_argument(
'-d', '--dry-run', action='store_true',
help='Do not run the actual query, only simulate it.')
'-d',
'--dry-run',
action='store_true',
help='Do not run the actual query, only simulate it.',
)
collect.add_argument(
'-f', '--force', action='store_true',
help='Force the download even if the data already exists or there is a gap')
'-f',
'--force',
action='store_true',
help='Force the download even if the data already exists or there is a gap',
)
collect.add_argument(
'-M', '--add-metrics', action='store_true',
help='Compute the aggregation metrics and create the corresponding spreadsheets.')
'-M',
'--add-metrics',
action='store_true',
help='Compute the aggregation metrics and create the corresponding spreadsheets.',
)

return parser

Expand Down
11 changes: 7 additions & 4 deletions download_analytics/bq.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@ def _get_bq_client(credentials_file):
scopes=['https://www.googleapis.com/auth/cloud-platform'],
)

return bigquery.Client(credentials=credentials, project=credentials.project_id,)
return bigquery.Client(
credentials=credentials,
project=credentials.project_id,
)


def run_query(query, dry_run=False, credentials_file=None):
Expand All @@ -41,14 +44,14 @@ def run_query(query, dry_run=False, credentials_file=None):

job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
dry_run_job = client.query(query, job_config=job_config)
LOGGER.info('Estimated processed GBs: %.2f', dry_run_job.total_bytes_processed / 1024 ** 3)
LOGGER.info('Estimated processed GBs: %.2f', dry_run_job.total_bytes_processed / 1024**3)

if dry_run:
return None

query_job = client.query(query)
data = query_job.to_dataframe()
LOGGER.info('Total processed GBs: %.2f', query_job.total_bytes_processed / 1024 ** 3)
LOGGER.info('Total billed GBs: %.2f', query_job.total_bytes_billed / 1024 ** 3)
LOGGER.info('Total processed GBs: %.2f', query_job.total_bytes_processed / 1024**3)
LOGGER.info('Total billed GBs: %.2f', query_job.total_bytes_billed / 1024**3)

return data
6 changes: 1 addition & 5 deletions download_analytics/drive.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,7 @@ def upload(content, filename, folder, convert=False):
except FileNotFoundError:
file_config = {
'title': filename,
'parents': [
{
'id': folder
}
],
'parents': [{'id': folder}],
}
drive_file = drive.CreateFile(file_config)

Expand Down
14 changes: 11 additions & 3 deletions download_analytics/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,16 @@
LOGGER = logging.getLogger(__name__)


def collect_downloads(projects, output_folder, start_date=None, max_days=1, credentials_file=None,
dry_run=False, force=False, add_metrics=True):
def collect_downloads(
projects,
output_folder,
start_date=None,
max_days=1,
credentials_file=None,
dry_run=False,
force=False,
add_metrics=True,
):
"""Pull data about the downloads of a list of projects.

Args:
Expand Down Expand Up @@ -49,7 +57,7 @@ def collect_downloads(projects, output_folder, start_date=None, max_days=1, cred
max_days=max_days,
credentials_file=credentials_file,
dry_run=dry_run,
force=force
force=force,
)

if pypi_downloads.empty:
Expand Down
19 changes: 8 additions & 11 deletions download_analytics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import logging
import re

import pandas as pd

from download_analytics.output import create_spreadsheet

LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -35,24 +37,21 @@ def _historical_groupby(downloads, groupbys=None):

for groupby in groupbys:
grouped = downloads.groupby([year_month, groupby])
grouped_sizes = grouped.size().unstack(-1)
grouped_sizes = grouped.size().unstack(-1) # noqa: PD010
if len(groupbys) > 1:
grouped_sizes.columns = f"{groupby}='" + grouped_sizes.columns + "'"

base[grouped_sizes.columns] = grouped_sizes.fillna(0)

totals = base.sum()
totals.name = 'total'
base = base.append(totals)
base = pd.concat([base, totals], ignore_index=True)

return base.reset_index().iloc[::-1]


def _get_sheet_name(column):
words = [
f'{word[0].upper()}{word[1:]}'
for word in column.split('_')
]
words = [f'{word[0].upper()}{word[1:]}' for word in column.split('_')]
return ' '.join(['By'] + words)


Expand Down Expand Up @@ -121,7 +120,7 @@ def _version_element_order_key(version):
# while it shouldn't enter the `if`.
pass

components.append(last_component[len(last_numeric):])
components.append(last_component[len(last_numeric) :])

return components

Expand All @@ -133,7 +132,7 @@ def _version_order_key(version_column):
def _mangle_columns(downloads):
downloads = downloads.rename(columns=RENAME_COLUMNS)
downloads['full_python_version'] = downloads['python_version']
downloads['python_version'] = downloads['python_version'].str.rsplit('.', 1).str[0]
downloads['python_version'] = downloads['python_version'].str.rsplit('.', n=1).str[0]
downloads['project_version'] = downloads['project'] + '-' + downloads['version']
downloads['distro_version'] = downloads['distro_name'] + ' ' + downloads['distro_version']
downloads['distro_kernel'] = downloads['distro_version'] + ' - ' + downloads['distro_kernel']
Expand All @@ -150,9 +149,7 @@ def compute_metrics(downloads, output_path=None):
downloads = _mangle_columns(downloads)

LOGGER.debug('Aggregating by month')
sheets = {
'By Month': _by_month(downloads)
}
sheets = {'By Month': _by_month(downloads)}

for column in GROUPBY_COLUMNS:
name = _get_sheet_name(column)
Expand Down
Loading