diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml index eddf8d3..2771008 100644 --- a/.github/workflows/daily.yaml +++ b/.github/workflows/daily.yaml @@ -12,7 +12,8 @@ on: jobs: collect: - runs-on: ubuntu-latest + runs-on: ubuntu-latest-large + timeout-minutes: 30 steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/.github/workflows/dryrun.yaml b/.github/workflows/dryrun.yaml index a6a8350..b1260c5 100644 --- a/.github/workflows/dryrun.yaml +++ b/.github/workflows/dryrun.yaml @@ -1,5 +1,4 @@ name: Health-check Dry Run - on: workflow_dispatch: inputs: @@ -7,10 +6,14 @@ on: description: Slack channel to post the error message to if the builds fail. required: false default: "sdv-alerts-debug" - - push: pull_request: - + types: + - opened + - synchronize + - ready_for_review +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true jobs: dry_run: runs-on: ubuntu-latest diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index e84f94b..e35cf0a 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -1,10 +1,13 @@ name: Style Checks - on: - push: pull_request: - types: [opened, reopened] - + types: + - opened + - synchronize + - ready_for_review +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true jobs: lint: runs-on: ubuntu-latest diff --git a/.github/workflows/manual.yaml b/.github/workflows/manual.yaml index 6d9b8d2..7a7d91b 100644 --- a/.github/workflows/manual.yaml +++ b/.github/workflows/manual.yaml @@ -22,7 +22,7 @@ on: jobs: collect: - runs-on: ubuntu-latest + runs-on: ubuntu-latest-large steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} @@ -38,7 +38,8 @@ jobs: download-analytics collect \ --verbose \ --projects ${{ github.event.inputs.projects }} \ - --max-days ${{ github.event.inputs.max_days }} \ + ${{ github.event.inputs.max_days && '--max-days ' || '' }} \ + ${{ github.event.inputs.max_days && github.event.inputs.max_days || '' }} \ --output-folder gdrive://${{ github.event.inputs.output_folder }} \ ${{ github.event.inputs.extras }} env: diff --git a/README.md b/README.md index 6b2e0ff..f649867 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,18 @@ In the future, these sources may also be added: For more information about how to configure and use the software, or about the data that is being collected check the resources below. +### Add new libraries +In order add new libraries, it is important to follow these steps to ensure that data is backfilled. +1. Update `config.yaml` with the new libraries (pypi project names only for now) +2. Run the [Manual collection workflow](https://github.com/datacebo/download-analytics/actions/workflows/manual.yaml) on your branch. + - Use workflow from **your branch name**. + - List all project names from config.yaml + - Remove `7` from max days to indicate you want all data + - Pass any extra arguments (for example `--dry-run` to test your changes) +3. Let the workflow finish and check that pypi.csv contains the right data. +4. Get your pull request reviewed and merged into `main`. The daily collection workflow will fill the data for the last 30 days and future days. + - Note: The collection script looks at timestamps and avoids adding overlapping data. + ## Resources | | Document | Description | diff --git a/config.yaml b/config.yaml index 5fd787a..1986c9f 100644 --- a/config.yaml +++ b/config.yaml @@ -8,14 +8,16 @@ projects: - deepecho - sdmetrics - sdgym - - gretel-synthetics - - ydata-synthetic - synthesized - datomize - - gretel-trainer - - ydata-sdk - - mostlyai - synthcity - smartnoise-synth - realtabformer - be-great + - ydata-synthetic + - ydata-sdk + - gretel-synthetics + - gretel-trainer + - gretel-client + - mostlyai + - mostlyai-mock diff --git a/download_analytics/__main__.py b/download_analytics/__main__.py index 312184c..fef2f4a 100644 --- a/download_analytics/__main__.py +++ b/download_analytics/__main__.py @@ -124,6 +124,7 @@ def _get_parser(): '--projects', nargs='*', help='List of projects to collect. If not given use the configured ones.', + default=None, ) collect.add_argument( '-s', diff --git a/download_analytics/bq.py b/download_analytics/bq.py index 7e00a09..0d56588 100644 --- a/download_analytics/bq.py +++ b/download_analytics/bq.py @@ -24,11 +24,19 @@ def _get_bq_client(credentials_file): LOGGER.info('Loading BigQuery credentials from BIGQUERY_CREDENTIALS envvar') - service_account_info = json.loads(credentials_contents) - credentials = service_account.Credentials.from_service_account_info( - service_account_info, - scopes=['https://www.googleapis.com/auth/cloud-platform'], - ) + if os.path.exists(credentials_contents): + LOGGER.info('Loading BigQuery credentials from service account file') + credentials = service_account.Credentials.from_service_account_file( + credentials_contents, + scopes=['https://www.googleapis.com/auth/cloud-platform'], + ) + else: + LOGGER.info('Loading BigQuery credentials from service account info') + service_account_info = json.loads(credentials_contents) + credentials = service_account.Credentials.from_service_account_info( + service_account_info, + scopes=['https://www.googleapis.com/auth/cloud-platform'], + ) return bigquery.Client( credentials=credentials, @@ -44,7 +52,14 @@ def run_query(query, dry_run=False, credentials_file=None): job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False) dry_run_job = client.query(query, job_config=job_config) - LOGGER.info('Estimated processed GBs: %.2f', dry_run_job.total_bytes_processed / 1024**3) + data_processed_gbs = dry_run_job.total_bytes_processed / 1024**3 + LOGGER.info('Estimated data processed in query (GBs): %.2f', data_processed_gbs) + # https://cloud.google.com/bigquery/pricing#on_demand_pricing + # assuming have hit 1 terabyte processed in month + cost_per_terabyte = 6.15 + bytes = dry_run_job.total_bytes_processed + cost = cost_per_terabyte * bytes_to_terabytes(bytes) + LOGGER.info('Estimated cost for query: $%.2f', cost) if dry_run: return None @@ -53,5 +68,21 @@ def run_query(query, dry_run=False, credentials_file=None): data = query_job.to_dataframe() LOGGER.info('Total processed GBs: %.2f', query_job.total_bytes_processed / 1024**3) LOGGER.info('Total billed GBs: %.2f', query_job.total_bytes_billed / 1024**3) - + cost = cost_per_terabyte * bytes_to_terabytes(query_job.total_bytes_billed) + LOGGER.info('Total cost for query: $%.2f', cost) return data + + +def bytes_to_megabytes(bytes): + """Convert bytes to megabytes.""" + return bytes / 1024 / 1024 + + +def bytes_to_gigabytes(bytes): + """Convert bytes to gigabytes.""" + return bytes_to_megabytes(bytes) / 1024 + + +def bytes_to_terabytes(bytes): + """Convert bytes to terabytes.""" + return bytes_to_gigabytes(bytes) / 1024 diff --git a/download_analytics/main.py b/download_analytics/main.py index f58b19f..0f4cbaa 100644 --- a/download_analytics/main.py +++ b/download_analytics/main.py @@ -47,8 +47,10 @@ def collect_downloads( if not projects: raise ValueError('No projects have been passed') + LOGGER.info(f'Collecting downloads for projects={projects}') + csv_path = get_path(output_folder, 'pypi.csv') - previous = load_csv(csv_path) + previous = load_csv(csv_path, dry_run=dry_run) pypi_downloads = get_pypi_downloads( projects=projects, @@ -63,7 +65,11 @@ def collect_downloads( if pypi_downloads.empty: LOGGER.info('Not creating empty CSV file %s', csv_path) elif pypi_downloads.equals(previous): - LOGGER.info('Skipping update of unmodified CSV file %s', csv_path) + msg = f'Skipping update of unmodified CSV file {csv_path}' + if dry_run: + msg += f' because dry_run={dry_run}, meaning no downloads were returned from BigQuery' + LOGGER.info(msg) + else: create_csv(csv_path, pypi_downloads) diff --git a/download_analytics/metrics.py b/download_analytics/metrics.py index 3b21667..dd2a050 100644 --- a/download_analytics/metrics.py +++ b/download_analytics/metrics.py @@ -134,6 +134,16 @@ def _version_order_key(version_column): def _mangle_columns(downloads): downloads = downloads.rename(columns=RENAME_COLUMNS) + for col in [ + 'python_version', + 'project', + 'version', + 'distro_name', + 'distro_version', + 'distro_kernel', + ]: + downloads[col] = downloads[col].astype('string') + downloads['full_python_version'] = downloads['python_version'] downloads['python_version'] = downloads['python_version'].str.rsplit('.', n=1).str[0] downloads['project_version'] = downloads['project'] + '-' + downloads['version'] diff --git a/download_analytics/output.py b/download_analytics/output.py index 3e031a9..f9756c2 100644 --- a/download_analytics/output.py +++ b/download_analytics/output.py @@ -145,7 +145,7 @@ def load_spreadsheet(spreadsheet): return sheets -def load_csv(csv_path): +def load_csv(csv_path, dry_run=False): """Load a CSV previously created by download-analytics. Args: @@ -161,13 +161,33 @@ def load_csv(csv_path): LOGGER.info('Trying to load CSV file %s', csv_path) try: + read_csv_kwargs = { + 'parse_dates': ['timestamp'], + 'dtype': { + 'country_code': pd.CategoricalDtype(), + 'project': pd.CategoricalDtype(), + 'version': pd.CategoricalDtype(), + 'type': pd.CategoricalDtype(), + 'installer_name': pd.CategoricalDtype(), + 'implementation_name': pd.CategoricalDtype(), + 'implementation_version': pd.CategoricalDtype(), + 'distro_name': pd.CategoricalDtype(), + 'distro_version': pd.CategoricalDtype(), + 'system_name': pd.CategoricalDtype(), + 'system_release': pd.CategoricalDtype(), + 'cpu': pd.CategoricalDtype(), + }, + } + if dry_run: + nrows = 1_000_000 + LOGGER.info('Only reading first 1 million rows because dry-run') + read_csv_kwargs['nrows'] = nrows if drive.is_drive_path(csv_path): folder, filename = drive.split_drive_path(csv_path) stream = drive.download(folder, filename) - data = pd.read_csv(stream, parse_dates=['timestamp']) + data = pd.read_csv(stream, **read_csv_kwargs) else: - data = pd.read_csv(csv_path, parse_dates=['timestamp']) - + data = pd.read_csv(csv_path, **read_csv_kwargs) except FileNotFoundError: LOGGER.info('Failed to load CSV file %s: not found', csv_path) return None diff --git a/download_analytics/pypi.py b/download_analytics/pypi.py index dd834c7..e50da07 100644 --- a/download_analytics/pypi.py +++ b/download_analytics/pypi.py @@ -1,7 +1,7 @@ """Functions to get PyPI downloads from Google Big Query.""" import logging -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone import pandas as pd @@ -67,7 +67,7 @@ def _get_query(projects, start_date, end_date): def _get_query_dates(start_date, min_date, max_date, max_days, force=False): - end_date = datetime.utcnow().date() + end_date = datetime.now(timezone.utc).date() if start_date is None: start_date = end_date - timedelta(days=max_days) @@ -131,8 +131,8 @@ def get_pypi_downloads( projects = (projects,) previous_projects = previous[previous.project.isin(projects)] - min_date = previous_projects.timestamp.min() - max_date = previous_projects.timestamp.max() + min_date = previous_projects.timestamp.min().date() + max_date = previous_projects.timestamp.max().date() else: previous = pd.DataFrame(columns=OUTPUT_COLUMNS) min_date = None @@ -150,7 +150,7 @@ def get_pypi_downloads( if max_date is None: all_downloads = new_downloads else: - if pd.Timestamp(max_date) < pd.Timestamp(end_date): + if max_date <= end_date: before = previous[previous.timestamp < new_downloads.timestamp.min()] after = new_downloads else: @@ -160,5 +160,4 @@ def get_pypi_downloads( all_downloads = pd.concat([before, after], ignore_index=True) LOGGER.info('Obtained %s new downloads', len(all_downloads) - len(previous)) - return all_downloads diff --git a/pyproject.toml b/pyproject.toml index ae5c259..4351af0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ dependencies = [ "PyYAML", "PyDrive", "google-cloud-bigquery", + "google-cloud-bigquery-storage", "db-dtypes", "httplib2==0.15.0", # https://stackoverflow.com/questions/59815620/gcloud-upload-httplib2-redirectmissinglocation-redirected-but-the-response-is-m ]