Add gretel-client and mostlyai-mock (#19)

gsheni · web-flow · commit 017784778fcb · 2025-06-04T13:11:55.000-04:00
* wip

* lint

* fix start date

* add project print

* fix print

* update message

* update to use pyarrow dtypes

* fix string

* update to ubuntu-latest-largeA

* update to ubuntu

* fix engine

* docstring

* use category dtype

* remove pyarrow

* fix ns

* lint

* use pyarrow everywhere

* remove pyarrow dtypes

* add readme instructions

* fix manual

* cleanup

* fix manual

* fix manual

* fix max_days

* fix docs
diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
@@ -12,7 +12,8 @@ on:
 
 jobs:
   collect:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-latest-large
+    timeout-minutes: 30
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
diff --git a/.github/workflows/dryrun.yaml b/.github/workflows/dryrun.yaml
@@ -1,16 +1,19 @@
 name: Health-check Dry Run
-
 on:
   workflow_dispatch:
     inputs:
       slack_channel:
         description: Slack channel to post the error message to if the builds fail.
         required: false
         default: "sdv-alerts-debug"
-
-  push:
   pull_request:
-
+    types:
+      - opened
+      - synchronize
+      - ready_for_review
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
 jobs:
   dry_run:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
@@ -1,10 +1,13 @@
 name: Style Checks
-
 on:
-  push:
   pull_request:
-    types: [opened, reopened]
-
+    types:
+      - opened
+      - synchronize
+      - ready_for_review
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
 jobs:
   lint:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/manual.yaml b/.github/workflows/manual.yaml
@@ -22,7 +22,7 @@ on:
 
 jobs:
   collect:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-latest-large
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
@@ -38,7 +38,8 @@ jobs:
         download-analytics collect \
           --verbose \
           --projects ${{ github.event.inputs.projects }} \
-          --max-days ${{ github.event.inputs.max_days }} \
+          ${{ github.event.inputs.max_days && '--max-days ' || '' }} \
+          ${{ github.event.inputs.max_days && github.event.inputs.max_days || '' }} \
           --output-folder gdrive://${{ github.event.inputs.output_folder }} \
           ${{ github.event.inputs.extras }}
       env:
diff --git a/README.md b/README.md
@@ -25,6 +25,18 @@ In the future, these sources may also be added:
 For more information about how to configure and use the software, or about the data that is being
 collected check the resources below.
 
+### Add new libraries
+In order add new libraries, it is important to follow these steps to ensure that data is backfilled.
+1. Update `config.yaml` with the new libraries (pypi project names only for now)
+2. Run the [Manual collection workflow](https://github.com/datacebo/download-analytics/actions/workflows/manual.yaml) on your branch.
+    - Use workflow from **your branch name**.
+    - List all project names from config.yaml
+    - Remove `7` from max days to indicate you want all data
+    - Pass any extra arguments (for example `--dry-run` to test your changes)
+3. Let the workflow finish and check that pypi.csv contains the right data.
+4. Get your pull request reviewed and merged into `main`. The daily collection workflow will fill the data for the last 30 days and future days.
+    - Note: The collection script looks at timestamps and avoids adding overlapping data.
+
 ## Resources
 
 |               | Document                            | Description |
diff --git a/config.yaml b/config.yaml
@@ -8,14 +8,16 @@ projects:
   - deepecho
   - sdmetrics
   - sdgym
-  - gretel-synthetics
-  - ydata-synthetic
   - synthesized
   - datomize
-  - gretel-trainer
-  - ydata-sdk
-  - mostlyai
   - synthcity
   - smartnoise-synth
   - realtabformer
   - be-great
+  - ydata-synthetic
+  - ydata-sdk
+  - gretel-synthetics
+  - gretel-trainer
+  - gretel-client
+  - mostlyai
+  - mostlyai-mock
diff --git a/download_analytics/__main__.py b/download_analytics/__main__.py
@@ -124,6 +124,7 @@ def _get_parser():
         '--projects',
         nargs='*',
         help='List of projects to collect. If not given use the configured ones.',
+        default=None,
     )
     collect.add_argument(
         '-s',
diff --git a/download_analytics/bq.py b/download_analytics/bq.py
@@ -24,11 +24,19 @@ def _get_bq_client(credentials_file):
 
         LOGGER.info('Loading BigQuery credentials from BIGQUERY_CREDENTIALS envvar')
 
-    service_account_info = json.loads(credentials_contents)
-    credentials = service_account.Credentials.from_service_account_info(
-        service_account_info,
-        scopes=['https://www.googleapis.com/auth/cloud-platform'],
-    )
+    if os.path.exists(credentials_contents):
+        LOGGER.info('Loading BigQuery credentials from service account file')
+        credentials = service_account.Credentials.from_service_account_file(
+            credentials_contents,
+            scopes=['https://www.googleapis.com/auth/cloud-platform'],
+        )
+    else:
+        LOGGER.info('Loading BigQuery credentials from service account info')
+        service_account_info = json.loads(credentials_contents)
+        credentials = service_account.Credentials.from_service_account_info(
+            service_account_info,
+            scopes=['https://www.googleapis.com/auth/cloud-platform'],
+        )
 
     return bigquery.Client(
         credentials=credentials,
@@ -44,7 +52,14 @@ def run_query(query, dry_run=False, credentials_file=None):
 
     job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
     dry_run_job = client.query(query, job_config=job_config)
-    LOGGER.info('Estimated processed GBs: %.2f', dry_run_job.total_bytes_processed / 1024**3)
+    data_processed_gbs = dry_run_job.total_bytes_processed / 1024**3
+    LOGGER.info('Estimated data processed in query (GBs): %.2f', data_processed_gbs)
+    # https://cloud.google.com/bigquery/pricing#on_demand_pricing
+    # assuming have hit 1 terabyte processed in month
+    cost_per_terabyte = 6.15
+    bytes = dry_run_job.total_bytes_processed
+    cost = cost_per_terabyte * bytes_to_terabytes(bytes)
+    LOGGER.info('Estimated cost for query: $%.2f', cost)
 
     if dry_run:
         return None
@@ -53,5 +68,21 @@ def run_query(query, dry_run=False, credentials_file=None):
     data = query_job.to_dataframe()
     LOGGER.info('Total processed GBs: %.2f', query_job.total_bytes_processed / 1024**3)
     LOGGER.info('Total billed GBs: %.2f', query_job.total_bytes_billed / 1024**3)
-
+    cost = cost_per_terabyte * bytes_to_terabytes(query_job.total_bytes_billed)
+    LOGGER.info('Total cost for query: $%.2f', cost)
     return data
+
+
+def bytes_to_megabytes(bytes):
+    """Convert bytes to megabytes."""
+    return bytes / 1024 / 1024
+
+
+def bytes_to_gigabytes(bytes):
+    """Convert bytes to gigabytes."""
+    return bytes_to_megabytes(bytes) / 1024
+
+
+def bytes_to_terabytes(bytes):
+    """Convert bytes to terabytes."""
+    return bytes_to_gigabytes(bytes) / 1024
diff --git a/download_analytics/main.py b/download_analytics/main.py
@@ -47,8 +47,10 @@ def collect_downloads(
     if not projects:
         raise ValueError('No projects have been passed')
 
+    LOGGER.info(f'Collecting downloads for projects={projects}')
+
     csv_path = get_path(output_folder, 'pypi.csv')
-    previous = load_csv(csv_path)
+    previous = load_csv(csv_path, dry_run=dry_run)
 
     pypi_downloads = get_pypi_downloads(
         projects=projects,
@@ -63,7 +65,11 @@ def collect_downloads(
     if pypi_downloads.empty:
         LOGGER.info('Not creating empty CSV file %s', csv_path)
     elif pypi_downloads.equals(previous):
-        LOGGER.info('Skipping update of unmodified CSV file %s', csv_path)
+        msg = f'Skipping update of unmodified CSV file {csv_path}'
+        if dry_run:
+            msg += f' because dry_run={dry_run}, meaning no downloads were returned from BigQuery'
+        LOGGER.info(msg)
+
     else:
         create_csv(csv_path, pypi_downloads)
 
diff --git a/download_analytics/metrics.py b/download_analytics/metrics.py
@@ -134,6 +134,16 @@ def _version_order_key(version_column):
 
 def _mangle_columns(downloads):
     downloads = downloads.rename(columns=RENAME_COLUMNS)
+    for col in [
+        'python_version',
+        'project',
+        'version',
+        'distro_name',
+        'distro_version',
+        'distro_kernel',
+    ]:
+        downloads[col] = downloads[col].astype('string')
+
     downloads['full_python_version'] = downloads['python_version']
     downloads['python_version'] = downloads['python_version'].str.rsplit('.', n=1).str[0]
     downloads['project_version'] = downloads['project'] + '-' + downloads['version']
diff --git a/download_analytics/output.py b/download_analytics/output.py
@@ -145,7 +145,7 @@ def load_spreadsheet(spreadsheet):
     return sheets
 
 
-def load_csv(csv_path):
+def load_csv(csv_path, dry_run=False):
     """Load a CSV previously created by download-analytics.
 
     Args:
@@ -161,13 +161,33 @@ def load_csv(csv_path):
 
     LOGGER.info('Trying to load CSV file %s', csv_path)
     try:
+        read_csv_kwargs = {
+            'parse_dates': ['timestamp'],
+            'dtype': {
+                'country_code': pd.CategoricalDtype(),
+                'project': pd.CategoricalDtype(),
+                'version': pd.CategoricalDtype(),
+                'type': pd.CategoricalDtype(),
+                'installer_name': pd.CategoricalDtype(),
+                'implementation_name': pd.CategoricalDtype(),
+                'implementation_version': pd.CategoricalDtype(),
+                'distro_name': pd.CategoricalDtype(),
+                'distro_version': pd.CategoricalDtype(),
+                'system_name': pd.CategoricalDtype(),
+                'system_release': pd.CategoricalDtype(),
+                'cpu': pd.CategoricalDtype(),
+            },
+        }
+        if dry_run:
+            nrows = 1_000_000
+            LOGGER.info('Only reading first 1 million rows because dry-run')
+            read_csv_kwargs['nrows'] = nrows
         if drive.is_drive_path(csv_path):
             folder, filename = drive.split_drive_path(csv_path)
             stream = drive.download(folder, filename)
-            data = pd.read_csv(stream, parse_dates=['timestamp'])
+            data = pd.read_csv(stream, **read_csv_kwargs)
         else:
-            data = pd.read_csv(csv_path, parse_dates=['timestamp'])
-
+            data = pd.read_csv(csv_path, **read_csv_kwargs)
     except FileNotFoundError:
         LOGGER.info('Failed to load CSV file %s: not found', csv_path)
         return None
diff --git a/download_analytics/pypi.py b/download_analytics/pypi.py
@@ -1,7 +1,7 @@
 """Functions to get PyPI downloads from Google Big Query."""
 
 import logging
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 
 import pandas as pd
 
@@ -67,7 +67,7 @@ def _get_query(projects, start_date, end_date):
 
 
 def _get_query_dates(start_date, min_date, max_date, max_days, force=False):
-    end_date = datetime.utcnow().date()
+    end_date = datetime.now(timezone.utc).date()
     if start_date is None:
         start_date = end_date - timedelta(days=max_days)
 
@@ -131,8 +131,8 @@ def get_pypi_downloads(
             projects = (projects,)
 
         previous_projects = previous[previous.project.isin(projects)]
-        min_date = previous_projects.timestamp.min()
-        max_date = previous_projects.timestamp.max()
+        min_date = previous_projects.timestamp.min().date()
+        max_date = previous_projects.timestamp.max().date()
     else:
         previous = pd.DataFrame(columns=OUTPUT_COLUMNS)
         min_date = None
@@ -150,7 +150,7 @@ def get_pypi_downloads(
         if max_date is None:
             all_downloads = new_downloads
         else:
-            if pd.Timestamp(max_date) < pd.Timestamp(end_date):
+            if max_date <= end_date:
                 before = previous[previous.timestamp < new_downloads.timestamp.min()]
                 after = new_downloads
             else:
@@ -160,5 +160,4 @@ def get_pypi_downloads(
             all_downloads = pd.concat([before, after], ignore_index=True)
 
     LOGGER.info('Obtained %s new downloads', len(all_downloads) - len(previous))
-
     return all_downloads
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,6 +21,7 @@ dependencies = [
     "PyYAML",
     "PyDrive",
     "google-cloud-bigquery",
+    "google-cloud-bigquery-storage",
     "db-dtypes",
     "httplib2==0.15.0",  # https://stackoverflow.com/questions/59815620/gcloud-upload-httplib2-redirectmissinglocation-redirected-but-the-response-is-m
 ]

Original file line number	Diff line number	Diff line change
`@@ -124,6 +124,7 @@ def _get_parser():`
`124`	`124`	`'--projects',`
`125`	`125`	`nargs='*',`
`126`	`126`	`help='List of projects to collect. If not given use the configured ones.',`
	`127`	`+ default=None,`
`127`	`128`	`)`
`128`	`129`	`collect.add_argument(`
`129`	`130`	`'-s',`