wip

gsheni · gsheni · commit 7564a821a356 · 2025-06-03T11:27:59.000-04:00
diff --git a/config.yaml b/config.yaml
@@ -8,14 +8,16 @@ projects:
   - deepecho
   - sdmetrics
   - sdgym
-  - gretel-synthetics
-  - ydata-synthetic
   - synthesized
   - datomize
-  - gretel-trainer
-  - ydata-sdk
-  - mostlyai
   - synthcity
   - smartnoise-synth
   - realtabformer
   - be-great
+  - ydata-synthetic
+  - ydata-sdk
+  - gretel-synthetics
+  - gretel-trainer
+  - gretel-client
+  - mostlyai
+  - mostlyai-mock
diff --git a/download_analytics/__main__.py b/download_analytics/__main__.py
@@ -124,6 +124,7 @@ def _get_parser():
         '--projects',
         nargs='*',
         help='List of projects to collect. If not given use the configured ones.',
+        default=None,
     )
     collect.add_argument(
         '-s',
diff --git a/download_analytics/bq.py b/download_analytics/bq.py
@@ -6,6 +6,7 @@
 import logging
 import os
 import pathlib
+import pandas as pd
 
 from google.cloud import bigquery
 from google.oauth2 import service_account
@@ -24,11 +25,17 @@ def _get_bq_client(credentials_file):
 
         LOGGER.info('Loading BigQuery credentials from BIGQUERY_CREDENTIALS envvar')
 
-    service_account_info = json.loads(credentials_contents)
-    credentials = service_account.Credentials.from_service_account_info(
-        service_account_info,
-        scopes=['https://www.googleapis.com/auth/cloud-platform'],
-    )
+    if os.path.exists(credentials_contents):
+        credentials = service_account.Credentials.from_service_account_file(
+            credentials_contents,
+            scopes=['https://www.googleapis.com/auth/cloud-platform'],
+        )
+    else:
+        service_account_info = json.loads(credentials_contents)
+        credentials = service_account.Credentials.from_service_account_info(
+            service_account_info,
+            scopes=['https://www.googleapis.com/auth/cloud-platform'],
+        )
 
     return bigquery.Client(
         credentials=credentials,
@@ -44,7 +51,13 @@ def run_query(query, dry_run=False, credentials_file=None):
 
     job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
     dry_run_job = client.query(query, job_config=job_config)
-    LOGGER.info('Estimated processed GBs: %.2f', dry_run_job.total_bytes_processed / 1024**3)
+    LOGGER.info('Estimated data processed in query (GBs): %.2f', dry_run_job.total_bytes_processed / 1024**3)
+    # https://cloud.google.com/bigquery/pricing#on_demand_pricing
+    # assuming have hit 1 terabyte processed in month
+    cost_per_terabyte = 6.15
+    bytes = dry_run_job.total_bytes_processed
+    cost = cost_per_terabyte * bytes_to_terabytes(bytes)
+    LOGGER.info('Estimated cost for query: $%.2f', cost)
 
     if dry_run:
         return None
@@ -53,5 +66,15 @@ def run_query(query, dry_run=False, credentials_file=None):
     data = query_job.to_dataframe()
     LOGGER.info('Total processed GBs: %.2f', query_job.total_bytes_processed / 1024**3)
     LOGGER.info('Total billed GBs: %.2f', query_job.total_bytes_billed / 1024**3)
-
+    cost = cost_per_terabyte * bytes_to_terabytes(query_job.total_bytes_billed)
+    LOGGER.info('Total cost for query: $%.2f', cost)
     return data
+
+def bytes_to_megabytes(bytes):
+    return bytes / 1024 / 1024
+
+def bytes_to_gigabytes(bytes):
+    return bytes_to_megabytes(bytes) / 1024
+
+def bytes_to_terabytes(bytes):
+    return bytes_to_gigabytes(bytes) / 1024
diff --git a/download_analytics/pypi.py b/download_analytics/pypi.py
@@ -1,7 +1,7 @@
 """Functions to get PyPI downloads from Google Big Query."""
 
 import logging
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 
 import pandas as pd
 
@@ -67,10 +67,12 @@ def _get_query(projects, start_date, end_date):
 
 
 def _get_query_dates(start_date, min_date, max_date, max_days, force=False):
-    end_date = datetime.utcnow().date()
+    end_date = datetime.now(timezone.utc).date()
     if start_date is None:
         start_date = end_date - timedelta(days=max_days)
 
+    start_date = start_date.date()
+
     if pd.notna(min_date):
         min_date = pd.Timestamp(min_date).date()
         if min_date > start_date:
@@ -131,8 +133,8 @@ def get_pypi_downloads(
             projects = (projects,)
 
         previous_projects = previous[previous.project.isin(projects)]
-        min_date = previous_projects.timestamp.min()
-        max_date = previous_projects.timestamp.max()
+        min_date = previous_projects.timestamp.min().date()
+        max_date = previous_projects.timestamp.max().date()
     else:
         previous = pd.DataFrame(columns=OUTPUT_COLUMNS)
         min_date = None
@@ -150,7 +152,7 @@ def get_pypi_downloads(
         if max_date is None:
             all_downloads = new_downloads
         else:
-            if pd.Timestamp(max_date) < pd.Timestamp(end_date):
+            if max_date <= end_date:
                 before = previous[previous.timestamp < new_downloads.timestamp.min()]
                 after = new_downloads
             else:
@@ -160,5 +162,4 @@ def get_pypi_downloads(
             all_downloads = pd.concat([before, after], ignore_index=True)
 
     LOGGER.info('Obtained %s new downloads', len(all_downloads) - len(previous))
-
     return all_downloads

Original file line number	Diff line number	Diff line change
`@@ -124,6 +124,7 @@ def _get_parser():`
`124`	`124`	`'--projects',`
`125`	`125`	`nargs='*',`
`126`	`126`	`help='List of projects to collect. If not given use the configured ones.',`
	`127`	`+ default=None,`
`127`	`128`	`)`
`128`	`129`	`collect.add_argument(`
`129`	`130`	`'-s',`