Skip to content

Commit 7564a82

Browse files
committed
wip
1 parent 2188976 commit 7564a82

File tree

4 files changed

+45
-18
lines changed

4 files changed

+45
-18
lines changed

config.yaml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,16 @@ projects:
88
- deepecho
99
- sdmetrics
1010
- sdgym
11-
- gretel-synthetics
12-
- ydata-synthetic
1311
- synthesized
1412
- datomize
15-
- gretel-trainer
16-
- ydata-sdk
17-
- mostlyai
1813
- synthcity
1914
- smartnoise-synth
2015
- realtabformer
2116
- be-great
17+
- ydata-synthetic
18+
- ydata-sdk
19+
- gretel-synthetics
20+
- gretel-trainer
21+
- gretel-client
22+
- mostlyai
23+
- mostlyai-mock

download_analytics/__main__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ def _get_parser():
124124
'--projects',
125125
nargs='*',
126126
help='List of projects to collect. If not given use the configured ones.',
127+
default=None,
127128
)
128129
collect.add_argument(
129130
'-s',

download_analytics/bq.py

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import logging
77
import os
88
import pathlib
9+
import pandas as pd
910

1011
from google.cloud import bigquery
1112
from google.oauth2 import service_account
@@ -24,11 +25,17 @@ def _get_bq_client(credentials_file):
2425

2526
LOGGER.info('Loading BigQuery credentials from BIGQUERY_CREDENTIALS envvar')
2627

27-
service_account_info = json.loads(credentials_contents)
28-
credentials = service_account.Credentials.from_service_account_info(
29-
service_account_info,
30-
scopes=['https://www.googleapis.com/auth/cloud-platform'],
31-
)
28+
if os.path.exists(credentials_contents):
29+
credentials = service_account.Credentials.from_service_account_file(
30+
credentials_contents,
31+
scopes=['https://www.googleapis.com/auth/cloud-platform'],
32+
)
33+
else:
34+
service_account_info = json.loads(credentials_contents)
35+
credentials = service_account.Credentials.from_service_account_info(
36+
service_account_info,
37+
scopes=['https://www.googleapis.com/auth/cloud-platform'],
38+
)
3239

3340
return bigquery.Client(
3441
credentials=credentials,
@@ -44,7 +51,13 @@ def run_query(query, dry_run=False, credentials_file=None):
4451

4552
job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
4653
dry_run_job = client.query(query, job_config=job_config)
47-
LOGGER.info('Estimated processed GBs: %.2f', dry_run_job.total_bytes_processed / 1024**3)
54+
LOGGER.info('Estimated data processed in query (GBs): %.2f', dry_run_job.total_bytes_processed / 1024**3)
55+
# https://cloud.google.com/bigquery/pricing#on_demand_pricing
56+
# assuming have hit 1 terabyte processed in month
57+
cost_per_terabyte = 6.15
58+
bytes = dry_run_job.total_bytes_processed
59+
cost = cost_per_terabyte * bytes_to_terabytes(bytes)
60+
LOGGER.info('Estimated cost for query: $%.2f', cost)
4861

4962
if dry_run:
5063
return None
@@ -53,5 +66,15 @@ def run_query(query, dry_run=False, credentials_file=None):
5366
data = query_job.to_dataframe()
5467
LOGGER.info('Total processed GBs: %.2f', query_job.total_bytes_processed / 1024**3)
5568
LOGGER.info('Total billed GBs: %.2f', query_job.total_bytes_billed / 1024**3)
56-
69+
cost = cost_per_terabyte * bytes_to_terabytes(query_job.total_bytes_billed)
70+
LOGGER.info('Total cost for query: $%.2f', cost)
5771
return data
72+
73+
def bytes_to_megabytes(bytes):
74+
return bytes / 1024 / 1024
75+
76+
def bytes_to_gigabytes(bytes):
77+
return bytes_to_megabytes(bytes) / 1024
78+
79+
def bytes_to_terabytes(bytes):
80+
return bytes_to_gigabytes(bytes) / 1024

download_analytics/pypi.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Functions to get PyPI downloads from Google Big Query."""
22

33
import logging
4-
from datetime import datetime, timedelta
4+
from datetime import datetime, timedelta, timezone
55

66
import pandas as pd
77

@@ -67,10 +67,12 @@ def _get_query(projects, start_date, end_date):
6767

6868

6969
def _get_query_dates(start_date, min_date, max_date, max_days, force=False):
70-
end_date = datetime.utcnow().date()
70+
end_date = datetime.now(timezone.utc).date()
7171
if start_date is None:
7272
start_date = end_date - timedelta(days=max_days)
7373

74+
start_date = start_date.date()
75+
7476
if pd.notna(min_date):
7577
min_date = pd.Timestamp(min_date).date()
7678
if min_date > start_date:
@@ -131,8 +133,8 @@ def get_pypi_downloads(
131133
projects = (projects,)
132134

133135
previous_projects = previous[previous.project.isin(projects)]
134-
min_date = previous_projects.timestamp.min()
135-
max_date = previous_projects.timestamp.max()
136+
min_date = previous_projects.timestamp.min().date()
137+
max_date = previous_projects.timestamp.max().date()
136138
else:
137139
previous = pd.DataFrame(columns=OUTPUT_COLUMNS)
138140
min_date = None
@@ -150,7 +152,7 @@ def get_pypi_downloads(
150152
if max_date is None:
151153
all_downloads = new_downloads
152154
else:
153-
if pd.Timestamp(max_date) < pd.Timestamp(end_date):
155+
if max_date <= end_date:
154156
before = previous[previous.timestamp < new_downloads.timestamp.min()]
155157
after = new_downloads
156158
else:
@@ -160,5 +162,4 @@ def get_pypi_downloads(
160162
all_downloads = pd.concat([before, after], ignore_index=True)
161163

162164
LOGGER.info('Obtained %s new downloads', len(all_downloads) - len(previous))
163-
164165
return all_downloads

0 commit comments

Comments
 (0)