Skip to content

Commit 250548c

Browse files
test: add a bigquery usage report to notebook test session (#604)
* test: add a bigquery usage report to notebook test session * filter out mocks * remove pointless type hint * fix replace statement * account for dry runs * ipynb only * use env var via nox * don't import bigframes from noxfile * address comments * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent bc82804 commit 250548c

File tree

2 files changed

+80
-9
lines changed

2 files changed

+80
-9
lines changed

bigframes/session/_io/bigquery.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
import datetime
2020
import itertools
21+
import os
2122
import textwrap
2223
import types
2324
from typing import Dict, Iterable, Optional, Sequence, Tuple, Union
@@ -34,6 +35,8 @@
3435
MAX_LABELS_COUNT = 64
3536
TEMP_TABLE_PREFIX = "bqdf{date}_{random_id}"
3637

38+
LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME"
39+
3740

3841
def create_job_configs_labels(
3942
job_configs_labels: Optional[Dict[str, str]],
@@ -243,4 +246,32 @@ def start_query_with_client(
243246
)
244247
else:
245248
results_iterator = query_job.result(max_results=max_results)
249+
250+
if LOGGING_NAME_ENV_VAR in os.environ:
251+
# when running notebooks via pytest nbmake
252+
pytest_log_job(query_job)
253+
246254
return results_iterator, query_job
255+
256+
257+
def pytest_log_job(query_job: bigquery.QueryJob):
258+
"""For pytest runs only, log information about the query job
259+
to a file in order to create a performance report.
260+
"""
261+
if LOGGING_NAME_ENV_VAR not in os.environ:
262+
raise EnvironmentError(
263+
"Environment variable {env_var} is not set".format(
264+
env_var=LOGGING_NAME_ENV_VAR
265+
)
266+
)
267+
test_name = os.environ[LOGGING_NAME_ENV_VAR]
268+
current_directory = os.getcwd()
269+
bytes_processed = query_job.total_bytes_processed
270+
if not isinstance(bytes_processed, int):
271+
return # filter out mocks
272+
if query_job.configuration.dry_run:
273+
# dry runs don't process their total_bytes_processed
274+
bytes_processed = 0
275+
bytes_file = os.path.join(current_directory, test_name + ".bytesprocessed")
276+
with open(bytes_file, "a") as f:
277+
f.write(str(bytes_processed) + "\n")

noxfile.py

Lines changed: 49 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -764,6 +764,8 @@ def notebook(session: nox.Session):
764764
"--nbmake-timeout=900", # 15 minutes
765765
]
766766

767+
logging_name_env_var = "BIGFRAMES_PERFORMANCE_LOG_NAME"
768+
767769
try:
768770
# Populate notebook parameters and make a backup so that the notebooks
769771
# are runnable.
@@ -773,13 +775,21 @@ def notebook(session: nox.Session):
773775
*notebooks,
774776
)
775777

776-
# Run self-contained notebooks in single session.run
777-
# achieve parallelization via -n
778-
session.run(
779-
*pytest_command,
780-
"-nauto",
781-
*notebooks,
782-
)
778+
# Run notebooks in parallel session.run's, since each notebook
779+
# takes an environment variable for performance logging
780+
processes = []
781+
for notebook in notebooks:
782+
session.env[logging_name_env_var] = os.path.basename(notebook)
783+
process = Process(
784+
target=session.run,
785+
args=(*pytest_command, notebook),
786+
)
787+
process.start()
788+
processes.append(process)
789+
790+
for process in processes:
791+
process.join()
792+
783793
finally:
784794
# Prevent our notebook changes from getting checked in to git
785795
# accidentally.
@@ -789,11 +799,12 @@ def notebook(session: nox.Session):
789799
*notebooks,
790800
)
791801

792-
# Run regionalized notebooks in parallel session.run's, since each notebook
793-
# takes a different region via env param.
802+
# Additionally run regionalized notebooks in parallel session.run's.
803+
# Each notebook takes a different region via env param.
794804
processes = []
795805
for notebook, regions in notebooks_reg.items():
796806
for region in regions:
807+
session.env[logging_name_env_var] = os.path.basename(notebook)
797808
process = Process(
798809
target=session.run,
799810
args=(*pytest_command, notebook),
@@ -805,6 +816,35 @@ def notebook(session: nox.Session):
805816
for process in processes:
806817
process.join()
807818

819+
# when run via pytest, notebooks output a .bytesprocessed report
820+
# collect those reports and print a summary
821+
_print_bytes_processed_report()
822+
823+
824+
def _print_bytes_processed_report():
825+
"""Add an informational report about http queries and bytes
826+
processed to the testlog output for purposes of measuring
827+
bigquery-related performance changes.
828+
"""
829+
print("---BIGQUERY USAGE REPORT---")
830+
cumulative_queries = 0
831+
cumulative_bytes = 0
832+
for report in Path("notebooks/").glob("*/*.bytesprocessed"):
833+
with open(report, "r") as f:
834+
filename = report.stem
835+
lines = f.read().splitlines()
836+
query_count = len(lines)
837+
total_bytes = sum([int(line) for line in lines])
838+
format_string = f"{filename} - query count: {query_count}, bytes processed sum: {total_bytes}"
839+
print(format_string)
840+
cumulative_bytes += total_bytes
841+
cumulative_queries += query_count
842+
print(
843+
"---total queries: {total_queries}, total bytes: {total_bytes}---".format(
844+
total_queries=cumulative_queries, total_bytes=cumulative_bytes
845+
)
846+
)
847+
808848

809849
@nox.session(python="3.10")
810850
def release_dry_run(session):

0 commit comments

Comments
 (0)