Skip to content

Commit 7488afb

Browse files
authored
Feat: Add performance metrics and mb/s tracking (#356)
1 parent 31b6eeb commit 7488afb

File tree

3 files changed

+112
-2
lines changed

3 files changed

+112
-2
lines changed

airbyte/_connector_base.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838

3939
from airbyte._executors.base import Executor
4040
from airbyte._message_iterators import AirbyteMessageIterator
41+
from airbyte.progress import ProgressTracker
4142

4243

4344
MAX_LOG_LINES = 20
@@ -352,6 +353,8 @@ def _execute(
352353
self,
353354
args: list[str],
354355
stdin: IO[str] | AirbyteMessageIterator | None = None,
356+
*,
357+
progress_tracker: ProgressTracker | None = None,
355358
) -> Generator[AirbyteMessage, None, None]:
356359
"""Execute the connector with the given arguments.
357360
@@ -371,6 +374,11 @@ def _execute(
371374
for line in self.executor.execute(args, stdin=stdin):
372375
try:
373376
message: AirbyteMessage = AirbyteMessage.model_validate_json(json_data=line)
377+
if progress_tracker and message.record:
378+
progress_tracker.tally_bytes_read(
379+
len(line),
380+
stream_name=message.record.stream,
381+
)
374382
self._peek_airbyte_message(message)
375383
yield message
376384

airbyte/progress.py

Lines changed: 103 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
import datetime
2020
import importlib
21+
import json
2122
import math
2223
import os
2324
import sys
@@ -207,6 +208,7 @@ def __init__(
207208
self.stream_read_counts: dict[str, int] = defaultdict(int)
208209
self.stream_read_start_times: dict[str, float] = {}
209210
self.stream_read_end_times: dict[str, float] = {}
211+
self.stream_bytes_read: dict[str, int] = defaultdict(int)
210212

211213
# Cache Writes
212214
self.total_records_written = 0
@@ -238,6 +240,27 @@ def _print_info_message(
238240
if self._file_logger:
239241
self._file_logger.info(message)
240242

243+
@property
244+
def bytes_tracking_enabled(self) -> bool:
245+
"""Return True if bytes are being tracked."""
246+
return bool(self.stream_bytes_read)
247+
248+
@property
249+
def total_bytes_read(self) -> int:
250+
"""Return the total number of bytes read.
251+
252+
Return None if bytes are not being tracked.
253+
"""
254+
return sum(self.stream_bytes_read.values())
255+
256+
@property
257+
def total_megabytes_read(self) -> float:
258+
"""Return the total number of bytes read.
259+
260+
Return None if no bytes have been read, as this is generally due to bytes not being tracked.
261+
"""
262+
return self.total_bytes_read / 1_000_000
263+
241264
def tally_records_read(
242265
self,
243266
messages: Iterable[AirbyteMessage],
@@ -351,6 +374,13 @@ def tally_confirmed_writes(
351374

352375
self._update_display(force_refresh=True)
353376

377+
def tally_bytes_read(self, bytes_read: int, stream_name: str) -> None:
378+
"""Tally the number of bytes read.
379+
380+
Unlike the other tally methods, this method does not yield messages.
381+
"""
382+
self.stream_bytes_read[stream_name] += bytes_read
383+
354384
# Logging methods
355385

356386
@property
@@ -393,6 +423,72 @@ def _log_stream_read_end(self, stream_name: str) -> None:
393423
)
394424
self.stream_read_end_times[stream_name] = time.time()
395425

426+
def _log_read_metrics(self) -> None:
427+
"""Log read performance metrics."""
428+
# Source performance metrics
429+
if not self.total_records_read or not self._file_logger:
430+
return
431+
432+
perf_metrics: dict[str, Any] = {
433+
"job_description": {
434+
"description": self.job_description,
435+
}
436+
}
437+
if self._source:
438+
perf_metrics["job_description"]["source"] = self._source.name
439+
if self._cache:
440+
perf_metrics["job_description"]["cache"] = type(self._cache).__name__
441+
if self._destination:
442+
perf_metrics["job_description"]["destination"] = self._destination.name
443+
444+
perf_metrics["records_read"] = self.total_records_read
445+
perf_metrics["read_time_seconds"] = self.elapsed_read_seconds
446+
perf_metrics["read_start_time"] = self.read_start_time
447+
perf_metrics["read_end_time"] = self.read_end_time
448+
if self.elapsed_read_seconds > 0:
449+
perf_metrics["records_per_second"] = round(
450+
self.total_records_read / self.elapsed_read_seconds, 4
451+
)
452+
if self.bytes_tracking_enabled:
453+
mb_read = self.total_megabytes_read
454+
perf_metrics["mb_read"] = mb_read
455+
perf_metrics["mb_per_second"] = round(mb_read / self.elapsed_read_seconds, 4)
456+
457+
stream_metrics = {}
458+
for stream_name, count in self.stream_read_counts.items():
459+
stream_metrics[stream_name] = {
460+
"records_read": count,
461+
"read_start_time": self.stream_read_start_times.get(stream_name),
462+
"read_end_time": self.stream_read_end_times.get(stream_name),
463+
}
464+
if (
465+
stream_name in self.stream_read_end_times
466+
and stream_name in self.stream_read_start_times
467+
and count > 0
468+
):
469+
duration: float = (
470+
self.stream_read_end_times[stream_name]
471+
- self.stream_read_start_times[stream_name]
472+
)
473+
stream_metrics[stream_name]["read_time_seconds"] = duration
474+
if duration > 0:
475+
stream_metrics[stream_name]["records_per_second"] = round(
476+
count
477+
/ (
478+
self.stream_read_end_times[stream_name]
479+
- self.stream_read_start_times[stream_name]
480+
),
481+
4,
482+
)
483+
if self.bytes_tracking_enabled:
484+
mb_read = self.stream_bytes_read[stream_name] / 1_000_000
485+
stream_metrics[stream_name]["mb_read"] = mb_read
486+
stream_metrics[stream_name]["mb_per_second"] = round(mb_read / duration, 4)
487+
488+
perf_metrics["stream_metrics"] = stream_metrics
489+
490+
self._file_logger.info(json.dumps({"read_performance_metrics": perf_metrics}))
491+
396492
@property
397493
def _unclosed_stream_names(self) -> list[str]:
398494
"""Return a list of streams that have not yet been fully read."""
@@ -416,6 +512,7 @@ def log_success(
416512
self._print_info_message(
417513
f"Completed `{self.job_description}` sync at `{pendulum.now().format('HH:mm:ss')}`."
418514
)
515+
self._log_read_metrics()
419516
send_telemetry(
420517
source=self._source,
421518
cache=self._cache,
@@ -663,8 +760,12 @@ def _get_status_message(self) -> str:
663760
# Format start time as a friendly string in local timezone:
664761
start_time_str = _to_time_str(self.read_start_time)
665762
records_per_second: float = 0.0
763+
mb_per_second_str = ""
666764
if self.elapsed_read_seconds > 0:
667765
records_per_second = self.total_records_read / self.elapsed_read_seconds
766+
if self.bytes_tracking_enabled:
767+
mb_per_second = self.total_megabytes_read / self.elapsed_read_seconds
768+
mb_per_second_str = f", {mb_per_second:,.2f} MB/s"
668769

669770
status_message = HORIZONTAL_LINE + f"\n### Sync Progress: `{self.job_description}`\n\n"
670771

@@ -680,7 +781,7 @@ def join_streams_strings(streams_list: list[str]) -> str:
680781
f"**Started reading from source at `{start_time_str}`:**\n\n"
681782
f"- Read **{self.total_records_read:,}** records "
682783
f"over **{self.elapsed_read_time_string}** "
683-
f"({records_per_second:,.1f} records / second).\n\n"
784+
f"({records_per_second:,.1f} records/s{mb_per_second_str}).\n\n"
684785
)
685786

686787
if self.stream_read_counts:
@@ -747,7 +848,7 @@ def join_streams_strings(streams_list: list[str]) -> str:
747848
status_message += (
748849
f"- Sent **{self.total_destination_records_delivered:,} records** "
749850
f"to destination over **{self.total_destination_write_time_str}** "
750-
f"({self.destination_records_delivered_per_second:,.1f} records per second)."
851+
f"({self.destination_records_delivered_per_second:,.1f} records/s)."
751852
"\n\n"
752853
)
753854
status_message += (

airbyte/sources/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,7 @@ def _read_with_catalog(
552552
"--state",
553553
state_file,
554554
],
555+
progress_tracker=progress_tracker,
555556
)
556557
yield from progress_tracker.tally_records_read(message_generator)
557558
progress_tracker.log_read_complete()

0 commit comments

Comments
 (0)