1818
1919import datetime
2020import importlib
21+ import json
2122import math
2223import os
2324import sys
@@ -207,6 +208,7 @@ def __init__(
207208 self .stream_read_counts : dict [str , int ] = defaultdict (int )
208209 self .stream_read_start_times : dict [str , float ] = {}
209210 self .stream_read_end_times : dict [str , float ] = {}
211+ self .stream_bytes_read : dict [str , int ] = defaultdict (int )
210212
211213 # Cache Writes
212214 self .total_records_written = 0
@@ -238,6 +240,27 @@ def _print_info_message(
238240 if self ._file_logger :
239241 self ._file_logger .info (message )
240242
243+ @property
244+ def bytes_tracking_enabled (self ) -> bool :
245+ """Return True if bytes are being tracked."""
246+ return bool (self .stream_bytes_read )
247+
248+ @property
249+ def total_bytes_read (self ) -> int :
250+ """Return the total number of bytes read.
251+
252+ Return None if bytes are not being tracked.
253+ """
254+ return sum (self .stream_bytes_read .values ())
255+
256+ @property
257+ def total_megabytes_read (self ) -> float :
258+ """Return the total number of bytes read.
259+
260+ Return None if no bytes have been read, as this is generally due to bytes not being tracked.
261+ """
262+ return self .total_bytes_read / 1_000_000
263+
241264 def tally_records_read (
242265 self ,
243266 messages : Iterable [AirbyteMessage ],
@@ -351,6 +374,13 @@ def tally_confirmed_writes(
351374
352375 self ._update_display (force_refresh = True )
353376
377+ def tally_bytes_read (self , bytes_read : int , stream_name : str ) -> None :
378+ """Tally the number of bytes read.
379+
380+ Unlike the other tally methods, this method does not yield messages.
381+ """
382+ self .stream_bytes_read [stream_name ] += bytes_read
383+
354384 # Logging methods
355385
356386 @property
@@ -393,6 +423,72 @@ def _log_stream_read_end(self, stream_name: str) -> None:
393423 )
394424 self .stream_read_end_times [stream_name ] = time .time ()
395425
426+ def _log_read_metrics (self ) -> None :
427+ """Log read performance metrics."""
428+ # Source performance metrics
429+ if not self .total_records_read or not self ._file_logger :
430+ return
431+
432+ perf_metrics : dict [str , Any ] = {
433+ "job_description" : {
434+ "description" : self .job_description ,
435+ }
436+ }
437+ if self ._source :
438+ perf_metrics ["job_description" ]["source" ] = self ._source .name
439+ if self ._cache :
440+ perf_metrics ["job_description" ]["cache" ] = type (self ._cache ).__name__
441+ if self ._destination :
442+ perf_metrics ["job_description" ]["destination" ] = self ._destination .name
443+
444+ perf_metrics ["records_read" ] = self .total_records_read
445+ perf_metrics ["read_time_seconds" ] = self .elapsed_read_seconds
446+ perf_metrics ["read_start_time" ] = self .read_start_time
447+ perf_metrics ["read_end_time" ] = self .read_end_time
448+ if self .elapsed_read_seconds > 0 :
449+ perf_metrics ["records_per_second" ] = round (
450+ self .total_records_read / self .elapsed_read_seconds , 4
451+ )
452+ if self .bytes_tracking_enabled :
453+ mb_read = self .total_megabytes_read
454+ perf_metrics ["mb_read" ] = mb_read
455+ perf_metrics ["mb_per_second" ] = round (mb_read / self .elapsed_read_seconds , 4 )
456+
457+ stream_metrics = {}
458+ for stream_name , count in self .stream_read_counts .items ():
459+ stream_metrics [stream_name ] = {
460+ "records_read" : count ,
461+ "read_start_time" : self .stream_read_start_times .get (stream_name ),
462+ "read_end_time" : self .stream_read_end_times .get (stream_name ),
463+ }
464+ if (
465+ stream_name in self .stream_read_end_times
466+ and stream_name in self .stream_read_start_times
467+ and count > 0
468+ ):
469+ duration : float = (
470+ self .stream_read_end_times [stream_name ]
471+ - self .stream_read_start_times [stream_name ]
472+ )
473+ stream_metrics [stream_name ]["read_time_seconds" ] = duration
474+ if duration > 0 :
475+ stream_metrics [stream_name ]["records_per_second" ] = round (
476+ count
477+ / (
478+ self .stream_read_end_times [stream_name ]
479+ - self .stream_read_start_times [stream_name ]
480+ ),
481+ 4 ,
482+ )
483+ if self .bytes_tracking_enabled :
484+ mb_read = self .stream_bytes_read [stream_name ] / 1_000_000
485+ stream_metrics [stream_name ]["mb_read" ] = mb_read
486+ stream_metrics [stream_name ]["mb_per_second" ] = round (mb_read / duration , 4 )
487+
488+ perf_metrics ["stream_metrics" ] = stream_metrics
489+
490+ self ._file_logger .info (json .dumps ({"read_performance_metrics" : perf_metrics }))
491+
396492 @property
397493 def _unclosed_stream_names (self ) -> list [str ]:
398494 """Return a list of streams that have not yet been fully read."""
@@ -416,6 +512,7 @@ def log_success(
416512 self ._print_info_message (
417513 f"Completed `{ self .job_description } ` sync at `{ pendulum .now ().format ('HH:mm:ss' )} `."
418514 )
515+ self ._log_read_metrics ()
419516 send_telemetry (
420517 source = self ._source ,
421518 cache = self ._cache ,
@@ -663,8 +760,12 @@ def _get_status_message(self) -> str:
663760 # Format start time as a friendly string in local timezone:
664761 start_time_str = _to_time_str (self .read_start_time )
665762 records_per_second : float = 0.0
763+ mb_per_second_str = ""
666764 if self .elapsed_read_seconds > 0 :
667765 records_per_second = self .total_records_read / self .elapsed_read_seconds
766+ if self .bytes_tracking_enabled :
767+ mb_per_second = self .total_megabytes_read / self .elapsed_read_seconds
768+ mb_per_second_str = f", { mb_per_second :,.2f} MB/s"
668769
669770 status_message = HORIZONTAL_LINE + f"\n ### Sync Progress: `{ self .job_description } `\n \n "
670771
@@ -680,7 +781,7 @@ def join_streams_strings(streams_list: list[str]) -> str:
680781 f"**Started reading from source at `{ start_time_str } `:**\n \n "
681782 f"- Read **{ self .total_records_read :,} ** records "
682783 f"over **{ self .elapsed_read_time_string } ** "
683- f"({ records_per_second :,.1f} records / second ).\n \n "
784+ f"({ records_per_second :,.1f} records/s { mb_per_second_str } ).\n \n "
684785 )
685786
686787 if self .stream_read_counts :
@@ -747,7 +848,7 @@ def join_streams_strings(streams_list: list[str]) -> str:
747848 status_message += (
748849 f"- Sent **{ self .total_destination_records_delivered :,} records** "
749850 f"to destination over **{ self .total_destination_write_time_str } ** "
750- f"({ self .destination_records_delivered_per_second :,.1f} records per second )."
851+ f"({ self .destination_records_delivered_per_second :,.1f} records/s )."
751852 "\n \n "
752853 )
753854 status_message += (
0 commit comments