Merge pull request #344 from Backblaze/download-stats

ppolewicz · web-flow · commit 7aa5ba235259 · 2022-09-17T16:20:06.000+02:00
Add stats for parallel downloader
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,9 +6,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+* Logging performance summary of parallel download threads
+
 ### Fixed
 * Replace `ReplicationScanResult.source_has_sse_c_enabled` with `source_encryption_mode`
 * Fix `B2Api.get_key()` and `RawSimulator.delete_key()`
+* Fix calling `CopySizeTooBig` exception
 
 ### Infrastructure
 * Fix nox's deprecated `session.install()` calls
diff --git a/b2sdk/exception.py b/b2sdk/exception.py
@@ -566,7 +566,7 @@ def interpret_b2_error(
         matcher = COPY_SOURCE_TOO_BIG_ERROR_MESSAGE_RE.match(message)
         if matcher is not None:
             size = int(matcher.group('size'))
-            return CopySourceTooBig(size)
+            return CopySourceTooBig(message, code, size)
 
         return BadRequest(message, code)
     elif status == 400:
diff --git a/b2sdk/stream/progress.py b/b2sdk/stream/progress.py
@@ -36,6 +36,9 @@ def _progress_update(self, delta):
         self.bytes_completed += delta
         self.progress_listener.bytes_completed(self.bytes_completed + self.offset)
 
+    def __str__(self):
+        return str(self.stream)
+
 
 class ReadingStreamWithProgress(AbstractStreamWithProgress):
     """
diff --git a/b2sdk/transfer/inbound/downloaded_file.py b/b2sdk/transfer/inbound/downloaded_file.py
@@ -79,6 +79,9 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         self.file.close()
         set_file_mtime(self.path_, self.mod_time_to_set)
 
+    def __str__(self):
+        return str(self.path_)
+
 
 class DownloadedFile:
     """
diff --git a/b2sdk/transfer/inbound/downloader/parallel.py b/b2sdk/transfer/inbound/downloader/parallel.py
@@ -10,6 +10,7 @@
 
 from concurrent import futures
 from io import IOBase
+from time import perf_counter_ns
 from typing import Optional
 import logging
 import queue
@@ -18,6 +19,7 @@
 from requests.models import Response
 
 from .abstract import AbstractDownloader
+from .stats_collector import StatsCollector
 from b2sdk.encryption.setting import EncryptionSetting
 from b2sdk.file_version import DownloadVersion
 from b2sdk.session import B2Session
@@ -118,7 +120,15 @@ def download(
         if self._check_hash:
             # we skip hashing if we would not check it - hasher object is actually a EmptyHasher instance
             # but we avoid here reading whole file (except for the first part) from disk again
+            before_hash = perf_counter_ns()
             self._finish_hashing(first_part, file, hasher, download_version.content_length)
+            after_hash = perf_counter_ns()
+            logger.info(
+                'download stats | %s | %s total: %.3f ms',
+                file,
+                'finish_hash',
+                (after_hash - before_hash) / 1000000,
+            )
 
         return bytes_written, hasher.hexdigest()
 
@@ -203,18 +213,33 @@ def __init__(self, file, max_queue_depth):
         self.file = file
         self.queue = queue.Queue(max_queue_depth)
         self.total = 0
+        self.stats_collector = StatsCollector(str(self.file), 'writer', 'seek')
         super(WriterThread, self).__init__()
 
     def run(self):
         file = self.file
         queue_get = self.queue.get
+        stats_collector_read_append = self.stats_collector.read.append
+        stats_collector_other_append = self.stats_collector.other.append
+        stats_collector_write_append = self.stats_collector.write.append
+        start = perf_counter_ns()
         while 1:
+
+            before_read = perf_counter_ns()
             shutdown, offset, data = queue_get()
+            stats_collector_read_append(perf_counter_ns() - before_read)
+
             if shutdown:
                 break
+            before_seek = perf_counter_ns()
             file.seek(offset)
+            after_seek = perf_counter_ns()
             file.write(data)
+            after_write = perf_counter_ns()
+            stats_collector_other_append(after_seek - before_seek)
+            stats_collector_write_append(after_write - after_seek)
             self.total += len(data)
+        self.stats_collector.total = perf_counter_ns() - start
 
     def __enter__(self):
         self.start()
@@ -223,6 +248,7 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.queue.put((True, None, None))
         self.join()
+        self.stats_collector.report()
 
 
 def download_first_part(
@@ -243,6 +269,19 @@ def download_first_part(
     :param chunk_size: size (in bytes) of read data chunks
     :param encryption: encryption mode, algorithm and key
     """
+    # This function contains a loop that has heavy impact on performance.
+    # It has not been broken down to several small functions due to fear of
+    # performance overhead of calling a python function. Advanced performance optimization
+    # techniques are in use here, for example avoiding internal python getattr calls by
+    # caching function signatures in local variables. Most of this code was written in
+    # times where python 2.7 (or maybe even 2.6) had to be supported, so maybe some
+    # of those optimizations could be removed without affecting performance.
+    #
+    # Due to reports of hard to debug performance issues, this code has also been riddled
+    # with performance measurements. A known issue is GCP VMs which have more network speed
+    # than storage speed, but end users have different issues with network and storage.
+    # Basic tools to figure out where the time is being spent is a must for long-term
+    # maintainability.
 
     writer_queue_put = writer.queue.put
     hasher_update = hasher.update
@@ -253,14 +292,29 @@ def download_first_part(
 
     bytes_read = 0
     stop = False
+
+    stats_collector = StatsCollector(response.url, f'{first_offset}:{last_offset}', 'hash')
+    stats_collector_read_append = stats_collector.read.append
+    stats_collector_other_append = stats_collector.other.append
+    stats_collector_write_append = stats_collector.write.append
+    start = before_read = perf_counter_ns()
     for data in response.iter_content(chunk_size=chunk_size):
+        stats_collector_read_append(perf_counter_ns() - before_read)
         if first_offset + bytes_read + len(data) >= last_offset:
             to_write = data[:last_offset - bytes_read]
             stop = True
         else:
             to_write = data
+        before_put = perf_counter_ns()
         writer_queue_put((False, first_offset + bytes_read, to_write))
+
+        before_hash = perf_counter_ns()
         hasher_update(to_write)
+        after_hash = perf_counter_ns()
+
+        stats_collector_write_append(before_hash - before_put)
+        stats_collector_other_append(after_hash - before_hash)
+
         bytes_read += len(to_write)
         if stop:
             break
@@ -284,11 +338,24 @@ def download_first_part(
             cloud_range.as_tuple(),
             encryption=encryption,
         ) as response:
+            before_read = perf_counter_ns()
             for to_write in response.iter_content(chunk_size=chunk_size):
+                stats_collector_read_append(perf_counter_ns() - before_read)
+
+                before_put = perf_counter_ns()
                 writer_queue_put((False, first_offset + bytes_read, to_write))
+                before_hash = perf_counter_ns()
                 hasher_update(to_write)
+                after_hash = perf_counter_ns()
+
+                stats_collector_write_append(before_hash - before_put)
+                stats_collector_other_append(after_hash - before_hash)
+
                 bytes_read += len(to_write)
+                before_read = perf_counter_ns()
         tries_left -= 1
+    stats_collector.total = perf_counter_ns() - start
+    stats_collector.report()
 
 
 def download_non_first_part(
@@ -321,15 +388,27 @@ def download_non_first_part(
             'download attempts remaining: %i, bytes read already: %i. Getting range %s now.',
             retries_left, bytes_read, cloud_range
         )
+        stats_collector = StatsCollector(url, f'{cloud_range.start}:{cloud_range.end}', 'none')
+        stats_collector_read_append = stats_collector.read.append
+        stats_collector_write_append = stats_collector.write.append
+        start = before_read = perf_counter_ns()
         with session.download_file_from_url(
             url,
             cloud_range.as_tuple(),
             encryption=encryption,
         ) as response:
+            before_read = perf_counter_ns()
             for to_write in response.iter_content(chunk_size=chunk_size):
+                after_read = perf_counter_ns()
                 writer_queue_put((False, start_range + bytes_read, to_write))
+                after_write = perf_counter_ns()
+                stats_collector_read_append(after_read - before_read)
+                stats_collector_write_append(after_write - after_read)
                 bytes_read += len(to_write)
+                before_read = perf_counter_ns()
         retries_left -= 1
+        stats_collector.total = perf_counter_ns() - start
+        stats_collector.report()
 
 
 class PartToDownload:
diff --git a/b2sdk/transfer/inbound/downloader/stats_collector.py b/b2sdk/transfer/inbound/downloader/stats_collector.py
@@ -0,0 +1,51 @@
+######################################################################
+#
+# File: b2sdk/transfer/inbound/downloader/stats_collector.py
+#
+# Copyright 2020 Backblaze Inc. All Rights Reserved.
+#
+# License https://www.backblaze.com/using_b2_code.html
+#
+######################################################################
+
+import logging
+from dataclasses import dataclass, field
+from typing import List  # 3.7 doesn't understand `list` vs `List`
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class StatsCollector:
+    name: str  #: file name or object url
+    detail: str  #: description of the thread, ex. "10000000:20000000" or "writer"
+    other_name: str  #: other statistic, typically "seek" or "hash"
+    total: Optional[int] = None
+    other: List[int] = field(default_factory=list)
+    write: List[int] = field(default_factory=list)
+    read: List[int] = field(default_factory=list)
+
+    def report(self):
+        if self.read:
+            logger.info('download stats | %s | TTFB: %.3f ms', self, self.read[0] / 1000000)
+            logger.info(
+                'download stats | %s | read() without TTFB: %.3f ms', self,
+                sum(self.read[1:]) / 1000000
+            )
+        if self.other:
+            logger.info(
+                'download stats | %s | %s total: %.3f ms', self, self.other_name,
+                sum(self.other) / 1000000
+            )
+        if self.write:
+            logger.info(
+                'download stats | %s | write() total: %.3f ms', self,
+                sum(self.write) / 1000000
+            )
+        if self.total is not None:
+            overhead = self.total - sum(self.write) - sum(self.other) - sum(self.read)
+            logger.info('download stats | %s | overhead: %.3f ms', self, overhead / 1000000)
+
+    def __str__(self):
+        return f'{self.name}[{self.detail}]'