1010
1111from concurrent import futures
1212from io import IOBase
13+ from time import perf_counter_ns
1314from typing import Optional
1415import logging
1516import queue
1819from requests .models import Response
1920
2021from .abstract import AbstractDownloader
22+ from .stats_collector import StatsCollector
2123from b2sdk .encryption .setting import EncryptionSetting
2224from b2sdk .file_version import DownloadVersion
2325from b2sdk .session import B2Session
@@ -118,7 +120,15 @@ def download(
118120 if self ._check_hash :
119121 # we skip hashing if we would not check it - hasher object is actually a EmptyHasher instance
120122 # but we avoid here reading whole file (except for the first part) from disk again
123+ before_hash = perf_counter_ns ()
121124 self ._finish_hashing (first_part , file , hasher , download_version .content_length )
125+ after_hash = perf_counter_ns ()
126+ logger .info (
127+ 'download stats | %s | %s total: %.3f ms' ,
128+ file ,
129+ 'finish_hash' ,
130+ (after_hash - before_hash ) / 1000000 ,
131+ )
122132
123133 return bytes_written , hasher .hexdigest ()
124134
@@ -203,18 +213,33 @@ def __init__(self, file, max_queue_depth):
203213 self .file = file
204214 self .queue = queue .Queue (max_queue_depth )
205215 self .total = 0
216+ self .stats_collector = StatsCollector (str (self .file ), 'writer' , 'seek' )
206217 super (WriterThread , self ).__init__ ()
207218
208219 def run (self ):
209220 file = self .file
210221 queue_get = self .queue .get
222+ stats_collector_read_append = self .stats_collector .read .append
223+ stats_collector_other_append = self .stats_collector .other .append
224+ stats_collector_write_append = self .stats_collector .write .append
225+ start = perf_counter_ns ()
211226 while 1 :
227+
228+ before_read = perf_counter_ns ()
212229 shutdown , offset , data = queue_get ()
230+ stats_collector_read_append (perf_counter_ns () - before_read )
231+
213232 if shutdown :
214233 break
234+ before_seek = perf_counter_ns ()
215235 file .seek (offset )
236+ after_seek = perf_counter_ns ()
216237 file .write (data )
238+ after_write = perf_counter_ns ()
239+ stats_collector_other_append (after_seek - before_seek )
240+ stats_collector_write_append (after_write - after_seek )
217241 self .total += len (data )
242+ self .stats_collector .total = perf_counter_ns () - start
218243
219244 def __enter__ (self ):
220245 self .start ()
@@ -223,6 +248,7 @@ def __enter__(self):
223248 def __exit__ (self , exc_type , exc_val , exc_tb ):
224249 self .queue .put ((True , None , None ))
225250 self .join ()
251+ self .stats_collector .report ()
226252
227253
228254def download_first_part (
@@ -243,6 +269,19 @@ def download_first_part(
243269 :param chunk_size: size (in bytes) of read data chunks
244270 :param encryption: encryption mode, algorithm and key
245271 """
272+ # This function contains a loop that has heavy impact on performance.
273+ # It has not been broken down to several small functions due to fear of
274+ # performance overhead of calling a python function. Advanced performance optimization
275+ # techniques are in use here, for example avoiding internal python getattr calls by
276+ # caching function signatures in local variables. Most of this code was written in
277+ # times where python 2.7 (or maybe even 2.6) had to be supported, so maybe some
278+ # of those optimizations could be removed without affecting performance.
279+ #
280+ # Due to reports of hard to debug performance issues, this code has also been riddled
281+ # with performance measurements. A known issue is GCP VMs which have more network speed
282+ # than storage speed, but end users have different issues with network and storage.
283+ # Basic tools to figure out where the time is being spent is a must for long-term
284+ # maintainability.
246285
247286 writer_queue_put = writer .queue .put
248287 hasher_update = hasher .update
@@ -253,14 +292,29 @@ def download_first_part(
253292
254293 bytes_read = 0
255294 stop = False
295+
296+ stats_collector = StatsCollector (response .url , f'{ first_offset } :{ last_offset } ' , 'hash' )
297+ stats_collector_read_append = stats_collector .read .append
298+ stats_collector_other_append = stats_collector .other .append
299+ stats_collector_write_append = stats_collector .write .append
300+ start = before_read = perf_counter_ns ()
256301 for data in response .iter_content (chunk_size = chunk_size ):
302+ stats_collector_read_append (perf_counter_ns () - before_read )
257303 if first_offset + bytes_read + len (data ) >= last_offset :
258304 to_write = data [:last_offset - bytes_read ]
259305 stop = True
260306 else :
261307 to_write = data
308+ before_put = perf_counter_ns ()
262309 writer_queue_put ((False , first_offset + bytes_read , to_write ))
310+
311+ before_hash = perf_counter_ns ()
263312 hasher_update (to_write )
313+ after_hash = perf_counter_ns ()
314+
315+ stats_collector_write_append (before_hash - before_put )
316+ stats_collector_other_append (after_hash - before_hash )
317+
264318 bytes_read += len (to_write )
265319 if stop :
266320 break
@@ -284,11 +338,24 @@ def download_first_part(
284338 cloud_range .as_tuple (),
285339 encryption = encryption ,
286340 ) as response :
341+ before_read = perf_counter_ns ()
287342 for to_write in response .iter_content (chunk_size = chunk_size ):
343+ stats_collector_read_append (perf_counter_ns () - before_read )
344+
345+ before_put = perf_counter_ns ()
288346 writer_queue_put ((False , first_offset + bytes_read , to_write ))
347+ before_hash = perf_counter_ns ()
289348 hasher_update (to_write )
349+ after_hash = perf_counter_ns ()
350+
351+ stats_collector_write_append (before_hash - before_put )
352+ stats_collector_other_append (after_hash - before_hash )
353+
290354 bytes_read += len (to_write )
355+ before_read = perf_counter_ns ()
291356 tries_left -= 1
357+ stats_collector .total = perf_counter_ns () - start
358+ stats_collector .report ()
292359
293360
294361def download_non_first_part (
@@ -321,15 +388,27 @@ def download_non_first_part(
321388 'download attempts remaining: %i, bytes read already: %i. Getting range %s now.' ,
322389 retries_left , bytes_read , cloud_range
323390 )
391+ stats_collector = StatsCollector (url , f'{ cloud_range .start } :{ cloud_range .end } ' , 'none' )
392+ stats_collector_read_append = stats_collector .read .append
393+ stats_collector_write_append = stats_collector .write .append
394+ start = before_read = perf_counter_ns ()
324395 with session .download_file_from_url (
325396 url ,
326397 cloud_range .as_tuple (),
327398 encryption = encryption ,
328399 ) as response :
400+ before_read = perf_counter_ns ()
329401 for to_write in response .iter_content (chunk_size = chunk_size ):
402+ after_read = perf_counter_ns ()
330403 writer_queue_put ((False , start_range + bytes_read , to_write ))
404+ after_write = perf_counter_ns ()
405+ stats_collector_read_append (after_read - before_read )
406+ stats_collector_write_append (after_write - after_read )
331407 bytes_read += len (to_write )
408+ before_read = perf_counter_ns ()
332409 retries_left -= 1
410+ stats_collector .total = perf_counter_ns () - start
411+ stats_collector .report ()
333412
334413
335414class PartToDownload :
0 commit comments