1111import http
1212import io
1313import functools
14+ import itertools
1415import logging
1516import time
1617import warnings
@@ -1333,14 +1334,16 @@ def _accept_all(key):
13331334
13341335
13351336def iter_bucket (
1336- bucket_name ,
1337- prefix = '' ,
1338- accept_key = None ,
1339- key_limit = None ,
1340- workers = 16 ,
1341- retries = 3 ,
1342- max_threads_per_fileobj = 4 ,
1343- ** session_kwargs ):
1337+ bucket_name ,
1338+ prefix = '' ,
1339+ accept_key = None ,
1340+ key_limit = None ,
1341+ workers = 16 ,
1342+ retries = 3 ,
1343+ max_threads_per_fileobj = 4 ,
1344+ client_kwargs = None ,
1345+ ** session_kwargs , # double star notation for backwards compatibility
1346+ ):
13441347 """
13451348 Iterate and download all S3 objects under `s3://bucket_name/prefix`.
13461349
@@ -1364,6 +1367,10 @@ def iter_bucket(
13641367 max_threads_per_fileobj: int, optional
13651368 The maximum number of download threads per worker. The maximum size of the
13661369 connection pool will be `workers * max_threads_per_fileobj + 1`. Default: 4
1370+ client_kwargs: dict, optional
1371+ Keyword arguments to pass when creating a new session.
1372+ For a list of available names and values, see:
1373+ https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session.client
13671374 session_kwargs: dict, optional
13681375 Keyword arguments to pass when creating a new session.
13691376 For a list of available names and values, see:
@@ -1411,17 +1418,20 @@ def iter_bucket(
14111418 if bucket_name is None :
14121419 raise ValueError ('bucket_name may not be None' )
14131420
1414- total_size , key_no = 0 , - 1
1421+ total_size , key_no = 0 , 0
14151422
14161423 # thread-safe client to share across _list_bucket and _download_key calls
14171424 # https://github.com/boto/boto3/blob/1.38.41/docs/source/guide/clients.rst?plain=1#L111
14181425 session = boto3 .session .Session (** session_kwargs )
1419- config = botocore .client .Config (
1420- max_pool_connections = workers * max_threads_per_fileobj + 1 , # 1 thread for _list_bucket
1421- tcp_keepalive = True ,
1422- retries = {"max_attempts" : retries * 2 , "mode" : "adaptive" },
1423- )
1424- client = session .client ('s3' , config = config )
1426+ if client_kwargs is None :
1427+ client_kwargs = {}
1428+ if 'config' not in client_kwargs :
1429+ client_kwargs ['config' ] = botocore .client .Config (
1430+ max_pool_connections = workers * max_threads_per_fileobj + 1 , # 1 thread for _list_bucket
1431+ tcp_keepalive = True ,
1432+ retries = {'max_attempts' : retries * 2 , 'mode' : 'adaptive' },
1433+ )
1434+ client = session .client ('s3' , ** client_kwargs )
14251435
14261436 transfer_config = boto3 .s3 .transfer .TransferConfig (max_concurrency = max_threads_per_fileobj )
14271437
@@ -1439,29 +1449,29 @@ def iter_bucket(
14391449 transfer_config = transfer_config ,
14401450 )
14411451
1442- with smart_open .concurrency .create_pool (workers ) as pool :
1443- result_iterator = pool .imap_unordered (download_key , key_iterator )
1444- key_no = 0
1445- while True :
1446- try :
1447- (key , content ) = result_iterator .__next__ ()
1448- except StopIteration :
1449- break
1452+ # Limit the iterator ('infinite' iterators are supported, key_limit=None is supported)
1453+ key_iterator = itertools .islice (key_iterator , key_limit )
1454+
1455+ with smart_open .concurrency .ThreadPoolExecutor (workers ) as executor :
1456+ result_iterator = executor .imap (download_key , key_iterator )
1457+ for key_no , (key , content ) in enumerate (result_iterator , start = 1 ):
14501458 # Skip deleted objects (404 responses)
14511459 if key is None :
14521460 continue
1461+
14531462 if key_no % 1000 == 0 :
14541463 logger .info (
1455- "yielding key #%i: %s, size %i (total %.1fMB )" ,
1464+ "yielding key #%i: %s, size %i (total %.1f MB )" ,
14561465 key_no , key , len (content ), total_size / 1024.0 ** 2
14571466 )
1467+
14581468 yield key , content
14591469 total_size += len (content )
1460- if key_limit is not None and key_no + 1 >= key_limit :
1461- # we were asked to output only a limited number of keys => we're done
1462- break
1463- key_no += 1
1464- logger . info ( "processed %i keys, total size %i" % ( key_no + 1 , total_size ) )
1470+ logger . info (
1471+ "processed %i keys, total size %.1f MB" ,
1472+ key_no ,
1473+ total_size / 1024.0 ** 2 ,
1474+ )
14651475
14661476
14671477def _list_bucket (
0 commit comments