Skip to content

Commit be4c78a

Browse files
authored
[SYNPY-1548] Patch sync code for durability of downloads and log message format along with TDQM progress bar formatting (#1147)
* Patch sync code for durability of downloads and log message format along with TDQM progress bar formatting
1 parent b784b85 commit be4c78a

File tree

7 files changed

+229
-106
lines changed

7 files changed

+229
-106
lines changed

.github/workflows/build.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ jobs:
4545

4646
strategy:
4747
matrix:
48-
os: [ubuntu-20.04, macos-12, windows-2022]
48+
os: [ubuntu-20.04, macos-13, windows-2022]
4949

5050
# if changing the below change the run-integration-tests versions and the check-deploy versions
5151
# Make sure that we are running the integration tests on the first and last versions of the matrix
@@ -399,7 +399,7 @@ jobs:
399399

400400
strategy:
401401
matrix:
402-
os: [ubuntu-20.04, macos-12, windows-2022]
402+
os: [ubuntu-20.04, macos-13, windows-2022]
403403

404404
# python versions should be consistent with the strategy matrix and the runs-integration-tests versions
405405
python: ['3.9', '3.10', '3.11', '3.12']

synapseclient/core/download/download_async.py

Lines changed: 24 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,7 @@
1616

1717
import httpx
1818

19-
from synapseclient.api.file_services import (
20-
get_file_handle_for_download,
21-
get_file_handle_for_download_async,
22-
)
19+
from synapseclient.api.file_services import get_file_handle_for_download
2320
from synapseclient.core.exceptions import (
2421
SynapseDownloadAbortedException,
2522
_raise_for_status_httpx,
@@ -29,6 +26,7 @@
2926
RETRYABLE_CONNECTION_ERRORS,
3027
RETRYABLE_CONNECTION_EXCEPTIONS,
3128
with_retry_time_based,
29+
with_retry_time_based_async,
3230
)
3331
from synapseclient.core.transfer_bar import get_or_create_download_progress_bar
3432

@@ -110,24 +108,6 @@ class PresignedUrlProvider:
110108
# offset parameter used to buffer url expiration checks, time in seconds
111109
_TIME_BUFFER: datetime.timedelta = datetime.timedelta(seconds=5)
112110

113-
async def get_info_async(self) -> PresignedUrlInfo:
114-
"""
115-
Using async, returns the cached info if it's not expired, otherwise
116-
retrieves a new pre-signed url and returns that.
117-
118-
Returns:
119-
Information about a retrieved presigned-url from either the cache or a
120-
new request
121-
"""
122-
if not self._cached_info or (
123-
datetime.datetime.now(tz=datetime.timezone.utc)
124-
+ PresignedUrlProvider._TIME_BUFFER
125-
>= self._cached_info.expiration_utc
126-
):
127-
self._cached_info = await self._get_pre_signed_info_async()
128-
129-
return self._cached_info
130-
131111
def get_info(self) -> PresignedUrlInfo:
132112
"""
133113
Using a thread lock, returns the cached info if it's not expired, otherwise
@@ -168,27 +148,6 @@ def _get_pre_signed_info(self) -> PresignedUrlInfo:
168148
expiration_utc=_pre_signed_url_expiration_time(pre_signed_url),
169149
)
170150

171-
async def _get_pre_signed_info_async(self) -> PresignedUrlInfo:
172-
"""
173-
Make an HTTP request to get a pre-signed url to download a file.
174-
175-
Returns:
176-
Information about a retrieved presigned-url from a new request.
177-
"""
178-
response = await get_file_handle_for_download_async(
179-
file_handle_id=self.request.file_handle_id,
180-
synapse_id=self.request.object_id,
181-
entity_type=self.request.object_type,
182-
synapse_client=self.client,
183-
)
184-
file_name = response["fileHandle"]["fileName"]
185-
pre_signed_url = response["preSignedURL"]
186-
return PresignedUrlInfo(
187-
file_name=file_name,
188-
url=pre_signed_url,
189-
expiration_utc=_pre_signed_url_expiration_time(pre_signed_url),
190-
)
191-
192151

193152
def _generate_chunk_ranges(
194153
file_size: int,
@@ -232,40 +191,47 @@ def _pre_signed_url_expiration_time(url: str) -> datetime:
232191
return return_data
233192

234193

235-
async def _get_file_size_wrapper(syn: "Synapse", url: str, debug: bool) -> int:
194+
async def _get_file_size_wrapper(
195+
syn: "Synapse", url_provider: PresignedUrlProvider, debug: bool
196+
) -> int:
236197
"""
237198
Gets the size of the file located at url
238199
239200
Arguments:
240201
syn: The synapseclient
241-
url: The pre-signed url of the file
202+
url_provider: A URL provider for the presigned urls
242203
debug: A boolean to specify if debug mode is on
243204
244205
Returns:
245206
The size of the file in bytes
246207
"""
208+
247209
loop = asyncio.get_running_loop()
248210
return await loop.run_in_executor(
249211
syn._get_thread_pool_executor(asyncio_event_loop=loop),
250212
_get_file_size,
251213
syn,
252-
url,
214+
url_provider,
253215
debug,
254216
)
255217

256218

257-
def _get_file_size(syn: "Synapse", url: str, debug: bool) -> int:
219+
def _get_file_size(
220+
syn: "Synapse", presigned_url_provider: PresignedUrlProvider, debug: bool
221+
) -> int:
258222
"""
259223
Gets the size of the file located at url
260224
261225
Arguments:
262-
url: The pre-signed url of the file
226+
url_provider: A URL provider for the presigned urls
263227
debug: A boolean to specify if debug mode is on
264228
265229
Returns:
266230
The size of the file in bytes
267231
"""
268-
with syn._requests_session_storage.stream("GET", url) as response:
232+
with syn._requests_session_storage.stream(
233+
method="GET", url=presigned_url_provider.get_info().url
234+
) as response:
269235
_raise_for_status_httpx(
270236
response=response,
271237
logger=syn.logger,
@@ -306,9 +272,15 @@ async def download_file(self) -> None:
306272
"""
307273
url_provider = PresignedUrlProvider(self._syn, request=self._download_request)
308274

309-
url_info = await url_provider.get_info_async()
310-
file_size = await _get_file_size_wrapper(
311-
syn=self._syn, url=url_info.url, debug=self._download_request.debug
275+
file_size = await with_retry_time_based_async(
276+
function=lambda: _get_file_size_wrapper(
277+
syn=self._syn,
278+
url_provider=url_provider,
279+
debug=self._download_request.debug,
280+
),
281+
retry_status_codes=[403],
282+
retry_max_wait_before_failure=30,
283+
read_response_content=False,
312284
)
313285
self._progress_bar = get_or_create_download_progress_bar(
314286
file_size=file_size,

synapseclient/core/download/download_functions.py

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ async def download_file_entity(
146146
if_collision=if_collision,
147147
synapse_cache_location=synapse_cache_location,
148148
cached_file_path=cached_file_path,
149+
entity_id=getattr(entity, "id", None),
149150
synapse_client=client,
150151
)
151152
if download_path is None:
@@ -157,9 +158,15 @@ async def download_file_entity(
157158
if not os.path.exists(download_location):
158159
os.makedirs(download_location)
159160
client.logger.info(
160-
f"Copying existing file from {cached_file_path} to {download_path}"
161+
f"[{getattr(entity, 'id', None)}:{file_name}]: Copying existing "
162+
f"file from {cached_file_path} to {download_path}"
161163
)
162164
shutil.copy(cached_file_path, download_path)
165+
else:
166+
client.logger.info(
167+
f"[{getattr(entity, 'id', None)}:{file_name}]: Found existing file "
168+
f"at {download_path}, skipping download."
169+
)
163170

164171
else: # download the file from URL (could be a local file)
165172
object_type = "FileEntity" if submission is None else "SubmissionAttachment"
@@ -257,6 +264,7 @@ async def download_file_entity_model(
257264
if_collision=if_collision,
258265
synapse_cache_location=synapse_cache_location,
259266
cached_file_path=cached_file_path,
267+
entity_id=file.id,
260268
synapse_client=client,
261269
)
262270
if download_path is None:
@@ -268,9 +276,13 @@ async def download_file_entity_model(
268276
if not os.path.exists(download_location):
269277
os.makedirs(download_location)
270278
client.logger.info(
271-
f"Copying existing file from {cached_file_path} to {download_path}"
279+
f"[{file.id}:{file_name}]: Copying existing file from {cached_file_path} to {download_path}"
272280
)
273281
shutil.copy(cached_file_path, download_path)
282+
else:
283+
client.logger.info(
284+
f"[{file.id}:{file_name}]: Found existing file at {download_path}, skipping download."
285+
)
274286

275287
else: # download the file from URL (could be a local file)
276288
object_type = "FileEntity" if submission is None else "SubmissionAttachment"
@@ -526,7 +538,7 @@ def download_fn(
526538
),
527539
)
528540

529-
syn.logger.info(f"Downloaded {synapse_id} to {downloaded_path}")
541+
syn.logger.info(f"[{synapse_id}]: Downloaded to {downloaded_path}")
530542
syn.cache.add(
531543
file_handle["id"], downloaded_path, file_handle.get("contentMd5", None)
532544
)
@@ -541,7 +553,8 @@ def download_fn(
541553
exc_info = sys.exc_info()
542554
ex.progress = 0 if not hasattr(ex, "progress") else ex.progress
543555
syn.logger.debug(
544-
f"\nRetrying download on error: [{exc_info[0]}] after progressing {ex.progress} bytes",
556+
f"\n[{synapse_id}]: Retrying "
557+
f"download on error: [{exc_info[0]}] after progressing {ex.progress} bytes",
545558
exc_info=True,
546559
) # this will include stack trace
547560
if ex.progress == 0: # No progress was made reduce remaining retries.
@@ -669,7 +682,7 @@ def download_from_url(
669682
actual_md5 = None
670683
redirect_count = 0
671684
delete_on_md5_mismatch = True
672-
client.logger.debug(f"Downloading from {url} to {destination}")
685+
client.logger.debug(f"[{entity_id}]: Downloading from {url} to {destination}")
673686
while redirect_count < REDIRECT_LIMIT:
674687
redirect_count += 1
675688
scheme = urllib_urlparse.urlparse(url).scheme
@@ -854,7 +867,8 @@ def _ftp_report_hook(
854867
)
855868
increment_progress_bar(n=transferred, progress_bar=progress_bar)
856869
client.logger.debug(
857-
f"Resuming partial download to {temp_destination}. "
870+
f"[{entity_id}]: Resuming "
871+
f"partial download to {temp_destination}. "
858872
f"{previously_transferred}/{to_be_transferred} bytes already "
859873
"transferred."
860874
)
@@ -894,7 +908,8 @@ def _ftp_report_hook(
894908
# verify that the file was completely downloaded and retry if it is not complete
895909
if to_be_transferred > 0 and transferred < to_be_transferred:
896910
client.logger.warning(
897-
"\nRetrying download because the connection ended early.\n"
911+
f"\n[{entity_id}]: "
912+
"Retrying download because the connection ended early.\n"
898913
)
899914
continue
900915

@@ -903,7 +918,9 @@ def _ftp_report_hook(
903918
shutil.move(temp_destination, destination)
904919
break
905920
else:
906-
client.logger.error(f"Unable to download URLs of type {scheme}")
921+
client.logger.error(
922+
f"[{entity_id}]: Unable to download URLs of type {scheme}"
923+
)
907924
return None
908925

909926
else: # didn't break out of loop
@@ -949,6 +966,7 @@ def resolve_download_path_collisions(
949966
if_collision: str,
950967
synapse_cache_location: str,
951968
cached_file_path: str,
969+
entity_id: str,
952970
*,
953971
synapse_client: Optional["Synapse"] = None,
954972
) -> Union[str, None]:
@@ -964,6 +982,7 @@ def resolve_download_path_collisions(
964982
May be "overwrite.local", "keep.local", or "keep.both".
965983
synapse_cache_location: The location in .synapseCache where the file would be
966984
corresponding to its FileHandleId.
985+
entity_id: The entity id
967986
cached_file_path: The file path of the cached copy
968987
969988
Raises:
@@ -1000,7 +1019,8 @@ def resolve_download_path_collisions(
10001019
pass # Let the download proceed and overwrite the local file.
10011020
elif if_collision == COLLISION_KEEP_LOCAL:
10021021
client.logger.info(
1003-
f"Found existing file at {download_path}, skipping download."
1022+
f"[{entity_id}]: Found existing "
1023+
f"file at {download_path}, skipping download."
10041024
)
10051025

10061026
# Don't want to overwrite the local file.

synapseclient/core/transfer_bar.py

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,17 @@ def increment_progress_bar(n: int, progress_bar: Union[tqdm, None]) -> None:
6666

6767
@contextmanager
6868
def shared_download_progress_bar(
69-
file_size: int, *, synapse_client: Optional["Synapse"] = None
69+
file_size: int,
70+
custom_message: str = None,
71+
*,
72+
synapse_client: Optional["Synapse"] = None,
7073
):
7174
"""An outside process that will eventually trigger a download through this module
7275
can configure a shared Progress Bar by running its code within this context manager.
7376
7477
Arguments:
7578
file_size: The size of the file being downloaded.
79+
custom_message: A custom message to display on the progress bar instead of default.
7680
synapse_client: If not passed in and caching was not disabled by
7781
`Synapse.allow_client_caching(False)` this will use the last created
7882
instance from the Synapse class constructor.
@@ -86,22 +90,28 @@ def shared_download_progress_bar(
8690

8791
syn = Synapse.get_client(synapse_client=synapse_client)
8892
with logging_redirect_tqdm(loggers=[syn.logger]):
89-
get_or_create_download_progress_bar(file_size=file_size, synapse_client=syn)
93+
get_or_create_download_progress_bar(
94+
file_size=file_size, custom_message=custom_message, synapse_client=syn
95+
)
9096
try:
9197
yield
9298
finally:
9399
_thread_local.progress_bar_download_context_managed = False
94-
if _thread_local.progress_bar_download:
95-
_thread_local.progress_bar_download.close()
96-
_thread_local.progress_bar_download.refresh()
97-
del _thread_local.progress_bar_download
100+
close_download_progress_bar()
98101

99102

100-
def close_download_progress_bar() -> None:
101-
"""Handle closing the download progress bar if it is not context managed."""
102-
if not _is_context_managed_download_bar():
103+
def close_download_progress_bar(force_close: bool = False) -> None:
104+
"""Handle closing the download progress bar if it is not context managed. This will
105+
also only close the progress bar if there are no other downloads sharing it."""
106+
if force_close or not _is_context_managed_download_bar():
103107
progress_bar: tqdm = getattr(_thread_local, "progress_bar_download", None)
104-
if progress_bar is not None:
108+
transfer_count: int = getattr(_thread_local, "transfer_count", 0)
109+
transfer_count -= 1
110+
if transfer_count < 0:
111+
transfer_count = 0
112+
113+
_thread_local.transfer_count = transfer_count
114+
if progress_bar is not None and not transfer_count:
105115
progress_bar.close()
106116
progress_bar.refresh()
107117
del _thread_local.progress_bar_download
@@ -113,7 +123,11 @@ def _is_context_managed_download_bar() -> bool:
113123

114124

115125
def get_or_create_download_progress_bar(
116-
file_size: int, postfix: str = None, *, synapse_client: Optional["Synapse"] = None
126+
file_size: int,
127+
postfix: str = None,
128+
custom_message: str = None,
129+
*,
130+
synapse_client: Optional["Synapse"] = None,
117131
) -> Union[tqdm, None]:
118132
"""Return the existing progress bar if it exists, otherwise create a new one.
119133
@@ -132,11 +146,15 @@ def get_or_create_download_progress_bar(
132146
if syn.silent:
133147
return None
134148

149+
transfer_count: int = getattr(_thread_local, "transfer_count", 0)
150+
transfer_count += 1
151+
_thread_local.transfer_count = transfer_count
152+
135153
progress_bar: tqdm = getattr(_thread_local, "progress_bar_download", None)
136154
if progress_bar is None:
137155
progress_bar = tqdm(
138156
total=file_size,
139-
desc="Downloading files",
157+
desc=custom_message or "Downloading files",
140158
unit="B",
141159
unit_scale=True,
142160
smoothing=0,

0 commit comments

Comments
 (0)