Skip to content

Commit 3671917

Browse files
authored
Merge pull request #1509 from dandi/enh-log2
download: more consistent and exhaustive logging, new `DANDI_DEVEL_AGGRESSIVE_RETRY` mode, respect (?) Retry-After
2 parents 06c10cc + 7f97394 commit 3671917

File tree

7 files changed

+240
-28
lines changed

7 files changed

+240
-28
lines changed

DEVELOPMENT.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,10 @@ development command line options.
8585
function will patch `requests` to log the results of calls to
8686
`requests.utils.super_len()`
8787

88+
- `DANDI_DOWNLOAD_AGGRESSIVE_RETRY` -- When set, would make `download()` retry
89+
very aggressively - it would keep trying if at least some bytes are downloaded
90+
on each attempt. Typically is not needed and could be a sign of network issues.
91+
8892
## Sourcegraph
8993

9094
The [Sourcegraph](https://sourcegraph.com) browser extension can be used to

dandi/cli/command.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -114,14 +114,20 @@ def main(ctx, log_level, pdb=False):
114114
lambda r: r.name != "pyout" and not r.name.startswith("pyout.")
115115
)
116116
root.addHandler(handler)
117+
exts = (
118+
"dandischema",
119+
"h5py",
120+
"hdmf",
121+
"pynwb",
122+
"requests",
123+
"urllib3",
124+
)
117125

118126
lgr.info(
119-
"dandi v%s, dandischema v%s, hdmf v%s, pynwb v%s, h5py v%s",
127+
"python %s, dandi %s, "
128+
+ ", ".join("%s %s" % (e, get_module_version(e)) for e in sorted(exts)),
129+
sys.version.split()[0],
120130
__version__,
121-
get_module_version("dandischema"),
122-
get_module_version("hdmf"),
123-
get_module_version("pynwb"),
124-
get_module_version("h5py"),
125131
extra={"file_only": True},
126132
)
127133
lgr.info("sys.argv = %r", sys.argv, extra={"file_only": True})

dandi/cli/tests/test_service_scripts.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from click.testing import CliRunner
1212
from dandischema.consts import DANDI_SCHEMA_VERSION
1313
import pytest
14-
import vcr
1514

1615
from dandi import __version__
1716
from dandi.tests.fixtures import SampleDandiset
@@ -76,9 +75,14 @@ def test_update_dandiset_from_doi(
7675
dandiset_id = new_dandiset.dandiset_id
7776
repository = new_dandiset.api.instance.gui
7877
monkeypatch.setenv("DANDI_API_KEY", new_dandiset.api.api_key)
79-
if os.environ.get("DANDI_TESTS_NO_VCR", ""):
78+
if os.environ.get("DANDI_TESTS_NO_VCR", "") or sys.version_info <= (3, 10):
79+
# Older vcrpy has an issue with Python 3.9 and newer urllib2 >= 2
80+
# But we require newer urllib2 for more correct operation, and
81+
# do still support 3.9. Remove when 3.9 support is dropped
8082
ctx = nullcontext()
8183
else:
84+
import vcr
85+
8286
ctx = vcr.use_cassette(
8387
str(DATA_DIR / "update_dandiset_from_doi" / f"{name}.vcr.yaml"),
8488
before_record_request=record_only_doi_requests,

dandi/dandiapi.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1592,10 +1592,23 @@ def downloader(start_at: int = 0) -> Iterator[bytes]:
15921592
# TODO: apparently we might need retries here as well etc
15931593
# if result.status_code not in (200, 201):
15941594
result.raise_for_status()
1595+
nbytes, nchunks = 0, 0
15951596
for chunk in result.iter_content(chunk_size=chunk_size):
1597+
nchunks += 1
15961598
if chunk: # could be some "keep alive"?
1599+
nbytes += len(chunk)
15971600
yield chunk
1598-
lgr.info("Asset %s successfully downloaded", self.identifier)
1601+
else:
1602+
lgr.debug("'Empty' chunk downloaded for %s", url)
1603+
lgr.info(
1604+
"Asset %s (%d bytes in %d chunks starting from %d) successfully "
1605+
"downloaded from %s",
1606+
self.identifier,
1607+
nbytes,
1608+
nchunks,
1609+
start_at,
1610+
url,
1611+
)
15991612

16001613
return downloader
16011614

dandi/download.py

Lines changed: 112 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from pathlib import Path
1414
import random
1515
from shutil import rmtree
16+
import sys
1617
from threading import Lock
1718
import time
1819
from types import TracebackType
@@ -692,17 +693,26 @@ def _download_file(
692693
# TODO: how do we discover the total size????
693694
# TODO: do not do it in-place, but rather into some "hidden" file
694695
resuming = False
695-
for attempt in range(3):
696+
attempt = 0
697+
nattempts = 3 # number to do, could be incremented if we downloaded a little
698+
while attempt <= nattempts:
699+
attempt += 1
696700
try:
697701
if digester:
698702
downloaded_digest = digester() # start empty
699703
warned = False
700704
# I wonder if we could make writing async with downloader
701705
with DownloadDirectory(path, digests or {}) as dldir:
702706
assert dldir.offset is not None
707+
downloaded_in_attempt = 0
703708
downloaded = dldir.offset
704709
resuming = downloaded > 0
705710
if size is not None and downloaded == size:
711+
lgr.debug(
712+
"%s - downloaded size matches target size of %d, exiting the loop",
713+
path,
714+
size,
715+
)
706716
# Exit early when downloaded == size, as making a Range
707717
# request in such a case results in a 416 error from S3.
708718
# Problems will result if `size` is None but we've already
@@ -713,6 +723,7 @@ def _download_file(
713723
assert downloaded_digest is not None
714724
downloaded_digest.update(block)
715725
downloaded += len(block)
726+
downloaded_in_attempt += len(block)
716727
# TODO: yield progress etc
717728
out: dict[str, Any] = {"done": downloaded}
718729
if size:
@@ -738,30 +749,83 @@ def _download_file(
738749
# Catching RequestException lets us retry on timeout & connection
739750
# errors (among others) in addition to HTTP status errors.
740751
except requests.RequestException as exc:
752+
sleep_amount = random.random() * 5 * attempt
753+
if os.environ.get("DANDI_DOWNLOAD_AGGRESSIVE_RETRY"):
754+
# in such a case if we downloaded a little more --
755+
# consider it a successful attempt
756+
if downloaded_in_attempt > 0:
757+
lgr.debug(
758+
"%s - download failed on attempt #%d: %s, "
759+
"but did download %d bytes, so considering "
760+
"it a success and incrementing number of allowed attempts.",
761+
path,
762+
attempt,
763+
exc,
764+
downloaded_in_attempt,
765+
)
766+
nattempts += 1
741767
# TODO: actually we should probably retry only on selected codes,
742-
# and also respect Retry-After
743-
if attempt >= 2 or (
744-
exc.response is not None
745-
and exc.response.status_code
746-
not in (
768+
if exc.response is not None:
769+
if exc.response.status_code not in (
747770
400, # Bad Request, but happened with gider:
748771
# https://github.com/dandi/dandi-cli/issues/87
749772
*RETRY_STATUSES,
773+
):
774+
lgr.debug(
775+
"%s - download failed due to response %d: %s",
776+
path,
777+
exc.response.status_code,
778+
exc,
779+
)
780+
yield {"status": "error", "message": str(exc)}
781+
return
782+
elif retry_after := exc.response.headers.get("Retry-After"):
783+
# playing safe
784+
if not str(retry_after).isdigit():
785+
# our code is wrong, do not crash but issue warning so
786+
# we might get report/fix it up
787+
lgr.warning(
788+
"%s - download failed due to response %d with non-integer"
789+
" Retry-After=%r: %s",
790+
path,
791+
exc.response.status_code,
792+
retry_after,
793+
exc,
794+
)
795+
yield {"status": "error", "message": str(exc)}
796+
return
797+
sleep_amount = int(retry_after)
798+
lgr.debug(
799+
"%s - download failed due to response %d with "
800+
"Retry-After=%d: %s, will sleep and retry",
801+
path,
802+
exc.response.status_code,
803+
sleep_amount,
804+
exc,
805+
)
806+
else:
807+
lgr.debug("%s - download failed: %s", path, exc)
808+
yield {"status": "error", "message": str(exc)}
809+
return
810+
elif attempt >= nattempts:
811+
lgr.debug(
812+
"%s - download failed after %d attempts: %s", path, attempt, exc
750813
)
751-
):
752-
lgr.debug("%s - download failed: %s", path, exc)
753814
yield {"status": "error", "message": str(exc)}
754815
return
755816
# if is_access_denied(exc) or attempt >= 2:
756817
# raise
757818
# sleep a little and retry
758-
lgr.debug(
759-
"%s - failed to download on attempt #%d: %s, will sleep a bit and retry",
760-
path,
761-
attempt,
762-
exc,
763-
)
764-
time.sleep(random.random() * 5)
819+
else:
820+
lgr.debug(
821+
"%s - download failed on attempt #%d: %s, will sleep a bit and retry",
822+
path,
823+
attempt,
824+
exc,
825+
)
826+
time.sleep(sleep_amount)
827+
else:
828+
lgr.warning("downloader logic: We should not be here!")
765829

766830
if downloaded_digest and not resuming:
767831
assert downloaded_digest is not None
@@ -829,16 +893,22 @@ def __enter__(self) -> DownloadDirectory:
829893
):
830894
# Pick up where we left off, writing to the end of the file
831895
lgr.debug(
832-
"Download directory exists and has matching checksum; resuming download"
896+
"%s - download directory exists and has matching checksum(s) %s; resuming download",
897+
self.dirpath,
898+
matching_algs,
833899
)
834900
self.fp = self.writefile.open("ab")
835901
else:
836902
# Delete the file (if it even exists) and start anew
837903
if not chkpath.exists():
838-
lgr.debug("Starting new download in new download directory")
904+
lgr.debug(
905+
"%s - no prior digests found; starting new download", self.dirpath
906+
)
839907
else:
840908
lgr.debug(
841-
"Download directory found, but digests do not match; starting new download"
909+
"%s - download directory found, but digests do not match;"
910+
" starting new download",
911+
self.dirpath,
842912
)
843913
try:
844914
self.writefile.unlink()
@@ -857,12 +927,35 @@ def __exit__(
857927
exc_tb: TracebackType | None,
858928
) -> None:
859929
assert self.fp is not None
930+
if exc_type is not None or exc_val is not None or exc_tb is not None:
931+
lgr.debug(
932+
"%s - entered __exit__ with position %d with exception: %s, %s",
933+
self.dirpath,
934+
self.fp.tell(),
935+
exc_type,
936+
exc_val,
937+
)
938+
else:
939+
lgr.debug(
940+
"%s - entered __exit__ with position %d without any exception",
941+
self.dirpath,
942+
self.fp.tell(),
943+
)
860944
self.fp.close()
861945
try:
862946
if exc_type is None:
863947
try:
864948
self.writefile.replace(self.filepath)
865-
except IsADirectoryError:
949+
except (IsADirectoryError, PermissionError) as exc:
950+
if isinstance(exc, PermissionError):
951+
if not (
952+
sys.platform.startswith("win") and self.filepath.is_dir()
953+
):
954+
raise
955+
lgr.debug(
956+
"Destination path %s is a directory; removing it and retrying",
957+
self.filepath,
958+
)
866959
rmtree(self.filepath)
867960
self.writefile.replace(self.filepath)
868961
finally:

dandi/tests/test_download.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
from __future__ import annotations
22

33
from collections.abc import Callable
4+
from glob import glob
45
import json
6+
import logging
7+
from multiprocessing import Manager, Process
58
import os
69
import os.path
710
from pathlib import Path
@@ -21,6 +24,7 @@
2124
from ..consts import DRAFT, dandiset_metadata_file
2225
from ..dandiarchive import DandisetURL
2326
from ..download import (
27+
DownloadDirectory,
2428
Downloader,
2529
DownloadExisting,
2630
DownloadFormat,
@@ -1038,3 +1042,89 @@ def test_pyouthelper_time_remaining_1339():
10381042
assert len(done) == 2
10391043
else:
10401044
assert done[-1] == f"ETA: {10 - i} seconds<"
1045+
1046+
1047+
@mark.skipif_on_windows # https://github.com/pytest-dev/pytest/issues/12964
1048+
def test_DownloadDirectory_basic(tmp_path: Path) -> None:
1049+
with DownloadDirectory(tmp_path, digests={}) as dl:
1050+
assert dl.dirpath.exists()
1051+
assert dl.writefile.exists()
1052+
assert dl.writefile.stat().st_size == 0
1053+
assert dl.offset == 0
1054+
1055+
dl.append(b"123")
1056+
assert dl.fp is not None
1057+
dl.fp.flush() # appends are not flushed automatically
1058+
assert dl.writefile.stat().st_size == 3
1059+
assert dl.offset == 0 # doesn't change
1060+
1061+
dl.append(b"456")
1062+
inode_number = dl.writefile.stat().st_ino
1063+
assert inode_number != tmp_path.stat().st_ino
1064+
1065+
# but after we are done - should be a full file!
1066+
assert tmp_path.stat().st_size == 6
1067+
assert tmp_path.read_bytes() == b"123456"
1068+
# we moved the file, didn't copy (expensive)
1069+
assert inode_number == tmp_path.stat().st_ino
1070+
1071+
# no problem with overwriting with new content
1072+
with DownloadDirectory(tmp_path, digests={}) as dl:
1073+
dl.append(b"789")
1074+
assert tmp_path.read_bytes() == b"789"
1075+
1076+
# even if path is a directory which we "overwrite"
1077+
tmp_path.unlink()
1078+
tmp_path.mkdir()
1079+
(tmp_path / "somedata.dat").write_text("content")
1080+
with DownloadDirectory(tmp_path, digests={}) as dl:
1081+
assert set(glob(f"{tmp_path}*")) == {str(tmp_path), str(dl.dirpath)}
1082+
dl.append(b"123")
1083+
assert tmp_path.read_bytes() == b"123"
1084+
1085+
# no temp .dandidownload folder is left behind
1086+
assert set(glob(f"{tmp_path}*")) == {str(tmp_path)}
1087+
1088+
# test locking
1089+
with Manager() as manager:
1090+
results = manager.list()
1091+
with DownloadDirectory(tmp_path, digests={}) as dl:
1092+
dl.append(b"123")
1093+
p1 = Process(target=_download_directory_subproc, args=(tmp_path, results))
1094+
p1.start()
1095+
p1.join()
1096+
assert len(results) == 1
1097+
assert results[0] == f"Could not acquire download lock for {tmp_path}"
1098+
assert tmp_path.read_bytes() == b"123"
1099+
1100+
1101+
# needs to be a top-level function for pickling
1102+
def _download_directory_subproc(path, results):
1103+
try:
1104+
with DownloadDirectory(path, digests={}):
1105+
results.append("re-entered fine")
1106+
except Exception as exc:
1107+
results.append(str(exc))
1108+
1109+
1110+
def test_DownloadDirectory_exc(
1111+
tmp_path: Path, caplog: pytest.LogCaptureFixture
1112+
) -> None:
1113+
caplog.set_level(logging.DEBUG, logger="dandi")
1114+
# and now let's exit with exception
1115+
with pytest.raises(RuntimeError):
1116+
with DownloadDirectory(tmp_path, digests={}) as dl:
1117+
dl.append(b"456")
1118+
raise RuntimeError("Boom")
1119+
assert (
1120+
"dandi",
1121+
10,
1122+
f"{dl.dirpath} - entered __exit__ with position 3 with exception: "
1123+
"<class 'RuntimeError'>, Boom",
1124+
) == caplog.record_tuples[-1]
1125+
# and we left without cleanup but closed things up after ourselves
1126+
assert tmp_path.exists()
1127+
assert tmp_path.is_dir()
1128+
assert dl.dirpath.exists()
1129+
assert dl.fp is None
1130+
assert dl.writefile.read_bytes() == b"456"

0 commit comments

Comments
 (0)