Skip to content

Commit 6f2671d

Browse files
committed
FIX: Python batch download for existing files
1 parent e34c724 commit 6f2671d

File tree

3 files changed

+174
-17
lines changed

3 files changed

+174
-17
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
- Added `pip-system-certs` dependency for Windows platforms to prevent a connection issue in `requests` when behind a proxy
77
- Iteration of the `Live` client will now automatically call `Live.stop` when the iterator is destroyed, such as when a for loop is escaped with an exception or `break` statement.
88

9+
#### Bug fixes
10+
- Fixed an issue where `batch.download` and `batch.download_async` would fail if requested files already existed in the output directory
11+
912
## 0.33.0 - 2024-04-16
1013

1114
#### Enhancements

databento/historical/api/batch.py

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -394,8 +394,16 @@ def _download_batch_file(
394394
headers: dict[str, str] = self._headers.copy()
395395
if output_path.exists():
396396
existing_size = output_path.stat().st_size
397-
headers["Range"] = f"bytes={existing_size}-{batch_download_file.size - 1}"
398-
mode = "ab"
397+
if existing_size < batch_download_file.size:
398+
headers["Range"] = f"bytes={existing_size}-{batch_download_file.size - 1}"
399+
mode = "ab"
400+
elif existing_size == batch_download_file.size:
401+
# File exists and is complete
402+
break
403+
else:
404+
raise FileExistsError(
405+
f"Batch file {output_path.name} already exists and has a larger than expected size.",
406+
)
399407
else:
400408
mode = "wb"
401409
try:
@@ -424,24 +432,26 @@ def _download_batch_file(
424432
attempts += 1
425433
continue # try again
426434
raise BentoError(f"Error downloading file: {exc}") from None
435+
else:
436+
break
427437

428-
logger.debug("Download of %s completed", output_path.name)
429-
hash_algo, _, hash_hex = batch_download_file.hash_str.partition(":")
438+
logger.debug("Download of %s completed", output_path.name)
439+
hash_algo, _, hash_hex = batch_download_file.hash_str.partition(":")
430440

431-
if hash_algo == "sha256":
432-
output_hash = hashlib.sha256(output_path.read_bytes())
433-
if output_hash.hexdigest() != hash_hex:
434-
warn_msg = f"Downloaded file failed checksum validation: {output_path.name}"
435-
logger.warning(warn_msg)
436-
warnings.warn(warn_msg, category=BentoWarning)
437-
else:
438-
logger.warning(
439-
"Skipping %s checksum because %s is not supported",
440-
output_path.name,
441-
hash_algo,
442-
)
441+
if hash_algo == "sha256":
442+
output_hash = hashlib.sha256(output_path.read_bytes())
443+
if output_hash.hexdigest() != hash_hex:
444+
warn_msg = f"Downloaded file failed checksum validation: {output_path.name}"
445+
logger.warning(warn_msg)
446+
warnings.warn(warn_msg, category=BentoWarning)
447+
else:
448+
logger.warning(
449+
"Skipping %s checksum because %s is not supported",
450+
output_path.name,
451+
hash_algo,
452+
)
443453

444-
return output_path
454+
return output_path
445455

446456

447457
@dataclass

tests/test_historical_batch.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,3 +276,147 @@ def test_batch_download_rate_limit_429(
276276
assert mocked_batch_list_files.call_args.args == (job_id,)
277277
assert len(downloaded_files) == 1
278278
assert downloaded_files[0].read_bytes() == file_content
279+
280+
281+
def test_batch_download_file_exists(
282+
monkeypatch: pytest.MonkeyPatch,
283+
historical_client: Historical,
284+
tmp_path: Path,
285+
) -> None:
286+
"""
287+
Tests batch download by setting up a MagicMock which will return the
288+
content "unittest".
289+
290+
A subsequent call to batch.download should not fail.
291+
292+
"""
293+
# Arrange
294+
job_id = "GLBX-20220610-5DEFXVTMSM"
295+
filename = "glbx-mdp3-20220610.mbo.csv.zst"
296+
file_content = b"unittest"
297+
file_hash = f"sha256:{hashlib.sha256(file_content).hexdigest()}"
298+
file_size = len(file_content)
299+
300+
# Mock the call to list files so it returns a test manifest
301+
monkeypatch.setattr(
302+
historical_client.batch,
303+
"list_files",
304+
mocked_batch_list_files := MagicMock(
305+
return_value=[
306+
{
307+
"filename": filename,
308+
"hash": file_hash,
309+
"size": file_size,
310+
"urls": {
311+
"https": f"localhost:442/v0/batch/download/TESTUSER/{job_id}/{filename}",
312+
"ftp": "",
313+
},
314+
},
315+
],
316+
),
317+
)
318+
319+
# Mock the call for get, so we can simulate a 429 response
320+
ok_response = MagicMock()
321+
ok_response.__enter__.return_value = MagicMock(
322+
status_code=200,
323+
iter_content=MagicMock(return_value=iter([file_content])),
324+
)
325+
monkeypatch.setattr(
326+
requests,
327+
"get",
328+
MagicMock(
329+
side_effect=[ok_response],
330+
),
331+
)
332+
333+
# Act
334+
historical_client.batch.download(
335+
job_id=job_id,
336+
output_dir=tmp_path,
337+
filename_to_download=filename,
338+
)
339+
340+
downloaded_files = historical_client.batch.download(
341+
job_id=job_id,
342+
output_dir=tmp_path,
343+
filename_to_download=filename,
344+
)
345+
346+
# Assert
347+
assert mocked_batch_list_files.call_args.args == (job_id,)
348+
assert len(downloaded_files) == 1
349+
assert downloaded_files[0].read_bytes() == file_content
350+
351+
352+
def test_batch_download_file_larger_than_expected(
353+
monkeypatch: pytest.MonkeyPatch,
354+
historical_client: Historical,
355+
tmp_path: Path,
356+
) -> None:
357+
"""
358+
Tests batch download by setting up a MagicMock which will return the
359+
content "unittest".
360+
361+
Then, write some extra bytes to that file, and ensure a subsequent
362+
call to batch.download will raise an exception.
363+
364+
"""
365+
# Arrange
366+
job_id = "GLBX-20220610-5DEFXVTMSM"
367+
filename = "glbx-mdp3-20220610.mbo.csv.zst"
368+
file_content = b"unittest"
369+
file_hash = f"sha256:{hashlib.sha256(file_content).hexdigest()}"
370+
file_size = len(file_content)
371+
372+
# Mock the call to list files so it returns a test manifest
373+
monkeypatch.setattr(
374+
historical_client.batch,
375+
"list_files",
376+
MagicMock(
377+
return_value=[
378+
{
379+
"filename": filename,
380+
"hash": file_hash,
381+
"size": file_size,
382+
"urls": {
383+
"https": f"localhost:442/v0/batch/download/TESTUSER/{job_id}/{filename}",
384+
"ftp": "",
385+
},
386+
},
387+
],
388+
),
389+
)
390+
391+
# Mock the call for get, so we can simulate a 429 response
392+
ok_response = MagicMock()
393+
ok_response.__enter__.return_value = MagicMock(
394+
status_code=200,
395+
iter_content=MagicMock(return_value=iter([file_content])),
396+
)
397+
monkeypatch.setattr(
398+
requests,
399+
"get",
400+
MagicMock(
401+
side_effect=[ok_response],
402+
),
403+
)
404+
405+
# Act
406+
downloaded_files = historical_client.batch.download(
407+
job_id=job_id,
408+
output_dir=tmp_path,
409+
filename_to_download=filename,
410+
)
411+
412+
# Increase the existing file size with some junk
413+
with downloaded_files[-1].open(mode="ab") as out:
414+
out.write(b"junk")
415+
416+
# Assert
417+
with pytest.raises(FileExistsError):
418+
historical_client.batch.download(
419+
job_id=job_id,
420+
output_dir=tmp_path,
421+
filename_to_download=filename,
422+
)

0 commit comments

Comments
 (0)