Skip to content

Commit 8b9f446

Browse files
Fix bad total size after resuming download (#3234) (#3248)
* Fix bad total size after resuming download (#3234) * Apply suggestions from code review Co-authored-by: célina <[email protected]> --------- Co-authored-by: célina <[email protected]>
1 parent 206ff8d commit 8b9f446

File tree

2 files changed

+12
-8
lines changed

2 files changed

+12
-8
lines changed

src/huggingface_hub/file_download.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -317,9 +317,6 @@ def _get_file_length_from_http_response(response: requests.Response) -> Optional
317317
318318
This function extracts the file size from the HTTP response headers, either from the
319319
`Content-Range` or `Content-Length` header, if available (in that order).
320-
The HTTP response object containing the headers.
321-
`int` or `None`: The length of the file in bytes if the information is available,
322-
otherwise `None`.
323320
324321
Args:
325322
response (`requests.Response`):
@@ -329,6 +326,15 @@ def _get_file_length_from_http_response(response: requests.Response) -> Optional
329326
`int` or `None`: The length of the file in bytes, or None if not available.
330327
"""
331328

329+
# If HTTP response contains compressed body (e.g. gzip), the `Content-Length` header will
330+
# contain the length of the compressed body, not the uncompressed file size.
331+
# And at the start of transmission there's no way to know the uncompressed file size for gzip,
332+
# thus we return None in that case.
333+
content_encoding = response.headers.get("Content-Encoding", "identity").lower()
334+
if content_encoding != "identity":
335+
# gzip/br/deflate/zstd etc
336+
return None
337+
332338
content_range = response.headers.get("Content-Range")
333339
if content_range is not None:
334340
return int(content_range.rsplit("/")[-1])
@@ -422,11 +428,7 @@ def http_get(
422428
)
423429

424430
hf_raise_for_status(r)
425-
content_length = _get_file_length_from_http_response(r)
426-
427-
# NOTE: 'total' is the total number of bytes to download, not the number of bytes in the file.
428-
# If the file is compressed, the number of bytes in the saved file will be higher than 'total'.
429-
total = resume_size + int(content_length) if content_length is not None else None
431+
total: Optional[int] = _get_file_length_from_http_response(r)
430432

431433
if displayed_filename is None:
432434
displayed_filename = url

tests/test_file_download.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1205,6 +1205,7 @@ def test_etag_timeout_set_as_env_variable_parameter_ignored(self):
12051205
@with_production_testing
12061206
class TestExtraLargeFileDownloadPaths(unittest.TestCase):
12071207
@patch("huggingface_hub.file_download.constants.HF_HUB_ENABLE_HF_TRANSFER", False)
1208+
@patch("huggingface_hub.file_download.constants.HF_HUB_DISABLE_XET", True)
12081209
def test_large_file_http_path_error(self):
12091210
with SoftTemporaryDirectory() as cache_dir:
12101211
with self.assertRaises(
@@ -1226,6 +1227,7 @@ def test_large_file_http_path_error(self):
12261227
"hf_transfer not installed, so skipping large file download with hf_transfer check.",
12271228
)
12281229
@patch("huggingface_hub.file_download.constants.HF_HUB_ENABLE_HF_TRANSFER", True)
1230+
@patch("huggingface_hub.file_download.constants.HF_HUB_DISABLE_XET", True)
12291231
@patch("huggingface_hub.file_download.constants.MAX_HTTP_DOWNLOAD_SIZE", 44)
12301232
@patch("huggingface_hub.file_download.constants.DOWNLOAD_CHUNK_SIZE", 2) # make sure hf_download is used
12311233
def test_large_file_download_with_hf_transfer(self):

0 commit comments

Comments
 (0)