refactor to use size instead of items count as progress

Andrei Neagu · Andrei Neagu · commit e32f26529b2b · 2025-02-07T17:57:18.000+01:00
diff --git a/packages/aws-library/src/aws_library/s3/_client.py b/packages/aws-library/src/aws_library/s3/_client.py
@@ -3,7 +3,7 @@
 import functools
 import logging
 import urllib.parse
-from collections.abc import AsyncGenerator, Callable, Sequence
+from collections.abc import AsyncGenerator, Sequence
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Final, Protocol, cast
@@ -18,7 +18,12 @@
 from pydantic import AnyUrl, ByteSize, TypeAdapter
 from servicelib.logging_utils import log_catch, log_context
 from servicelib.utils import limited_gather
-from servicelib.zip_stream import DEFAULT_CHUNK_SIZE, FileStream
+from servicelib.zip_stream import (
+    DEFAULT_READ_CHUNK_SIZE,
+    FileSize,
+    FileStream,
+    FileStreamCallable,
+)
 from settings_library.s3 import S3Settings
 from types_aiobotocore_s3 import S3Client
 from types_aiobotocore_s3.literals import BucketLocationConstraintType
@@ -484,36 +489,43 @@ async def get_object_file_stream(
         bucket_name: S3BucketName,
         object_key: S3ObjectKey,
         *,
-        chunk_size: int = DEFAULT_CHUNK_SIZE,
-    ) -> FileStream:
-        response = await self._client.head_object(Bucket=bucket_name, Key=object_key)
-        file_size = response["ContentLength"]
-
-        # Download the file in chunks
-        position = 0
-        while position < file_size:
-            # Calculate the range for this chunk
-            end = min(position + chunk_size - 1, file_size - 1)
-            range_header = f"bytes={position}-{end}"
-
-            # Download the chunk
-            response = await self._client.get_object(
-                Bucket=bucket_name, Key=object_key, Range=range_header
-            )
+        chunk_size: int = DEFAULT_READ_CHUNK_SIZE,
+    ) -> tuple[FileSize, FileStreamCallable]:
+
+        # below is a quick call
+        head_response = await self._client.head_object(
+            Bucket=bucket_name, Key=object_key
+        )
+        file_size = FileSize(head_response["ContentLength"])
+
+        async def _() -> FileStream:
+            # Download the file in chunks
+            position = 0
+            while position < file_size:
+                # Calculate the range for this chunk
+                end = min(position + chunk_size - 1, file_size - 1)
+                range_header = f"bytes={position}-{end}"
+
+                # Download the chunk
+                response = await self._client.get_object(
+                    Bucket=bucket_name, Key=object_key, Range=range_header
+                )
+
+                chunk = await response["Body"].read()
 
-            chunk = await response["Body"].read()
+                # Yield the chunk for processing
+                yield chunk
 
-            # Yield the chunk for processing
-            yield chunk
+                position += chunk_size
 
-            position += chunk_size
+        return file_size, _
 
     @s3_exception_handler(_logger)
     async def upload_object_from_file_stream(
         self,
         bucket_name: S3BucketName,
         object_key: S3ObjectKey,
-        file_stream: Callable[[], FileStream],
+        file_stream: FileStream,
     ) -> None:
         # Create a multipart upload
         multipart_response = await self._client.create_multipart_upload(
@@ -525,7 +537,7 @@ async def upload_object_from_file_stream(
             parts = []
             part_number = 1
 
-            async for chunk in file_stream():
+            async for chunk in file_stream:
                 part_response = await self._client.upload_part(
                     Bucket=bucket_name,
                     Key=object_key,
diff --git a/packages/aws-library/tests/test_s3_client.py b/packages/aws-library/tests/test_s3_client.py
@@ -1502,8 +1502,13 @@ async def extracted_archive_path(tmp_path: Path, faker: Faker) -> AsyncIterator[
     assert not path.is_dir()
 
 
-def _get_s3_object_keys(files: set[Path]) -> set[S3ObjectKey]:
-    return {f.name for f in files}
+@pytest.fixture
+async def archive_s3_object_key(
+    with_s3_bucket: S3BucketName, simcore_s3_api: SimcoreS3API
+) -> AsyncIterator[S3ObjectKey]:
+    s3_object_key = "read_from_s3_write_to_s3"
+    yield s3_object_key
+    await simcore_s3_api.delete_object(bucket=with_s3_bucket, object_key=s3_object_key)
 
 
 @pytest.fixture
@@ -1514,13 +1519,8 @@ def _progress_cb(*args, **kwargs) -> None:
     return mocker.Mock(side_effect=_progress_cb)
 
 
-@pytest.fixture
-async def archive_s3_object_key(
-    with_s3_bucket: S3BucketName, simcore_s3_api: SimcoreS3API
-) -> AsyncIterator[S3ObjectKey]:
-    s3_object_key = "read_from_s3_write_to_s3"
-    yield s3_object_key
-    await simcore_s3_api.delete_object(bucket=with_s3_bucket, object_key=s3_object_key)
+def _get_s3_object_keys(files: set[Path]) -> set[S3ObjectKey]:
+    return {f.name for f in files}
 
 
 @pytest.mark.parametrize(
@@ -1540,8 +1540,8 @@ async def test_workflow_compress_s3_objects_and_local_files_in_a_single_archive_
     simcore_s3_api: SimcoreS3API,
     with_s3_bucket: S3BucketName,
     s3_client: S3Client,
-    mocked_progress_bar_cb: Mock,
     archive_s3_object_key: S3ObjectKey,
+    mocked_progress_bar_cb: Mock,
 ):
     # In this test:
     # - files are read form disk and S3
@@ -1556,15 +1556,18 @@ async def test_workflow_compress_s3_objects_and_local_files_in_a_single_archive_
         archive_file_entries.append(
             (
                 file.name,
-                DiskStreamReader(file).get_stream,
+                DiskStreamReader(file).get_stream_data(),
             )
         )
 
-    for s3_object_key in _get_s3_object_keys(files_stored_in_s3):
+    s3_object_keys = _get_s3_object_keys(files_stored_in_s3)
+    assert len(s3_object_keys) == len(files_stored_in_s3)
+
+    for s3_object_key in s3_object_keys:
         archive_file_entries.append(
             (
                 s3_object_key,
-                lambda: simcore_s3_api.get_object_file_stream(
+                await simcore_s3_api.get_object_file_stream(
                     with_s3_bucket, s3_object_key
                 ),
             )
@@ -1574,23 +1577,24 @@ async def test_workflow_compress_s3_objects_and_local_files_in_a_single_archive_
     # some will be read from S3 and some from the disk
     random.shuffle(archive_file_entries)
 
+    started = time.time()
+
     async with ProgressBarData(
         num_steps=1,
         progress_report_cb=mocked_progress_bar_cb,
         description="root_bar",
-    ) as root:
-        started = time.time()
+    ) as progress_bar:
         await simcore_s3_api.upload_object_from_file_stream(
             with_s3_bucket,
             archive_s3_object_key,
-            lambda: get_zip_archive_stream(
+            get_zip_archive_stream(
                 archive_file_entries,
-                progress_bar=root,
+                progress_bar=progress_bar,
                 chunk_size=MIN_MULTIPART_UPLOAD_CHUNK_SIZE,
             ),
         )
-        duration = time.time() - started
-        print(f"Zip created on S3 in {duration:.2f} seconds")
+    duration = time.time() - started
+    print(f"Zip created on S3 in {duration:.2f} seconds")
 
     # 2. download zip archive form S3
     print(f"downloading {archive_download_path}")
@@ -1606,7 +1610,7 @@ async def test_workflow_compress_s3_objects_and_local_files_in_a_single_archive_
     all_files_in_zip = get_files_info_from_itrable(
         files_stored_locally
     ) | get_files_info_from_itrable(files_stored_in_s3)
-    assert len(all_files_in_zip) == 20
+
     await assert_same_contents(
         all_files_in_zip, get_files_info_from_path(extracted_archive_path)
     )
diff --git a/packages/service-library/src/servicelib/zip_stream/__init__.py b/packages/service-library/src/servicelib/zip_stream/__init__.py
@@ -1,15 +1,23 @@
-from ._constants import DEFAULT_CHUNK_SIZE
+from ._constants import DEFAULT_READ_CHUNK_SIZE
 from ._input import DiskStreamReader
 from ._output import DiskStreamWriter
-from ._types import ArchiveEntries, ArchiveFileEntry, FileStream
+from ._types import (
+    ArchiveEntries,
+    ArchiveFileEntry,
+    FileSize,
+    FileStream,
+    FileStreamCallable,
+)
 from ._zipper import get_zip_archive_stream
 
 __all__: tuple[str, ...] = (
     "ArchiveEntries",
     "ArchiveFileEntry",
-    "DEFAULT_CHUNK_SIZE",
+    "DEFAULT_READ_CHUNK_SIZE",
     "DiskStreamReader",
     "DiskStreamWriter",
+    "FileSize",
     "FileStream",
+    "FileStreamCallable",
     "get_zip_archive_stream",
 )
diff --git a/packages/service-library/src/servicelib/zip_stream/_constants.py b/packages/service-library/src/servicelib/zip_stream/_constants.py
@@ -2,4 +2,4 @@
 
 from pydantic import ByteSize, TypeAdapter
 
-DEFAULT_CHUNK_SIZE: Final[int] = TypeAdapter(ByteSize).validate_python("1MiB")
+DEFAULT_READ_CHUNK_SIZE: Final[int] = TypeAdapter(ByteSize).validate_python("1MiB")
diff --git a/packages/service-library/src/servicelib/zip_stream/_input.py b/packages/service-library/src/servicelib/zip_stream/_input.py
@@ -2,19 +2,22 @@
 
 import aiofiles
 
-from ._constants import DEFAULT_CHUNK_SIZE
-from ._types import FileStream
+from ._constants import DEFAULT_READ_CHUNK_SIZE
+from ._types import FileSize, FileStream, StreamData
 
 
 class DiskStreamReader:
-    def __init__(self, file_path: Path, *, chunk_size=DEFAULT_CHUNK_SIZE):
+    def __init__(self, file_path: Path, *, chunk_size=DEFAULT_READ_CHUNK_SIZE):
         self.file_path = file_path
         self.chunk_size = chunk_size
 
-    async def get_stream(self) -> FileStream:
-        async with aiofiles.open(self.file_path, "rb") as f:
-            while True:
-                chunk = await f.read(self.chunk_size)
-                if not chunk:
-                    break
-                yield chunk
+    def get_stream_data(self) -> StreamData:
+        async def _() -> FileStream:
+            async with aiofiles.open(self.file_path, "rb") as f:
+                while True:
+                    chunk = await f.read(self.chunk_size)
+                    if not chunk:
+                        break
+                    yield chunk
+
+        return FileSize(self.file_path.stat().st_size), _
diff --git a/packages/service-library/src/servicelib/zip_stream/_types.py b/packages/service-library/src/servicelib/zip_stream/_types.py
@@ -1,8 +1,15 @@
 from collections.abc import AsyncIterable, Callable
 from typing import TypeAlias
 
+from pydantic import ByteSize
+
 FileNameInArchive: TypeAlias = str
 FileStream: TypeAlias = AsyncIterable[bytes]
 
-ArchiveFileEntry: TypeAlias = tuple[FileNameInArchive, Callable[[], FileStream]]
+FileStreamCallable: TypeAlias = Callable[[], FileStream]
+FileSize: TypeAlias = ByteSize
+
+StreamData: TypeAlias = tuple[FileSize, FileStreamCallable]
+
+ArchiveFileEntry: TypeAlias = tuple[FileNameInArchive, StreamData]
 ArchiveEntries: TypeAlias = list[ArchiveFileEntry]
diff --git a/packages/service-library/src/servicelib/zip_stream/_zipper.py b/packages/service-library/src/servicelib/zip_stream/_zipper.py
@@ -5,41 +5,43 @@
 from stream_zip import ZIP_32, AsyncMemberFile, async_stream_zip
 
 from ..progress_bar import ProgressBarData
-from ._constants import DEFAULT_CHUNK_SIZE
+from ._constants import DEFAULT_READ_CHUNK_SIZE
 from ._types import ArchiveEntries, FileStream
 
 
-async def _iter_files(
-    file_streams: ArchiveEntries, progress_bar: ProgressBarData
+async def _member_files_iter(
+    file_streams: ArchiveEntries, progress: ProgressBarData
 ) -> AsyncIterable[AsyncMemberFile]:
-    async with progress_bar.sub_progress(
-        steps=len(file_streams), description="..."
-    ) as sub_progress:
-        for file_name, file_stream_handler in file_streams:
-            yield (
-                file_name,
-                datetime.now(UTC),
-                S_IFREG | 0o600,
-                ZIP_32,
-                file_stream_handler(),
-            )
-            await sub_progress.update(1)
+    for file_name, (stream_size, file_stream_handler) in file_streams:
+        yield (
+            file_name,
+            datetime.now(UTC),
+            S_IFREG | 0o600,
+            ZIP_32,
+            file_stream_handler(),
+        )
+        await progress.update(stream_size)
 
 
 async def get_zip_archive_stream(
     archive_files: ArchiveEntries,
     *,
     progress_bar: ProgressBarData | None = None,
-    chunk_size: int = DEFAULT_CHUNK_SIZE
+    chunk_size: int = DEFAULT_READ_CHUNK_SIZE
 ) -> FileStream:
     # NOTE: this is CPU bound task, even though the loop is not blocked,
-    # the CPU is still used for compressing the content
+    # the CPU is still used for compressing the content.
     if progress_bar is None:
-        progress_bar = ProgressBarData(num_steps=1, description="stream archiver")
-
-    # NOTE: do not disable compression or the streams will be
-    # loaded fully in memory before yielding their content
-    async for chunk in async_stream_zip(
-        _iter_files(archive_files, progress_bar), chunk_size=chunk_size
-    ):
-        yield chunk
+        progress_bar = ProgressBarData(num_steps=1, description="zip archive stream")
+
+    total_stream_lenth = sum(file_size for _, (file_size, _) in archive_files)
+
+    async with progress_bar.sub_progress(
+        steps=total_stream_lenth, description="streams_reader", progress_unit="Byte"
+    ) as sub_progress:
+        # NOTE: do not disable compression or the streams will be
+        # loaded fully in memory before yielding their content
+        async for chunk in async_stream_zip(
+            _member_files_iter(archive_files, sub_progress), chunk_size=chunk_size
+        ):
+            yield chunk
diff --git a/packages/service-library/tests/test_zip_stream.py b/packages/service-library/tests/test_zip_stream.py
@@ -104,7 +104,7 @@ async def test_get_zip_archive_stream(
     for file in (x for x in local_files_dir.rglob("*") if x.is_file()):
         archive_name = get_relative_to(local_files_dir, file)
 
-        archive_files.append((archive_name, DiskStreamReader(file).get_stream))
+        archive_files.append((archive_name, DiskStreamReader(file).get_stream_data()))
 
     writer = DiskStreamWriter(local_archive_path)
 
@@ -114,7 +114,7 @@ async def test_get_zip_archive_stream(
         description="root_bar",
     ) as root:
         await writer.write_stream(
-            get_zip_archive_stream(archive_files, progress_bar=root)
+            get_zip_archive_stream(archive_files, progress_bar=root, chunk_size=1024)
         )
 
     # 2. extract archive using exiting tools

Original file line number	Diff line number	Diff line change
`@@ -2,4 +2,4 @@`
`2`	`2`
`3`	`3`	`from pydantic import ByteSize, TypeAdapter`
`4`	`4`
`5`		`-DEFAULT_CHUNK_SIZE: Final[int] = TypeAdapter(ByteSize).validate_python("1MiB")`
	`5`	`+DEFAULT_READ_CHUNK_SIZE: Final[int] = TypeAdapter(ByteSize).validate_python("1MiB")`