Improve file download performance (#319)

mgyucht · web-flow · commit 99c479e599da · 2023-08-30T10:29:17.000Z
## Changes
The change to use a chunked iterator for streaming responses caused a
performance regression when streaming raw responses, as the default
chunk size from the requests library is 1 byte. To identify an
appropriate chunk size to use, I added a benchmark integration test that
uploads a 50mb file and attempts to download it with a number of various
chunk sizes. Based on this benchmarking, it seems like we don't have any
real speed-up after around 100KiB chunks.

Additionally, I added a check here to ensure that `read()` raises a
ValueError if the response was ever closed, as it is not allowed to read
multiple times from a streaming response.

## Tests
- [x] Added an integration test to verify `read()` raises ValueError on
second attempt to read.
- [x] Added a benchmark to stress-test file download performance.

```
[chunk size 1kb] Average time to download: 129.83376684188843
[chunk size 2kb] Average time to download: 69.17960963249206
[chunk size 5kb] Average time to download: 34.08896443843842
[chunk size 10kb] Average time to download: 19.392758226394655
[chunk size 20kb] Average time to download: 10.74389090538025
[chunk size 50kb] Average time to download: 5.658655261993408
[chunk size 100kb] Average time to download: 3.982270860671997
[chunk size 200kb] Average time to download: 4.485624170303344
[chunk size 500kb] Average time to download: 4.236340761184692
[chunk size 1000kb] Average time to download: 4.695496129989624
[chunk size 2000kb] Average time to download: 4.6709349155426025
[chunk size 5000kb] Average time to download: 4.5816244840621945
[chunk size 10000kb] Average time to download: 4.32698233127594
[chunk size 20000kb] Average time to download: 4.625458240509033
[chunk size 50000kb] Average time to download: 4.405146503448487
{1: 129.83376684188843, 2: 69.17960963249206, 5: 34.08896443843842, 10: 19.392758226394655, 20: 10.74389090538025, 50: 5.658655261993408, 100: 3.982270860671997, 200: 4.485624170303344, 500: 4.236340761184692, 1000: 4.695496129989624, 2000: 4.6709349155426025, 5000: 4.5816244840621945, 10000: 4.32698233127594, 20000: 4.625458240509033, 50000: 4.405146503448487}
Fastest chunk size:  100 kb,  3.982270860671997 seconds
```
diff --git a/Makefile b/Makefile
@@ -21,10 +21,13 @@ lint:
 	autoflake --check-diff --quiet --recursive databricks
 
 test:
-	pytest -m 'not integration' --cov=databricks --cov-report html tests
+	pytest -m 'not integration and not benchmark' --cov=databricks --cov-report html tests
 
 integration:
-	pytest -n auto -m 'integration' --cov=databricks --cov-report html tests
+	pytest -n auto -m 'integration and not benchmark' --cov=databricks --cov-report html tests
+
+benchmark:
+	pytest -m 'benchmark' tests
 
 coverage: test
 	open htmlcov/index.html
diff --git a/databricks/sdk/core.py b/databricks/sdk/core.py
@@ -1131,26 +1131,41 @@ def _redacted_dump(self, prefix: str, body: str) -> str:
 class StreamingResponse(BinaryIO):
     _response: requests.Response
     _buffer: bytes
-    _content: Iterator[bytes]
+    _content: Union[Iterator[bytes], None]
+    _chunk_size: Union[int, None]
+    _closed: bool = False
 
-    def __init__(self, response: requests.Response):
+    def fileno(self) -> int:
+        pass
+
+    def flush(self) -> int:
+        pass
+
+    def __init__(self, response: requests.Response, chunk_size: Union[int, None] = None):
         self._response = response
         self._buffer = b''
         self._content = None
+        self._chunk_size = chunk_size
 
     def __enter__(self) -> BinaryIO:
-        self._content = self._response.iter_content()
+        self._content = self._response.iter_content(chunk_size=self._chunk_size)
         return self
 
+    def set_chunk_size(self, chunk_size: Union[int, None]) -> None:
+        self._chunk_size = chunk_size
+
     def close(self) -> None:
         self._response.close()
+        self._closed = True
 
     def isatty(self) -> bool:
         return False
 
     def read(self, n: int = -1) -> bytes:
-        if self._content is None:
-            self._content = self._response.iter_content()
+        if self._closed is None:
+            raise ValueError("I/O operation on closed file")
+        if not self._content:
+            self._content = self._response.iter_content(chunk_size=self._chunk_size)
         read_everything = n < 0
         remaining_bytes = n
         res = b''
@@ -1191,7 +1206,7 @@ def truncate(self, __size: Union[int, None] = ...) -> int:
     def writable(self) -> bool:
         return False
 
-    def write(self, s: bytes) -> int:
+    def write(self, s: Union[bytes, bytearray]) -> int:
         raise NotImplementedError()
 
     def writelines(self, lines: Iterable[bytes]) -> None:
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -23,6 +23,8 @@ def pytest_configure(config):
 
     config.addinivalue_line('markers',
                             'integration: marks tests as those requiring a real Databricks backend')
+    config.addinivalue_line('markers',
+                            'benchmark: marks tests as benchmarks which should not be run by default')
 
 
 def pytest_collection_modifyitems(items):
diff --git a/tests/integration/test_files.py b/tests/integration/test_files.py
@@ -1,6 +1,8 @@
 import io
+import logging
 import pathlib
-from typing import Callable, List
+import time
+from typing import Callable, List, Tuple, Union
 
 import pytest
 
@@ -227,3 +229,63 @@ def test_files_api_upload_download(ucws, random):
                 assert f.read() == b"some text data"
 
             w.files.delete(target_file)
+
+
+def test_files_api_read_twice_from_one_download(ucws, random):
+    w = ucws
+    schema = 'filesit-' + random()
+    volume = 'filesit-' + random()
+    with ResourceWithCleanup.create_schema(w, 'main', schema):
+        with ResourceWithCleanup.create_volume(w, 'main', schema, volume):
+            f = io.BytesIO(b"some text data")
+            target_file = f'/Volumes/main/{schema}/{volume}/filesit-{random()}.txt'
+            w.files.upload(target_file, f)
+
+            res = w.files.download(target_file).contents
+
+            with res:
+                assert res.read() == b"some text data"
+
+            with pytest.raises(ValueError):
+                with res:
+                    res.read()
+
+
+@pytest.mark.benchmark
+def test_files_api_download_benchmark(ucws, random):
+    w = ucws
+    schema = 'filesit-' + random()
+    volume = 'filesit-' + random()
+    with ResourceWithCleanup.create_schema(w, 'main', schema):
+        with ResourceWithCleanup.create_volume(w, 'main', schema, volume):
+            # Create a 50 MB file
+            f = io.BytesIO(bytes(range(256)) * 200000)
+            target_file = f'/Volumes/main/{schema}/{volume}/filesit-benchmark-{random()}.txt'
+            w.files.upload(target_file, f)
+
+            totals = {}
+            for chunk_size_kb in [20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000, None]:
+                chunk_size = chunk_size_kb * 1024 if chunk_size_kb else None
+                total = 0
+                count = 10
+                for i in range(count):
+                    start = time.time()
+                    f = w.files.download(target_file).contents
+                    f.set_chunk_size(chunk_size)
+                    with f as vf:
+                        vf.read()
+                    end = time.time()
+                    total += end - start
+                avg_time = total / count
+                logging.info(f"[chunk_size=%s] Average time to download: %f seconds",
+                             str(chunk_size_kb) + 'kb' if chunk_size_kb else 'None', avg_time)
+                totals[chunk_size_kb] = avg_time
+            logging.info("Benchmark results:")
+            best: Tuple[Union[int, None], Union[float, None]] = (None, None)
+            for k, v in totals.items():
+                if best[1] is None or v < best[1]:
+                    best = (k, v)
+                logging.info(f"[chunk_size=%s] Average time to download: %f seconds",
+                             str(k) + 'kb' if k else 'None', v)
+            min_str = str(best[0]) + "kb" if best[0] else "None"
+            logging.info("Fastest chunk size: %s in %f seconds", min_str, best[1])
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -264,7 +264,7 @@ class DummyResponse:
     def __init__(self, content: List[bytes]) -> None:
         self._content = iter(content)
 
-    def iter_content(self) -> Iterator[bytes]:
+    def iter_content(self, chunk_size: int = 1) -> Iterator[bytes]:
         return self._content
 
     def close(self):