zstdfile: ensure we do not read more than size / IO_BLOCK_SIZE

joelynch · joelynch · commit 5406382847c2 · 2025-12-22T11:27:48.000Z
In the previous implementation, _ZtsdFileReader.read could produce
output of arbirary size. This can cause memory spikes while
decompressing a file. Instead, we should use a ZstdDecompressor.stream_reader
which decompresses incrementally into a fixed size output buffer.
diff --git a/rohmu/zstdfile.py b/rohmu/zstdfile.py
@@ -43,31 +43,28 @@ def writable(self) -> bool:
 
 class _ZtsdFileReader(FileWrap):
     def __init__(self, next_fp: FileLike) -> None:
-        self._zstd = zstd.ZstdDecompressor().decompressobj()
+        self._stream = zstd.ZstdDecompressor().stream_reader(
+            next_fp,  # type: ignore[arg-type]
+            read_size=IO_BLOCK_SIZE,
+            read_across_frames=True,
+        )
         super().__init__(next_fp)
-        self._done = False
 
     def close(self) -> None:
         if self.closed:
             return
+        self._stream.close()
         super().close()
 
     def read(self, size: Optional[int] = -1) -> bytes:
-        # NOTE: size arg is ignored, random size output is returned
         self._check_not_closed()
-        while not self._done:
-            compressed = self.next_fp.read(IO_BLOCK_SIZE)
-            if not compressed:
-                self._done = True
-                output = self._zstd.flush() or b""
-            else:
-                output = self._zstd.decompress(compressed)
-
-            if output:
-                self.offset += len(output)
-                return output
-
-        return b""
+        if size == 0:
+            return b""
+
+        read_size = size if size and size > 0 else IO_BLOCK_SIZE
+        data = self._stream.read(read_size)
+        self.offset += len(data)
+        return data
 
     def readable(self) -> bool:
         return True
diff --git a/test/test_zstdfile.py b/test/test_zstdfile.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2025 Aiven, Helsinki, Finland. https://aiven.io/
+# See LICENSE for details
+from rohmu import zstdfile
+
+import io
+
+
+def test_compress_and_decompress() -> None:
+    """Test basic compression and decompression"""
+    original_data = b"Hello, World! " * 10_000
+
+    compressed_buffer = io.BytesIO()
+    with zstdfile.open(compressed_buffer, "wb", level=3) as zf:
+        written = zf.write(original_data)
+        assert written == len(original_data)
+
+    compressed_buffer.seek(0)
+    decompressed_data = b""
+    with zstdfile.open(compressed_buffer, "rb") as zf:
+        chunk = zf.read(512)
+        assert len(chunk) <= 512
+        while chunk:
+            decompressed_data += chunk
+            chunk = zf.read(512)
+
+    assert decompressed_data == original_data