Buffer more data in object store writer (#4619)

AdamGS · web-flow · commit b89c1f85d220 · 2025-09-24T11:47:34.000+01:00
Mostly realized that this makes more sense:
1. Buffer is bigger than the chunk size, so we don't submit a part
immediately when we pass the chunk size, which should give us some more
concurrency.
2. We upload in buffer sizes that are aligned to 16MB, which the AWS
perf guide seems to indicate is a good idea when reading it back.
3. Also fixes two typos.

Signed-off-by: Adam Gutglick &lt;adam@spiraldb.com&gt;
diff --git a/vortex-io/src/object_store.rs b/vortex-io/src/object_store.rs
@@ -13,23 +13,24 @@ use vortex_error::VortexResult;
 
 use crate::{IoBuf, VortexWrite};
 
-/// Adapter type to write data through a [`ObjectStore`] instace.
+/// Adapter type to write data through a [`ObjectStore`] instance.
 ///
-/// After writing, the caller must make sure to call `shutdonw`, in order to ensure the data is actually persisted.
+/// After writing, the caller must make sure to call `shutdown`, in order to ensure the data is actually persisted.
 pub struct ObjectStoreWriter {
     upload: Box<dyn MultipartUpload>,
     buffer: BytesMut,
     put_result: Option<PutResult>,
 }
 
-const CHUNKS_SIZE: usize = 25 * 1024 * 1024;
+const CHUNK_SIZE: usize = 16 * 1024 * 1024;
+const BUFFER_SIZE: usize = 128 * 1024 * 1024;
 
 impl ObjectStoreWriter {
     pub async fn new(object_store: Arc<dyn ObjectStore>, location: &Path) -> VortexResult<Self> {
         let upload = object_store.put_multipart(location).await?;
         Ok(Self {
             upload,
-            buffer: BytesMut::with_capacity(CHUNKS_SIZE),
+            buffer: BytesMut::with_capacity(CHUNK_SIZE),
             put_result: None,
         })
     }
@@ -44,12 +45,15 @@ impl VortexWrite for ObjectStoreWriter {
         self.buffer.extend_from_slice(buffer.as_slice());
         let parts = FuturesUnordered::new();
 
-        // Split off chunks while buffer is larger than CHUNKS_SIZE
-        while self.buffer.len() > CHUNKS_SIZE {
-            let payload = self.buffer.split_to(CHUNKS_SIZE).freeze();
-            let part_fut = self.upload.put_part(PutPayload::from_bytes(payload));
+        // If the buffer is full
+        if self.buffer.len() > BUFFER_SIZE {
+            // Split off chunks while buffer is larger than CHUNKS_SIZE
+            while self.buffer.len() > CHUNK_SIZE {
+                let payload = self.buffer.split_to(CHUNK_SIZE).freeze();
+                let part_fut = self.upload.put_part(PutPayload::from_bytes(payload));
 
-            parts.push(part_fut);
+                parts.push(part_fut);
+            }
         }
 
         parts.try_collect::<Vec<_>>().await?;
@@ -60,8 +64,8 @@ impl VortexWrite for ObjectStoreWriter {
     async fn flush(&mut self) -> io::Result<()> {
         let parts = FuturesUnordered::new();
 
-        while self.buffer.len() > CHUNKS_SIZE {
-            let payload = self.buffer.split_to(CHUNKS_SIZE).freeze();
+        while self.buffer.len() > CHUNK_SIZE {
+            let payload = self.buffer.split_to(CHUNK_SIZE).freeze();
             let part_fut = self.upload.put_part(PutPayload::from_bytes(payload));
 
             parts.push(part_fut);