[SYNPY-1584] Add a buffer.tell when truncating bytes during dataframe upload and drop writing header to csv (#1193)

BryanFauble · web-flow · commit a33dc9acd37b · 2025-04-22T11:13:50.000-07:00
* [SYNPY-1584] Add a buffer.tell when truncating bytes during dataframe upload and drop writing header to csv
diff --git a/synapseclient/core/upload/upload_utils.py b/synapseclient/core/upload/upload_utils.py
@@ -55,7 +55,6 @@ def get_partial_dataframe_chunk(
         (total_size_of_chunks_being_uploaded - ((part_number - 1) * part_size)),
         part_size,
     )
-    header_written = False
     # TODO: This is an area for optimization. It is possible to avoid writing the entire
     # dataframe to a buffer and then reading the buffer to get the bytes. Instead, we
     # might be able to do something like keeping markers at each 100 row increment how
@@ -68,12 +67,11 @@ def get_partial_dataframe_chunk(
         df.iloc[offset_start:end].to_csv(
             buffer,
             mode="a",
-            header=(part_number == 1 and not header_written),
+            header=False,
             index=False,
             float_format="%.12g",
             **(to_csv_kwargs or {}),
         )
-        header_written = True
         number_of_bytes_in_buffer = buffer.tell()
         # Drop data from the front of the buffer until total_offset is 0
         if total_offset > 0 and total_offset >= number_of_bytes_in_buffer:
@@ -89,6 +87,7 @@ def get_partial_dataframe_chunk(
             buffer.truncate(0)
             buffer.write(copy_of_data)
             total_offset = 0
+            number_of_bytes_in_buffer = buffer.tell()
 
         if number_of_bytes_in_buffer >= max_bytes_to_read:
             # Return maximum number of bytes that can be read from the buffer
diff --git a/synapseclient/models/mixins/table_components.py b/synapseclient/models/mixins/table_components.py
@@ -3468,10 +3468,8 @@ async def _chunk_and_upload_df(
 
         chunks_to_upload = []
         size_of_chunk = 0
-        previous_chunk_byte_offset = 0
         buffer = BytesIO()
         total_df_bytes = 0
-        size_of_header = 0
         header_line = None
         md5_hashlib = hashlib.new("md5", usedforsecurity=False)  # nosec
         line_start_index_for_chunk = 0
@@ -3494,28 +3492,23 @@ async def _chunk_and_upload_df(
             if start == 0:
                 buffer.seek(0)
                 header_line = buffer.readline()
-                size_of_header = len(header_line)
-                previous_chunk_byte_offset = size_of_header
             md5_hashlib.update(buffer.getvalue())
 
             if size_of_chunk >= insert_size_bytes:
                 chunks_to_upload.append(
                     (
-                        previous_chunk_byte_offset,
                         size_of_chunk,
                         md5_hashlib.hexdigest(),
                         line_start_index_for_chunk,
                         line_end_index_for_chunk,
                     )
                 )
-                previous_chunk_byte_offset = size_of_header
                 size_of_chunk = 0
                 line_start_index_for_chunk = line_end_index_for_chunk
                 md5_hashlib = hashlib.new("md5", usedforsecurity=False)  # nosec
         if size_of_chunk > 0:
             chunks_to_upload.append(
                 (
-                    previous_chunk_byte_offset,
                     size_of_chunk,
                     md5_hashlib.hexdigest(),
                     line_start_index_for_chunk,
@@ -3558,7 +3551,6 @@ async def _chunk_and_upload_df(
         wait_for_update_semaphore = asyncio.Semaphore(value=1)
         part = 0
         for (
-            byte_chunk_offset,
             size_of_chunk,
             md5,
             line_start,
@@ -3569,7 +3561,7 @@ async def _chunk_and_upload_df(
                     self._stream_and_update_from_df(
                         client=client,
                         size_of_chunk=size_of_chunk,
-                        byte_chunk_offset=byte_chunk_offset,
+                        byte_chunk_offset=0,
                         md5=md5,
                         csv_table_descriptor=csv_table_descriptor,
                         job_timeout=job_timeout,
diff --git a/tests/integration/synapseclient/models/async/test_table_async.py b/tests/integration/synapseclient/models/async/test_table_async.py
@@ -1,8 +1,11 @@
 import json
 import os
+import random
+import string
 import tempfile
 import uuid
 from typing import Callable
+from unittest import skip
 
 import pandas as pd
 import pytest
@@ -901,9 +904,6 @@ async def test_store_rows_as_df_being_split_and_uploaded(
                 "large_string": [large_string_a for _ in range(200)],
             }
         )
-        filepath = f"{tempfile.mkdtemp()}/upload_{uuid.uuid4()}.csv"
-        self.schedule_for_cleanup(filepath)
-        data_for_table.to_csv(filepath, index=False, float_format="%.12g")
 
         # WHEN I store the rows to the table
         await table.store_rows_async(
@@ -937,6 +937,70 @@ async def test_store_rows_as_df_being_split_and_uploaded(
         # Note: DataFrames have a minimum of 100 rows per batch
         assert spy_send_job.call_count == 2
 
+    @skip("Skip in normal testing because the large size makes it slow")
+    async def test_store_rows_as_large_df_being_split_and_uploaded(
+        self, project_model: Project, mocker: MockerFixture
+    ) -> None:
+        # GIVEN a table in Synapse
+        table_name = str(uuid.uuid4())
+        table = Table(
+            name=table_name,
+            parent_id=project_model.id,
+            columns=[
+                Column(name="column_string", column_type=ColumnType.STRING),
+                Column(name="column_to_order_on", column_type=ColumnType.INTEGER),
+                Column(
+                    name="large_string",
+                    column_type=ColumnType.LARGETEXT,
+                ),
+            ],
+        )
+        table = await table.store_async(synapse_client=self.syn)
+        self.schedule_for_cleanup(table.id)
+        spy_send_job = mocker.spy(asynchronous_job_module, "send_job_async")
+
+        # AND data that will be split into multiple parts
+        rows_in_table = 20
+        random_string = "".join(random.choices(string.ascii_uppercase, k=500000))
+        data_for_table = pd.DataFrame(
+            {
+                "column_string": [f"value{i}" for i in range(rows_in_table)],
+                "column_to_order_on": [i for i in range(rows_in_table)],
+                "large_string": [random_string for _ in range(rows_in_table)],
+            }
+        )
+
+        # WHEN I store the rows to the table
+        await table.store_rows_async(
+            values=data_for_table,
+            schema_storage_strategy=None,
+            synapse_client=self.syn,
+            insert_size_bytes=1 * utils.KB,
+        )
+
+        # AND I query the table
+        results = await query_async(
+            f"SELECT * FROM {table.id} ORDER BY column_to_order_on ASC",
+            synapse_client=self.syn,
+        )
+
+        # THEN the data in the columns should match
+        pd.testing.assert_series_equal(
+            results["column_string"], data_for_table["column_string"]
+        )
+        pd.testing.assert_series_equal(
+            results["column_to_order_on"], data_for_table["column_to_order_on"]
+        )
+        pd.testing.assert_series_equal(
+            results["large_string"], data_for_table["large_string"]
+        )
+
+        # AND `rows_in_table` rows exist on the table
+        assert len(results) == rows_in_table
+
+        # AND The spy should have been called in multiple batches
+        assert spy_send_job.call_count == 1
+
 
 class TestUpsertRows:
     @pytest.fixture(autouse=True, scope="function")