apache
diff --git a/‎paimon-python/pypaimon/read/reader/concat_batch_reader.py‎
Lines changed: 68 additions & 31 deletions b/‎paimon-python/pypaimon/read/reader/concat_batch_reader.py‎
Lines changed: 68 additions & 31 deletions
diff --git a/‎paimon-python/pypaimon/read/reader/data_evolution_merge_reader.py‎
Lines changed: 0 additions & 85 deletions b/‎paimon-python/pypaimon/read/reader/data_evolution_merge_reader.py‎
Lines changed: 0 additions & 85 deletions
diff --git a/‎paimon-python/pypaimon/read/reader/data_file_batch_reader.py‎
Lines changed: 7 additions & 3 deletions b/‎paimon-python/pypaimon/read/reader/data_file_batch_reader.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎paimon-python/pypaimon/read/reader/format_blob_reader.py‎
Lines changed: 13 additions & 3 deletions b/‎paimon-python/pypaimon/read/reader/format_blob_reader.py‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎paimon-python/pypaimon/read/reader/shard_batch_reader.py‎
Lines changed: 61 additions & 0 deletions b/‎paimon-python/pypaimon/read/reader/shard_batch_reader.py‎
Lines changed: 61 additions & 0 deletions
@@ -53,36 +53,6 @@ def close(self) -> None:
         self.queue.clear()
 
 
-class ShardBatchReader(ConcatBatchReader):
-
-    def __init__(self, readers, split_start_row, split_end_row):
-        super().__init__(readers)
-        self.split_start_row = split_start_row
-        self.split_end_row = split_end_row
-        self.cur_end = 0
-
-    def read_arrow_batch(self) -> Optional[RecordBatch]:
-        batch = super().read_arrow_batch()
-        if batch is None:
-            return None
-        if self.split_start_row is not None or self.split_end_row is not None:
-            cur_begin = self.cur_end  # begin idx of current batch based on the split
-            self.cur_end += batch.num_rows
-            # shard the first batch and the last batch
-            if self.split_start_row <= cur_begin < self.cur_end <= self.split_end_row:
-                return batch
-            elif cur_begin <= self.split_start_row < self.cur_end:
-                return batch.slice(self.split_start_row - cur_begin,
-                                   min(self.split_end_row, self.cur_end) - self.split_start_row)
-            elif cur_begin < self.split_end_row <= self.cur_end:
-                return batch.slice(0, self.split_end_row - cur_begin)
-            else:
-                # return empty RecordBatch if the batch size has not reached split_start_row
-                return pa.RecordBatch.from_arrays([], [])
-        else:
-            return batch
-
-
 class MergeAllBatchReader(RecordBatchReader):
     """
     A reader that accepts multiple reader suppliers and concatenates all their arrow batches
@@ -98,13 +68,18 @@ def __init__(self, reader_suppliers: List[Callable], batch_size: int = 4096):
 
     def read_arrow_batch(self) -> Optional[RecordBatch]:
         if self.reader:
-            return self.reader.read_next_batch()
+            try:
+                return self.reader.read_next_batch()
+            except StopIteration:
+                return None
 
         all_batches = []
 
         # Read all batches from all reader suppliers
         for supplier in self.reader_suppliers:
             reader = supplier()
+            if reader is None:
+                continue
             try:
                 while True:
                     batch = reader.read_arrow_batch()
@@ -149,3 +124,65 @@ def read_arrow_batch(self) -> Optional[RecordBatch]:
     def close(self) -> None:
         self.merged_batch = None
         self.reader = None
+
+
+class DataEvolutionMergeReader(RecordBatchReader):
+    """
+    This is a union reader which contains multiple inner readers, Each reader is responsible for reading one file.
+
+    This reader, assembling multiple reader into one big and great reader, will merge the batches from all readers.
+
+    For example, if rowOffsets is {0, 2, 0, 1, 2, 1} and fieldOffsets is {0, 0, 1, 1, 1, 0}, it means:
+     - The first field comes from batch0, and it is at offset 0 in batch0.
+     - The second field comes from batch2, and it is at offset 0 in batch2.
+     - The third field comes from batch0, and it is at offset 1 in batch0.
+     - The fourth field comes from batch1, and it is at offset 1 in batch1.
+     - The fifth field comes from batch2, and it is at offset 1 in batch2.
+     - The sixth field comes from batch1, and it is at offset 0 in batch1.
+    """
+
+    def __init__(self, row_offsets: List[int], field_offsets: List[int], readers: List[Optional[RecordBatchReader]]):
+        if row_offsets is None:
+            raise ValueError("Row offsets must not be null")
+        if field_offsets is None:
+            raise ValueError("Field offsets must not be null")
+        if len(row_offsets) != len(field_offsets):
+            raise ValueError("Row offsets and field offsets must have the same length")
+        if not row_offsets:
+            raise ValueError("Row offsets must not be empty")
+        if not readers or len(readers) < 1:
+            raise ValueError("Readers should be more than 0")
+        self.row_offsets = row_offsets
+        self.field_offsets = field_offsets
+        self.readers = readers
+
+    def read_arrow_batch(self) -> Optional[RecordBatch]:
+        batches: List[Optional[RecordBatch]] = [None] * len(self.readers)
+        for i, reader in enumerate(self.readers):
+            if reader is not None:
+                batch = reader.read_arrow_batch()
+                if batch is None:
+                    # all readers are aligned, as long as one returns null, the others will also have no data
+                    return None
+                batches[i] = batch
+        # Assemble record batches from batches based on row_offsets and field_offsets
+        columns = []
+        names = []
+        for i in range(len(self.row_offsets)):
+            batch_index = self.row_offsets[i]
+            field_index = self.field_offsets[i]
+            if batches[batch_index] is not None:
+                column = batches[batch_index].column(field_index)
+                columns.append(column)
+                names.append(batches[batch_index].schema.names[field_index])
+        if columns:
+            return pa.RecordBatch.from_arrays(columns, names)
+        return None
+
+    def close(self) -> None:
+        try:
+            for reader in self.readers:
+                if reader is not None:
+                    reader.close()
+        except Exception as e:
+            raise IOError("Failed to close inner readers") from e
@@ -22,14 +22,15 @@
 from pyarrow import RecordBatch
 
 from pypaimon.read.partition_info import PartitionInfo
+from pypaimon.read.reader.format_blob_reader import FormatBlobReader
 from pypaimon.read.reader.iface.record_batch_reader import RecordBatchReader
 from pypaimon.schema.data_types import DataField, PyarrowFieldParser
 from pypaimon.table.special_fields import SpecialFields
 
 
 class DataFileBatchReader(RecordBatchReader):
     """
-    Reads record batch from data files.
+    Reads record batch from files of different formats
     """
 
     def __init__(self, format_reader: RecordBatchReader, index_mapping: List[int], partition_info: PartitionInfo,
@@ -48,8 +49,11 @@ def __init__(self, format_reader: RecordBatchReader, index_mapping: List[int], p
         self.max_sequence_number = max_sequence_number
         self.system_fields = system_fields
 
-    def read_arrow_batch(self) -> Optional[RecordBatch]:
-        record_batch = self.format_reader.read_arrow_batch()
+    def read_arrow_batch(self, start_idx=None, end_idx=None) -> Optional[RecordBatch]:
+        if isinstance(self.format_reader, FormatBlobReader):
+            record_batch = self.format_reader.read_arrow_batch(start_idx, end_idx)
+        else:
+            record_batch = self.format_reader.read_arrow_batch()
         if record_batch is None:
             return None
 
 
@@ -63,7 +63,11 @@ def __init__(self, file_io: FileIO, file_path: str, read_fields: List[str],
         self._blob_iterator = None
         self._current_batch = None
 
-    def read_arrow_batch(self) -> Optional[RecordBatch]:
+    def read_arrow_batch(self, start_idx=None, end_idx=None) -> Optional[RecordBatch]:
+        """
+         start_idx: start index record of the blob file
+         end_idx: end index record of the blob file
+        """
         if self._blob_iterator is None:
             if self.returned:
                 return None
@@ -73,7 +77,13 @@ def read_arrow_batch(self) -> Optional[RecordBatch]:
                 self.blob_offsets, self._fields[0]
             )
             self._blob_iterator = iter(batch_iterator)
-
+        read_size = self._batch_size
+        if start_idx is not None and end_idx is not None:
+            if self._blob_iterator.current_position >= end_idx:
+                return None
+            if self._blob_iterator.current_position < start_idx:
+                self._blob_iterator.current_position = start_idx
+            read_size = min(end_idx - self._blob_iterator.current_position, self._batch_size)
         # Collect records for this batch
         pydict_data = {name: [] for name in self._fields}
         records_in_batch = 0
@@ -93,7 +103,7 @@ def read_arrow_batch(self) -> Optional[RecordBatch]:
                     pydict_data[field_name].append(blob_data)
 
                 records_in_batch += 1
-                if records_in_batch >= self._batch_size:
+                if records_in_batch >= read_size:
                     break
 
         except StopIteration:
 
@@ -0,0 +1,61 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+from typing import Optional
+
+from pyarrow import RecordBatch
+from pypaimon.read.reader.iface.record_batch_reader import RecordBatchReader
+from pypaimon.read.reader.format_blob_reader import FormatBlobReader
+
+
+class ShardBatchReader(RecordBatchReader):
+    """
+    A reader that reads a subset of rows from a data file
+    """
+    def __init__(self, reader, start_row, end_row):
+        self.reader = reader
+        self.start_row = start_row
+        self.end_row = end_row
+        self.current_row = 0
+
+    def read_arrow_batch(self) -> Optional[RecordBatch]:
+        # Check if reader is FormatBlobReader (blob type)
+        if isinstance(self.reader.format_reader, FormatBlobReader):
+            # For blob reader, pass begin_idx and end_idx parameters
+            return self.reader.read_arrow_batch(start_idx=self.start_row, end_idx=self.end_row)
+        else:
+            # For non-blob reader (DataFileBatchReader), use standard read_arrow_batch
+            batch = self.reader.read_arrow_batch()
+
+            if batch is None:
+                return None
+
+            # Apply row range filtering for non-blob readers
+            batch_begin = self.current_row
+            self.current_row += batch.num_rows
+
+            # Check if batch is within the desired range
+            if self.start_row <= batch_begin < self.current_row <= self.end_row:  # batch is within the desired range
+                return batch
+            elif batch_begin < self.start_row < self.current_row:  # batch starts before the desired range
+                return batch.slice(self.start_row - batch_begin, self.end_row - self.start_row)
+            elif batch_begin < self.end_row < self.current_row:  # batch ends after the desired range
+                return batch.slice(0, self.end_row - batch_begin)
+            else:  # batch is outside the desired range
+                return self.read_arrow_batch()
+
+    def close(self):
+        self.reader.close()