@@ -53,36 +53,6 @@ def close(self) -> None:
5353 self .queue .clear ()
5454
5555
56- class ShardBatchReader (ConcatBatchReader ):
57-
58- def __init__ (self , readers , split_start_row , split_end_row ):
59- super ().__init__ (readers )
60- self .split_start_row = split_start_row
61- self .split_end_row = split_end_row
62- self .cur_end = 0
63-
64- def read_arrow_batch (self ) -> Optional [RecordBatch ]:
65- batch = super ().read_arrow_batch ()
66- if batch is None :
67- return None
68- if self .split_start_row is not None or self .split_end_row is not None :
69- cur_begin = self .cur_end # begin idx of current batch based on the split
70- self .cur_end += batch .num_rows
71- # shard the first batch and the last batch
72- if self .split_start_row <= cur_begin < self .cur_end <= self .split_end_row :
73- return batch
74- elif cur_begin <= self .split_start_row < self .cur_end :
75- return batch .slice (self .split_start_row - cur_begin ,
76- min (self .split_end_row , self .cur_end ) - self .split_start_row )
77- elif cur_begin < self .split_end_row <= self .cur_end :
78- return batch .slice (0 , self .split_end_row - cur_begin )
79- else :
80- # return empty RecordBatch if the batch size has not reached split_start_row
81- return pa .RecordBatch .from_arrays ([], [])
82- else :
83- return batch
84-
85-
8656class MergeAllBatchReader (RecordBatchReader ):
8757 """
8858 A reader that accepts multiple reader suppliers and concatenates all their arrow batches
@@ -98,13 +68,18 @@ def __init__(self, reader_suppliers: List[Callable], batch_size: int = 4096):
9868
9969 def read_arrow_batch (self ) -> Optional [RecordBatch ]:
10070 if self .reader :
101- return self .reader .read_next_batch ()
71+ try :
72+ return self .reader .read_next_batch ()
73+ except StopIteration :
74+ return None
10275
10376 all_batches = []
10477
10578 # Read all batches from all reader suppliers
10679 for supplier in self .reader_suppliers :
10780 reader = supplier ()
81+ if reader is None :
82+ continue
10883 try :
10984 while True :
11085 batch = reader .read_arrow_batch ()
@@ -149,3 +124,65 @@ def read_arrow_batch(self) -> Optional[RecordBatch]:
149124 def close (self ) -> None :
150125 self .merged_batch = None
151126 self .reader = None
127+
128+
129+ class DataEvolutionMergeReader (RecordBatchReader ):
130+ """
131+ This is a union reader which contains multiple inner readers, Each reader is responsible for reading one file.
132+
133+ This reader, assembling multiple reader into one big and great reader, will merge the batches from all readers.
134+
135+ For example, if rowOffsets is {0, 2, 0, 1, 2, 1} and fieldOffsets is {0, 0, 1, 1, 1, 0}, it means:
136+ - The first field comes from batch0, and it is at offset 0 in batch0.
137+ - The second field comes from batch2, and it is at offset 0 in batch2.
138+ - The third field comes from batch0, and it is at offset 1 in batch0.
139+ - The fourth field comes from batch1, and it is at offset 1 in batch1.
140+ - The fifth field comes from batch2, and it is at offset 1 in batch2.
141+ - The sixth field comes from batch1, and it is at offset 0 in batch1.
142+ """
143+
144+ def __init__ (self , row_offsets : List [int ], field_offsets : List [int ], readers : List [Optional [RecordBatchReader ]]):
145+ if row_offsets is None :
146+ raise ValueError ("Row offsets must not be null" )
147+ if field_offsets is None :
148+ raise ValueError ("Field offsets must not be null" )
149+ if len (row_offsets ) != len (field_offsets ):
150+ raise ValueError ("Row offsets and field offsets must have the same length" )
151+ if not row_offsets :
152+ raise ValueError ("Row offsets must not be empty" )
153+ if not readers or len (readers ) < 1 :
154+ raise ValueError ("Readers should be more than 0" )
155+ self .row_offsets = row_offsets
156+ self .field_offsets = field_offsets
157+ self .readers = readers
158+
159+ def read_arrow_batch (self ) -> Optional [RecordBatch ]:
160+ batches : List [Optional [RecordBatch ]] = [None ] * len (self .readers )
161+ for i , reader in enumerate (self .readers ):
162+ if reader is not None :
163+ batch = reader .read_arrow_batch ()
164+ if batch is None :
165+ # all readers are aligned, as long as one returns null, the others will also have no data
166+ return None
167+ batches [i ] = batch
168+ # Assemble record batches from batches based on row_offsets and field_offsets
169+ columns = []
170+ names = []
171+ for i in range (len (self .row_offsets )):
172+ batch_index = self .row_offsets [i ]
173+ field_index = self .field_offsets [i ]
174+ if batches [batch_index ] is not None :
175+ column = batches [batch_index ].column (field_index )
176+ columns .append (column )
177+ names .append (batches [batch_index ].schema .names [field_index ])
178+ if columns :
179+ return pa .RecordBatch .from_arrays (columns , names )
180+ return None
181+
182+ def close (self ) -> None :
183+ try :
184+ for reader in self .readers :
185+ if reader is not None :
186+ reader .close ()
187+ except Exception as e :
188+ raise IOError ("Failed to close inner readers" ) from e
0 commit comments