Skip to content

Commit b0cd965

Browse files
[Batched Storage] Reset index after slicing dataframe in storage.py (#444)
* [Batched Storage] Reset index after slicing dataframe in storage.py Fix indexing to reset index after slicing dataframe. * add comments for reset_index --------- Co-authored-by: Ma, Xiaochen <mxch1122@126.com>
1 parent 0bc7e82 commit b0cd965

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

dataflow/utils/storage.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1018,7 +1018,7 @@ def read(self, output_type: Literal["dataframe", "dict"]="dataframe") -> Any:
10181018
if self.batch_size:
10191019
dataframe = dataframe.iloc[
10201020
self.batch_step * self.batch_size : (self.batch_step + 1) * self.batch_size
1021-
]
1021+
].reset_index(drop=True) # reset_index, make the new index of every batch start from 0. Instead of its original index.
10221022
return self._convert_output(dataframe, output_type)
10231023

10241024
def write(self, data: Any) -> Any:
@@ -1181,4 +1181,4 @@ def iter_chunks(self) -> Generator[pd.DataFrame, None, None]:
11811181
for chunk in reader:
11821182
yield chunk
11831183
else:
1184-
yield reader # 如果不支持流式,就产出整个 DF
1184+
yield reader # 如果不支持流式,就产出整个 DF

0 commit comments

Comments
 (0)