Skip to content

Commit 979e743

Browse files
authored
REF: split big method in pyarrow csv wrapper (#62087)
1 parent 1bd1830 commit 979e743

File tree

1 file changed

+47
-30
lines changed

1 file changed

+47
-30
lines changed

pandas/io/parsers/arrow_parser_wrapper.py

Lines changed: 47 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -140,20 +140,29 @@ def handle_warning(invalid_row) -> str:
140140
"encoding": self.encoding,
141141
}
142142

143-
def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
144-
"""
145-
Processes data read in based on kwargs.
143+
def _get_convert_options(self):
144+
pyarrow_csv = import_optional_dependency("pyarrow.csv")
146145

147-
Parameters
148-
----------
149-
frame: DataFrame
150-
The DataFrame to process.
146+
try:
147+
convert_options = pyarrow_csv.ConvertOptions(**self.convert_options)
148+
except TypeError as err:
149+
include = self.convert_options.get("include_columns", None)
150+
if include is not None:
151+
self._validate_usecols(include)
151152

152-
Returns
153-
-------
154-
DataFrame
155-
The processed DataFrame.
156-
"""
153+
nulls = self.convert_options.get("null_values", set())
154+
if not lib.is_list_like(nulls) or not all(
155+
isinstance(x, str) for x in nulls
156+
):
157+
raise TypeError(
158+
"The 'pyarrow' engine requires all na_values to be strings"
159+
) from err
160+
161+
raise
162+
163+
return convert_options
164+
165+
def _adjust_column_names(self, frame: DataFrame) -> tuple[DataFrame, bool]:
157166
num_cols = len(frame.columns)
158167
multi_index_named = True
159168
if self.header is None:
@@ -169,8 +178,9 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
169178
self.names = columns_prefix + self.names
170179
multi_index_named = False
171180
frame.columns = self.names
181+
return frame, multi_index_named
172182

173-
frame = self._do_date_conversions(frame.columns, frame)
183+
def _finalize_index(self, frame: DataFrame, multi_index_named: bool) -> DataFrame:
174184
if self.index_col is not None:
175185
index_to_set = self.index_col.copy()
176186
for i, item in enumerate(self.index_col):
@@ -196,6 +206,9 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
196206
if self.header is None and not multi_index_named:
197207
frame.index.names = [None] * len(frame.index.names)
198208

209+
return frame
210+
211+
def _finalize_dtype(self, frame: DataFrame) -> DataFrame:
199212
if self.dtype is not None:
200213
# Ignore non-existent columns from dtype mapping
201214
# like other parsers do
@@ -214,6 +227,26 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
214227
raise ValueError(str(err)) from err
215228
return frame
216229

230+
def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
231+
"""
232+
Processes data read in based on kwargs.
233+
234+
Parameters
235+
----------
236+
frame: DataFrame
237+
The DataFrame to process.
238+
239+
Returns
240+
-------
241+
DataFrame
242+
The processed DataFrame.
243+
"""
244+
frame, multi_index_named = self._adjust_column_names(frame)
245+
frame = self._do_date_conversions(frame.columns, frame)
246+
frame = self._finalize_index(frame, multi_index_named)
247+
frame = self._finalize_dtype(frame)
248+
return frame
249+
217250
def _validate_usecols(self, usecols) -> None:
218251
if lib.is_list_like(usecols) and not all(isinstance(x, str) for x in usecols):
219252
raise ValueError(
@@ -239,23 +272,7 @@ def read(self) -> DataFrame:
239272
pa = import_optional_dependency("pyarrow")
240273
pyarrow_csv = import_optional_dependency("pyarrow.csv")
241274
self._get_pyarrow_options()
242-
243-
try:
244-
convert_options = pyarrow_csv.ConvertOptions(**self.convert_options)
245-
except TypeError as err:
246-
include = self.convert_options.get("include_columns", None)
247-
if include is not None:
248-
self._validate_usecols(include)
249-
250-
nulls = self.convert_options.get("null_values", set())
251-
if not lib.is_list_like(nulls) or not all(
252-
isinstance(x, str) for x in nulls
253-
):
254-
raise TypeError(
255-
"The 'pyarrow' engine requires all na_values to be strings"
256-
) from err
257-
258-
raise
275+
convert_options = self._get_convert_options()
259276

260277
try:
261278
table = pyarrow_csv.read_csv(

0 commit comments

Comments
 (0)