Skip to content

Commit 35db9a9

Browse files
authored
[SYNPY-1521] Fixes asDataFrame kwarg Collision Issue (#1132)
* fixes kwarg collision * pre-commit * adds typehints to _csv_to_pandas_df * adds unit tests for _csv_to_pandas_df * pre-commit fix * fix line separator behavior * splits comment lines * pre-commit whitespaces * removes extra **kwargs * updates docstrings * pre-commit
1 parent 857154e commit 35db9a9

File tree

2 files changed

+237
-21
lines changed

2 files changed

+237
-21
lines changed

synapseclient/table.py

Lines changed: 43 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
import re
3535
import tempfile
3636
from builtins import zip
37-
from typing import Dict, List, Tuple, TypeVar, Union
37+
from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
3838

3939
from synapseclient.core.constants import concrete_types
4040
from synapseclient.core.exceptions import SynapseError
@@ -397,16 +397,16 @@ def _convert_df_date_cols_to_datetime(
397397

398398

399399
def _csv_to_pandas_df(
400-
filepath,
401-
separator=DEFAULT_SEPARATOR,
402-
quote_char=DEFAULT_QUOTE_CHARACTER,
403-
escape_char=DEFAULT_ESCAPSE_CHAR,
404-
contain_headers=True,
405-
lines_to_skip=0,
406-
date_columns=None,
407-
list_columns=None,
408-
rowIdAndVersionInIndex=True,
409-
dtype=None,
400+
filepath: str,
401+
separator: str = DEFAULT_SEPARATOR,
402+
quote_char: str = DEFAULT_QUOTE_CHARACTER,
403+
escape_char: str = DEFAULT_ESCAPSE_CHAR,
404+
contain_headers: bool = True,
405+
lines_to_skip: int = 0,
406+
date_columns: Optional[List[str]] = None,
407+
list_columns: Optional[List[str]] = None,
408+
rowIdAndVersionInIndex: bool = True,
409+
dtype: Optional[Dict[str, Any]] = None,
410410
**kwargs,
411411
):
412412
"""
@@ -415,14 +415,20 @@ def _csv_to_pandas_df(
415415
Arguments:
416416
filepath: The path to the file.
417417
separator: The separator for the file, Defaults to `DEFAULT_SEPARATOR`.
418+
Passed as `sep` to pandas. If `sep` is supplied as a `kwarg`
419+
it will be used instead of this `separator` argument.
418420
quote_char: The quote character for the file,
419-
Defaults to `DEFAULT_QUOTE_CHARACTER`.
421+
Defaults to `DEFAULT_QUOTE_CHARACTER`.
422+
Passed as `quotechar` to pandas. If `quotechar` is supplied as a `kwarg`
423+
it will be used instead of this `quote_char` argument.
420424
escape_char: The escape character for the file,
421425
Defaults to `DEFAULT_ESCAPSE_CHAR`.
422-
contain_headers: Whether the file contains headers,
426+
contain_headers: Whether the file contains headers,
423427
Defaults to `True`.
424428
lines_to_skip: The number of lines to skip at the beginning of the file,
425-
Defaults to `0`.
429+
Defaults to `0`. Passed as `skiprows` to pandas.
430+
If `skiprows` is supplied as a `kwarg`
431+
it will be used instead of this `lines_to_skip` argument.
426432
date_columns: The names of the date columns in the file
427433
list_columns: The names of the list columns in the file
428434
rowIdAndVersionInIndex: Whether the file contains rowId and
@@ -440,21 +446,26 @@ def _csv_to_pandas_df(
440446

441447
line_terminator = str(os.linesep)
442448

449+
pandas_args = {
450+
"dtype": dtype,
451+
"sep": separator,
452+
"quotechar": quote_char,
453+
"escapechar": escape_char,
454+
"header": 0 if contain_headers else None,
455+
"skiprows": lines_to_skip,
456+
}
457+
pandas_args.update(kwargs)
458+
443459
# assign line terminator only if for single character
444460
# line terminators (e.g. not '\r\n') 'cause pandas doesn't
445461
# longer line terminators. See: <https://github.com/pydata/pandas/issues/3501>
446462
# "ValueError: Only length-1 line terminators supported"
447463
df = pd.read_csv(
448464
filepath,
449-
dtype=dtype,
450-
sep=separator,
451465
lineterminator=line_terminator if len(line_terminator) == 1 else None,
452-
quotechar=quote_char,
453-
escapechar=escape_char,
454-
header=0 if contain_headers else None,
455-
skiprows=lines_to_skip,
456-
**kwargs,
466+
**pandas_args,
457467
)
468+
458469
# parse date columns if exists
459470
if date_columns:
460471
df = _convert_df_date_cols_to_datetime(df, date_columns)
@@ -2425,6 +2436,12 @@ def __init__(
24252436
includeRowIdAndRowVersion=None,
24262437
headers=None,
24272438
):
2439+
"""Initialize a CsvFileTable object.
2440+
2441+
Note: Some arguments provided to this constructor are passed to pandas.read_csv()
2442+
including `quoteCharacter`, `escapeCharacter`, `lineEnd`, and `separator`.
2443+
These arguments can be overwritten by passing `pandas.read_csv` kwargs to `asDataFrame`.
2444+
"""
24282445
self.filepath = filepath
24292446

24302447
self.includeRowIdAndRowVersion = includeRowIdAndRowVersion
@@ -2481,6 +2498,11 @@ def asDataFrame(
24812498
):
24822499
"""Convert query result to a Pandas DataFrame.
24832500
2501+
Note: Class attributes `quoteCharacter`, `escapeCharacter`, `lineEnd`, and `separator`
2502+
are used as `quotechar`, `escapechar`, `lineterminator`, and `sep` in `pandas.read_csv`
2503+
within this method. If you want to override these values, you can do so by passing the
2504+
appropriate keyword arguments to this method.
2505+
24842506
Arguments:
24852507
rowIdAndVersionInIndex: Make the dataframe index consist of the
24862508
row_id and row_version (and row_etag if it exists)

tests/unit/synapseclient/unit_test_tables.py

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Unit test for synapseclient.table"""
22
import csv
33
import io
4+
import json
45
import math
56
import os
67
import shutil
@@ -40,6 +41,7 @@
4041
Table,
4142
TableQueryResult,
4243
_convert_df_date_cols_to_datetime,
44+
_csv_to_pandas_df,
4345
_get_view_type_mask,
4446
_get_view_type_mask_for_deprecated_type,
4547
as_table_columns,
@@ -251,6 +253,198 @@ def test_convert_df_date_cols_to_datetime() -> None:
251253
assert_frame_equal(test_df2, expected_date_df)
252254

253255

256+
def test_csv_to_pandas_df_no_kwargs():
257+
# GIVEN a pandas DataFrame (CSV file stand-in)
258+
expected_df = pd.DataFrame(
259+
{"col1": [1, 2, 3], "col2": ["a", "b", "c"], "col3": [True, False, True]}
260+
)
261+
262+
with patch.object(
263+
pd, "read_csv", return_value=expected_df
264+
) as mock_read_csv, patch.object(os, "linesep", "\r\n"):
265+
# WHEN I call _csv_to_pandas_df with default parameters
266+
df = _csv_to_pandas_df(
267+
filepath="dummy_path.csv",
268+
separator=synapseclient.table.DEFAULT_SEPARATOR,
269+
quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
270+
escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
271+
contain_headers=True,
272+
lines_to_skip=0,
273+
date_columns=None,
274+
list_columns=None,
275+
rowIdAndVersionInIndex=True,
276+
dtype=None,
277+
)
278+
279+
# THEN I expect pandas.read_csv was called with default arguments
280+
mock_read_csv.assert_called_once_with(
281+
"dummy_path.csv",
282+
dtype=None,
283+
sep=synapseclient.table.DEFAULT_SEPARATOR,
284+
quotechar=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
285+
escapechar=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
286+
header=0,
287+
skiprows=0,
288+
lineterminator=None,
289+
)
290+
291+
# AND I expect the returned DataFrame to be
292+
# the same as the original DataFrame (file)
293+
pd.testing.assert_frame_equal(df, expected_df)
294+
295+
296+
def test_csv_to_pandas_df_with_kwargs() -> None:
297+
# GIVEN a pandas DataFrame (CSV file stand-in)
298+
expected_df = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
299+
300+
with patch.object(
301+
pd, "read_csv", return_value=expected_df
302+
) as mock_read_csv, patch.object(os, "linesep", "\r\n"):
303+
# WHEN I call _csv_to_pandas_df with custom keyword arguments
304+
kwargs = {"escapechar": "\\", "keep_default_na": False}
305+
df = _csv_to_pandas_df(
306+
filepath="dummy_path.csv",
307+
separator=synapseclient.table.DEFAULT_SEPARATOR,
308+
quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
309+
escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
310+
contain_headers=True,
311+
lines_to_skip=0,
312+
date_columns=None,
313+
list_columns=None,
314+
rowIdAndVersionInIndex=True,
315+
dtype=None,
316+
**kwargs,
317+
)
318+
319+
# THEN I expect pandas.read_csv was called with the keyword arguments
320+
mock_read_csv.assert_called_once_with(
321+
"dummy_path.csv",
322+
dtype=None,
323+
sep=synapseclient.table.DEFAULT_SEPARATOR,
324+
quotechar=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
325+
escapechar="\\",
326+
header=0,
327+
skiprows=0,
328+
keep_default_na=False,
329+
lineterminator=None,
330+
)
331+
332+
# AND I expect the returned DataFrame to match the expected DataFrame
333+
pd.testing.assert_frame_equal(df, expected_df)
334+
335+
336+
def test_csv_to_pandas_df_calls_convert_date_cols():
337+
# GIVEN a pandas DataFrame (CSV file stand-in) with a date column
338+
expected_df = pd.DataFrame(
339+
{"col1": [1, 2, 3], "date_col": ["2021-01-01", "2021-01-02", "2021-01-03"]}
340+
)
341+
342+
with patch.object(pd, "read_csv", return_value=expected_df), patch.object(
343+
synapseclient.table, "_convert_df_date_cols_to_datetime"
344+
) as mock_convert_dates:
345+
# WHEN I call _csv_to_pandas_df with date_columns specified
346+
_csv_to_pandas_df(
347+
filepath="dummy_path.csv",
348+
separator=synapseclient.table.DEFAULT_SEPARATOR,
349+
quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
350+
escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
351+
contain_headers=True,
352+
lines_to_skip=0,
353+
date_columns=["date_col"], # Specify date column
354+
list_columns=None,
355+
rowIdAndVersionInIndex=True,
356+
dtype=None,
357+
)
358+
359+
# THEN I expect _convert_df_date_cols_to_datetime to be
360+
# called with the expected DataFrame and date columns
361+
mock_convert_dates.assert_called_once_with(expected_df, ["date_col"])
362+
363+
364+
def test_csv_to_pandas_df_handles_list_columns():
365+
# GIVEN a pandas DataFrame (CSV file stand-in) with a list column
366+
initial_df = pd.DataFrame(
367+
{"col1": [1, 2, 3], "list_col": ["[1, 2, 3]", "[4, 5, 6]", "[7, 8, 9]"]}
368+
)
369+
370+
# AND a pandas DataFrame (expected result) with the list column converted to a list
371+
expected_final_df = pd.DataFrame(
372+
{"col1": [1, 2, 3], "list_col": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}
373+
)
374+
375+
with patch.object(pd, "read_csv", return_value=initial_df), patch.object(
376+
synapseclient.table, "_convert_df_date_cols_to_datetime"
377+
), patch.object(
378+
pd.Series, "apply", return_value=expected_final_df["list_col"]
379+
) as mock_apply:
380+
# WHEN I call _csv_to_pandas_df with list_columns specified
381+
result_df = synapseclient.table._csv_to_pandas_df(
382+
filepath="dummy_path.csv",
383+
separator=synapseclient.table.DEFAULT_SEPARATOR,
384+
quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
385+
escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
386+
contain_headers=True,
387+
lines_to_skip=0,
388+
date_columns=None,
389+
list_columns=["list_col"], # Specify list column
390+
rowIdAndVersionInIndex=True,
391+
dtype=None,
392+
)
393+
394+
# THEN I expect json.loads to be applied to the list column
395+
mock_apply.assert_called_once_with(json.loads)
396+
397+
# AND I expect the returned DataFrame to match the expected DataFrame
398+
pd.testing.assert_frame_equal(result_df, expected_final_df)
399+
400+
401+
def test_csv_to_pandas_df_handles_row_id_and_version():
402+
# GIVEN a pandas DataFrame (CSV file stand-in) with ROW_ID and ROW_VERSION columns
403+
initial_df = pd.DataFrame(
404+
{
405+
"ROW_ID": [1, 2, 3],
406+
"ROW_VERSION": [1, 1, 2],
407+
"col1": ["a", "b", "c"],
408+
"col2": [10, 20, 30],
409+
}
410+
)
411+
412+
# AND a pandas DataFrame (expected result)
413+
# with the ROW_ID and ROW_VERSION columns removed
414+
expected_final_df = pd.DataFrame(
415+
{"col1": ["a", "b", "c"], "col2": [10, 20, 30]}, index=["1_1", "2_1", "3_2"]
416+
) # Index format: ROW_ID_ROW_VERSION
417+
418+
with patch.object(pd, "read_csv", return_value=initial_df), patch.object(
419+
synapseclient.table,
420+
"row_labels_from_id_and_version",
421+
return_value=["1_1", "2_1", "3_2"],
422+
) as mock_row_labels:
423+
# WHEN I call _csv_to_pandas_df with rowIdAndVersionInIndex=True
424+
result_df = synapseclient.table._csv_to_pandas_df(
425+
filepath="dummy_path.csv",
426+
separator=synapseclient.table.DEFAULT_SEPARATOR,
427+
quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
428+
escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
429+
contain_headers=True,
430+
lines_to_skip=0,
431+
date_columns=None,
432+
list_columns=None,
433+
rowIdAndVersionInIndex=True,
434+
dtype=None,
435+
)
436+
437+
# THEN I expect row_labels_from_id_and_version to be called once
438+
mock_row_labels.assert_called_once()
439+
440+
# AND I expect the returned DataFrame to match the expected
441+
# DataFrame with the ROW_ID and ROW_VERSION columns removed
442+
pd.testing.assert_frame_equal(result_df, expected_final_df)
443+
444+
# AND I expect the index of the result_df to be as expected
445+
assert list(result_df.index) == ["1_1", "2_1", "3_2"]
446+
447+
254448
def test_schema() -> None:
255449
schema = Schema(name="My Table", parent="syn1000001")
256450

0 commit comments

Comments
 (0)