|
1 | 1 | """Unit test for synapseclient.table"""
|
2 | 2 | import csv
|
3 | 3 | import io
|
| 4 | +import json |
4 | 5 | import math
|
5 | 6 | import os
|
6 | 7 | import shutil
|
|
40 | 41 | Table,
|
41 | 42 | TableQueryResult,
|
42 | 43 | _convert_df_date_cols_to_datetime,
|
| 44 | + _csv_to_pandas_df, |
43 | 45 | _get_view_type_mask,
|
44 | 46 | _get_view_type_mask_for_deprecated_type,
|
45 | 47 | as_table_columns,
|
@@ -251,6 +253,198 @@ def test_convert_df_date_cols_to_datetime() -> None:
|
251 | 253 | assert_frame_equal(test_df2, expected_date_df)
|
252 | 254 |
|
253 | 255 |
|
| 256 | +def test_csv_to_pandas_df_no_kwargs(): |
| 257 | + # GIVEN a pandas DataFrame (CSV file stand-in) |
| 258 | + expected_df = pd.DataFrame( |
| 259 | + {"col1": [1, 2, 3], "col2": ["a", "b", "c"], "col3": [True, False, True]} |
| 260 | + ) |
| 261 | + |
| 262 | + with patch.object( |
| 263 | + pd, "read_csv", return_value=expected_df |
| 264 | + ) as mock_read_csv, patch.object(os, "linesep", "\r\n"): |
| 265 | + # WHEN I call _csv_to_pandas_df with default parameters |
| 266 | + df = _csv_to_pandas_df( |
| 267 | + filepath="dummy_path.csv", |
| 268 | + separator=synapseclient.table.DEFAULT_SEPARATOR, |
| 269 | + quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER, |
| 270 | + escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR, |
| 271 | + contain_headers=True, |
| 272 | + lines_to_skip=0, |
| 273 | + date_columns=None, |
| 274 | + list_columns=None, |
| 275 | + rowIdAndVersionInIndex=True, |
| 276 | + dtype=None, |
| 277 | + ) |
| 278 | + |
| 279 | + # THEN I expect pandas.read_csv was called with default arguments |
| 280 | + mock_read_csv.assert_called_once_with( |
| 281 | + "dummy_path.csv", |
| 282 | + dtype=None, |
| 283 | + sep=synapseclient.table.DEFAULT_SEPARATOR, |
| 284 | + quotechar=synapseclient.table.DEFAULT_QUOTE_CHARACTER, |
| 285 | + escapechar=synapseclient.table.DEFAULT_ESCAPSE_CHAR, |
| 286 | + header=0, |
| 287 | + skiprows=0, |
| 288 | + lineterminator=None, |
| 289 | + ) |
| 290 | + |
| 291 | + # AND I expect the returned DataFrame to be |
| 292 | + # the same as the original DataFrame (file) |
| 293 | + pd.testing.assert_frame_equal(df, expected_df) |
| 294 | + |
| 295 | + |
| 296 | +def test_csv_to_pandas_df_with_kwargs() -> None: |
| 297 | + # GIVEN a pandas DataFrame (CSV file stand-in) |
| 298 | + expected_df = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]}) |
| 299 | + |
| 300 | + with patch.object( |
| 301 | + pd, "read_csv", return_value=expected_df |
| 302 | + ) as mock_read_csv, patch.object(os, "linesep", "\r\n"): |
| 303 | + # WHEN I call _csv_to_pandas_df with custom keyword arguments |
| 304 | + kwargs = {"escapechar": "\\", "keep_default_na": False} |
| 305 | + df = _csv_to_pandas_df( |
| 306 | + filepath="dummy_path.csv", |
| 307 | + separator=synapseclient.table.DEFAULT_SEPARATOR, |
| 308 | + quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER, |
| 309 | + escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR, |
| 310 | + contain_headers=True, |
| 311 | + lines_to_skip=0, |
| 312 | + date_columns=None, |
| 313 | + list_columns=None, |
| 314 | + rowIdAndVersionInIndex=True, |
| 315 | + dtype=None, |
| 316 | + **kwargs, |
| 317 | + ) |
| 318 | + |
| 319 | + # THEN I expect pandas.read_csv was called with the keyword arguments |
| 320 | + mock_read_csv.assert_called_once_with( |
| 321 | + "dummy_path.csv", |
| 322 | + dtype=None, |
| 323 | + sep=synapseclient.table.DEFAULT_SEPARATOR, |
| 324 | + quotechar=synapseclient.table.DEFAULT_QUOTE_CHARACTER, |
| 325 | + escapechar="\\", |
| 326 | + header=0, |
| 327 | + skiprows=0, |
| 328 | + keep_default_na=False, |
| 329 | + lineterminator=None, |
| 330 | + ) |
| 331 | + |
| 332 | + # AND I expect the returned DataFrame to match the expected DataFrame |
| 333 | + pd.testing.assert_frame_equal(df, expected_df) |
| 334 | + |
| 335 | + |
| 336 | +def test_csv_to_pandas_df_calls_convert_date_cols(): |
| 337 | + # GIVEN a pandas DataFrame (CSV file stand-in) with a date column |
| 338 | + expected_df = pd.DataFrame( |
| 339 | + {"col1": [1, 2, 3], "date_col": ["2021-01-01", "2021-01-02", "2021-01-03"]} |
| 340 | + ) |
| 341 | + |
| 342 | + with patch.object(pd, "read_csv", return_value=expected_df), patch.object( |
| 343 | + synapseclient.table, "_convert_df_date_cols_to_datetime" |
| 344 | + ) as mock_convert_dates: |
| 345 | + # WHEN I call _csv_to_pandas_df with date_columns specified |
| 346 | + _csv_to_pandas_df( |
| 347 | + filepath="dummy_path.csv", |
| 348 | + separator=synapseclient.table.DEFAULT_SEPARATOR, |
| 349 | + quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER, |
| 350 | + escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR, |
| 351 | + contain_headers=True, |
| 352 | + lines_to_skip=0, |
| 353 | + date_columns=["date_col"], # Specify date column |
| 354 | + list_columns=None, |
| 355 | + rowIdAndVersionInIndex=True, |
| 356 | + dtype=None, |
| 357 | + ) |
| 358 | + |
| 359 | + # THEN I expect _convert_df_date_cols_to_datetime to be |
| 360 | + # called with the expected DataFrame and date columns |
| 361 | + mock_convert_dates.assert_called_once_with(expected_df, ["date_col"]) |
| 362 | + |
| 363 | + |
| 364 | +def test_csv_to_pandas_df_handles_list_columns(): |
| 365 | + # GIVEN a pandas DataFrame (CSV file stand-in) with a list column |
| 366 | + initial_df = pd.DataFrame( |
| 367 | + {"col1": [1, 2, 3], "list_col": ["[1, 2, 3]", "[4, 5, 6]", "[7, 8, 9]"]} |
| 368 | + ) |
| 369 | + |
| 370 | + # AND a pandas DataFrame (expected result) with the list column converted to a list |
| 371 | + expected_final_df = pd.DataFrame( |
| 372 | + {"col1": [1, 2, 3], "list_col": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]} |
| 373 | + ) |
| 374 | + |
| 375 | + with patch.object(pd, "read_csv", return_value=initial_df), patch.object( |
| 376 | + synapseclient.table, "_convert_df_date_cols_to_datetime" |
| 377 | + ), patch.object( |
| 378 | + pd.Series, "apply", return_value=expected_final_df["list_col"] |
| 379 | + ) as mock_apply: |
| 380 | + # WHEN I call _csv_to_pandas_df with list_columns specified |
| 381 | + result_df = synapseclient.table._csv_to_pandas_df( |
| 382 | + filepath="dummy_path.csv", |
| 383 | + separator=synapseclient.table.DEFAULT_SEPARATOR, |
| 384 | + quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER, |
| 385 | + escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR, |
| 386 | + contain_headers=True, |
| 387 | + lines_to_skip=0, |
| 388 | + date_columns=None, |
| 389 | + list_columns=["list_col"], # Specify list column |
| 390 | + rowIdAndVersionInIndex=True, |
| 391 | + dtype=None, |
| 392 | + ) |
| 393 | + |
| 394 | + # THEN I expect json.loads to be applied to the list column |
| 395 | + mock_apply.assert_called_once_with(json.loads) |
| 396 | + |
| 397 | + # AND I expect the returned DataFrame to match the expected DataFrame |
| 398 | + pd.testing.assert_frame_equal(result_df, expected_final_df) |
| 399 | + |
| 400 | + |
| 401 | +def test_csv_to_pandas_df_handles_row_id_and_version(): |
| 402 | + # GIVEN a pandas DataFrame (CSV file stand-in) with ROW_ID and ROW_VERSION columns |
| 403 | + initial_df = pd.DataFrame( |
| 404 | + { |
| 405 | + "ROW_ID": [1, 2, 3], |
| 406 | + "ROW_VERSION": [1, 1, 2], |
| 407 | + "col1": ["a", "b", "c"], |
| 408 | + "col2": [10, 20, 30], |
| 409 | + } |
| 410 | + ) |
| 411 | + |
| 412 | + # AND a pandas DataFrame (expected result) |
| 413 | + # with the ROW_ID and ROW_VERSION columns removed |
| 414 | + expected_final_df = pd.DataFrame( |
| 415 | + {"col1": ["a", "b", "c"], "col2": [10, 20, 30]}, index=["1_1", "2_1", "3_2"] |
| 416 | + ) # Index format: ROW_ID_ROW_VERSION |
| 417 | + |
| 418 | + with patch.object(pd, "read_csv", return_value=initial_df), patch.object( |
| 419 | + synapseclient.table, |
| 420 | + "row_labels_from_id_and_version", |
| 421 | + return_value=["1_1", "2_1", "3_2"], |
| 422 | + ) as mock_row_labels: |
| 423 | + # WHEN I call _csv_to_pandas_df with rowIdAndVersionInIndex=True |
| 424 | + result_df = synapseclient.table._csv_to_pandas_df( |
| 425 | + filepath="dummy_path.csv", |
| 426 | + separator=synapseclient.table.DEFAULT_SEPARATOR, |
| 427 | + quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER, |
| 428 | + escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR, |
| 429 | + contain_headers=True, |
| 430 | + lines_to_skip=0, |
| 431 | + date_columns=None, |
| 432 | + list_columns=None, |
| 433 | + rowIdAndVersionInIndex=True, |
| 434 | + dtype=None, |
| 435 | + ) |
| 436 | + |
| 437 | + # THEN I expect row_labels_from_id_and_version to be called once |
| 438 | + mock_row_labels.assert_called_once() |
| 439 | + |
| 440 | + # AND I expect the returned DataFrame to match the expected |
| 441 | + # DataFrame with the ROW_ID and ROW_VERSION columns removed |
| 442 | + pd.testing.assert_frame_equal(result_df, expected_final_df) |
| 443 | + |
| 444 | + # AND I expect the index of the result_df to be as expected |
| 445 | + assert list(result_df.index) == ["1_1", "2_1", "3_2"] |
| 446 | + |
| 447 | + |
254 | 448 | def test_schema() -> None:
|
255 | 449 | schema = Schema(name="My Table", parent="syn1000001")
|
256 | 450 |
|
|
0 commit comments