Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
- Added support for dict values in `Series.str.get`, `Series.str.slice`, and `Series.str.__getitem__` (`Series.str[...]`).
- Added support for `DataFrame.to_html`.
- Added support for `DataFrame.to_string` and `Series.to_string`.
- Added support for reading files from S3 buckets using `pd.read_csv`.

#### Improvements

Expand Down
25 changes: 24 additions & 1 deletion src/snowflake/snowpark/modin/plugin/io/snow_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@
)
from pandas.core.dtypes.common import is_list_like

from snowflake.snowpark._internal.utils import (
STAGE_PREFIX,
TempObjectType,
random_name_for_temp_object,
)
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark.mock._stage_registry import extract_stage_name_and_prefix
from snowflake.snowpark.modin.plugin._internal.io_utils import (
Expand Down Expand Up @@ -391,8 +396,26 @@ def read_csv(
"filepath_or_buffer must be a path to a file or folder stored locally or on a Snowflake stage."
)

if kwargs["engine"] != "snowflake" and is_local_filepath(filepath_or_buffer):
if (
filepath_or_buffer is not None
and isinstance(filepath_or_buffer, str)
and any(
filepath_or_buffer.lower().startswith(prefix)
for prefix in ["s3://", "s3china://", "s3gov://"]
)
):
session = get_active_session()
temp_stage_name = random_name_for_temp_object(TempObjectType.STAGE)
dirname = os.path.dirname(filepath_or_buffer)
basename = os.path.basename(filepath_or_buffer)
session.sql(
f"CREATE OR REPLACE TEMPORARY STAGE {temp_stage_name} URL='{dirname}'"
).collect()
filepath_or_buffer = (
f"{STAGE_PREFIX}{os.path.join(temp_stage_name, basename)}"
)

if kwargs["engine"] != "snowflake" and is_local_filepath(filepath_or_buffer):
return cls.query_compiler_cls.from_file_with_pandas("csv", **kwargs)

WarningMessage.mismatch_with_pandas(
Expand Down
11 changes: 11 additions & 0 deletions tests/integ/modin/io/test_read_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -837,3 +837,14 @@ def test_read_csv_dateparse_multiple_columns():
assert_frame_equal(expected_df, got_df, check_dtype=False, check_index_type=False)

os.remove(temp_file_name)


def test_read_csv_s3():
host = pd.session.connection.host
if any(platform in host.split(".") for platform in ["gcp", "azure"]):
pytest.skip(reason="Skipping test for Azure and GCP deployment")
with SqlCounter(query_count=9):
df = pd.read_csv(
"s3://sfquickstarts/frostbyte_tastybytes/analytics/menu_item_aggregate_v.csv"
)
assert len(df.columns) == 12
Loading