Skip to content

Commit 918498e

Browse files
Merge branch 'refs/heads/main' into feature/aherrera/SNOW-2432059-StringAndBinary-part1
# Conflicts: # CHANGELOG.md
2 parents 2a5afac + b4fd165 commit 918498e

29 files changed

+956
-242
lines changed

CHANGELOG.md

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@
5858
- `st_geometryfromwkt`
5959
- `try_to_geography`
6060
- `try_to_geometry`
61-
61+
6262
- String and Binary functions:
6363
- `base64_decode_binary`
6464
- `compress`
@@ -70,7 +70,8 @@
7070
- `sha1_binary`
7171
- `sha2_binary`
7272
- `soundex_p123`
73-
73+
74+
- Added a parameter to enable and disable automatic column name aliasing for `interval_day_time_from_parts` and `interval_year_month_from_parts` functions.
7475

7576
#### Bug Fixes
7677

@@ -79,15 +80,21 @@
7980
- Fixed a bug where writing Snowpark pandas dataframes on the pandas backend with a column multiindex to Snowflake with `to_snowflake` would raise `KeyError`.
8081
- Fixed a bug that `DataFrameReader.dbapi` (PuPr) is not compatible with oracledb 3.4.0.
8182

83+
#### Improvements
84+
85+
- The default maximum length for inferred StringType columns during schema inference in `DataFrameReader.dbapi` is now increased from 16MB to 128MB in parquet file based ingestion.
86+
8287
#### Dependency Updates
8388

8489
- Updated dependency of `snowflake-connector-python>=3.17,<5.0.0`.
8590

8691
### Snowpark pandas API Updates
8792

8893
#### New Features
94+
8995
- Added support for the `dtypes` parameter of `pd.get_dummies`
9096
- Added support for `nunique` in `df.pivot_table`, `df.agg` and other places where aggregate functions can be used.
97+
- Added support for `DataFrame.interpolate` and `Series.interpolate` with the "linear", "ffill"/"pad", and "backfill"/bfill" methods. These use the SQL `INTERPOLATE_LINEAR`, `INTERPOLATE_FFILL`, and `INTERPOLATE_BFILL` functions (PuPr).
9198

9299
#### Improvements
93100

docs/source/modin/supported/dataframe_supported.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,11 @@ Methods
227227
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
228228
| ``insert`` | Y | | |
229229
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
230-
| ``interpolate`` | N | | |
230+
| ``interpolate`` | P | | ``N`` if ``axis == 1``, ``limit`` is set, |
231+
| | | | ``limit_area`` is "outside", or ``method`` is not |
232+
| | | | "linear", "bfill", "backfill", "ffill", or "pad". |
233+
| | | | ``limit_area="inside"`` is supported only when |
234+
| | | | ``method`` is ``linear``. |
231235
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
232236
| ``isetitem`` | N | | |
233237
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+

docs/source/modin/supported/series_supported.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,11 @@ Methods
243243
| ``info`` | D | | Different Index types are used in pandas but not |
244244
| | | | in Snowpark pandas |
245245
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
246-
| ``interpolate`` | N | | |
246+
| ``interpolate`` | P | | ``N`` if ``limit`` is set, |
247+
| | | | ``limit_area`` is "outside", or ``method`` is not |
248+
| | | | "linear", "bfill", "backfill", "ffill", or "pad". |
249+
| | | | ``limit_area="inside"`` is supported only when |
250+
| | | | ``method`` is ``linear``. |
247251
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
248252
| ``isin`` | Y | | Snowpark pandas deviates with respect to handling |
249253
| | | | NA values |

src/snowflake/snowpark/_internal/data_source/drivers/base_driver.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
Connection,
1212
Cursor,
1313
)
14+
from snowflake.snowpark._internal.server_connection import MAX_STRING_SIZE
1415
from snowflake.snowpark._internal.utils import (
1516
get_sorted_key_for_version,
1617
measure_time,
@@ -27,6 +28,7 @@
2728
BinaryType,
2829
DateType,
2930
BooleanType,
31+
StringType,
3032
)
3133
import snowflake.snowpark
3234
import logging
@@ -103,7 +105,16 @@ def infer_schema_from_description(
103105
query_input_alias: str,
104106
) -> StructType:
105107
self.get_raw_schema(table_or_query, cursor, is_query, query_input_alias)
106-
return self.to_snow_type(self.raw_schema)
108+
generated_schema = self.to_snow_type(self.raw_schema)
109+
# snowflake will default string length to 128MB in the bundle which will be enabled in 2026-01
110+
# https://docs.snowflake.com/en/release-notes/bcr-bundles/2025_07_bundle
111+
# here we prematurely make the change to default string to
112+
# 1. align the string length with UDTF based ingestion
113+
# 2. avoid the BCR impact to dbapi feature
114+
for field in generated_schema.fields:
115+
if isinstance(field.datatype, StringType) and field.datatype.length is None:
116+
field.datatype.length = MAX_STRING_SIZE
117+
return generated_schema
107118

108119
def infer_schema_from_description_with_error_control(
109120
self, table_or_query: str, is_query: bool, query_input_alias: str
@@ -184,7 +195,10 @@ def udtf_ingestion(
184195
select * from {partition_table}, table({udtf_name}({PARTITION_TABLE_COLUMN_NAME}))
185196
"""
186197
res = session.sql(call_udtf_sql, _emit_ast=_emit_ast)
187-
return self.to_result_snowpark_df_udtf(res, schema, _emit_ast=_emit_ast)
198+
return BaseDriver.keep_nullable_attributes(
199+
self.to_result_snowpark_df_udtf(res, schema, _emit_ast=_emit_ast),
200+
schema,
201+
)
188202

189203
def udtf_class_builder(
190204
self,
@@ -284,6 +298,14 @@ def to_result_snowpark_df(
284298
) -> "DataFrame":
285299
return session.table(table_name, _emit_ast=_emit_ast)
286300

301+
@staticmethod
302+
def keep_nullable_attributes(
303+
selected_df: "DataFrame", schema: StructType
304+
) -> "DataFrame":
305+
for attr, source_field in zip(selected_df._plan.attributes, schema.fields):
306+
attr.nullable = source_field.nullable
307+
return selected_df
308+
287309
@staticmethod
288310
def to_result_snowpark_df_udtf(
289311
res_df: "DataFrame",
@@ -294,10 +316,7 @@ def to_result_snowpark_df_udtf(
294316
res_df[field.name].cast(field.datatype).alias(field.name)
295317
for field in schema.fields
296318
]
297-
selected_df = res_df.select(cols, _emit_ast=_emit_ast)
298-
for attr, source_field in zip(selected_df._plan.attributes, schema.fields):
299-
attr.nullable = source_field.nullable
300-
return selected_df
319+
return res_df.select(cols, _emit_ast=_emit_ast)
301320

302321
def get_server_cursor_if_supported(self, conn: "Connection") -> "Cursor":
303322
"""

src/snowflake/snowpark/_internal/server_connection.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@
8686
PARAM_INTERNAL_APPLICATION_NAME = "internal_application_name"
8787
PARAM_INTERNAL_APPLICATION_VERSION = "internal_application_version"
8888
DEFAULT_STRING_SIZE = 16777216
89+
MAX_STRING_SIZE = 134217728
8990

9091

9192
def _build_target_path(stage_location: str, dest_prefix: str = "") -> str:

src/snowflake/snowpark/dataframe_reader.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1707,18 +1707,24 @@ def dbapi(
17071707
Reads data from a database table or query into a DataFrame using a DBAPI connection,
17081708
with support for optional partitioning, parallel processing, and query customization.
17091709
1710-
There are multiple methods to partition data and accelerate ingestion.
1711-
These methods can be combined to achieve optimal performance:
1712-
1713-
1.Use column, lower_bound, upper_bound and num_partitions at the same time when you need to split large tables into smaller partitions for parallel processing.
1714-
These must all be specified together, otherwise error will be raised.
1715-
2.Set max_workers to a proper positive integer.
1716-
This defines the maximum number of processes and threads used for parallel execution.
1717-
3.Adjusting fetch_size can optimize performance by reducing the number of round trips to the database.
1718-
4.Use predicates to defining WHERE conditions for partitions,
1719-
predicates will be ignored if column is specified to generate partition.
1720-
5.Set custom_schema to avoid snowpark infer schema, custom_schema must have a matched
1721-
column name with table in external data source.
1710+
Usage Notes:
1711+
- Ingestion performance tuning:
1712+
- **Partitioning**: Use ``column``, ``lower_bound``, ``upper_bound``, and ``num_partitions``
1713+
together to split large tables into smaller partitions for parallel processing.
1714+
All four parameters must be specified together, otherwise an error will be raised.
1715+
- **Parallel execution**: Set ``max_workers`` to control the maximum number of processes
1716+
and threads used for parallel execution.
1717+
- **Fetch optimization**: Adjust ``fetch_size`` to optimize performance by reducing
1718+
the number of round trips to the database.
1719+
- **Partition filtering**: Use ``predicates`` to define WHERE conditions for partitions.
1720+
Note that ``predicates`` will be ignored if ``column`` is specified for partitioning.
1721+
- **Schema specification**: Set ``custom_schema`` to skip schema inference. The custom schema
1722+
must have matching column names with the table in the external data source.
1723+
- Execution timing and error handling:
1724+
- **UDTF Ingestion**: Uses lazy evaluation. Errors are reported as ``SnowparkSQLException``
1725+
during DataFrame actions (e.g., ``DataFrame.collect()``).
1726+
- **Local Ingestion**: Uses eager execution. Errors are reported immediately as
1727+
``SnowparkDataFrameReaderException`` when this method is called.
17221728
17231729
Args:
17241730
create_connection: A callable that returns a DB-API compatible database connection.

src/snowflake/snowpark/functions.py

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11031,6 +11031,7 @@ def make_interval(
1103111031
def interval_year_month_from_parts(
1103211032
years: Optional[ColumnOrName] = None,
1103311033
months: Optional[ColumnOrName] = None,
11034+
_alias_column_name: Optional[bool] = True,
1103411035
_emit_ast: bool = True,
1103511036
) -> Column:
1103611037
"""
@@ -11042,6 +11043,7 @@ def interval_year_month_from_parts(
1104211043
Args:
1104311044
years: The number of years, positive or negative
1104411045
months: The number of months, positive or negative
11046+
_alias_column_name: If true, alias the column name to a cleaner value
1104511047

1104611048
Returns:
1104711049
A Column representing a year-month interval
@@ -11091,15 +11093,21 @@ def interval_year_month_from_parts(
1109111093
)
1109211094
interval_string = concat(sign_prefix, normalized_years, lit("-"), normalized_months)
1109311095

11094-
def get_col_name(col):
11095-
if isinstance(col._expr1, Literal):
11096-
return str(col._expr1.value)
11097-
else:
11098-
return col._expression.name
11096+
res = cast(interval_string, "INTERVAL YEAR TO MONTH")
11097+
if _alias_column_name:
11098+
# Aliasing column names when using this in a case when will throw an error. This allows us to only alias
11099+
# when necessary.
11100+
11101+
def get_col_name(col):
11102+
if isinstance(col._expr1, Literal):
11103+
return str(col._expr1.value)
11104+
else:
11105+
return col._expression.name
11106+
11107+
alias_name = f"interval_year_month_from_parts({get_col_name(years_col)}, {get_col_name(months_col)})"
1109911108

11100-
alias_name = f"interval_year_month_from_parts({get_col_name(years_col)}, {get_col_name(months_col)})"
11109+
res = res.alias(alias_name)
1110111110

11102-
res = cast(interval_string, "INTERVAL YEAR TO MONTH").alias(alias_name)
1110311111
res._ast = ast
1110411112
return res
1110511113

@@ -11114,6 +11122,7 @@ def interval_day_time_from_parts(
1111411122
hours: Optional[ColumnOrName] = None,
1111511123
mins: Optional[ColumnOrName] = None,
1111611124
secs: Optional[ColumnOrName] = None,
11125+
_alias_column_name: Optional[bool] = True,
1111711126
_emit_ast: bool = True,
1111811127
) -> Column:
1111911128
"""
@@ -11127,6 +11136,7 @@ def interval_day_time_from_parts(
1112711136
hours: The number of hours, positive or negative
1112811137
mins: The number of minutes, positive or negative
1112911138
secs: The number of seconds, positive or negative
11139+
_alias_column_name: If true, alias the column name to a cleaner value
1113011140

1113111141
Returns:
1113211142
A Column representing a day-time interval
@@ -11238,15 +11248,21 @@ def interval_day_time_from_parts(
1123811248
secs_formatted,
1123911249
)
1124011250

11241-
def get_col_name(col):
11242-
if isinstance(col._expr1, Literal):
11243-
return str(col._expr1.value)
11244-
else:
11245-
return str(col._expr1)
11251+
res = cast(interval_value, "INTERVAL DAY TO SECOND")
11252+
if _alias_column_name:
11253+
# Aliasing column names when using this in a case when will throw an error. This allows us to only alias
11254+
# when necessary.
11255+
11256+
def get_col_name(col):
11257+
if isinstance(col._expr1, Literal):
11258+
return str(col._expr1.value)
11259+
else:
11260+
return str(col._expr1)
11261+
11262+
alias_name = f"interval_day_time_from_parts({get_col_name(days_col)}, {get_col_name(hours_col)}, {get_col_name(mins_col)}, {get_col_name(secs_col)})"
1124611263

11247-
alias_name = f"interval_day_time_from_parts({get_col_name(days_col)}, {get_col_name(hours_col)}, {get_col_name(mins_col)}, {get_col_name(secs_col)})"
11264+
res = res.alias(alias_name)
1124811265

11249-
res = cast(interval_value, "INTERVAL DAY TO SECOND").alias(alias_name)
1125011266
res._ast = ast
1125111267
return res
1125211268

0 commit comments

Comments
 (0)