snowflakedb
diff --git a/‎CHANGELOG.md‎
Lines changed: 14 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎docs/source/snowpark/functions.rst‎
Lines changed: 3 additions & 0 deletions b/‎docs/source/snowpark/functions.rst‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/snowflake/snowpark/_internal/analyzer/analyzer.py‎
Lines changed: 49 additions & 3 deletions b/‎src/snowflake/snowpark/_internal/analyzer/analyzer.py‎
Lines changed: 49 additions & 3 deletions
diff --git a/‎src/snowflake/snowpark/_internal/analyzer/analyzer_utils.py‎
Lines changed: 44 additions & 14 deletions b/‎src/snowflake/snowpark/_internal/analyzer/analyzer_utils.py‎
Lines changed: 44 additions & 14 deletions
diff --git a/‎src/snowflake/snowpark/_internal/analyzer/snowflake_plan.py‎
Lines changed: 2 additions & 0 deletions b/‎src/snowflake/snowpark/_internal/analyzer/snowflake_plan.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/snowflake/snowpark/_internal/compiler/plan_compiler.py‎
Lines changed: 4 additions & 4 deletions b/‎src/snowflake/snowpark/_internal/compiler/plan_compiler.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/snowflake/snowpark/_internal/event_table_telemetry.py‎
Lines changed: 2 additions & 1 deletion b/‎src/snowflake/snowpark/_internal/event_table_telemetry.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/snowflake/snowpark/_internal/xml_reader.py‎
Lines changed: 0 additions & 7 deletions b/‎src/snowflake/snowpark/_internal/xml_reader.py‎
Lines changed: 0 additions & 7 deletions
@@ -10,10 +10,15 @@
 - Added support for PrPr feature `Session.client_telemetry`.
 - Added support for `Session.udf_profiler`.
 - Added support for `functions.ai_translate`.
+- Added support for the following `iceberg_config` options in `DataFrameWriter.save_as_table` and `DataFrame.copy_into_table`:
+  - `target_file_size`
+  - `partition_by`
 - Added support for the following functions in `functions.py`:
     - String and Binary functions:
       - `base64_decode_binary`
+      - `bucket`
       - `compress`
+      - `day`
       - `decompress_binary`
       - `decompress_string`
       - `md5_binary`
@@ -23,6 +28,7 @@
       - `sha2_binary`
       - `soundex_p123`
       - `strtok`
+      - `truncate`
       - `try_base64_decode_binary`
       - `try_base64_decode_string`
       - `try_hex_decode_binary`
@@ -43,6 +49,10 @@
       - `square`
       - `width_bucket`
 
+#### Bug Fixes
+
+- Fixed a bug where automatically-generated temporary objects were not properly cleaned up.
+
 #### Improvements
 
 - Enhanced `DataFrame.sort()` to support `ORDER BY ALL` when no columns are specified.
@@ -56,6 +66,7 @@
 #### Bug Fixes
 
 - Fixed with a bug when sql generation when joining two `DataFrame`s created using `DataFrame.alias` and CTE optimization is enabled.
+- Fixed a bug in `XMLReader` where finding the start position of a row tag could return an incorrect file position.
 
 ### Snowpark pandas API Updates
 
@@ -64,6 +75,9 @@
 - Added support for `Dataframe.groupby.rolling()`.
 - Added support for mapping `np.percentile` with DataFrame and Series inputs to `Series.quantile`.
 - Added support for setting the `random_state` parameter to an integer when calling `DataFrame.sample` or `Series.sample`.
+- Added support for the following `iceberg_config` options in `to_iceberg`:
+  - `target_file_size`
+  - `partition_by`
 
 #### Improvements
 
 
@@ -120,6 +120,7 @@ Functions
     boolor
     boolxor
     boolxor_agg
+    bucket
     build_stage_file_url
     builtin
     bround
@@ -189,6 +190,7 @@ Functions
     datediff
     date_add
     date_sub
+    day
     daydiff
     dayname
     dayofmonth
@@ -555,6 +557,7 @@ Functions
     translate
     trim
     trunc
+    truncate
     try_cast
     try_parse_json
     try_to_binary
 
@@ -4,7 +4,7 @@
 #
 import uuid
 from collections import Counter, defaultdict
-from typing import TYPE_CHECKING, DefaultDict, Dict, List, Union
+from typing import TYPE_CHECKING, DefaultDict, Dict, List, Optional, Union
 from logging import getLogger
 
 from snowflake.connector import IntegrityError
@@ -177,6 +177,7 @@
     ExprAliasUpdateDict,
 )
 from snowflake.snowpark.types import BooleanType, _NumericType
+from snowflake.snowpark.column import Column
 
 ARRAY_BIND_THRESHOLD = 512
 
@@ -904,6 +905,43 @@ def to_sql_try_avoid_cast(
                 parse_local_name,
             )
 
+    def _process_partition_by_in_iceberg_config(
+        self,
+        iceberg_config: Optional[dict],
+        df_aliased_col_name_to_real_col_name: Union[
+            DefaultDict[str, Dict[str, str]], DefaultDict[str, ExprAliasUpdateDict]
+        ],
+    ) -> Optional[dict]:
+        """
+        Process partition_by expressions from iceberg_config, converting Column objects to SQL strings.
+        Returns a new iceberg_config dict with partition_by as a list of SQL strings, or the original config if no processing needed.
+        """
+        if iceberg_config is None or iceberg_config.get("partition_by") is None:
+            return iceberg_config
+
+        iceberg_config = {k.lower(): v for k, v in iceberg_config.items()}
+        pb = iceberg_config["partition_by"]
+
+        # Convert to list and filter out empty expressions
+        partition_exprs = pb if isinstance(pb, (list, tuple)) else [pb]
+        partition_sqls = []
+        for expr in partition_exprs:
+            if isinstance(expr, Column):
+                partition_sqls.append(
+                    self.analyze(expr._expression, df_aliased_col_name_to_real_col_name)
+                )
+            elif isinstance(expr, str):
+                if expr:  # Ignore empty strings
+                    partition_sqls.append(str(expr))
+            else:
+                raise TypeError(
+                    f"partition_by in iceberg_config expected Column or str, got: {type(expr)}"
+                )
+
+        if partition_sqls:
+            return {**iceberg_config, "partition_by": partition_sqls}
+        return iceberg_config
+
     def resolve(self, logical_plan: LogicalPlan) -> SnowflakePlan:
         self.subquery_plans = []
         self.generated_alias_maps = (
@@ -1164,6 +1202,10 @@ def do_resolve_with_resolved_children(
 
         if isinstance(logical_plan, SnowflakeCreateTable):
             resolved_child = resolved_children[logical_plan.children[0]]
+            iceberg_config = self._process_partition_by_in_iceberg_config(
+                logical_plan.iceberg_config, df_aliased_col_name_to_real_col_name
+            )
+
             return self.plan_builder.save_as_table(
                 table_name=logical_plan.table_name,
                 column_names=logical_plan.column_names,
@@ -1184,7 +1226,7 @@ def do_resolve_with_resolved_children(
                 use_scoped_temp_objects=self.session._use_scoped_temp_objects,
                 creation_source=logical_plan.creation_source,
                 child_attributes=resolved_child.attributes,
-                iceberg_config=logical_plan.iceberg_config,
+                iceberg_config=iceberg_config,
                 table_exists=logical_plan.table_exists,
             )
 
@@ -1416,6 +1458,10 @@ def do_resolve_with_resolved_children(
             if format_name is not None:
                 format_type_options["FORMAT_NAME"] = format_name
             assert logical_plan.file_format is not None
+            iceberg_config = self._process_partition_by_in_iceberg_config(
+                logical_plan.iceberg_config, df_aliased_col_name_to_real_col_name
+            )
+
             return self.plan_builder.copy_into_table(
                 path=logical_plan.file_path,
                 table_name=logical_plan.table_name,
@@ -1435,7 +1481,7 @@ def do_resolve_with_resolved_children(
                 else None,
                 user_schema=logical_plan.user_schema,
                 create_table_from_infer_schema=logical_plan.create_table_from_infer_schema,
-                iceberg_config=logical_plan.iceberg_config,
+                iceberg_config=iceberg_config,
             )
 
         if isinstance(logical_plan, CopyIntoLocationNode):
 
@@ -151,6 +151,7 @@
 EXTERNAL_VOLUME = " EXTERNAL_VOLUME "
 CATALOG = " CATALOG "
 BASE_LOCATION = " BASE_LOCATION "
+TARGET_FILE_SIZE = " TARGET_FILE_SIZE "
 CATALOG_SYNC = " CATALOG_SYNC "
 STORAGE_SERIALIZATION_POLICY = " STORAGE_SERIALIZATION_POLICY "
 REG_EXP = " REGEXP "
@@ -231,23 +232,34 @@ def format_uuid(uuid: Optional[str], with_new_line: bool = True) -> str:
     return f"{UUID_COMMENT.format(uuid)}"
 
 
-def validate_iceberg_config(iceberg_config: Optional[dict]) -> Dict[str, str]:
+def validate_iceberg_config(
+    iceberg_config: Optional[dict],
+) -> tuple[Dict[str, str], list]:
+    """
+    Validate and process iceberg config, returning (options_dict, partition_exprs_list).
+    """
     if iceberg_config is None:
-        return dict()
+        return dict(), []
 
     iceberg_config = {k.lower(): v for k, v in iceberg_config.items()}
 
-    return {
+    # Extract partition_by (already processed as SQL strings by analyzer)
+    partition_exprs = iceberg_config.get("partition_by", [])
+
+    options = {
         EXTERNAL_VOLUME: iceberg_config.get("external_volume", None),
         CATALOG: iceberg_config.get("catalog", None),
         BASE_LOCATION: iceberg_config.get("base_location", None),
+        TARGET_FILE_SIZE: iceberg_config.get("target_file_size", None),
         CATALOG_SYNC: iceberg_config.get("catalog_sync", None),
         STORAGE_SERIALIZATION_POLICY: iceberg_config.get(
             "storage_serialization_policy", None
         ),
         ICEBERG_VERSION: iceberg_config.get("iceberg_version", None),
     }
 
+    return options, partition_exprs
+
 
 def result_scan_statement(uuid_place_holder: str) -> str:
     return (
@@ -311,6 +323,20 @@ def partition_spec(col_exprs: List[str]) -> str:
     return f"PARTITION BY {COMMA.join(col_exprs)}" if col_exprs else EMPTY_STRING
 
 
+def iceberg_partition_clause(partition_exprs: List[str]) -> str:
+    return (
+        (
+            SPACE
+            + PARTITION_BY
+            + LEFT_PARENTHESIS
+            + COMMA.join(partition_exprs)
+            + RIGHT_PARENTHESIS
+        )
+        if partition_exprs
+        else EMPTY_STRING
+    )
+
+
 def order_by_spec(col_exprs: List[str]) -> str:
     if not col_exprs:
         return EMPTY_STRING
@@ -1103,15 +1129,17 @@ def create_table_statement(
         CHANGE_TRACKING: change_tracking,
     }
 
-    iceberg_config = validate_iceberg_config(iceberg_config)
-    options.update(iceberg_config)
+    iceberg_options, partition_exprs = validate_iceberg_config(iceberg_config)
+    options.update(iceberg_options)
     options_statement = get_options_statement(options)
 
+    partition_by_clause = iceberg_partition_clause(partition_exprs)
+
     return (
         f"{CREATE}{(OR + REPLACE) if replace else EMPTY_STRING}"
         f" {(get_temp_type_for_object(use_scoped_temp_objects, is_generated) if table_type.lower() in TEMPORARY_STRING_SET else table_type).upper()} "
-        f"{ICEBERG if iceberg_config else EMPTY_STRING}{TABLE}{table_name}{(IF + NOT + EXISTS) if not replace and not error else EMPTY_STRING}"
-        f"{LEFT_PARENTHESIS}{schema}{RIGHT_PARENTHESIS}{cluster_by_clause}"
+        f"{ICEBERG if iceberg_options else EMPTY_STRING}{TABLE}{table_name}{(IF + NOT + EXISTS) if not replace and not error else EMPTY_STRING}"
+        f"{LEFT_PARENTHESIS}{schema}{RIGHT_PARENTHESIS}{partition_by_clause}{cluster_by_clause}"
         f"{options_statement}{COPY_GRANTS if copy_grants else EMPTY_STRING}{comment_sql}"
     )
 
@@ -1192,15 +1220,18 @@ def create_table_as_select_statement(
         MAX_DATA_EXTENSION_TIME_IN_DAYS: max_data_extension_time,
         CHANGE_TRACKING: change_tracking,
     }
-    iceberg_config = validate_iceberg_config(iceberg_config)
-    options.update(iceberg_config)
+    iceberg_options, partition_exprs = validate_iceberg_config(iceberg_config)
+    options.update(iceberg_options)
     options_statement = get_options_statement(options)
+
+    partition_by_clause = iceberg_partition_clause(partition_exprs)
+
     return (
         f"{CREATE}{OR + REPLACE if replace else EMPTY_STRING}"
         f" {(get_temp_type_for_object(use_scoped_temp_objects, is_generated) if table_type.lower() in TEMPORARY_STRING_SET else table_type).upper()} "
-        f"{ICEBERG if iceberg_config else EMPTY_STRING}{TABLE}"
+        f"{ICEBERG if iceberg_options else EMPTY_STRING}{TABLE}"
         f"{IF + NOT + EXISTS if not replace and not error else EMPTY_STRING} "
-        f"{table_name}{column_definition_sql}{cluster_by_clause}{options_statement}"
+        f"{table_name}{column_definition_sql}{partition_by_clause}{cluster_by_clause}{options_statement}"
         f"{COPY_GRANTS if copy_grants else EMPTY_STRING}{comment_sql} {AS}{project_statement([], child)}"
     )
 
@@ -1506,9 +1537,8 @@ def create_or_replace_dynamic_table_statement(
         }
     )
 
-    iceberg_options = get_options_statement(
-        validate_iceberg_config(iceberg_config)
-    ).strip()
+    iceberg_options, _ = validate_iceberg_config(iceberg_config)
+    iceberg_options = get_options_statement(iceberg_options).strip()
 
     return (
         f"{CREATE}{OR + REPLACE if replace else EMPTY_STRING}{TRANSIENT if is_transient else EMPTY_STRING}"
 
@@ -1298,6 +1298,8 @@ def save_as_table(
                     the Iceberg table stores its metadata files and data in Parquet format
                 catalog: specifies either Snowflake or a catalog integration to use for this table
                 base_location: the base directory that snowflake can write iceberg metadata and files to
+                target_file_size: specifies a target Parquet file size for the table.
+                    Valid values: 'AUTO' (default), '16MB', '32MB', '64MB', '128MB'
                 catalog_sync: optionally sets the catalog integration configured for Polaris Catalog
                 storage_serialization_policy: specifies the storage serialization policy for the table
                 iceberg_version: Overrides the version of iceberg to use. Defaults to 2 when unset.
 
@@ -216,10 +216,10 @@ def replace_temp_obj_placeholders(
                         placeholder_name,
                         temp_obj_type,
                     ) = query.temp_obj_name_placeholder
-                    placeholders[placeholder_name] = random_name_for_temp_object(
-                        temp_obj_type
-                    )
-
+                    if placeholder_name not in placeholders:
+                        placeholders[placeholder_name] = random_name_for_temp_object(
+                            temp_obj_type
+                        )
                 copied_query = copy.copy(query)
                 for placeholder_name, target_temp_name in placeholders.items():
                     # Copy the original query and replace all the placeholder names with the
 
@@ -7,7 +7,6 @@
 from logging import getLogger
 from typing import Dict, Optional, Tuple
 from snowflake.connector.options import MissingOptionalDependency, ModuleLikeObject
-from snowflake.connector.wif_util import create_attestation
 
 import snowflake.snowpark
 import requests
@@ -320,6 +319,8 @@ def disable_event_table_telemetry_collection(self) -> None:
             self._disable_logger_provider()
 
     def _get_external_telemetry_auth_token(self) -> Dict:
+        from snowflake.connector.wif_util import create_attestation
+
         self._attestation = create_attestation(
             self.session.connection.auth_class.provider,
             self.session.connection.auth_class.entra_resource,
 
@@ -205,10 +205,6 @@ def find_next_opening_tag_pos(
         chunk = file_obj.read(current_chunk_size)
         if not chunk:
             raise EOFError("Reached end of file before finding opening tag")
-        # If the chunk is smaller than expected, we are near the end.
-        if len(chunk) < current_chunk_size:
-            if chunk.find(tag_start_1) == -1 and chunk.find(tag_start_2) == -1:
-                raise EOFError("Reached end of file before finding opening tag")
 
         # Combine leftover from previous read with the new chunk.
         data = overlap + chunk
@@ -233,9 +229,6 @@ def find_next_opening_tag_pos(
         # Update the overlap from the end of the combined data.
         overlap = data[-overlap_size:] if len(data) >= overlap_size else data
 
-        # Otherwise, rewind by the length of the overlap so that a tag spanning the boundary isn't missed.
-        file_obj.seek(-len(overlap), 1)
-
         # Check that progress is being made to avoid infinite loops.
         if file_obj.tell() <= pos_before:
             raise EOFError("No progress made while searching for opening tag")