SNOW-2148589: [Local Testing] Fix window indexing issue (#3462)

sfc-gh-jrose · web-flow · commit 41a1b907e418 · 2025-06-12T09:52:47.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,12 @@
 
 - Added support for row validation using XSD schema using `rowValidationXSDPath` option when reading XML files with a row tag using `rowTag` option.
 
+### Snowpark Local Testing Updates
+
+#### Bug Fixes
+
+- Fixed a bug when processing windowed functions that lead to incorrect indexing in results.
+
 ## 1.33.0 (YYYY-MM-DD)
 
 ### Snowpark Python API Updates
diff --git a/src/snowflake/snowpark/mock/_plan.py b/src/snowflake/snowpark/mock/_plan.py
@@ -2596,6 +2596,7 @@ def _match_pattern(row) -> bool:
 
         # Process window frame specification
         # Reference: https://docs.snowflake.com/en/sql-reference/functions-analytic#window-frame-usage-notes
+        pd_index = res_index
         if not window_spec.frame_spec or not isinstance(
             window_spec.frame_spec, SpecifiedWindowFrame
         ):
@@ -2609,13 +2610,18 @@ def _match_pattern(row) -> bool:
                     True,
                     False,
                 )
+
+                # Pandas reindexes the data when generating rows in a RollingGroupby
+                # The resulting index is not exposed in the window groupings so calculate it here
+                if not isinstance(windows, list):
+                    pd_index = list(windows.count().index)
             else:
                 indexer = EntireWindowIndexer()
                 rolling = res.rolling(indexer)
                 windows = [ordered.loc[w.index] for w in rolling]
                 # rolling can unpredictably change the index of the data
                 # apply a trivial function to materialize the final index
-                res_index = list(rolling.count().index)
+                pd_index = list(rolling.count().index)
 
         elif isinstance(window_spec.frame_spec.frame_type, RowFrame):
             indexer = RowFrameIndexer(frame_spec=window_spec.frame_spec)
@@ -2663,14 +2669,16 @@ def get_bound(bound):
         # compute window function:
         if isinstance(window_function, (FunctionExpression,)):
             res_cols = []
-            for current_row, w in zip(res_index, windows):
-                res_cols.append(
-                    handle_function_expression(
-                        window_function, w, analyzer, expr_to_alias, current_row
-                    )
+
+            for current_row, w in zip(pd_index, windows):
+                result = handle_function_expression(
+                    window_function, w, analyzer, expr_to_alias, current_row
                 )
+                result.index = [current_row]
+                res_cols.append(result)
+
             res_col = pd.concat(res_cols) if res_cols else ColumnEmulator([])
-            res_col.index = res_index
+            res_col.reindex(res_index)
             if res_cols:
                 res_col.sf_type = res_cols[0].sf_type
             else:
diff --git a/tests/mock/test_functions.py b/tests/mock/test_functions.py
@@ -30,6 +30,7 @@
     min,
     rank,
     row_number,
+    sum,
     to_char,
     to_date,
 )
@@ -512,6 +513,36 @@ def test_rank(session):
     )
 
 
+def test_window_indexing(session):
+    df = session.create_dataframe(
+        [
+            [1, 1, 1],
+            [2, 2, 1],
+            [2, 2, 1],
+            [2, 1, 1],
+        ],
+        ["A", "B", "VAL"],
+    )
+
+    window_a = Window.partition_by("A")
+    window_both = Window.partition_by("B", "A")
+
+    windowed = df.with_columns(
+        ["_A", "_BA"],
+        [sum("VAL").over(window_a), sum("VAL").over(window_both)],
+    )
+
+    Utils.check_answer(
+        windowed,
+        [
+            Row(1, 1, 1, 1, 1),
+            Row(2, 2, 1, 3, 2),
+            Row(2, 2, 1, 3, 2),
+            Row(2, 1, 1, 3, 1),
+        ],
+    )
+
+
 def test_get(session):
     data = [
         Row(101, 1, "cat"),