SNOW-2752334: Fix overlap handling when parsing XML file (#4008)

sfc-gh-aalam · web-flow · commit 48e715d8ff1d · 2025-11-21T17:19:55.000-08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -53,6 +53,10 @@
 
 - Catalog API no longer uses types declared in `snowflake.core` and therefore this dependency was removed.
 
+#### Bug Fixes
+
+- Fixed a bug in `XMLReader` where finding the start position of a row tag could return an incorrect file position.
+
 ### Snowpark pandas API Updates
 
 #### New Features
diff --git a/src/snowflake/snowpark/_internal/xml_reader.py b/src/snowflake/snowpark/_internal/xml_reader.py
@@ -205,10 +205,6 @@ def find_next_opening_tag_pos(
         chunk = file_obj.read(current_chunk_size)
         if not chunk:
             raise EOFError("Reached end of file before finding opening tag")
-        # If the chunk is smaller than expected, we are near the end.
-        if len(chunk) < current_chunk_size:
-            if chunk.find(tag_start_1) == -1 and chunk.find(tag_start_2) == -1:
-                raise EOFError("Reached end of file before finding opening tag")
 
         # Combine leftover from previous read with the new chunk.
         data = overlap + chunk
@@ -233,9 +229,6 @@ def find_next_opening_tag_pos(
         # Update the overlap from the end of the combined data.
         overlap = data[-overlap_size:] if len(data) >= overlap_size else data
 
-        # Otherwise, rewind by the length of the overlap so that a tag spanning the boundary isn't missed.
-        file_obj.seek(-len(overlap), 1)
-
         # Check that progress is being made to avoid infinite loops.
         if file_obj.tell() <= pos_before:
             raise EOFError("No progress made while searching for opening tag")
diff --git a/tests/integ/scala/test_dataframe_aggregate_suite.py b/tests/integ/scala/test_dataframe_aggregate_suite.py
@@ -591,7 +591,7 @@ def test_group_by_grouping_sets(session):
             .with_column("medical_license", lit(None))
             .select("medical_license", "radio_license", "count")
         )
-        .sort(col("count"))
+        .sort(col("count"), col("radio_license"))
         .collect()
     )
 
@@ -601,16 +601,16 @@ def test_group_by_grouping_sets(session):
             GroupingSets([col("medical_license")], [col("radio_license")])
         )
         .agg(count(col("*")).as_("count"))
-        .sort(col("count"))
+        .sort(col("count"), col("radio_license"))
     )
 
     Utils.check_answer(grouping_sets, result, sort=False)
 
     Utils.check_answer(
         grouping_sets,
         [
-            Row(None, "General", 1),
             Row(None, "Amateur Extra", 1),
+            Row(None, "General", 1),
             Row("RN", None, 2),
             Row(None, "Technician", 2),
             Row(None, None, 3),
@@ -624,8 +624,8 @@ def test_group_by_grouping_sets(session):
         TestData.nurse(session)
         .group_by("medical_license", "radio_license")
         .agg(count(col("*")).as_("count"))
-        .sort(col("count"), col("medical_license"), col("radio_license"))
-        .select("count", "medical_license", "radio_license"),
+        .select("count", "medical_license", "radio_license")
+        .sort(col("count"), col("medical_license"), col("radio_license")),
         [
             Row(1, "LVN", "General"),
             Row(1, "RN", None),
@@ -775,11 +775,13 @@ def test_rel_grouped_dataframe_median(session):
 def test_builtin_functions(session):
     df = session.create_dataframe([(1, 11), (2, 12), (1, 13)]).to_df(["a", "b"])
 
-    assert df.group_by("a").builtin("max")(col("a"), col("b")).collect() == [
+    assert df.group_by("a").builtin("max")(col("a"), col("b")).sort(
+        col("a")
+    ).collect() == [
         Row(1, 1, 13),
         Row(2, 2, 12),
     ]
-    assert df.group_by("a").builtin("max")(col("b")).collect() == [
+    assert df.group_by("a").builtin("max")(col("b")).sort(col("a")).collect() == [
         Row(1, 13),
         Row(2, 12),
     ]
@@ -828,16 +830,16 @@ def test_non_empty_arg_functions(session):
 
 
 def test_null_count(session):
-    assert TestData.test_data3(session).group_by("a").agg(
-        count(col("b"))
+    assert TestData.test_data3(session).group_by("a").agg(count(col("b"))).sort(
+        col("a")
     ).collect() == [
         Row(1, 0),
         Row(2, 1),
     ]
 
     assert TestData.test_data3(session).group_by("a").agg(
         count(col("a") + col("b"))
-    ).collect() == [Row(1, 0), Row(2, 1)]
+    ).sort(col("a")).collect() == [Row(1, 0), Row(2, 1)]
 
     assert TestData.test_data3(session).agg(
         [
@@ -1147,9 +1149,12 @@ def test_ints_in_agg_exprs_are_taken_as_groupby_ordinal(session):
         [lit(6), lit(7), sum(col("b"))]
     ).collect() == [Row(3, 4, 6, 7, 9)]
 
-    assert TestData.test_data2(session).group_by([lit(3), lit(4)]).agg(
-        [lit(6), col("b"), sum(col("b"))]
-    ).collect() == [Row(3, 4, 6, 1, 3), Row(3, 4, 6, 2, 6)]
+    Utils.check_answer(
+        TestData.test_data2(session)
+        .group_by([lit(3), lit(4)])
+        .agg([lit(6), col("b"), sum(col("b"))]),
+        [Row(3, 4, 6, 1, 3), Row(3, 4, 6, 2, 6)],
+    )
 
 
 @pytest.mark.xfail(
diff --git a/tests/unit/test_xml_reader.py b/tests/unit/test_xml_reader.py
@@ -389,6 +389,44 @@ def test_find_next_opening_tag_pos_normal(chunk_size):
     assert pos == expected_pos
 
 
+@pytest.mark.parametrize("chunk_size", [10, 100, DEFAULT_CHUNK_SIZE])
+def test_find_next_opening_tag_pos_full_chunk_before_tag(chunk_size):
+    # This tests that the overlap logic works correctly when multiple chunks
+    # must be read before finding the tag.
+    prefix = b"x" * (chunk_size * 2 + 10)  # More than 2 full chunks
+    record = prefix + b"<row attr='value'> more content here </row>"
+    file_obj = io.BytesIO(record)
+    tag_start_1 = b"<row>"
+    tag_start_2 = b"<row "
+    end_limit = len(record)
+    pos = find_next_opening_tag_pos(
+        file_obj, tag_start_1, tag_start_2, end_limit, chunk_size=chunk_size
+    )
+    # Should find the first tag after all the prefix data
+    expected_pos = len(prefix)
+    assert pos == expected_pos
+    # Verify file pointer is at the correct position
+    assert file_obj.tell() == expected_pos
+
+
+@pytest.mark.parametrize("chunk_size", [10, 100, DEFAULT_CHUNK_SIZE])
+def test_find_next_opening_tag_pos_tag_spans_chunk_boundary(chunk_size):
+    # Position the tag so it splits exactly across a chunk boundary.
+    # This is the most challenging case for the overlap logic.
+    # Place the tag start 2 bytes before the chunk boundary
+    prefix = b"x" * (chunk_size - 2)
+    record = prefix + b"<row attr='value'> content </row>"
+    file_obj = io.BytesIO(record)
+    tag_start_1 = b"<row>"
+    tag_start_2 = b"<row "
+    end_limit = len(record)
+    pos = find_next_opening_tag_pos(
+        file_obj, tag_start_1, tag_start_2, end_limit, chunk_size=chunk_size
+    )
+    expected_pos = len(prefix)
+    assert pos == expected_pos
+
+
 @pytest.mark.parametrize("chunk_size", [3, 10, DEFAULT_CHUNK_SIZE])
 def test_find_next_opening_tag_pos_both_variants(chunk_size):
     # Test when both "<row>" and "<row " exist.