Skip to content

Commit 2824029

Browse files
SNOW-2866374: Enforce deterministic column ordering in infer_schema (#4021)
1 parent 6ffd9ea commit 2824029

File tree

7 files changed

+42
-1
lines changed

7 files changed

+42
-1
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111
#### Improvements
1212

13+
- Improved `DataFrameReader` to return columns in deterministic order when using `INFER_SCHEMA`.
14+
1315
### Snowpark pandas API Updates
1416

1517
#### New Features

src/snowflake/snowpark/_internal/analyzer/analyzer_utils.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1299,8 +1299,13 @@ def create_file_format_statement(
12991299

13001300

13011301
def infer_schema_statement(
1302-
path: str, file_format_name: str, options: Optional[Dict[str, str]] = None
1302+
path: str,
1303+
file_format_name: str,
1304+
options: Optional[Dict[str, str]] = None,
13031305
) -> str:
1306+
"""
1307+
Note: Results are ordered by ORDER_ID and COLUMN_NAME for deterministic column ordering.
1308+
"""
13041309
return (
13051310
SELECT
13061311
+ STAR
@@ -1328,6 +1333,8 @@ def infer_schema_statement(
13281333
)
13291334
+ RIGHT_PARENTHESIS
13301335
+ RIGHT_PARENTHESIS
1336+
+ ORDER_BY
1337+
+ "ORDER_ID, COLUMN_NAME"
13311338
)
13321339

13331340

tests/integ/conftest.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,7 @@ def connection(db_parameters, local_testing_mode):
237237
"user",
238238
"password",
239239
"private_key_file",
240+
"private_key_file_pwd",
240241
"host",
241242
"port",
242243
"database",

tests/integ/scala/test_dataframe_reader_suite.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
test_file_json = "testJson.json"
6868
test_file_json_same_schema = "testJsonSameSchema.json"
6969
test_file_json_new_schema = "testJsonNewSchema.json"
70+
test_file_json_dupe_column_id = "testJsonDupeColumnID.jsonl"
7071
test_file_avro = "test.avro"
7172
test_file_parquet = "test.parquet"
7273
test_file_all_data_types_parquet = "test_all_data_types.parquet"
@@ -194,6 +195,12 @@ def setup(session, resources_path, local_testing_mode):
194195
test_files.test_file_json,
195196
compress=False,
196197
)
198+
Utils.upload_to_stage(
199+
session,
200+
"@" + tmp_stage_name1,
201+
test_files.test_file_json_dupe_column_id,
202+
compress=False,
203+
)
197204
Utils.upload_to_stage(
198205
session,
199206
"@" + tmp_stage_only_json_file,
@@ -1255,6 +1262,22 @@ def test_read_json_with_infer_schema(session, mode):
12551262
]
12561263

12571264

1265+
@pytest.mark.skipif(
1266+
"config.getoption('local_testing_mode', default=False)",
1267+
reason="Local Testing does not support JSONL format or INFER_SCHEMA.",
1268+
)
1269+
@pytest.mark.parametrize("mode", ["select", "copy"])
1270+
def test_read_json_with_infer_schema_deterministic_column_order(session, mode):
1271+
"""Test that INFER_SCHEMA returns columns in deterministic order."""
1272+
json_path = f"@{tmp_stage_name1}/{test_file_json_dupe_column_id}"
1273+
1274+
# Run multiple times to verify deterministic ordering. Previously, column order would vary between runs:
1275+
# (a, b, c) vs (a, c, b) due to order_id of column b and c being the same.
1276+
for _ in range(10):
1277+
df = get_reader(session, mode).option("INFER_SCHEMA", True).json(json_path)
1278+
assert df.columns == ['"a"', '"b"', '"c"']
1279+
1280+
12581281
@pytest.mark.skipif(
12591282
"config.getoption('local_testing_mode', default=False)",
12601283
reason="Local Testing does not support loading json with user specified schema.",
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{"a":3.0, "b":3.5}
2+
{"a":4.0, "c":3.0}
3+
{"a":6, "c":7}

tests/unit/scala/test_utils_suite.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,7 @@ def check_zip_files_and_close_stream(input_stream, expected_files):
345345
"resources/testCSVspecialFormat.csv",
346346
"resources/testJSONspecialFormat.json.gz",
347347
"resources/testJson.json",
348+
"resources/testJsonDupeColumnID.jsonl",
348349
"resources/testJsonNewSchema.json",
349350
"resources/testJsonSameSchema.json",
350351
"resources/test_all_data_types.parquet",

tests/utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1541,6 +1541,10 @@ def test_file_json_same_schema(self):
15411541
def test_file_json_new_schema(self):
15421542
return os.path.join(self.resources_path, "testJsonNewSchema.json")
15431543

1544+
@property
1545+
def test_file_json_dupe_column_id(self):
1546+
return os.path.join(self.resources_path, "testJsonDupeColumnID.jsonl")
1547+
15441548
@property
15451549
def test_file_avro(self):
15461550
return os.path.join(self.resources_path, "test.avro")

0 commit comments

Comments
 (0)