Skip to content

Commit 7d1b222

Browse files
committed
Enforce deterministic column ordering from infer_schema
1 parent 4706f25 commit 7d1b222

File tree

5 files changed

+41
-2
lines changed

5 files changed

+41
-2
lines changed

src/snowflake/snowpark/_internal/analyzer/analyzer_utils.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1299,9 +1299,16 @@ def create_file_format_statement(
12991299

13001300

13011301
def infer_schema_statement(
1302-
path: str, file_format_name: str, options: Optional[Dict[str, str]] = None
1302+
path: str,
1303+
file_format_name: str,
1304+
options: Optional[Dict[str, str]] = None,
1305+
ordered: bool = True,
13031306
) -> str:
1304-
return (
1307+
"""
1308+
Note: If ordered is set to True (default), results are ordered by ORDER_ID and
1309+
COLUMN_NAME for deterministic column ordering. Set to False to disable ordering.
1310+
"""
1311+
base_query = (
13051312
SELECT
13061313
+ STAR
13071314
+ FROM
@@ -1330,6 +1337,11 @@ def infer_schema_statement(
13301337
+ RIGHT_PARENTHESIS
13311338
)
13321339

1340+
if ordered:
1341+
base_query += ORDER_BY + "ORDER_ID, COLUMN_NAME"
1342+
1343+
return base_query
1344+
13331345

13341346
def file_operation_statement(
13351347
command: str, file_name: str, stage_location: str, options: Dict[str, str]

tests/integ/conftest.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,7 @@ def connection(db_parameters, local_testing_mode):
237237
"user",
238238
"password",
239239
"private_key_file",
240+
"private_key_file_pwd",
240241
"host",
241242
"port",
242243
"database",

tests/integ/scala/test_dataframe_reader_suite.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
test_file_json = "testJson.json"
6868
test_file_json_same_schema = "testJsonSameSchema.json"
6969
test_file_json_new_schema = "testJsonNewSchema.json"
70+
test_file_json_dupe_column_id = "testJsonDupeColumnID.jsonl"
7071
test_file_avro = "test.avro"
7172
test_file_parquet = "test.parquet"
7273
test_file_all_data_types_parquet = "test_all_data_types.parquet"
@@ -194,6 +195,12 @@ def setup(session, resources_path, local_testing_mode):
194195
test_files.test_file_json,
195196
compress=False,
196197
)
198+
Utils.upload_to_stage(
199+
session,
200+
"@" + tmp_stage_name1,
201+
test_files.test_file_json_dupe_column_id,
202+
compress=False,
203+
)
197204
Utils.upload_to_stage(
198205
session,
199206
"@" + tmp_stage_only_json_file,
@@ -1255,6 +1262,18 @@ def test_read_json_with_infer_schema(session, mode):
12551262
]
12561263

12571264

1265+
@pytest.mark.parametrize("mode", ["select", "copy"])
1266+
def test_read_json_with_infer_schema_deterministic_column_order(session, mode):
1267+
"""Test that INFER_SCHEMA returns columns in deterministic order."""
1268+
json_path = f"@{tmp_stage_name1}/{test_file_json_dupe_column_id}"
1269+
1270+
# Run multiple times to verify deterministic ordering. Previously, column order would vary between runs:
1271+
# (a, b, c) vs (a, c, b) due to order_id of column b and c being the same.
1272+
for _ in range(10):
1273+
df = get_reader(session, mode).option("INFER_SCHEMA", True).json(json_path)
1274+
assert df.columns == ['"a"', '"b"', '"c"']
1275+
1276+
12581277
@pytest.mark.skipif(
12591278
"config.getoption('local_testing_mode', default=False)",
12601279
reason="Local Testing does not support loading json with user specified schema.",
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{"a":3.0, "b":3.5}
2+
{"a":4.0, "c":3.0}
3+
{"a":6, "c":7}

tests/utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1541,6 +1541,10 @@ def test_file_json_same_schema(self):
15411541
def test_file_json_new_schema(self):
15421542
return os.path.join(self.resources_path, "testJsonNewSchema.json")
15431543

1544+
@property
1545+
def test_file_json_dupe_column_id(self):
1546+
return os.path.join(self.resources_path, "testJsonDupeColumnID.jsonl")
1547+
15441548
@property
15451549
def test_file_avro(self):
15461550
return os.path.join(self.resources_path, "test.avro")

0 commit comments

Comments
 (0)