Skip to content

Commit 3b1be63

Browse files
authored
refactor!: Remove support for reading collection metadata from schema.json (#201)
1 parent a030b76 commit 3b1be63

File tree

3 files changed

+9
-62
lines changed

3 files changed

+9
-62
lines changed

dataframely/_storage/parquet.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -169,13 +169,6 @@ def _collection_from_parquet(
169169
_read_serialized_collection(f"{prefix}{file}")
170170
)
171171

172-
# Backward compatibility: If the parquets do not have schema information,
173-
# fall back to looking for schema.json
174-
if not any(collection_types) and fs.exists(
175-
schema_file := fs.sep.join([path, "schema.json"])
176-
):
177-
collection_types.append(fs.read_text(schema_file))
178-
179172
return data, collection_types
180173

181174
@classmethod

dataframely/collection/collection.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -899,11 +899,6 @@ def read_parquet(
899899
all required members.
900900
ValidationError: If the collection cannot be validated.
901901
902-
Note:
903-
This method is backward compatible with older versions of dataframely
904-
in which the schema metadata was saved to `schema.json` files instead of
905-
being encoded into the parquet files.
906-
907902
Attention:
908903
Be aware that this method suffers from the same limitations as
909904
:meth:`serialize`.
@@ -966,10 +961,6 @@ def scan_parquet(
966961
parquet file into memory if `"validation"` is `"warn"` or `"allow"`
967962
and validation is required.
968963
969-
Note: This method is backward compatible with older versions of dataframely
970-
in which the schema metadata was saved to `schema.json` files instead of
971-
being encoded into the parquet files.
972-
973964
Attention:
974965
Be aware that this method suffers from the same limitations as
975966
:meth:`serialize`.
@@ -1337,11 +1328,8 @@ def _deserialize_types(
13371328
for t in serialized_collection_types:
13381329
if t is None:
13391330
continue
1340-
try:
1341-
collection_type = deserialize_collection(t)
1342-
collection_types.append(collection_type)
1343-
except (JSONDecodeError, plexc.ComputeError):
1344-
pass
1331+
collection_type = deserialize_collection(t)
1332+
collection_types.append(collection_type)
13451333

13461334
return collection_types
13471335

tests/collection/test_storage.py

Lines changed: 7 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -390,41 +390,6 @@ def test_reconcile_collection_types(
390390
# ---------------------------- PARQUET SPECIFICS ---------------------------------- #
391391

392392

393-
@pytest.mark.parametrize("validation", ["warn", "allow", "forbid", "skip"])
394-
@pytest.mark.parametrize("lazy", [True, False])
395-
@pytest.mark.parametrize(
396-
"any_tmp_path",
397-
["tmp_path", pytest.param("s3_tmp_path", marks=pytest.mark.s3)],
398-
indirect=True,
399-
)
400-
def test_read_write_parquet_fallback_schema_json_success(
401-
any_tmp_path: str, mocker: pytest_mock.MockerFixture, validation: Any, lazy: bool
402-
) -> None:
403-
# In https://github.com/Quantco/dataframely/pull/107, the
404-
# mechanism for storing collection metadata was changed.
405-
# Prior to this change, the metadata was stored in a `schema.json` file.
406-
# After this change, the metadata was moved into the parquet files.
407-
# This test verifies that the change was implemented a backward compatible manner:
408-
# The new code can still read parquet files that do not contain the metadata,
409-
# and will not call `validate` if the `schema.json` file is present.
410-
411-
# Arrange
412-
tester = ParquetCollectionStorageTester()
413-
collection = MyCollection.create_empty()
414-
tester.write_untyped(collection, any_tmp_path, lazy)
415-
416-
fs: AbstractFileSystem = url_to_fs(any_tmp_path)[0]
417-
with fs.open(fs.sep.join([any_tmp_path, "schema.json"]), "w") as f:
418-
f.write(collection.serialize())
419-
420-
# Act
421-
spy = mocker.spy(MyCollection, "validate")
422-
tester.read(MyCollection, any_tmp_path, lazy, validation=validation)
423-
424-
# Assert
425-
spy.assert_not_called()
426-
427-
428393
@pytest.mark.parametrize("validation", ["allow", "warn"])
429394
@pytest.mark.parametrize("lazy", [True, False])
430395
@pytest.mark.parametrize(
@@ -435,16 +400,17 @@ def test_read_write_parquet_fallback_schema_json_success(
435400
def test_read_write_parquet_schema_json_fallback_corrupt(
436401
any_tmp_path: str, mocker: pytest_mock.MockerFixture, validation: Any, lazy: bool
437402
) -> None:
438-
"""If the schema.json file is present, but corrupt, we should always fall back to
403+
"""If schema information is present, but corrupt, we should always fall back to
439404
validating."""
440405
# Arrange
441406
collection = MyCollection.create_empty()
442407
tester = ParquetCollectionStorageTester()
443-
tester.write_untyped(collection, any_tmp_path, lazy)
444-
445-
fs: AbstractFileSystem = url_to_fs(any_tmp_path)[0]
446-
with fs.open(fs.sep.join([any_tmp_path, "schema.json"]), "w") as f:
447-
f.write("} this is not a valid JSON {")
408+
tester.write_untyped(
409+
collection,
410+
any_tmp_path,
411+
lazy,
412+
metadata={COLLECTION_METADATA_KEY: "} this is not a valid JSON {"},
413+
)
448414

449415
# Act
450416
spy = mocker.spy(MyCollection, "validate")

0 commit comments

Comments
 (0)