refactor!: Remove support for reading collection metadata from schema.json (#201)

borchero · web-flow · commit 3b1be6346e23 · 2025-10-26T01:12:42.000+02:00
diff --git a/dataframely/_storage/parquet.py b/dataframely/_storage/parquet.py
@@ -169,13 +169,6 @@ def _collection_from_parquet(
                             _read_serialized_collection(f"{prefix}{file}")
                         )
 
-        # Backward compatibility: If the parquets do not have schema information,
-        # fall back to looking for schema.json
-        if not any(collection_types) and fs.exists(
-            schema_file := fs.sep.join([path, "schema.json"])
-        ):
-            collection_types.append(fs.read_text(schema_file))
-
         return data, collection_types
 
     @classmethod
diff --git a/dataframely/collection/collection.py b/dataframely/collection/collection.py
@@ -899,11 +899,6 @@ def read_parquet(
                 all required members.
             ValidationError: If the collection cannot be validated.
 
-        Note:
-            This method is backward compatible with older versions of dataframely
-            in which the schema metadata was saved to `schema.json` files instead of
-            being encoded into the parquet files.
-
         Attention:
             Be aware that this method suffers from the same limitations as
             :meth:`serialize`.
@@ -966,10 +961,6 @@ def scan_parquet(
             parquet file into memory if `"validation"` is `"warn"` or `"allow"`
             and validation is required.
 
-        Note: This method is backward compatible with older versions of dataframely
-            in which the schema metadata was saved to `schema.json` files instead of
-            being encoded into the parquet files.
-
         Attention:
             Be aware that this method suffers from the same limitations as
             :meth:`serialize`.
@@ -1337,11 +1328,8 @@ def _deserialize_types(
     for t in serialized_collection_types:
         if t is None:
             continue
-        try:
-            collection_type = deserialize_collection(t)
-            collection_types.append(collection_type)
-        except (JSONDecodeError, plexc.ComputeError):
-            pass
+        collection_type = deserialize_collection(t)
+        collection_types.append(collection_type)
 
     return collection_types
 
diff --git a/tests/collection/test_storage.py b/tests/collection/test_storage.py
@@ -390,41 +390,6 @@ def test_reconcile_collection_types(
 # ---------------------------- PARQUET SPECIFICS ---------------------------------- #
 
 
-@pytest.mark.parametrize("validation", ["warn", "allow", "forbid", "skip"])
-@pytest.mark.parametrize("lazy", [True, False])
-@pytest.mark.parametrize(
-    "any_tmp_path",
-    ["tmp_path", pytest.param("s3_tmp_path", marks=pytest.mark.s3)],
-    indirect=True,
-)
-def test_read_write_parquet_fallback_schema_json_success(
-    any_tmp_path: str, mocker: pytest_mock.MockerFixture, validation: Any, lazy: bool
-) -> None:
-    # In https://github.com/Quantco/dataframely/pull/107, the
-    # mechanism for storing collection metadata was changed.
-    # Prior to this change, the metadata was stored in a `schema.json` file.
-    # After this change, the metadata was moved into the parquet files.
-    # This test verifies that the change was implemented a backward compatible manner:
-    # The new code can still read parquet files that do not contain the metadata,
-    # and will not call `validate` if the `schema.json` file is present.
-
-    # Arrange
-    tester = ParquetCollectionStorageTester()
-    collection = MyCollection.create_empty()
-    tester.write_untyped(collection, any_tmp_path, lazy)
-
-    fs: AbstractFileSystem = url_to_fs(any_tmp_path)[0]
-    with fs.open(fs.sep.join([any_tmp_path, "schema.json"]), "w") as f:
-        f.write(collection.serialize())
-
-    # Act
-    spy = mocker.spy(MyCollection, "validate")
-    tester.read(MyCollection, any_tmp_path, lazy, validation=validation)
-
-    # Assert
-    spy.assert_not_called()
-
-
 @pytest.mark.parametrize("validation", ["allow", "warn"])
 @pytest.mark.parametrize("lazy", [True, False])
 @pytest.mark.parametrize(
@@ -435,16 +400,17 @@ def test_read_write_parquet_fallback_schema_json_success(
 def test_read_write_parquet_schema_json_fallback_corrupt(
     any_tmp_path: str, mocker: pytest_mock.MockerFixture, validation: Any, lazy: bool
 ) -> None:
-    """If the schema.json file is present, but corrupt, we should always fall back to
+    """If schema information is present, but corrupt, we should always fall back to
     validating."""
     # Arrange
     collection = MyCollection.create_empty()
     tester = ParquetCollectionStorageTester()
-    tester.write_untyped(collection, any_tmp_path, lazy)
-
-    fs: AbstractFileSystem = url_to_fs(any_tmp_path)[0]
-    with fs.open(fs.sep.join([any_tmp_path, "schema.json"]), "w") as f:
-        f.write("} this is not a valid JSON {")
+    tester.write_untyped(
+        collection,
+        any_tmp_path,
+        lazy,
+        metadata={COLLECTION_METADATA_KEY: "} this is not a valid JSON {"},
+    )
 
     # Act
     spy = mocker.spy(MyCollection, "validate")