Skip to content

Commit 551946c

Browse files
Feat 571 Basic metadata validators (#556)
* subject and data description validation * procedures * feat: removes unuses imports * feat: remove comment * feat: uses by_alias when writing field names --------- Co-authored-by: jtyoung84 <[email protected]>
1 parent f9e61ec commit 551946c

File tree

4 files changed

+162
-47
lines changed

4 files changed

+162
-47
lines changed

src/aind_data_schema/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from .imaging.acquisition import Acquisition, Axis
1313
from .imaging.instrument import Instrument
1414
from .imaging.mri_session import MriSession
15+
from .metadata import Metadata
1516
from .ophys.ophys_rig import OphysRig
1617
from .ophys.ophys_session import OphysSession
1718
from .procedures import Procedures
@@ -44,4 +45,5 @@
4445
"MriSession",
4546
"Rig",
4647
"Session",
48+
"Metadata",
4749
]

src/aind_data_schema/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ def write_standard_file(self, output_directory: Optional[Path] = None, prefix=No
154154
filename = output_directory / filename
155155

156156
with open(filename, "w") as f:
157-
f.write(self.json(indent=3))
157+
f.write(self.json(indent=3, by_alias=True))
158158

159159

160160
class _TypeEnumSubset(object):

src/aind_data_schema/metadata.py

Lines changed: 85 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,19 @@
22

33
from datetime import datetime
44
from enum import Enum
5-
from typing import Dict, List
5+
from typing import Dict, List, Optional
6+
from uuid import UUID, uuid4
67

7-
from pydantic import Extra, Field
8+
from pydantic import Field, root_validator, validate_model
89

910
from aind_data_schema.base import AindCoreModel
11+
from aind_data_schema.data_description import DataDescription
12+
from aind_data_schema.imaging.acquisition import Acquisition
13+
from aind_data_schema.imaging.instrument import Instrument
14+
from aind_data_schema.procedures import Procedures
15+
from aind_data_schema.processing import Processing
16+
from aind_data_schema.rig import Rig
17+
from aind_data_schema.session import Session
1018
from aind_data_schema.subject import Subject
1119

1220

@@ -29,8 +37,10 @@ class Metadata(AindCoreModel):
2937
"""The records in the Data Asset Collection needs to contain certain fields
3038
to easily query and index the data."""
3139

32-
id: str = Field(
33-
...,
40+
schema_version: str = Field("0.0.2", description="schema version", title="Version", const=True)
41+
42+
id: UUID = Field(
43+
default_factory=uuid4,
3444
alias="_id",
3545
title="Data Asset ID",
3646
description="The unique id of the data asset.",
@@ -40,31 +50,90 @@ class Metadata(AindCoreModel):
4050
description="Name of the data asset.",
4151
title="Data Asset Name",
4252
)
53+
# We'll set created and last_modified defaults using the root_validator
54+
# to ensure they're synced on creation
4355
created: datetime = Field(
44-
...,
56+
default_factory=datetime.utcnow,
4557
title="Created",
46-
description="The data and time the data asset created.",
58+
description="The utc date and time the data asset created.",
4759
)
4860
last_modified: datetime = Field(
49-
..., title="Last Modified", description="The date and time that the data asset was last modified."
61+
default_factory=datetime.utcnow,
62+
title="Last Modified",
63+
description="The utc date and time that the data asset was last modified.",
5064
)
5165
location: str = Field(
5266
...,
5367
title="Location",
5468
description="Current location of the data asset.",
5569
)
56-
metadata_status: MetadataStatus = Field(..., title=" Metadata Status", description="The status of the metadata.")
57-
schema_version: str = Field("0.0.1", title="Schema Version", const=True)
70+
metadata_status: MetadataStatus = Field(
71+
default=MetadataStatus.UNKNOWN, title=" Metadata Status", description="The status of the metadata."
72+
)
5873
external_links: List[Dict[ExternalPlatforms, str]] = Field(
59-
..., title="External Links", description="Links to the data asset on different platforms."
74+
default=[], title="External Links", description="Links to the data asset on different platforms."
6075
)
61-
subject: Subject = Field(
62-
...,
76+
# We can make the AindCoreModel fields optional for now and do more
77+
# granular validations using validators. We may have some older data
78+
# assets in S3 that don't have metadata attached. We'd still like to
79+
# index that data, but we can flag those instances as MISSING or UNKNOWN
80+
subject: Optional[Subject] = Field(
81+
None,
6382
title="Subject",
64-
description="Description of a subject of data collection.",
83+
description="Subject of data collection.",
6584
)
85+
data_description: Optional[DataDescription] = Field(
86+
None, title="Data Description", description="A logical collection of data files."
87+
)
88+
procedures: Optional[Procedures] = Field(
89+
None, title="Procedures", description="All procedures performed on a subject."
90+
)
91+
session: Optional[Session] = Field(None, title="Session", description="Description of a session.")
92+
rig: Optional[Rig] = Field(None, title="Rig", description="Rig.")
93+
processing: Optional[Processing] = Field(None, title="Processing", description="All processes run on data.")
94+
acquisition: Optional[Acquisition] = Field(None, title="Acquisition", description="Imaging acquisition session")
95+
instrument: Optional[Instrument] = Field(
96+
None, title="Instrument", description="Instrument, which is a collection of devices"
97+
)
98+
99+
@root_validator(pre=False)
100+
def validate_metadata(cls, values):
101+
"""Validator for metadata"""
66102

67-
class Config:
68-
"""Need to allow for additional fields to append to base model"""
103+
# There's a simpler way to do this if we drop support for py37
104+
all_model_fields = []
105+
for field_name in cls.__fields__:
106+
field_to_check = cls.__fields__[field_name]
107+
try:
108+
if issubclass(field_to_check.type_, AindCoreModel):
109+
all_model_fields.append(field_to_check)
110+
except TypeError:
111+
# Type errors in python3.7 when using issubclass on type
112+
# generics
113+
pass
69114

70-
extra = Extra.allow
115+
# For each model field, check that is present and check if the model
116+
# is valid. If it isn't valid, still add it, but mark MetadataStatus
117+
# as INVALID
118+
metadata_status = MetadataStatus.VALID
119+
for model_field in all_model_fields:
120+
model_class = model_field.type_
121+
model_name = model_field.name
122+
if values.get(model_name) is not None:
123+
model = values[model_name]
124+
# Since pre=False, the dictionaries get converted to models
125+
# upstream
126+
model_contents = model.dict()
127+
*_, validation_error = validate_model(model_class, model_contents)
128+
if validation_error:
129+
model_instance = model_class.construct(**model_contents)
130+
metadata_status = MetadataStatus.INVALID
131+
else:
132+
model_instance = model_class(**model_contents)
133+
values[model_name] = model_instance
134+
# For certain required fields, like subject, if they are not present,
135+
# mark the metadata record as missing
136+
if values.get("subject") is None:
137+
metadata_status = MetadataStatus.MISSING
138+
values["metadata_status"] = metadata_status
139+
return values

tests/test_metadata.py

Lines changed: 74 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,91 @@
1-
"""Tests Metadata model"""
1+
"""Tests metadata module"""
22

3+
import json
34
import unittest
4-
from datetime import datetime
55

6-
from aind_data_schema.metadata import ExternalPlatforms, Metadata, MetadataStatus
6+
from pydantic import ValidationError
7+
8+
from aind_data_schema.metadata import Metadata, MetadataStatus
79
from aind_data_schema.procedures import Procedures
8-
from aind_data_schema.subject import Subject
10+
from aind_data_schema.subject import Sex, Species, Subject
911

1012

1113
class TestMetadata(unittest.TestCase):
1214
"""Class to test Metadata model"""
1315

14-
def test_constructors(self):
15-
"""test building from component parts"""
16+
def test_valid_subject_info(self):
17+
"""Tests that the record is marked as VALID if a valid subject model
18+
is present."""
19+
s1 = Subject(
20+
species=Species.MUS_MUSCULUS,
21+
subject_id="123345",
22+
sex=Sex.MALE,
23+
date_of_birth="2020-10-10",
24+
genotype="Emx1-IRES-Cre;Camk2a-tTA;Ai93(TITL-GCaMP6f)",
25+
)
26+
d1 = Metadata(name="ecephys_655019_2023-04-03_18-17-09", location="bucket", subject=s1)
27+
self.assertEqual("ecephys_655019_2023-04-03_18-17-09", d1.name)
28+
self.assertEqual("bucket", d1.location)
29+
self.assertEqual(MetadataStatus.VALID, d1.metadata_status)
30+
self.assertEqual(s1, d1.subject)
31+
32+
# Test construction via dictionary
33+
d2 = Metadata(name="ecephys_655019_2023-04-03_18-17-09", location="bucket", subject=s1.dict())
34+
self.assertEqual(MetadataStatus.VALID, d2.metadata_status)
35+
self.assertEqual(s1, d2.subject)
36+
37+
# Test round-trip serialization
38+
# We may want override the default file writer to always use by_alias
39+
# when writing the Metadata records. This sets the field in the json
40+
# file to _id instead of id, which makes it easier to write to
41+
# MongoDB.
42+
d3 = Metadata.parse_obj(json.loads(d2.json(by_alias=True)))
43+
self.assertEqual(d2, d3)
44+
45+
def test_missing_subject_info(self):
46+
"""Marks the metadata status as MISSING if a Subject model is not
47+
present"""
1648

1749
d1 = Metadata(
18-
_id="00000",
19-
name="Test Name",
20-
created=datetime(2023, 9, 27, 0, 0, 0),
21-
last_modified=datetime(2023, 9, 28, 10, 20, 30),
22-
location="Test Location",
23-
metadata_status=MetadataStatus.VALID,
24-
subject=Subject.construct(),
25-
external_links=[{ExternalPlatforms.CODEOCEAN: "abc-1234"}],
50+
name="ecephys_655019_2023-04-03_18-17-09",
51+
location="bucket",
52+
)
53+
self.assertEqual(MetadataStatus.MISSING, d1.metadata_status)
54+
self.assertEqual("ecephys_655019_2023-04-03_18-17-09", d1.name)
55+
self.assertEqual("bucket", d1.location)
56+
57+
# Assert at least a name and location are required
58+
with self.assertRaises(ValidationError) as e:
59+
Metadata()
60+
expected_exception_message = (
61+
"2 validation errors for Metadata\n"
62+
"name\n"
63+
" field required (type=value_error.missing)\n"
64+
"location\n"
65+
" field required (type=value_error.missing)"
66+
)
67+
self.assertEqual(expected_exception_message, str(e.exception))
68+
69+
def test_invalid_core_models(self):
70+
"""Test that invalid models don't raise an error, but marks the
71+
metadata_status as INVALID"""
72+
73+
# Invalid subject model
74+
d1 = Metadata(name="ecephys_655019_2023-04-03_18-17-09", location="bucket", subject=Subject.construct())
75+
self.assertEqual(MetadataStatus.INVALID, d1.metadata_status)
76+
77+
# Valid subject model, but invalid procedures model
78+
s2 = Subject(
79+
species=Species.MUS_MUSCULUS,
80+
subject_id="123345",
81+
sex=Sex.MALE,
82+
date_of_birth="2020-10-10",
83+
genotype="Emx1-IRES-Cre;Camk2a-tTA;Ai93(TITL-GCaMP6f)",
2684
)
27-
self.assertIsNotNone(d1)
28-
self.assertEqual(d1.schema_version, "0.0.1")
29-
self.assertEqual(d1.location, "Test Location")
30-
self.assertTrue(hasattr(d1, "subject"))
3185
d2 = Metadata(
32-
_id="00000",
33-
name="Test Name",
34-
created=datetime(2023, 9, 27, 0, 0, 0),
35-
last_modified=datetime(2023, 9, 28, 10, 20, 30),
36-
location="Test Location",
37-
metadata_status=MetadataStatus.VALID,
38-
subject=Subject.construct(),
39-
procedures=Procedures.construct(),
40-
external_links=[{ExternalPlatforms.CODEOCEAN: "abc-1234"}],
86+
name="ecephys_655019_2023-04-03_18-17-09", location="bucket", subject=s2, procedures=Procedures.construct()
4187
)
42-
self.assertIsNotNone(d2)
43-
self.assertTrue(hasattr(d2, "procedures"))
44-
self.assertTrue(hasattr(d2, "external_links"))
88+
self.assertEqual(MetadataStatus.INVALID, d2.metadata_status)
4589

4690

4791
if __name__ == "__main__":

0 commit comments

Comments
 (0)