Skip to content

Commit 464c588

Browse files
committed
Merge branch 'feat-344-slap' of https://github.com/AllenNeuralDynamics/aind-data-schema into feat-344-slap
2 parents 4cde7c3 + 7186c78 commit 464c588

File tree

13 files changed

+278
-57
lines changed

13 files changed

+278
-57
lines changed
9.95 KB
Loading
5.96 MB
Loading

examples/data_description.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/data_description.py",
3-
"schema_version": "0.10.1",
3+
"schema_version": "0.10.2",
44
"license": "CC-BY-4.0",
55
"creation_time": "2022-02-21T16:30:01",
66
"name": "ecephys_12345_2022-02-21_16-30-01",

src/aind_data_schema/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
""" imports for AindModel subclasses
22
"""
33

4-
__version__ = "0.15.18"
4+
__version__ = "0.15.20"
55

66

77
from .behavior.behavior_rig import BehaviorRig
@@ -12,6 +12,7 @@
1212
from .imaging.acquisition import Acquisition, Axis
1313
from .imaging.instrument import Instrument
1414
from .imaging.mri_session import MriSession
15+
from .metadata import Metadata
1516
from .ophys.ophys_rig import OphysRig
1617
from .ophys.ophys_session import OphysSession
1718
from .procedures import Procedures
@@ -44,4 +45,5 @@
4445
"MriSession",
4546
"Rig",
4647
"Session",
48+
"Metadata",
4749
]

src/aind_data_schema/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ def write_standard_file(self, output_directory: Optional[Path] = None, prefix=No
154154
filename = output_directory / filename
155155

156156
with open(filename, "w") as f:
157-
f.write(self.json(indent=3))
157+
f.write(self.json(indent=3, by_alias=True))
158158

159159

160160
class _TypeEnumSubset(object):

src/aind_data_schema/data_description.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,12 @@ class DataRegex(Enum):
3131
f"^(?P<input>.+?_{RegexParts.DATE.value}_{RegexParts.TIME.value})_(?P<process_name>.+?)_(?P<c_date>"
3232
f"{RegexParts.DATE.value})_(?P<c_time>{RegexParts.TIME.value})"
3333
)
34+
ANALYZED = (
35+
f"^(?P<project_abbreviation>.+?)_(?P<analysis_name>.+?)_(?P<c_date>"
36+
f"{RegexParts.DATE.value})_(?P<c_time>{RegexParts.TIME.value})$"
37+
)
3438
NO_UNDERSCORES = "^[^_]+$"
39+
NO_SPECIAL_CHARS = '^[^<>:;"/|? \\_]+$'
3540

3641

3742
class DataLevel(Enum):
@@ -198,7 +203,7 @@ class RelatedData(AindModel):
198203
class DataDescription(AindCoreModel):
199204
"""Description of a logical collection of data files"""
200205

201-
schema_version: str = Field("0.10.1", title="Schema Version", const=True)
206+
schema_version: str = Field("0.10.2", title="Schema Version", const=True)
202207
license: str = Field("CC-BY-4.0", title="License", const=True)
203208

204209
creation_time: datetime = Field(
@@ -244,6 +249,7 @@ class DataDescription(AindCoreModel):
244249
)
245250
project_name: Optional[str] = Field(
246251
None,
252+
regex=DataRegex.NO_SPECIAL_CHARS.value,
247253
description="A name for a set of coordinated activities intended to achieve one or more objectives.",
248254
title="Project Name",
249255
)
@@ -459,3 +465,43 @@ def parse_name(cls, name):
459465
subject_id=m.group("subject_id"),
460466
creation_time=creation_time,
461467
)
468+
469+
470+
class AnalysisDescription(DataDescription):
471+
"""A collection of data files as analyzed from an asset"""
472+
473+
data_level: DataLevel = Field(
474+
DataLevel.DERIVED, description="Level of processing that data has undergone", title="Data Level", const=True
475+
)
476+
project_name: str = Field(
477+
...,
478+
regex=DataRegex.NO_SPECIAL_CHARS.value,
479+
description="Name of the project the analysis belongs to",
480+
title="Project name",
481+
)
482+
analysis_name: str = Field(
483+
..., regex=DataRegex.NO_SPECIAL_CHARS.value, description="Name of the analysis performed", title="Analysis name"
484+
)
485+
486+
@property
487+
def label(self):
488+
"""returns the label of the file"""
489+
490+
return f"{self.project_name}_{self.analysis_name}"
491+
492+
@classmethod
493+
def parse_name(cls, name):
494+
"""Decompose raw Analysis name into component parts"""
495+
496+
m = re.match(f"{DataRegex.ANALYZED.value}", name)
497+
498+
if m is None:
499+
raise ValueError(f"name({name}) does not match pattern")
500+
501+
creation_time = datetime_from_name_string(m.group("c_date"), m.group("c_time"))
502+
503+
return dict(
504+
project_abbreviation=m.group("project_abbreviation"),
505+
analysis_name=m.group("analysis_name"),
506+
creation_time=creation_time,
507+
)

src/aind_data_schema/metadata.py

Lines changed: 85 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,19 @@
22

33
from datetime import datetime
44
from enum import Enum
5-
from typing import Dict, List
5+
from typing import Dict, List, Optional
6+
from uuid import UUID, uuid4
67

7-
from pydantic import Extra, Field
8+
from pydantic import Field, root_validator, validate_model
89

910
from aind_data_schema.base import AindCoreModel
11+
from aind_data_schema.data_description import DataDescription
12+
from aind_data_schema.imaging.acquisition import Acquisition
13+
from aind_data_schema.imaging.instrument import Instrument
14+
from aind_data_schema.procedures import Procedures
15+
from aind_data_schema.processing import Processing
16+
from aind_data_schema.rig import Rig
17+
from aind_data_schema.session import Session
1018
from aind_data_schema.subject import Subject
1119

1220

@@ -29,8 +37,10 @@ class Metadata(AindCoreModel):
2937
"""The records in the Data Asset Collection needs to contain certain fields
3038
to easily query and index the data."""
3139

32-
id: str = Field(
33-
...,
40+
schema_version: str = Field("0.0.2", description="schema version", title="Version", const=True)
41+
42+
id: UUID = Field(
43+
default_factory=uuid4,
3444
alias="_id",
3545
title="Data Asset ID",
3646
description="The unique id of the data asset.",
@@ -40,31 +50,90 @@ class Metadata(AindCoreModel):
4050
description="Name of the data asset.",
4151
title="Data Asset Name",
4252
)
53+
# We'll set created and last_modified defaults using the root_validator
54+
# to ensure they're synced on creation
4355
created: datetime = Field(
44-
...,
56+
default_factory=datetime.utcnow,
4557
title="Created",
46-
description="The data and time the data asset created.",
58+
description="The utc date and time the data asset created.",
4759
)
4860
last_modified: datetime = Field(
49-
..., title="Last Modified", description="The date and time that the data asset was last modified."
61+
default_factory=datetime.utcnow,
62+
title="Last Modified",
63+
description="The utc date and time that the data asset was last modified.",
5064
)
5165
location: str = Field(
5266
...,
5367
title="Location",
5468
description="Current location of the data asset.",
5569
)
56-
metadata_status: MetadataStatus = Field(..., title=" Metadata Status", description="The status of the metadata.")
57-
schema_version: str = Field("0.0.3", title="Schema Version", const=True)
70+
metadata_status: MetadataStatus = Field(
71+
default=MetadataStatus.UNKNOWN, title=" Metadata Status", description="The status of the metadata."
72+
)
5873
external_links: List[Dict[ExternalPlatforms, str]] = Field(
59-
..., title="External Links", description="Links to the data asset on different platforms."
74+
default=[], title="External Links", description="Links to the data asset on different platforms."
6075
)
61-
subject: Subject = Field(
62-
...,
76+
# We can make the AindCoreModel fields optional for now and do more
77+
# granular validations using validators. We may have some older data
78+
# assets in S3 that don't have metadata attached. We'd still like to
79+
# index that data, but we can flag those instances as MISSING or UNKNOWN
80+
subject: Optional[Subject] = Field(
81+
None,
6382
title="Subject",
64-
description="Description of a subject of data collection.",
83+
description="Subject of data collection.",
6584
)
85+
data_description: Optional[DataDescription] = Field(
86+
None, title="Data Description", description="A logical collection of data files."
87+
)
88+
procedures: Optional[Procedures] = Field(
89+
None, title="Procedures", description="All procedures performed on a subject."
90+
)
91+
session: Optional[Session] = Field(None, title="Session", description="Description of a session.")
92+
rig: Optional[Rig] = Field(None, title="Rig", description="Rig.")
93+
processing: Optional[Processing] = Field(None, title="Processing", description="All processes run on data.")
94+
acquisition: Optional[Acquisition] = Field(None, title="Acquisition", description="Imaging acquisition session")
95+
instrument: Optional[Instrument] = Field(
96+
None, title="Instrument", description="Instrument, which is a collection of devices"
97+
)
98+
99+
@root_validator(pre=False)
100+
def validate_metadata(cls, values):
101+
"""Validator for metadata"""
66102

67-
class Config:
68-
"""Need to allow for additional fields to append to base model"""
103+
# There's a simpler way to do this if we drop support for py37
104+
all_model_fields = []
105+
for field_name in cls.__fields__:
106+
field_to_check = cls.__fields__[field_name]
107+
try:
108+
if issubclass(field_to_check.type_, AindCoreModel):
109+
all_model_fields.append(field_to_check)
110+
except TypeError:
111+
# Type errors in python3.7 when using issubclass on type
112+
# generics
113+
pass
69114

70-
extra = Extra.allow
115+
# For each model field, check that is present and check if the model
116+
# is valid. If it isn't valid, still add it, but mark MetadataStatus
117+
# as INVALID
118+
metadata_status = MetadataStatus.VALID
119+
for model_field in all_model_fields:
120+
model_class = model_field.type_
121+
model_name = model_field.name
122+
if values.get(model_name) is not None:
123+
model = values[model_name]
124+
# Since pre=False, the dictionaries get converted to models
125+
# upstream
126+
model_contents = model.dict()
127+
*_, validation_error = validate_model(model_class, model_contents)
128+
if validation_error:
129+
model_instance = model_class.construct(**model_contents)
130+
metadata_status = MetadataStatus.INVALID
131+
else:
132+
model_instance = model_class(**model_contents)
133+
values[model_name] = model_instance
134+
# For certain required fields, like subject, if they are not present,
135+
# mark the metadata record as missing
136+
if values.get("subject") is None:
137+
metadata_status = MetadataStatus.MISSING
138+
values["metadata_status"] = metadata_status
139+
return values

tests/resources/ephys_data_description/data_description_0.6.2.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
"ror_id": "04szwah67",
3434
"data_level": "raw",
3535
"group": "ephys",
36-
"project_name": "MRI-Guided Elecrophysiology",
36+
"project_name": "mri-guided-electrophysiology",
3737
"experiment_type": "ecephys",
3838
"subject_id": "661279",
3939
"data_summary": "This dataset was collected to evaluate the accuracy and feasibility of the AIND MRI-guided insertion pipeline. One probe targets the retinotopic center of LGN, with drifting grating for receptive field mapping to evaluate targeting. Other targets can be evaluated in histology."

tests/resources/ephys_data_description/data_description_0.6.2_wrong_field.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
"ror_id": "04szwah67",
3434
"data_level": "raw",
3535
"group": "ephys",
36-
"project_name": "MRI-Guided Elecrophysiology",
36+
"project_name": "mri-guided-electrophysiology",
3737
"experiment_type": "ecephys",
3838
"subject_id": "661279",
3939
"data_summary": "This dataset was collected to evaluate the accuracy and feasibility of the AIND MRI-guided insertion pipeline. One probe targets the retinotopic center of LGN, with drifting grating for receptive field mapping to evaluate targeting. Other targets can be evaluated in histology."

tests/test_data_description.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing import List
99

1010
from aind_data_schema.data_description import (
11+
AnalysisDescription,
1112
DataDescription,
1213
DerivedDataDescription,
1314
Funding,
@@ -38,6 +39,7 @@ def setUpClass(cls):
3839
BAD_NAME = "fizzbuzz"
3940
BASIC_NAME = "ecephys_1234_3033-12-21_04-22-11"
4041
DERIVED_NAME = "ecephys_1234_3033-12-21_04-22-11_spikesorted-ks25_2022-10-12_23-23-11"
42+
ANALYSIS_NAME = "project_analysis_3033-12-21_04-22-11"
4143

4244
def test_constructors(self):
4345
"""test building from component parts"""
@@ -120,6 +122,59 @@ def test_constructors(self):
120122
investigators=["Jane Smith"],
121123
)
122124

125+
ad = AnalysisDescription(
126+
analysis_name="analysis",
127+
project_name="project",
128+
creation_time=dt,
129+
subject_id="1234",
130+
modality=[Modality.SPIM],
131+
platform="exaspim",
132+
institution=Institution.AIND,
133+
funding_source=[f],
134+
investigators=["Jane Smith"],
135+
)
136+
137+
self.assertEqual(ad.label, "project_analysis")
138+
139+
with self.assertRaises(ValueError):
140+
AnalysisDescription(
141+
analysis_name="ana lysis",
142+
project_name="pro_ject",
143+
subject_id="1234",
144+
modality=[Modality.SPIM],
145+
platform="exaspim",
146+
creation_time=dt,
147+
institution=Institution.AIND,
148+
funding_source=[f],
149+
investigators=["Jane Smith"],
150+
)
151+
152+
with self.assertRaises(ValueError):
153+
AnalysisDescription(
154+
analysis_name="",
155+
project_name="project",
156+
subject_id="1234",
157+
modality=[Modality.SPIM],
158+
platform="exaspim",
159+
creation_time=dt,
160+
institution=Institution.AIND,
161+
funding_source=[f],
162+
investigators=["Jane Smith"],
163+
)
164+
165+
with self.assertRaises(ValueError):
166+
AnalysisDescription(
167+
analysis_name="analysis",
168+
project_name="",
169+
subject_id="1234",
170+
modality=[Modality.SPIM],
171+
platform="exaspim",
172+
creation_time=dt,
173+
institution=Institution.AIND,
174+
funding_source=[f],
175+
investigators=["Jane Smith"],
176+
)
177+
123178
def test_round_trip(self):
124179
"""make sure we can round trip from json"""
125180

@@ -167,6 +222,14 @@ def test_parse_name(self):
167222
with self.assertRaises(ValueError):
168223
toks = DerivedDataDescription.parse_name(self.BAD_NAME)
169224

225+
toks = AnalysisDescription.parse_name(self.ANALYSIS_NAME)
226+
assert toks["project_abbreviation"] == "project"
227+
assert toks["analysis_name"] == "analysis"
228+
assert toks["creation_time"] == datetime.datetime(3033, 12, 21, 4, 22, 11)
229+
230+
with self.assertRaises(ValueError):
231+
toks = AnalysisDescription.parse_name(self.BAD_NAME)
232+
170233
def test_abbreviation_enums(self):
171234
"""Tests that BaseName enums can be constructed from abbreviations"""
172235
# Tests that Modality constructed as expected

0 commit comments

Comments
 (0)