Merge pull request #535 from AllenNeuralDynamics/feat-500-add-AnalysisDescription

dyf · web-flow · commit 35eb87b00885 · 2023-11-01T08:08:03.000-07:00
Feat 500 add Analysis Description
diff --git a/examples/data_description.json b/examples/data_description.json
@@ -1,6 +1,6 @@
 {
    "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/data_description.py",
-   "schema_version": "0.10.1",
+   "schema_version": "0.10.2",
    "license": "CC-BY-4.0",
    "creation_time": "2022-02-21T16:30:01",
    "name": "ecephys_12345_2022-02-21_16-30-01",
diff --git a/src/aind_data_schema/data_description.py b/src/aind_data_schema/data_description.py
@@ -31,7 +31,12 @@ class DataRegex(Enum):
         f"^(?P<input>.+?_{RegexParts.DATE.value}_{RegexParts.TIME.value})_(?P<process_name>.+?)_(?P<c_date>"
         f"{RegexParts.DATE.value})_(?P<c_time>{RegexParts.TIME.value})"
     )
+    ANALYZED = (
+        f"^(?P<project_abbreviation>.+?)_(?P<analysis_name>.+?)_(?P<c_date>"
+        f"{RegexParts.DATE.value})_(?P<c_time>{RegexParts.TIME.value})$"
+    )
     NO_UNDERSCORES = "^[^_]+$"
+    NO_SPECIAL_CHARS = '^[^<>:;"/|? \\_]+$'
 
 
 class DataLevel(Enum):
@@ -198,7 +203,7 @@ class RelatedData(AindModel):
 class DataDescription(AindCoreModel):
     """Description of a logical collection of data files"""
 
-    schema_version: str = Field("0.10.1", title="Schema Version", const=True)
+    schema_version: str = Field("0.10.2", title="Schema Version", const=True)
     license: str = Field("CC-BY-4.0", title="License", const=True)
 
     creation_time: datetime = Field(
@@ -244,6 +249,7 @@ class DataDescription(AindCoreModel):
     )
     project_name: Optional[str] = Field(
         None,
+        regex=DataRegex.NO_SPECIAL_CHARS.value,
         description="A name for a set of coordinated activities intended to achieve one or more objectives.",
         title="Project Name",
     )
@@ -459,3 +465,43 @@ def parse_name(cls, name):
             subject_id=m.group("subject_id"),
             creation_time=creation_time,
         )
+
+
+class AnalysisDescription(DataDescription):
+    """A collection of data files as analyzed from an asset"""
+
+    data_level: DataLevel = Field(
+        DataLevel.DERIVED, description="Level of processing that data has undergone", title="Data Level", const=True
+    )
+    project_name: str = Field(
+        ...,
+        regex=DataRegex.NO_SPECIAL_CHARS.value,
+        description="Name of the project the analysis belongs to",
+        title="Project name",
+    )
+    analysis_name: str = Field(
+        ..., regex=DataRegex.NO_SPECIAL_CHARS.value, description="Name of the analysis performed", title="Analysis name"
+    )
+
+    @property
+    def label(self):
+        """returns the label of the file"""
+
+        return f"{self.project_name}_{self.analysis_name}"
+
+    @classmethod
+    def parse_name(cls, name):
+        """Decompose raw Analysis name into component parts"""
+
+        m = re.match(f"{DataRegex.ANALYZED.value}", name)
+
+        if m is None:
+            raise ValueError(f"name({name}) does not match pattern")
+
+        creation_time = datetime_from_name_string(m.group("c_date"), m.group("c_time"))
+
+        return dict(
+            project_abbreviation=m.group("project_abbreviation"),
+            analysis_name=m.group("analysis_name"),
+            creation_time=creation_time,
+        )
diff --git a/tests/resources/ephys_data_description/data_description_0.6.2.json b/tests/resources/ephys_data_description/data_description_0.6.2.json
@@ -33,7 +33,7 @@
     "ror_id": "04szwah67",
     "data_level": "raw",
     "group": "ephys",
-    "project_name": "MRI-Guided Elecrophysiology",
+    "project_name": "mri-guided-electrophysiology",
     "experiment_type": "ecephys",
     "subject_id": "661279",
     "data_summary": "This dataset was collected to evaluate the accuracy and feasibility of the AIND MRI-guided insertion pipeline. One probe targets the retinotopic center of LGN, with drifting grating for receptive field mapping to evaluate targeting. Other targets can be evaluated in histology."
diff --git a/tests/resources/ephys_data_description/data_description_0.6.2_wrong_field.json b/tests/resources/ephys_data_description/data_description_0.6.2_wrong_field.json
@@ -33,7 +33,7 @@
     "ror_id": "04szwah67",
     "data_level": "raw",
     "group": "ephys",
-    "project_name": "MRI-Guided Elecrophysiology",
+    "project_name": "mri-guided-electrophysiology",
     "experiment_type": "ecephys",
     "subject_id": "661279",
     "data_summary": "This dataset was collected to evaluate the accuracy and feasibility of the AIND MRI-guided insertion pipeline. One probe targets the retinotopic center of LGN, with drifting grating for receptive field mapping to evaluate targeting. Other targets can be evaluated in histology."
diff --git a/tests/test_data_description.py b/tests/test_data_description.py
@@ -8,6 +8,7 @@
 from typing import List
 
 from aind_data_schema.data_description import (
+    AnalysisDescription,
     DataDescription,
     DerivedDataDescription,
     Funding,
@@ -38,6 +39,7 @@ def setUpClass(cls):
     BAD_NAME = "fizzbuzz"
     BASIC_NAME = "ecephys_1234_3033-12-21_04-22-11"
     DERIVED_NAME = "ecephys_1234_3033-12-21_04-22-11_spikesorted-ks25_2022-10-12_23-23-11"
+    ANALYSIS_NAME = "project_analysis_3033-12-21_04-22-11"
 
     def test_constructors(self):
         """test building from component parts"""
@@ -120,6 +122,59 @@ def test_constructors(self):
                 investigators=["Jane Smith"],
             )
 
+        ad = AnalysisDescription(
+            analysis_name="analysis",
+            project_name="project",
+            creation_time=dt,
+            subject_id="1234",
+            modality=[Modality.SPIM],
+            platform="exaspim",
+            institution=Institution.AIND,
+            funding_source=[f],
+            investigators=["Jane Smith"],
+        )
+
+        self.assertEqual(ad.label, "project_analysis")
+
+        with self.assertRaises(ValueError):
+            AnalysisDescription(
+                analysis_name="ana lysis",
+                project_name="pro_ject",
+                subject_id="1234",
+                modality=[Modality.SPIM],
+                platform="exaspim",
+                creation_time=dt,
+                institution=Institution.AIND,
+                funding_source=[f],
+                investigators=["Jane Smith"],
+            )
+
+        with self.assertRaises(ValueError):
+            AnalysisDescription(
+                analysis_name="",
+                project_name="project",
+                subject_id="1234",
+                modality=[Modality.SPIM],
+                platform="exaspim",
+                creation_time=dt,
+                institution=Institution.AIND,
+                funding_source=[f],
+                investigators=["Jane Smith"],
+            )
+
+        with self.assertRaises(ValueError):
+            AnalysisDescription(
+                analysis_name="analysis",
+                project_name="",
+                subject_id="1234",
+                modality=[Modality.SPIM],
+                platform="exaspim",
+                creation_time=dt,
+                institution=Institution.AIND,
+                funding_source=[f],
+                investigators=["Jane Smith"],
+            )
+
     def test_round_trip(self):
         """make sure we can round trip from json"""
 
@@ -167,6 +222,14 @@ def test_parse_name(self):
         with self.assertRaises(ValueError):
             toks = DerivedDataDescription.parse_name(self.BAD_NAME)
 
+        toks = AnalysisDescription.parse_name(self.ANALYSIS_NAME)
+        assert toks["project_abbreviation"] == "project"
+        assert toks["analysis_name"] == "analysis"
+        assert toks["creation_time"] == datetime.datetime(3033, 12, 21, 4, 22, 11)
+
+        with self.assertRaises(ValueError):
+            toks = AnalysisDescription.parse_name(self.BAD_NAME)
+
     def test_abbreviation_enums(self):
         """Tests that BaseName enums can be constructed from abbreviations"""
         # Tests that Modality constructed as expected
diff --git a/tests/test_processing.py b/tests/test_processing.py
@@ -21,9 +21,6 @@ def test_constructors(self):
             processing_pipeline=PipelineProcess(processor_full_name="Processor", data_processes=[]),
         )
 
-        with self.assertRaises(pydantic.ValidationError):
-            DataProcess(name="Other")
-
         with self.assertRaises(pydantic.ValidationError):
             DataProcess(name="Other", notes="")
 
diff --git a/tests/test_schema_upgrade.py b/tests/test_schema_upgrade.py
@@ -205,7 +205,7 @@ def test_upgrades_0_6_2(self):
         self.assertEqual(DataLevel.RAW, new_data_description.data_level)
         self.assertEqual(Group.EPHYS, new_data_description.group)
         self.assertEqual(["John Doe", "Mary Smith"], new_data_description.investigators)
-        self.assertEqual("MRI-Guided Elecrophysiology", new_data_description.project_name)
+        self.assertEqual("mri-guided-electrophysiology", new_data_description.project_name)
         self.assertIsNone(new_data_description.restrictions)
         self.assertEqual([Modality.ECEPHYS], new_data_description.modality)
         self.assertEqual("661279", new_data_description.subject_id)
@@ -265,7 +265,7 @@ def test_upgrades_0_6_2_wrong_field(self):
         self.assertEqual(DataLevel.RAW, new_data_description.data_level)
         self.assertEqual(Group.EPHYS, new_data_description.group)
         self.assertEqual(["John Doe", "Mary Smith"], new_data_description.investigators)
-        self.assertEqual("MRI-Guided Elecrophysiology", new_data_description.project_name)
+        self.assertEqual("mri-guided-electrophysiology", new_data_description.project_name)
         self.assertIsNone(new_data_description.restrictions)
         self.assertEqual([Modality.ECEPHYS], new_data_description.modality)
         self.assertEqual("661279", new_data_description.subject_id)

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/data_description.py",`
`3`		`- "schema_version": "0.10.1",`
	`3`	`+ "schema_version": "0.10.2",`
`4`	`4`	`"license": "CC-BY-4.0",`
`5`	`5`	`"creation_time": "2022-02-21T16:30:01",`
`6`	`6`	`"name": "ecephys_12345_2022-02-21_16-30-01",`
Original file line number	Diff line number	Diff line change
`@@ -21,9 +21,6 @@ def test_constructors(self):`
`21`	`21`	`processing_pipeline=PipelineProcess(processor_full_name="Processor", data_processes=[]),`
`22`	`22`	`)`
`23`	`23`
`24`		`- with self.assertRaises(pydantic.ValidationError):`
`25`		`- DataProcess(name="Other")`
`26`		`-`
`27`	`24`	`with self.assertRaises(pydantic.ValidationError):`
`28`	`25`	`DataProcess(name="Other", notes="")`
`29`	`26`