Skip to content

Commit d056bd6

Browse files
authored
Merge pull request #1980 from SilenNaihin/master
ALab Pydantic Schema and pipeline for provenance
2 parents e5f73ab + 67eda5a commit d056bd6

File tree

11 files changed

+748
-0
lines changed

11 files changed

+748
-0
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# A-Lab Pipelines
2+
3+
Pipeline code for the A-Lab project is maintained in a separate repository:
4+
5+
**Repository:** [alab-pipeline](https://github.com/SilenNaihin/alab-pipeline.git)
6+
7+
## Getting Started
8+
9+
To clone the pipeline repository:
10+
11+
```bash
12+
git clone https://github.com/SilenNaihin/alab-pipeline.git
13+
```
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"""
2+
A-Lab Pydantic Schemas
3+
4+
This package contains Pydantic schemas for all A-Lab parquet tables.
5+
These schemas are the source of truth for data validation.
6+
7+
Each schema corresponds to one parquet file.
8+
Integrates team's validation patterns (constraints, Literal types) from results_schema.py.
9+
"""
10+
11+
from .base import ExcludeFromUpload
12+
from .experiments import Experiment
13+
from .experiment_elements import ExperimentElement
14+
from .powder_doses import PowderDose, PowderItem
15+
from .temperature_logs import TemperatureLog, TemperatureLogEntry
16+
from .workflow_tasks import WorkflowTask
17+
from .xrd_data_points import XRDDataPoint
18+
from .xrd_refinements import XRDRefinement
19+
from .xrd_phases import XRDPhase
20+
21+
__all__ = [
22+
# Base
23+
"ExcludeFromUpload",
24+
# Parquet table schemas (one per .parquet file)
25+
"Experiment",
26+
"ExperimentElement",
27+
"PowderDose",
28+
"PowderItem",
29+
"TemperatureLogEntry",
30+
"TemperatureLog",
31+
"WorkflowTask",
32+
"XRDDataPoint",
33+
"XRDRefinement",
34+
"XRDPhase",
35+
]
36+
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
"""
2+
Base utilities for A-Lab Pydantic schemas.
3+
4+
Provides common types, validators, and field utilities.
5+
"""
6+
7+
from typing import Any, Dict
8+
from pydantic import Field
9+
10+
11+
def ExcludeFromUpload(
12+
default: Any = None,
13+
description: str = "",
14+
**kwargs
15+
) -> Any:
16+
"""
17+
Field that should NOT be uploaded to MPContribs.
18+
19+
Use this for sensitive data that must remain private until publication.
20+
Examples: weight_collected, mass measurements that are embargoed.
21+
22+
Usage:
23+
weight_collected: float | None = ExcludeFromUpload(
24+
description="Weight of powder collected (embargoed)"
25+
)
26+
"""
27+
return Field(
28+
default=default,
29+
description=description,
30+
json_schema_extra={"exclude_from_upload": True},
31+
**kwargs
32+
)
33+
34+
35+
def get_uploadable_fields(model_class) -> list[str]:
36+
"""
37+
Get list of fields that should be uploaded to MPContribs.
38+
39+
Args:
40+
model_class: Pydantic model class
41+
42+
Returns:
43+
List of field names that are NOT marked with exclude_from_upload
44+
"""
45+
uploadable = []
46+
for field_name, field_info in model_class.model_fields.items():
47+
extra = field_info.json_schema_extra or {}
48+
if not extra.get("exclude_from_upload", False):
49+
uploadable.append(field_name)
50+
return uploadable
51+
52+
53+
def get_excluded_fields(model_class) -> list[str]:
54+
"""
55+
Get list of fields that should NOT be uploaded to MPContribs.
56+
57+
Args:
58+
model_class: Pydantic model class
59+
60+
Returns:
61+
List of field names that ARE marked with exclude_from_upload
62+
"""
63+
excluded = []
64+
for field_name, field_info in model_class.model_fields.items():
65+
extra = field_info.json_schema_extra or {}
66+
if extra.get("exclude_from_upload", False):
67+
excluded.append(field_name)
68+
return excluded
69+
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
"""
2+
Experiment Elements Schema
3+
4+
Elements present in each experiment (1:N relationship).
5+
Maps to: experiment_elements.parquet
6+
"""
7+
8+
from pydantic import BaseModel, Field
9+
10+
11+
class ExperimentElement(BaseModel, extra="forbid"):
12+
"""
13+
Element present in an experiment.
14+
15+
Each experiment can have multiple elements (1:N relationship).
16+
"""
17+
18+
experiment_id: str = Field(
19+
description="Reference to parent experiment"
20+
)
21+
22+
element_symbol: str = Field(
23+
description="Element symbol (e.g., Na, Mg, O)"
24+
)
25+
26+
target_atomic_percent: float | None = Field(
27+
default=None,
28+
description="Target atomic percentage of this element"
29+
)
30+
Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
"""
2+
Experiment Schema (Consolidated)
3+
4+
This is the main experiment table with ALL 1:1 data merged.
5+
Contains ~45 columns from: experiments + heating + recovery + xrd + finalization + dosing.
6+
7+
Maps to: experiments.parquet
8+
"""
9+
10+
from datetime import datetime
11+
from typing import Literal
12+
from pydantic import BaseModel, Field
13+
14+
# Import ExcludeFromUpload utility
15+
try:
16+
from .base import ExcludeFromUpload
17+
except ImportError:
18+
# When imported dynamically, use absolute import
19+
import sys
20+
from pathlib import Path
21+
sys.path.insert(0, str(Path(__file__).parent))
22+
from base import ExcludeFromUpload
23+
24+
25+
class Experiment(BaseModel, extra="forbid"):
26+
"""
27+
Main experiment schema with all 1:1 data consolidated.
28+
29+
This is the primary table - one row per experiment.
30+
All related 1:1 data (heating, recovery, xrd, finalization, dosing) is merged here.
31+
"""
32+
33+
# === Core experiment fields ===
34+
experiment_id: str = Field(
35+
description="Unique experiment identifier (MongoDB _id)"
36+
)
37+
38+
name: str = Field(
39+
description="Experiment name (e.g., NSC_249, MINES_12)"
40+
)
41+
42+
experiment_type: str = Field(
43+
description="Root experiment type (NSC, Na, PG, MINES, TRI)"
44+
)
45+
46+
experiment_subgroup: str | None = Field(
47+
default=None,
48+
description="Experiment subgroup (e.g., NSC_249, Na_123)"
49+
)
50+
51+
target_formula: str = Field(
52+
description="Target chemical formula"
53+
)
54+
55+
last_updated: datetime = Field(
56+
description="Last modification timestamp"
57+
)
58+
59+
status: Literal["completed", "error", "active", "unknown"] = Field(
60+
description="Workflow status"
61+
)
62+
63+
notes: str | None = Field(
64+
default=None,
65+
description="Optional notes about the experiment"
66+
)
67+
68+
# === Heating fields (prefix: heating_) ===
69+
heating_method: Literal["standard", "atmosphere", "manual", "none"] | None = Field(
70+
default=None,
71+
description="Heating method used"
72+
)
73+
74+
heating_temperature: float | None = Field(
75+
default=None,
76+
description="Target heating temperature in °C"
77+
)
78+
79+
heating_time: float | None = Field(
80+
default=None,
81+
description="Heating duration in minutes"
82+
)
83+
84+
heating_cooling_rate: float | None = Field(
85+
default=None,
86+
description="Cooling rate in °C/min"
87+
)
88+
89+
heating_atmosphere: str | None = Field(
90+
default=None,
91+
description="Atmosphere used during heating (e.g., N2, Ar, Air)"
92+
)
93+
94+
heating_flow_rate_ml_min: float | None = Field(
95+
default=None,
96+
description="Gas flow rate during heating in mL/min"
97+
)
98+
99+
heating_low_temp_calcination: bool | None = Field(
100+
default=None,
101+
description="Whether low temperature calcination was used"
102+
)
103+
104+
# === Recovery fields (prefix: recovery_) ===
105+
recovery_total_dosed_mass_mg: float | None = Field(
106+
default=None,
107+
description="Total mass of all powders dosed in mg"
108+
)
109+
110+
# EXCLUDED FROM UPLOAD per team request
111+
recovery_weight_collected_mg: float | None = ExcludeFromUpload(
112+
description="Weight of powder collected after heating in mg (EMBARGOED)"
113+
)
114+
115+
recovery_yield_percent: float | None = Field(
116+
default=None,
117+
description="Recovery yield (collected / dosed * 100)"
118+
)
119+
120+
recovery_initial_crucible_weight_mg: float | None = Field(
121+
default=None,
122+
description="Initial crucible weight before experiment in mg"
123+
)
124+
125+
recovery_failure_classification: str | None = Field(
126+
default=None,
127+
description="Classification of any failure during recovery"
128+
)
129+
130+
# === XRD measurement fields (prefix: xrd_) ===
131+
xrd_sampleid_in_aeris: str | None = Field(
132+
default=None,
133+
description="Sample ID in Aeris XRD system"
134+
)
135+
136+
xrd_holder_index: int | None = Field(
137+
default=None,
138+
description="XRD sample holder position index"
139+
)
140+
141+
# EXCLUDED FROM UPLOAD per team request
142+
xrd_total_mass_dispensed_mg: float | None = ExcludeFromUpload(
143+
description="Mass dispensed for XRD measurement in mg (EMBARGOED)"
144+
)
145+
146+
xrd_met_target_mass: bool | None = Field(
147+
default=None,
148+
description="Whether target mass was achieved for XRD"
149+
)
150+
151+
# === Finalization fields (prefix: finalization_) ===
152+
finalization_decoded_sample_id: str | None = Field(
153+
default=None,
154+
description="Decoded sample ID from barcode"
155+
)
156+
157+
finalization_successful_labeling: bool | None = Field(
158+
default=None,
159+
description="Whether sample was successfully labeled"
160+
)
161+
162+
finalization_storage_location: str | None = Field(
163+
default=None,
164+
description="Final storage location of sample"
165+
)
166+
167+
# === Dosing fields (prefix: dosing_) ===
168+
dosing_crucible_position: int | None = Field(
169+
default=None,
170+
description="Crucible position in rack",
171+
ge=1,
172+
le=4
173+
)
174+
175+
dosing_crucible_sub_rack: Literal["SubRackA", "SubRackB", "SubRackC", "SubRackD"] | None = Field(
176+
default=None,
177+
description="Sub-rack identifier"
178+
)
179+
180+
dosing_mixing_pot_position: int | None = Field(
181+
default=None,
182+
description="Mixing pot position",
183+
ge=1,
184+
le=16
185+
)
186+
187+
dosing_ethanol_dispense_volume: int | None = Field(
188+
default=None,
189+
description="Volume of ethanol dispensed in µL",
190+
ge=0
191+
)
192+
193+
dosing_target_transfer_volume: int | None = Field(
194+
default=None,
195+
description="Target transfer volume in µL",
196+
ge=0
197+
)
198+
199+
dosing_actual_transfer_mass: float | None = Field(
200+
default=None,
201+
description="Actual mass transferred in g",
202+
ge=0
203+
)
204+
205+
dosing_dac_duration: int | None = Field(
206+
default=None,
207+
description="DAC duration in seconds",
208+
ge=0
209+
)
210+
211+
dosing_dac_speed: int | None = Field(
212+
default=None,
213+
description="DAC rotation speed in rpm",
214+
ge=0
215+
)
216+
217+
dosing_actual_heat_duration: int | None = Field(
218+
default=None,
219+
description="Actual heating duration during dosing in seconds",
220+
ge=0
221+
)
222+
223+
dosing_end_reason: str | None = Field(
224+
default=None,
225+
description="Reason for ending dosing session"
226+
)
227+

0 commit comments

Comments
 (0)