materialsproject · esoteric-ephemera · Dec 9, 2025 · Dec 2, 2025 · Dec 3, 2025 · Dec 4, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -35,3 +35,12 @@ repos:
     rev: 25.9.0
     hooks:
       - id: black
+
+  - repo: https://github.com/kynan/nbstripout
+    rev: 0.8.1
+    hooks:
+      - id: nbstripout
+        args:
+          - --drop-empty-cells
+          - --strip-init-cells
+          - --extra-keys=metadata.kernelspec
diff --git a/mpcontribs-lux/.python-version b/mpcontribs-lux/.python-version
@@ -0,0 +1 @@
+3.12
diff --git a/mpcontribs-lux/README.md b/mpcontribs-lux/README.md
@@ -0,0 +1,29 @@
+## <span style="color:rgb(18, 180, 163)">MPContribs</span> <span style="color:goldenrod">LUX</span>
+
+<span style="color:goldenrod"><i><b>Ego sum lux datorum</b></i></span>.
+
+MPContribs-lux is a package which <it>sheds light</it> on data stored on the [Materials Project's AWS S3 OpenData bucket](https://materialsproject-contribs.s3.amazonaws.com/index.html#) by providing annotated schemas and optionally analysis tools to better explore user-submitted data.
+
+Adding a schema to this database is a <span style="color:red"><b>pre-requisite</b></span> for obtaining permission/IAM credentials for uploading data to MP's OpenData Bucket.
+Once a staff member from MP reviews and approves your data schema, your receive IAM role will be granted/updated (as appropriate).
+
+<span style="color:red"><b>What if I don't want my schemas / data made public yet?</b></span>
+
+To expedite the process of review, follow [these instructions](https://docs.github.com/en/repositories/creating-and-managing-repositories/duplicating-a-repository) to make a private copy (not a fork, which cannot be private) of the `MPContribs` repo.
+Suppose you name your new repository `PrivateMPContribs` and your username is `<username>`, you would run these commands from a terminal:
+```console
+git clone --bare https://github.com/materialsproject/MPContribs.git
+cd MPContribs
+git push --mirror https://github.com/<username>/PrivateMPContribs.git
+cd ..
+rm -rf MPContribs
+```
+
+Then add your schemas to the private repo `PrivateMPContribs` and invite the maintainers of `MPContribs` to view it (you don't need to give us edit access).
+We will then review your schemas.
+When you're ready to make your data public, you will also have to make a public PR with your new schemas.
+
+<span style="color:red"><b>But my CSV/JSON/YAML/etc. file isn't complicated. Why do I need to upload a schema?</b></span>
+
+Schemas are important for ensuring accessibility, interoperability, and reproducibility, and for ensuring that you are fully aware of possible errors in your dataset.
+If you are not comfortable mimicking the example `pydantic` schemas in `mpcontribs.lux.projects.examples`
diff --git a/mpcontribs-lux/mpcontribs/lux/autogen.py b/mpcontribs-lux/mpcontribs/lux/autogen.py
@@ -0,0 +1,126 @@
+"""Automatically generate schemas from existing data using pandas."""
+
+from enum import StrEnum
+from typing import Any, Type, Annotated
+
+from emmet.core.types.typing import NullableDateTimeType, DateTimeType
+import pandas as pd
+from pathlib import Path
+from pydantic import BaseModel, Field, model_validator, create_model, BeforeValidator
+
+_complex_type_validator = BeforeValidator(lambda x : (x.real,x.imag) if isinstance(x,complex) else x)
+
+ComplexType = Annotated[
+    tuple[float,float],
+    _complex_type_validator
+]
+
+NullableComplexType = Annotated[
+    tuple[float,float] | None,
+    _complex_type_validator
+]
+
+class FileFormat(StrEnum):
+    """Define known file formats for autogeneration of schemae."""
+
+    CSV = "csv"
+    JSON = "json"
+    JSONL = "jsonl"
+
+class SchemaGenerator(BaseModel):
+    """Automatically infer a dataset schema and create a pydantic model from it."""
+
+    file_name : str | Path = Field(
+        description="The path to the dataset."
+    )
+
+    fmt : FileFormat | None = Field(
+        None, description = "The dataset file format. If no format is provided, it will be inferred."
+    )
+
+    @model_validator(mode="before")
+    def check_format(cls, config : dict[str,Any]) -> dict[str,Any]:
+
+        if isinstance(fp := config["file_name"],str):
+            config["file_name"] = Path(fp).resolve()
+
+        if config.get("fmt"):
+            if isinstance(config["fmt"],str):
+                if config["fmt"] in FileFormat.__members__:
+                    config["fmt"] = FileFormat[config["fmt"]]
+                else:
+                    try:
+                        config["fmt"] = FileFormat(config["fmt"])
+                    except ValueError:
+                        raise ValueError(
+                            f"Could not interpret submitted file format {config['fmt']}"
+                        )
+        else:
+            try:
+                config["fmt"] = next(
+                    file_fmt for file_fmt in FileFormat if file_fmt.value in  config["file_name"].name
+                )
+            except StopIteration:
+                raise ValueError(
+                    f"Could not infer file format for {config['file_name']}"
+                )
+        return config
+
+    @staticmethod
+    def _cast_dtype(dtype, assume_nullable : bool = True):
+        """Cast input dtype to parquet-friendly dtypes.
+
+        Accounts for difficulties de-serializing datetimes 
+        and complex numbers.
+
+        Assumes all fields are nullable by default.
+        """
+        vname = getattr(dtype,"name",str(dtype)).lower()
+
+        if any(spec_type in vname for spec_type in ("datetime","complex")):
+            if "datetime" in vname:
+                return NullableDateTimeType if assume_nullable else DateTimeType
+            elif "complex" in vname:
+                return NullableComplexType if assume_nullable else ComplexType
+
+        inferred_type = str
+        if "float" in vname:
+            inferred_type = float
+        elif "int" in vname:
+            inferred_type = int
+
+        return inferred_type | None if assume_nullable else inferred_type
+
+    @property
+    def pydantic_schema(self) -> Type[BaseModel]:
+        """Create the pydantic schema of the data structure."""
+
+        if self.fmt == "csv":
+            data = pd.read_csv(self.file_name)
+
+        elif self.fmt in {"json","jsonl"}:
+            # we exclude the "table" case for `orient` since the user
+            # presumably already knows what the schema is.
+            for orient in ("columns","index","records","split","values"):
+                try:
+                    data = pd.read_json(self.file_name, orient=orient, lines = self.fmt == "jsonl")
+                    break
+                except Exception as exc:
+                    continue
+            else:
+                raise ValueError(
+                    f"Could not load {self.fmt.value} data, please check manually."
+                )
+
+        model_fields = {
+            col_name : (
+                self._cast_dtype(data.dtypes[col_name]),
+                Field(default=None,)
+            )
+            for col_name in data.columns
+        }
+
+        return create_model(
+            f"{self.file_name.name.split(".",1)[0]}",
+            **model_fields,
+        )
diff --git a/mpcontribs-lux/mpcontribs/lux/projects/__init__.py b/mpcontribs-lux/mpcontribs/lux/projects/__init__.py
diff --git a/mpcontribs-lux/mpcontribs/lux/projects/esoteric_ephemera/__init__.py b/mpcontribs-lux/mpcontribs/lux/projects/esoteric_ephemera/__init__.py
diff --git a/mpcontribs-lux/mpcontribs/lux/projects/esoteric_ephemera/schemas/MP_ALOE_2025.py b/mpcontribs-lux/mpcontribs/lux/projects/esoteric_ephemera/schemas/MP_ALOE_2025.py
@@ -0,0 +1,24 @@
+"""Define schemas for the MP-ALOE 2025 dataset."""
+from pydantic import Field
+
+from mpcontribs.lux.projects.esoteric_ephemera.schemas.MatPES_2025_1 import MatPESTrainDoc
+
+class MPAloeTrainDoc(MatPESTrainDoc):
+    """Schematize MP-ALOE data."""
+
+    mp_aloe_id: str | None = Field(
+        None, description="The identifier of this entry in MP-ALOE."
+    )
+    ionic_step_number: int | None = Field(
+        None, description="The ionic step index of this frame."
+    )
+    prototype_number: int | None = Field(
+        None, description="The index of the prototype structure used in generation."
+    )
+    is_charge_balanced: bool | None = Field(
+        None, description="Whether the structure is likely charge balanced."
+    )
+    has_overlapping_pseudo_cores: bool | None = Field(
+        None,
+        description="Whether the pseudopotential cores overlap for at least one set of nearest neighbors.",
+    )
diff --git a/mpcontribs-lux/mpcontribs/lux/projects/esoteric_ephemera/schemas/MPtrj_2022_9.py b/mpcontribs-lux/mpcontribs/lux/projects/esoteric_ephemera/schemas/MPtrj_2022_9.py
@@ -0,0 +1,47 @@
+"""Define schemas for the MPtrj v2022.9 dataset."""
+
+from pydantic import BaseModel, Field
+
+from emmet.core.types.typing import IdentifierType
+
+from mpcontribs.lux.projects.esoteric_ephemera.schemas.base import MLTrainDoc
+
+class MPtrjProvenance(BaseModel):
+    """Metadata for MPtrj entries."""
+
+    material_id: IdentifierType | None = Field(
+        None, description="The Materials Project (summary) ID for this material."
+    )
+    task_id: IdentifierType | None = Field(
+        None, description="The Materials Project (summary) ID for this material."
+    )
+    calcs_reversed_index: int | None = Field(
+        None, description="The index of the reversed calculations, if applicable."
+    )
+    ionic_step_index: int | None = Field(
+        None, description="The index of the ionic step, if applicable."
+    )
+
+
+class MPtrjTrainDoc(MLTrainDoc):
+    """Schematize MPtrj data."""
+
+    energy: float | None = Field(
+        None, description="The total uncorrected energy associated with this structure."
+    )
+
+    cohesive_energy_per_atom: float | None = Field(
+        None, description="The uncorrected cohesive energy per atom of this material."
+    )
+
+    corrected_cohesive_energy_per_atom: float | None = Field(
+        None,
+        description=(
+            "The corrected cohesive energy per atom of this material, "
+            "using the Materials Project GGA / GGA+U mixing scheme."
+        ),
+    )
+
+    provenance: MPtrjProvenance | None = Field(
+        None, description="Metadata for this frame."
+    )
diff --git a/mpcontribs-lux/mpcontribs/lux/projects/esoteric_ephemera/schemas/MatPES_2025_1.py b/mpcontribs-lux/mpcontribs/lux/projects/esoteric_ephemera/schemas/MatPES_2025_1.py
@@ -0,0 +1,68 @@
+"""Define schemas for the MatPES 2025.1 dataset."""
+
+from pydantic import BaseModel, Field
+
+from emmet.core.types.typing import IdentifierType
+
+from mpcontribs.lux.projects.esoteric_ephemera.schemas.base import MLTrainDoc
+
+class MatPESProvenanceDoc(BaseModel):
+    """Information regarding the origins of a MatPES structure."""
+
+    original_mp_id: IdentifierType | None = Field(
+        None,
+        description="MP identifier corresponding to the Materials Project structure from which this entry was sourced from.",
+    )
+    materials_project_version: str | None = Field(
+        None,
+        description="The version of the Materials Project from which the struture was sourced.",
+    )
+    md_ensemble: str | None = Field(
+        None,
+        description="The molecular dynamics ensemble used to generate this structure.",
+    )
+    md_temperature: float | None = Field(
+        None,
+        description="If a float, the temperature in Kelvin at which MLMD was performed.",
+    )
+    md_pressure: float | None = Field(
+        None,
+        description="If a float, the pressure in atmosphere at which MLMD was performed.",
+    )
+    md_step: int | None = Field(
+        None,
+        description="The step in the MD simulation from which the structure was sampled.",
+    )
+    mlip_name: str | None = Field(
+        None, description="The name of the ML potential used to perform MLMD."
+    )
+
+
+class MatPESTrainDoc(MLTrainDoc):
+    """
+    Schema for VASP data in the Materials Potential Energy Surface (MatPES) effort.
+
+    This schema is used in the data entries for MatPES v2025.1,
+    which can be downloaded either:
+        - On [MPContribs](https://materialsproject-contribs.s3.amazonaws.com/index.html#MatPES_2025_1/)
+        - or on [the site](https://matpes.ai)
+    """
+
+    matpes_id: str | None = Field(None, description="MatPES identifier.")
+
+    formation_energy_per_atom: float | None = Field(
+        None,
+        description="The uncorrected formation enthalpy per atom at zero pressure and temperature.",
+    )
+    cohesive_energy_per_atom: float | None = Field(
+        None, description="The uncorrected cohesive energy per atom."
+    )
+
+    provenance: MatPESProvenanceDoc | None = Field(
+        None, description="Information about the provenance of the structure."
+    )
+
+    @property
+    def pressure(self) -> float | None:
+        """Return the pressure from the DFT stress tensor."""
+        return sum(self.stress[:3]) / 3.0 if self.stress else None
diff --git a/mpcontribs-lux/mpcontribs/lux/projects/esoteric_ephemera/schemas/__init__.py b/mpcontribs-lux/mpcontribs/lux/projects/esoteric_ephemera/schemas/__init__.py
@@ -0,0 +1,6 @@
+from mpcontribs.lux.projects.esoteric_ephemera.schemas.base import MLTrainDoc
+from mpcontribs.lux.projects.esoteric_ephemera.schemas.MatPES_2025_1 import MatPESTrainDoc
+from mpcontribs.lux.projects.esoteric_ephemera.schemas.MP_ALOE_2025 import MPAloeTrainDoc
+from mpcontribs.lux.projects.esoteric_ephemera.schemas.MPtrj_2022_9 import MPtrjTrainDoc
+
+__all__ = ["MLTrainDoc","MatPESTrainDoc","MPAloeTrainDoc","MPtrjTrainDoc"]