Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 149 additions & 0 deletions emmet-builders/emmet/builders/materials/provenance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
"""Build provenance collection."""

from functools import partial

from emmet.builders.base import BaseBuilderInput
from emmet.builders.settings import EmmetBuildSettings
from emmet.core.connectors.analysis import parse_cif
from emmet.core.connectors.icsd.client import IcsdClient
from emmet.core.connectors.icsd.enums import IcsdSubset
from emmet.core.provenance import DatabaseSNL, ProvenanceDoc

from pymatgen.analysis.structure_matcher import StructureMatcher, ElementComparator

SETTINGS = EmmetBuildSettings()
structure_matcher = StructureMatcher(
ltol=SETTINGS.LTOL,
stol=SETTINGS.STOL,
comparator=ElementComparator(),
angle_tol=SETTINGS.ANGLE_TOL,
primitive_cell=True,
scale=True,
attempt_supercell=False,
allow_subset=False,
)


def _get_snl_from_cif(cif_str: str, **kwargs) -> DatabaseSNL | None:
"""Build a database SNL from a CIF plus its metadata.

NB: Only takes the first structure from a CIF.
While a CIF can technically contain many structures,
the ICSD usually only distributes CIFs with one structure
per file.

Parameters
-----------
cif_str : the CIF to parse
**kwargs to pass to `DatabaseSNL`
"""
try:
structures, cif_parsing_remarks = parse_cif(cif_str)
remarks = kwargs.pop("remarks", None) or cif_parsing_remarks or None
snl = DatabaseSNL.from_structure(
meta_structure=structures[0],
structure=structures[0],
remarks=remarks,
**kwargs,
)
except Exception:
return None

if snl and snl.remarks is None:
return snl
return None


def update_experimental_icsd_structures(**client_kwargs) -> list[DatabaseSNL]:
"""Update the collection of ICSD SNLs.

Parameters
-----------
**client_kwargs to pass to `IcsdClient`

Returns
-----------
List of DatabaseSNL
"""
data = []
with IcsdClient(use_document_model=False, **client_kwargs) as client:
for icsd_subset in (
IcsdSubset.EXPERIMENTAL_METALORGANIC,
IcsdSubset.EXPERIMENTAL_INORGANIC,
):
data += client.search(
subset=IcsdSubset.EXPERIMENTAL_INORGANIC,
space_group_number=(1, 230),
include_cif=True,
include_metadata=False,
)

parsed = [
_get_snl_from_cif(
doc["cif"],
snl_id=f"icsd-{doc['collection_code']}",
tags=doc["subset"].value,
source="icsd",
)
for doc in data
]

return sorted(
[doc for doc in parsed if doc],
key=lambda doc: int(doc.snl_id.split("-", 1)[-1]),
)


def match_against_snls(
input_doc: BaseBuilderInput,
snls: list[DatabaseSNL],
):
"""Match a single document against the SNL collection."""
database_ids = {}
authors = [SETTINGS.DEFAULT_AUTHOR]
history = [SETTINGS.DEFAULT_HISTORY]
references = [SETTINGS.DEFAULT_REFERENCE]
theoretical = True

for snl in [
doc
for doc in snls
if doc.chemsys
== (
"-".join(sorted(input_doc.structure.composition.chemical_system.split("-")))
)
]:
if structure_matcher.fit(input_doc.structure, snl.structure):

if snl.source and snl.source in {"icsd", "pauling"}:
theoretical = False
database_ids[snl.source].append(snl.snl_id)

if snl.about:
authors.extend(snl.about.authors or [])
history.extend(snl.about.history or [])
# `SNLAbout` uses string for `references`,
# `ProvenanceDoc` uses list of str
if snl.about.references:
references.append(snl.about.references)

return ProvenanceDoc.from_structure(
meta_structure=input_doc.structure,
material_id=input_doc.material_id,
database_IDs=database_ids,
theoretical=theoretical,
authors=authors,
history=history,
references=references,
)


def build_provenance_docs(
input_documents: list[BaseBuilderInput],
snls: list[DatabaseSNL],
) -> list[ProvenanceDoc]:
"""Build the provenance collection."""

wrapped = partial(match_against_snls, snls=snls)

return map(wrapped, input_documents)
120 changes: 120 additions & 0 deletions emmet-builders/emmet/builders/materials/similarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import numpy as np

from emmet.builders.base import BaseBuilderInput
from emmet.core.similarity import (
CrystalNNSimilarity,
M3GNetSimilarity,
SimilarityDoc,
SimilarityEntry,
SimilarityMethod,
)

SIM_METHOD_TO_SCORER = {
SimilarityMethod(k): v
for k, v in {
"CrystalNN": CrystalNNSimilarity,
"M3GNet": M3GNetSimilarity,
}.items()
}


class SimilarityBuilderInput(BaseBuilderInput):
"""Augment base builder input with extra fields."""

similarity_method: SimilarityMethod
feature_vector: list[float]


# this could probably be parallelized over `similarity_method`
def build_feature_vectors(
input_documents: list[BaseBuilderInput],
similarity_method: SimilarityMethod | str = SimilarityMethod.CRYSTALNN,
) -> list[SimilarityBuilderInput]:
"""Generate similarity feature vectors.

Args:
input_documents : list of BaseBuilderInput to process
similarity_method : SimilarityMethod = SimilarityMethod.CRYSTALNN
The method to use in building similarity docs.
Returns:
list of SimilarityBuilderInput
"""
if isinstance(similarity_method, str):
similarity_method = (
SimilarityMethod[similarity_method]
if similarity_method in SimilarityMethod.__members__
else SimilarityMethod(similarity_method)
)

if scorer_cls := SIM_METHOD_TO_SCORER.get(similarity_method):
scorer = scorer_cls()
else:
raise ValueError(f"Unsupported {similarity_method=}")

return list(
map(
lambda x: SimilarityBuilderInput(
material_id=x.material_id,
structure=x.structure,
similarity_method=similarity_method,
feature_vector=scorer._featurize_structure(x.structure),
),
input_documents,
)
)


def build_similarity_docs(
input_documents: list[SimilarityBuilderInput],
num_closest: int = 100,
) -> list[SimilarityDoc]:
"""Generate similarity feature vectors.

All input docs should use the same similarity method.
A check is performed at the start to ensure this.

Args:
input_documents : list of SimilarityBuilderInput to process
num_closest : int = 100
The number of most similar materials to identify
for each material
Returns:
list of SimilarityDoc
"""

if (
len(distinct_sim_methods := {doc.similarity_method for doc in input_documents})
> 1
):
raise ValueError(
f"Multiple similarity methods found: {', '.join(distinct_sim_methods)}"
)

scorer_cls = SIM_METHOD_TO_SCORER[method := input_documents[0].similarity_method]
material_ids, vectors, structures = np.array(
[doc.material_id, doc.feature_vector, doc.structure] for doc in input_documents
).T

similarity_docs = []
for i, material_id in enumerate(material_ids):
closest_idxs, closest_dist = scorer_cls._get_closest_vectors(
i, vectors, num_closest
)
similarity_docs.append(
SimilarityDoc.from_structure(
meta_structure=structures[i],
material_id=material_id,
feature_vector=vectors[i],
method=method,
sim=[
SimilarityEntry(
task_id=material_ids[jdx],
nelements=len(structures[jdx].composition.elements),
dissimilarity=100.0 - closest_dist[j],
formula=structures[jdx].formula,
)
for j, jdx in enumerate(closest_idxs)
],
)
)
return similarity_docs
1 change: 1 addition & 0 deletions emmet-core/emmet/core/connectors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Aggregate resources for making external queries to databases."""
Loading