orbital-materials
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 6 additions & 0 deletions b/‎CONTRIBUTING.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 6 additions & 0 deletions b/‎README.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎internal/check.py‎
Lines changed: 8 additions & 1 deletion b/‎internal/check.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎orb_models/forcefield/atomic_system.py‎
Lines changed: 14 additions & 4 deletions b/‎orb_models/forcefield/atomic_system.py‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎orb_models/forcefield/base.py‎
Lines changed: 7 additions & 0 deletions b/‎orb_models/forcefield/base.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎orb_models/forcefield/featurization_utilities.py‎
Lines changed: 91 additions & 15 deletions b/‎orb_models/forcefield/featurization_utilities.py‎
Lines changed: 91 additions & 15 deletions
@@ -7,6 +7,12 @@ pip install poetry  # Install Poetry if you don't have it
 poetry install
 ```
 
+Optionally, also install [cuML](https://docs.rapids.ai/install/) (requires CUDA):
+```bash
+pip install --extra-index-url=https://pypi.nvidia.com "cuml-cu11==25.2.*"  # For cuda versions >=11.4, <11.8
+pip install --extra-index-url=https://pypi.nvidia.com "cuml-cu12==25.2.*"  # For cuda versions >=12.0, <13.0
+```
+
 ### Running tests
 
 The `orb_models` package uses `pytest` for testing. To run the tests, navigate to the root directory of the package and run the following command:
 
@@ -22,6 +22,12 @@ pip install orb-models
 
 Orb models are expected to work on MacOS and Linux. Windows support is not guaranteed.
 
+For large system (≳5k atoms PBC, or ≳30k atoms non-PBC) simulations we recommend installing [cuML](https://docs.rapids.ai/install/) (requires CUDA), which can significantly reduce graph creation time (2-10x) and improve GPU memory efficiency (2-100x):
+```bash
+pip install --extra-index-url=https://pypi.nvidia.com "cuml-cu11==25.2.*"  # For cuda versions >=11.4, <11.8
+pip install --extra-index-url=https://pypi.nvidia.com "cuml-cu12==25.2.*"  # For cuda versions >=12.0, <13.0
+```
+
 ### Updates
 
 **Oct 2024**: We have released a new version of the models, `orb-v2`. This version has 2 major changes:
 
@@ -4,6 +4,8 @@
 
 import ase
 import torch
+import numpy as np
+
 from core.dataset import atomic_system as core_atomic_system
 from core.models import load
 
@@ -21,7 +23,12 @@ def main(model: str, core_model: str):
     """
     original_orbff, _, sys_config = load.load_model(core_model)
 
-    atoms = ase.Atoms("H2O", positions=[[0, 0, 0], [0, 0, 1.1], [0, 1.1, 0]])
+    atoms = ase.Atoms(
+        "H2O",
+        positions=[[0, 0, 0], [0, 0, 1.1], [0, 1.1, 0]],
+        cell=np.eye(3) * 2,
+        pbc=True,
+    )
 
     graph_orig = core_atomic_system.ase_atoms_to_atom_graphs(atoms, sys_config)
     graph = atomic_system.ase_atoms_to_atom_graphs(atoms)
 
@@ -171,9 +171,16 @@ def ase_atoms_to_atom_graphs(
     Args:
         atoms: ase.Atoms object
         wrap: whether to wrap atomic positions into the central unit cell (if there is one).
-        edge_method: The method to use for edge creation:
-            - knn_brute_force: Use brute force to find nearest neighbors.
-            - knn_scipy (default): Use scipy to find nearest neighbors.
+        edge_method (EdgeCreationMethod, optional): The method to use for graph edge construction.
+            If None, the edge method is chosen as follows:
+            * knn_brute_force: If device is not CPU, and cuML is not installed or num_atoms is < 5000 (PBC)
+                or < 30000 (non-PBC).
+            * knn_cuml_rbc: If device is not CPU, and cuML is installed, and num_atoms is >= 5000 (PBC) or
+                >= 30000 (non-PBC).
+            * knn_scipy (default): If device is CPU.
+            On GPU, for num_atoms ≲ 5000 (PBC) or ≲ 30000 (non-PBC), knn_brute_force is faster than knn_cuml_*,
+            but uses more memory. For num_atoms ≳ 5000 (PBC) or ≳ 30000 (non-PBC), knn_cuml_* is faster and uses
+            less memory, but requires cuML to be installed. knn_scipy is typically fastest on the CPU.
         system_config: The system configuration to use for graph construction.
         max_num_neighbors: Maximum number of neighbors each node can send messages to.
             If None, will use system_config.max_num_neighbors.
@@ -214,13 +221,15 @@ def ase_atoms_to_atom_graphs(
     )
     positions = torch.from_numpy(atoms.positions)
     cell = torch.from_numpy(atoms.cell.array)
+    pbc = torch.from_numpy(atoms.pbc)
     lattice = torch.from_numpy(cell_to_cellpar(cell))
-    if wrap and torch.any(cell != 0):
+    if wrap and (torch.any(cell != 0) and torch.any(pbc)):
         positions = feat_util.map_to_pbc_cell(positions, cell)
 
     edge_index, edge_vectors, unit_shifts = feat_util.compute_pbc_radius_graph(
         positions=positions,
         cell=cell,
+        pbc=pbc,
         radius=system_config.radius,
         max_number_neighbors=max_num_neighbors,
         edge_method=edge_method,
@@ -248,6 +257,7 @@ def ase_atoms_to_atom_graphs(
     graph_feats = {
         **atoms.info.get("graph_features", {}),
         "cell": cell,
+        "pbc": pbc,
         "lattice": lattice,
     }
 
 
@@ -113,6 +113,12 @@ def cell(self, val: torch.Tensor):
         assert self.system_features
         self.system_features["cell"] = val
 
+    @property
+    def pbc(self):
+        """Get pbc."""
+        assert self.system_features
+        return self.system_features.get("pbc")
+
     def compute_differentiable_edge_vectors(
         self,
         use_stress_displacement: bool = True,
@@ -477,6 +483,7 @@ def refeaturize_atomgraphs(
         ) = featurization_utilities.batch_compute_pbc_radius_graph(
             positions=positions,
             cells=cell,
+            pbc=atoms.pbc,
             radius=atoms.radius,
             n_node=num_atoms,
             max_number_neighbors=atoms.max_num_neighbors,
 
@@ -1,11 +1,17 @@
 """Featurization utilities for molecular models."""
 
+import typing
 from typing import Optional, Tuple, Union, Literal, List
 
 import ase
 import numpy as np
 import torch
 
+try:
+    import cuml
+except ImportError:
+    cuml = None
+
 from scipy.spatial import KDTree as SciKDTree
 
 
@@ -29,6 +35,25 @@ def get_device(
     return torch.device(requested_device)
 
 
+def get_default_edge_method(
+    device: torch.device, num_atoms: int, is_periodic: bool
+) -> EdgeCreationMethod:
+    """Get the default edge method for a given device and number of atoms."""
+    if device.type != "cpu":
+        if (
+            cuml is None
+            or (is_periodic and num_atoms < 5_000)
+            or (not is_periodic and num_atoms < 30_000)
+        ):
+            edge_method = "knn_brute_force"
+        else:
+            edge_method = "knn_cuml_rbc"
+    else:
+        edge_method = "knn_scipy"
+    assert edge_method in typing.get_args(EdgeCreationMethod)
+    return edge_method  # type: ignore
+
+
 def get_atom_embedding(atoms: ase.Atoms, k_hot: bool = False) -> torch.Tensor:
     """Get an atomic embedding."""
     atomic_numbers = torch.from_numpy(atoms.numbers).to(torch.long)
@@ -433,6 +458,7 @@ def compute_pbc_radius_graph(
     *,
     positions: torch.Tensor,
     cell: torch.Tensor,
+    pbc: torch.Tensor,
     radius: Union[float, torch.Tensor],
     max_number_neighbors: int,
     edge_method: Optional[EdgeCreationMethod] = None,
@@ -446,13 +472,21 @@ def compute_pbc_radius_graph(
     Args:
         positions (torch.Tensor): 3D positions of particles. Shape [num_particles, 3].
         cell (torch.Tensor): A 3x3 matrix where the lattice vectors are rows or columns.
+        pbc (torch.Tensor): A boolean tensor of shape [3] indicating which directions are periodic.
         radius (Union[float, torch.tensor]): The radius within which to connect atoms.
-        max_number_neighbors (int, optional): The maximum number of neighbors for each particle. Defaults to 20.
+        max_number_neighbors (int): The maximum number of neighbors for each particle.
         edge_method (EdgeCreationMethod, optional): The method to use for graph edge construction.
-            Defaults to None, in which case knn_brute_force is used if we are on GPU (2-6x faster),
-            otherwise knn_scipy. More details here: https://github.com/orbital-materials/orb/pull/766
-        n_workers (int, optional): The number of workers to use for KDTree construction. Defaults to 1.
-        device (Optional[Union[torch.device, str, int]], optional): The device to use for computation.
+            Defaults to None, in which case edge method is chosen as follows:
+            * knn_brute_force: If device is not CPU, and cuML is not installed or num_atoms is < 5000 (PBC)
+                or < 30000 (non-PBC).
+            * knn_cuml_rbc: If device is not CPU, and cuML is installed, and num_atoms is >= 5000 (PBC) or
+                >= 30000 (non-PBC).
+            * knn_scipy: If device is CPU.
+            On GPU, for num_atoms ≲ 5000 (PBC) or ≲ 30000 (non-PBC), knn_brute_force is faster than knn_cuml_*,
+            but uses more memory. For num_atoms ≳ 5000 (PBC) or ≳ 30000 (non-PBC), knn_cuml_* is faster and uses
+            less memory, but requires cuML to be installed. knn_scipy is typically fastest on the CPU.
+        n_workers (int, optional): The number of workers for KDTree construction in knn_scipy. Defaults to 1.
+        device (Union[torch.device, str, int], optional): The device to use for computation.
             Defaults to None, in which case GPU is used if available.
         half_supercell (bool): Whether to use half the supercell for graph construction, and then symmetrize.
             This flag does not affect the resulting graph; it is purely an optimization that can double
@@ -474,16 +508,16 @@ def compute_pbc_radius_graph(
 
     natoms = positions.shape[0]
     half_supercell = half_supercell and bool(torch.any(cell != 0.0))
+    is_periodic = bool(torch.any(cell != 0.0).item() and torch.any(pbc).item())
 
     device = get_device(requested_device=device)
-    if edge_method is None:
-        edge_method = "knn_brute_force" if device.type != "cpu" else "knn_scipy"
-    if edge_method == "knn_brute_force":
-        # if knn brute force, then try to place tensors on the gpu
+    edge_method = edge_method or get_default_edge_method(device, natoms, is_periodic)
+    if edge_method == "knn_brute_force" or edge_method.startswith("knn_cuml_"):
+        # if knn_brute_force or knn_cuml_*, then try to place tensors on the gpu if device is not provided
         positions = positions.to(device)
         cell = cell.to(device)
 
-    if torch.any(cell != 0.0):
+    if is_periodic:
         if half_supercell:
             supercell_positions, integer_offsets = construct_half_3x3x3_supercell(
                 positions=positions, cell=cell
@@ -575,6 +609,8 @@ def compute_supercell_neighbors(
         edge_method (EdgeCreationMethod): The method to use for graph edge construction:
             - knn_brute_force: Use brute force knn implementation: compute all pairwise distances between
             positions and supercell_positions, and subsequently filter edges based on radius and max_num_neighbors.
+            - knn_cuml_rbc: Use cuML's random-ball algorithm implementation.
+            - knn_cuml_brute: Use cuML's brute force implementation.
             - knn_scipy: Use scipy's KDTree implementation.
         n_workers (int, optional): The number of workers to use for KDTree construction. Defaults to 1.
     """
@@ -588,6 +624,33 @@ def compute_supercell_neighbors(
         within_radius = distances[:, 1:] < (radius + 1e-6)
         num_neighbors_per_sender = within_radius.sum(-1)
         supercell_receivers = supercell_receivers[:, 1:][within_radius]
+    elif edge_method.startswith("knn_cuml_"):
+        if cuml is None:
+            raise ImportError(
+                "cuML is not installed. Please install cuML: https://docs.rapids.ai/install/."
+            )
+        assert (
+            supercell_positions.device.type == "cuda"
+            and central_cell_positions.device.type == "cuda"
+        ), "cuML KNN is only supported on CUDA devices"
+        algorithm = edge_method.split("_")[-1]
+        k = min(max_num_neighbors + 1, len(supercell_positions))
+        knn = cuml.neighbors.NearestNeighbors(
+            n_neighbors=k,
+            algorithm=algorithm,
+            metric="euclidean",
+        )
+        knn.fit(supercell_positions)
+        distances, supercell_receivers = knn.kneighbors(
+            central_cell_positions, return_distance=True
+        )
+        # Convert from CuPy arrays to PyTorch tensors
+        distances = torch.as_tensor(distances)
+        supercell_receivers = torch.as_tensor(supercell_receivers)
+        # remove self-edges and edges beyond radius
+        within_radius = distances[:, 1:] < (radius + 1e-6)
+        num_neighbors_per_sender = within_radius.sum(-1)
+        supercell_receivers = supercell_receivers[:, 1:][within_radius]
     elif edge_method == "knn_scipy":
         tree_data = supercell_positions.clone().detach().cpu().numpy()
         tree_query = central_cell_positions.clone().detach().cpu().numpy()
@@ -600,6 +663,9 @@ def compute_supercell_neighbors(
             workers=n_workers,
             p=2,
         )
+        if len(supercell_receivers.shape) == 1:
+            supercell_receivers = supercell_receivers[None, :]
+
         # Remove the self-edge that will be closest
         supercell_receivers = np.array(supercell_receivers)[:, 1:]  # type: ignore
 
@@ -688,6 +754,7 @@ def batch_compute_pbc_radius_graph(
     *,
     positions: torch.Tensor,
     cells: torch.Tensor,
+    pbc: torch.Tensor,
     radius: Union[float, torch.Tensor],
     n_node: torch.Tensor,
     max_number_neighbors: torch.Tensor,
@@ -704,13 +771,21 @@ def batch_compute_pbc_radius_graph(
     Args:
         positions (torch.Tensor): 3D positions of a batch of particles. Shape [num_particles, 3].
         cells (torch.Tensor): A batch of 3x3 matrices where the lattice vectors are rows.
+        pbc (torch.Tensor): A batch of boolean tensors of shape [3] indicating which directions are periodic.
         radius (Union[float, torch.tensor]): The radius within which to connect atoms.
         n_node (torch.Tensor): A vector where each element indicates the number of particles in each element of
             the batch. Of size len(batch).
         max_number_neighbors (torch.Tensor): The maximum number of neighbors for each particle.
         edge_method (EdgeCreationMethod, optional): The method to use for graph edge construction.
-            Defaults to None, in which case knn_brute_force is used if we are on GPU (2-6x faster),
-            otherwise knn_scipy. More details here: https://github.com/orbital-materials/orb/pull/766
+            Defaults to None, in which case edge method is chosen as follows:
+            * knn_brute_force: If device is not CPU, and cuML is not installed or num_atoms is < 5000 (PBC)
+                or < 30000 (non-PBC).
+            * knn_cuml_rbc: If device is not CPU, and cuML is installed, and num_atoms is >= 5000 (PBC) or
+                >= 30000 (non-PBC).
+            * knn_scipy: If device is CPU.
+            On GPU, for num_atoms ≲ 5000 (PBC) or ≲ 30000 (non-PBC), knn_brute_force is faster than knn_cuml_*,
+            but uses more memory. For num_atoms ≳ 5000 (PBC) or ≳ 30000 (non-PBC), knn_cuml_* is faster and uses
+            less memory, but requires cuML to be installed. knn_scipy is typically fastest on the CPU.
         half_supercell (bool): Whether to use half the supercell for graph construction, and then symmetrize.
             This flag does not affect the resulting graph; it is purely an optimization that can double
             throughput and half memory for very large cells (e.g. 10k+ atoms). For smaller systems, it can harm
@@ -731,16 +806,17 @@ def batch_compute_pbc_radius_graph(
     num_edges = []
     all_unit_shifts = []
 
-    device = positions.device
-    for p, pbc, mn in zip(
+    for p, cell, pbc, mn in zip(
         torch.tensor_split(positions, torch.cumsum(n_node, 0)[:-1].cpu()),
         cells,
+        pbc,
         max_number_neighbors,
         strict=True,
     ):
         edges, vectors, unit_shifts = compute_pbc_radius_graph(
             positions=p,
-            cell=pbc,
+            cell=cell,
+            pbc=pbc,
             radius=radius,
             max_number_neighbors=int(mn),
             edge_method=edge_method,