pchlenski
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎manify/__init__.py‎
Lines changed: 4 additions & 8 deletions b/‎manify/__init__.py‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎manify/clustering/fuzzy_kmeans.py‎
Lines changed: 3 additions & 4 deletions b/‎manify/clustering/fuzzy_kmeans.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎manify/curvature_estimation/delta_hyperbolicity.py‎
Lines changed: 2 additions & 4 deletions b/‎manify/curvature_estimation/delta_hyperbolicity.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎manify/curvature_estimation/greedy_method.py‎
Lines changed: 2 additions & 4 deletions b/‎manify/curvature_estimation/greedy_method.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎manify/embedders/_base.py‎
Lines changed: 8 additions & 8 deletions b/‎manify/embedders/_base.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎manify/embedders/_losses.py‎
Lines changed: 12 additions & 8 deletions b/‎manify/embedders/_losses.py‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎manify/embedders/coordinate_learning.py‎
Lines changed: 10 additions & 8 deletions b/‎manify/embedders/coordinate_learning.py‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎manify/embedders/siamese.py‎
Lines changed: 2 additions & 3 deletions b/‎manify/embedders/siamese.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎manify/embedders/vae.py‎
Lines changed: 6 additions & 7 deletions b/‎manify/embedders/vae.py‎
Lines changed: 6 additions & 7 deletions
@@ -54,7 +54,7 @@ jobs:
 
       # Unit testing
       - name: Run unit tests & collect coverage
-        run: pytest tests --cov=manify --cov-report=xml:coverage.xml
+        run: pytest BEARTYPE_ENABLE=true tests --cov=manify --cov-report=xml:coverage.xml
 
       # Check docstrings are in Google style      
       - name: Check docstrings are in Google style
 
@@ -1,15 +1,11 @@
 """Manify: A Python Library for Learning Non-Euclidean Representations."""
 
-from jaxtyping import install_import_hook
+if os.getenv("BEARTYPE_ENABLE", "false").lower() == "true":
+    from jaxtyping import install_import_hook
 
-install_import_hook("manify", "beartype.beartype")
+    install_import_hook("manify", "beartype.beartype")
 
-from manify.curvature_estimation import (
-    delta_hyperbolicity,
-    greedy_signature_selection,
-    sampled_delta_hyperbolicity,
-    sectional_curvature,
-)
+from manify.curvature_estimation import greedy_signature_selection, sampled_delta_hyperbolicity, sectional_curvature
 from manify.embedders import CoordinateLearning, ProductSpaceVAE, SiameseNetwork
 from manify.manifolds import Manifold, ProductManifold
 from manify.predictors import KappaGCN, ProductSpaceDT, ProductSpacePerceptron, ProductSpaceRF, ProductSpaceSVM
 
@@ -20,10 +20,9 @@
 
 from __future__ import annotations
 
-from typing import Literal, Optional, Union
-
 import numpy as np
 import torch
+from beartype.typing import Literal
 from geoopt import ManifoldParameter
 from geoopt.optim import RiemannianAdam
 from jaxtyping import Float, Int
@@ -66,13 +65,13 @@ class RiemannianFuzzyKMeans(BaseEstimator, ClusterMixin):
     def __init__(
         self,
         n_clusters: int,
-        manifold: Union[Manifold, ProductManifold],
+        manifold: Manifold | ProductManifold,
         m: float = 2.0,
         lr: float = 0.1,
         max_iter: int = 100,
         tol: float = 1e-4,
         optimizer: Literal["adan", "adam"] = "adan",
-        random_state: Optional[int] = None,
+        random_state: int | None = None,
         verbose: bool = False,
     ):
         self.n_clusters = n_clusters
 
@@ -11,15 +11,13 @@
 
 from __future__ import annotations
 
-from typing import Tuple
-
 import torch
-from jaxtyping import Float
+from jaxtyping import Float, Int
 
 
 def sampled_delta_hyperbolicity(
     D: Float[torch.Tensor, "n_points n_points"], n_samples: int = 1000, reference_idx: int = 0, relative: bool = True
-) -> Tuple[Float[torch.Tensor, "n_samples"], Float[torch.Tensor, "n_samples 3"]]:
+) -> tuple[Float[torch.Tensor, "n_samples"], Int[torch.Tensor, "n_samples 3"]]:
     r"""Computes $\delta$-hyperbolicity by sampling random point triplets.
 
     For large metric spaces, this approximates $\delta$-hyperbolicity by randomly sampling triplets. For each triplet
 
@@ -6,8 +6,6 @@
 
 from __future__ import annotations
 
-from typing import Any, Tuple
-
 import torch
 from jaxtyping import Float
 
@@ -17,9 +15,9 @@
 def greedy_signature_selection(
     pm: ProductManifold,
     dists: Float[torch.Tensor, "n_points n_points"],
-    candidate_components: Tuple[Tuple[float, int], ...] = ((-1.0, 2), (0.0, 2), (1.0, 2)),
+    candidate_components: tuple[tuple[float, int], ...] = ((-1.0, 2), (0.0, 2), (1.0, 2)),
     max_components: int = 3,
-) -> Any:
+) -> None:
     r"""Greedily estimates an optimal product manifold signature.
 
     This implements the greedy signature selection algorithm that incrementally builds a product manifold
 
@@ -3,9 +3,9 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Optional
 
 import torch
+from beartype.typing import Any
 from jaxtyping import Float
 from sklearn.base import BaseEstimator, TransformerMixin
 
@@ -27,18 +27,18 @@ class BaseEmbedder(BaseEstimator, TransformerMixin, ABC):
         is_fitted_: Boolean flag indicating if the embedder has been fitted.
     """
 
-    def __init__(self, pm: ProductManifold, random_state: Optional[int] = None, device: Optional[str] = None) -> None:
+    def __init__(self, pm: ProductManifold, random_state: int | None = None, device: str | None = None) -> None:
         self.pm = pm
         self.random_state = random_state
         self.device = pm.device if device is None else device
-        self.loss_history_: Dict[str, List[float]] = {}
+        self.loss_history_: dict[str, list[float]] = {}
         self.is_fitted_: bool = False
 
     @abstractmethod
     def fit(
         self,
-        X: Optional[Float[torch.Tensor, "n_points n_features"]] = None,
-        D: Optional[Float[torch.Tensor, "n_points n_points"]] = None,
+        X: Float[torch.Tensor, "n_points n_features"] | None = None,
+        D: Float[torch.Tensor, "n_points n_points"] | None = None,
         lr: float = 1e-2,
         burn_in_lr: float = 1e-3,
         curvature_lr: float = 0.0,  # Off by default
@@ -67,7 +67,7 @@ def fit(
 
     @abstractmethod
     def transform(
-        self, X: Optional[Float[torch.Tensor, "n_points n_features"]]
+        self, X: Float[torch.Tensor, "n_points n_features"] | None
     ) -> Float[torch.Tensor, "n_points embedding_dim"]:
         """Apply embedding to new data. Not defined for coordinate learning.
 
@@ -81,8 +81,8 @@ def transform(
 
     def fit_transform(
         self,
-        X: Optional[Float[torch.Tensor, "n_points n_features"]] = None,
-        D: Optional[Float[torch.Tensor, "n_points n_points"]] = None,
+        X: Float[torch.Tensor, "n_points n_features"] | None = None,
+        D: Float[torch.Tensor, "n_points n_points"] | None = None,
         **fit_kwargs: Any,
     ) -> Float[torch.Tensor, "n_points embedding_dim"]:
         """Fit the embedder and transform the data in one step.
 
@@ -7,18 +7,18 @@
 
 from __future__ import annotations
 
-from typing import List
-
 import networkx as nx
 import torch
 from jaxtyping import Float
 
 from ..manifolds import ProductManifold
 
+# TODO: Fix shape annotations for Float tensors with "..." placeholders
+
 
 def distortion_loss(
-    D_est: Float[torch.Tensor, "n_points n_points"],
-    D_true: Float[torch.Tensor, "n_points n_points"],
+    D_est: Float[torch.Tensor, "..."],
+    D_true: Float[torch.Tensor, "..."],
     pairwise: bool = False,
 ) -> Float[torch.Tensor, ""]:
     r"""Computes the distortion loss between estimated and true squared distances.
@@ -59,8 +59,8 @@ def distortion_loss(
 
 
 def d_avg(
-    D_est: Float[torch.Tensor, "n_points n_points"],
-    D_true: Float[torch.Tensor, "n_points n_points"],
+    D_est: Float[torch.Tensor, "..."],
+    D_true: Float[torch.Tensor, "..."],
     pairwise: bool = False,
 ) -> Float[torch.Tensor, ""]:
     r"""Computes the average relative distance error (D_avg).
@@ -102,7 +102,9 @@ def d_avg(
     return torch.mean(torch.abs(D_est - D_true) / D_true)
 
 
-def mean_average_precision(x_embed: Float[torch.Tensor, "n_points n_dim"], graph: nx.Graph) -> Float[torch.Tensor, ""]:
+def mean_average_precision(
+    x_embed: Float[torch.Tensor, "n_points_dists n_dim"], graph: nx.Graph
+) -> Float[torch.Tensor, ""]:
     r"""Computes the mean average precision (mAP) for graph embedding evaluation.
 
     This metric is used to evaluate how well an embedding preserves the neighborhood structure of a graph, as described
@@ -121,7 +123,9 @@ def mean_average_precision(x_embed: Float[torch.Tensor, "n_points n_dim"], graph
     raise NotImplementedError
 
 
-def dist_component_by_manifold(pm: ProductManifold, x_embed: Float[torch.Tensor, "n_points n_dim"]) -> List[float]:
+def dist_component_by_manifold(
+    pm: ProductManifold, x_embed: Float[torch.Tensor, "n_points_dists n_dim"]
+) -> list[float]:
     r"""Computes the proportion of variance in pairwise distances explained by each manifold component.
 
     The contribution is calculated as the ratio of the sum of squared distances in each component to the total squared
 
@@ -1,20 +1,21 @@
 """Implementation for direct coordinate optimization in Riemannian manifolds.
 
 This module provides functions for learning optimal embeddings in product manifolds by directly optimizing the
-coordinates using Riemannian optimization. This approach is particularly useful for embedding graphs using metric learning
-to maintain pairwise distances in the target space. The optimization is performed using Riemannian gradient descent
-with support for non-transductive training, in which gradients from the test set to the training set are masked out.
+coordinates using Riemannian optimization. This approach is particularly useful for embedding graphs using metric
+learning to maintain pairwise distances in the target space. The optimization is performed using Riemannian gradient
+descent with support for non-transductive training, in which gradients from the test set to the training set are masked
+out.
 """
 
 from __future__ import annotations
 
 import sys
 import warnings
-from typing import Any, Dict, List, Optional
 
 import geoopt
 import numpy as np
 import torch
+from beartype.typing import Any
 from jaxtyping import Float, Int
 
 from ..manifolds import ProductManifold
@@ -62,7 +63,7 @@ class CoordinateLearning(BaseEmbedder):
         device: Optional device for tensor computations.
     """
 
-    def __init__(self, pm: ProductManifold, random_state: Optional[int] = None, device: Optional[str] = None) -> None:
+    def __init__(self, pm: ProductManifold, random_state: int | None = None, device: str | None = None) -> None:
         super().__init__(pm=pm, random_state=random_state, device=device)
 
     def fit(  # type: ignore[override]
@@ -105,7 +106,8 @@ def fit(  # type: ignore[override]
             raise ValueError("Distance matrix D is needed for coordinate learning")
         if X is not None:
             warnings.warn(
-                "Input X has been given. This will be ignored during fitting. If you have provided a distance matrix, please run embedder.fit(None, D) instead."
+                "Input X has been given. This will be ignored during fitting. If you have provided a distance matrix,"
+                "please run embedder.fit(None, D) instead."
             )
 
         # Set random seed if provided
@@ -115,7 +117,7 @@ def fit(  # type: ignore[override]
         # Move everything to the device; initialize random embeddings
         n = D.shape[0]
         covs = [torch.stack([torch.eye(M.dim) / self.pm.dim] * n).to(self.device) for M in self.pm.P]
-        means = torch.stack([self.pm.mu0] * n).to(self.device)
+        means = torch.vstack([self.pm.mu0] * n).to(self.device)
         X_embed, _ = self.pm.sample(z_mean=means, sigma_factorized=covs)
         D = D.to(self.device)
 
@@ -134,7 +136,7 @@ def fit(  # type: ignore[override]
         my_tqdm = tqdm(total=burn_in_iterations + training_iterations, leave=False)
 
         # Outer training loop - mostly setting optimizer learning rates up here
-        losses: Dict[str, List[float]] = {"train_train": [], "test_test": [], "train_test": [], "total": []}
+        losses: dict[str, list[float]] = {"train_train": [], "test_test": [], "train_test": [], "total": []}
 
         # Actual training loop
         for i in range(burn_in_iterations + training_iterations):
 
@@ -11,7 +11,6 @@
 from __future__ import annotations
 
 import sys
-from typing import Dict, List, Optional, Tuple
 
 import numpy as np
 import torch
@@ -59,10 +58,10 @@ def __init__(
         self,
         pm: ProductManifold,
         encoder: torch.nn.Module,
-        decoder: Optional[torch.nn.Module] = None,
+        decoder: torch.nn.Module | None = None,
         reconstruction_loss: str = "mse",
         beta: float = 1.0,
-        random_state: Optional[int] = None,
+        random_state: int | None = None,
         device: str = "cpu",
     ):
         # Init both base classes
 
@@ -11,7 +11,6 @@
 from __future__ import annotations
 
 import sys
-from typing import Dict, List, Optional, Tuple
 
 import numpy as np
 import torch
@@ -74,7 +73,7 @@ def __init__(
         pm: ProductManifold,
         encoder: torch.nn.Module,
         decoder: torch.nn.Module,
-        random_state: Optional[int] = None,
+        random_state: int | None = None,
         device: str = "cpu",
         beta: float = 1.0,
         reconstruction_loss: torch.nn.modules.loss._Loss = torch.nn.MSELoss(reduction="none"),
@@ -102,7 +101,7 @@ def __init__(
 
     def encode(
         self, x: Float[torch.Tensor, "batch_size n_features"]
-    ) -> Tuple[Float[torch.Tensor, "batch_size n_latent"], Float[torch.Tensor, "batch_size n_latent"]]:
+    ) -> tuple[Float[torch.Tensor, "batch_size n_latent"], Float[torch.Tensor, "batch_size n_latent"]]:
         r"""Encodes input data to obtain latent means and log-variances in the manifold.
 
         This method processes input data through the encoder network to obtain parameters of the approximate posterior
@@ -140,10 +139,10 @@ def decode(self, z: Float[torch.Tensor, "batch_size n_ambient"]) -> Float[torch.
         """
         return self.decoder(z)
 
-    def forward(self, x: Float[torch.Tensor, "batch_size n_features"]) -> Tuple[
+    def forward(self, x: Float[torch.Tensor, "batch_size n_features"]) -> tuple[
         Float[torch.Tensor, "batch_size n_features"],
         Float[torch.Tensor, "batch_size n_ambient"],
-        List[Float[torch.Tensor, "n_latent n_latent"]],
+        list[Float[torch.Tensor, "batch_size n_latent n_latent"]],
     ]:
         r"""Performs the forward pass of the VAE in product manifold space.
 
@@ -181,7 +180,7 @@ def forward(self, x: Float[torch.Tensor, "batch_size n_features"]) -> Tuple[
     def kl_divergence(
         self,
         z_mean: Float[torch.Tensor, "batch_size n_latent"],
-        sigma_factorized: List[Float[torch.Tensor, "n_latent n_latent"]],
+        sigma_factorized: list[Float[torch.Tensor, "batch_size manifold_dim manifold_dim"]],
     ) -> Float[torch.Tensor, "batch_size"]:
         r"""Computes the KL divergence between posterior and prior distributions in the manifold.
 
@@ -214,7 +213,7 @@ def kl_divergence(
 
     def elbo(
         self, x: Float[torch.Tensor, "batch_size n_features"]
-    ) -> Tuple[Float[torch.Tensor, ""], Float[torch.Tensor, ""], Float[torch.Tensor, ""]]:
+    ) -> tuple[Float[torch.Tensor, ""], Float[torch.Tensor, ""], Float[torch.Tensor, ""]]:
         r"""Computes the Evidence Lower Bound (ELBO) for the VAE objective.
 
         The ELBO is the standard objective function for variational autoencoders, consisting of a reconstruction term