ZJUEarthData · Watsonwater · Mar 29, 2025
diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py
@@ -61,7 +61,7 @@
     # "Decision Tree",
     # Histogram-based Gradient Boosting,
 ]
-CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative", "AffinityPropagation", "MeanShift"]
+CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative", "AffinityPropagation", "MeanShift", "OPTICS"]
 DECOMPOSITION_MODELS = ["PCA", "T-SNE", "MDS"]
 ANOMALYDETECTION_MODELS = ["Isolation Forest", "Local Outlier Factor"]
 

diff --git a/geochemistrypi/data_mining/model/clustering.py b/geochemistrypi/data_mining/model/clustering.py
@@ -8,7 +8,7 @@
 import pandas as pd
 from numpy.typing import ArrayLike
 from rich import print
-from sklearn.cluster import DBSCAN, AffinityPropagation, AgglomerativeClustering, KMeans, MeanShift
+from sklearn.cluster import DBSCAN, OPTICS, AffinityPropagation, AgglomerativeClustering, KMeans, MeanShift
 
 from ..constants import MLFLOW_ARTIFACT_DATA_PATH, MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH
 from ..utils.base import clear_output, save_data, save_fig, save_text
@@ -20,6 +20,7 @@
 from .func.algo_clustering._enum import ClusteringCommonFunction, KMeansSpecialFunction, MeanShiftSpecialFunction
 from .func.algo_clustering._kmeans import kmeans_manual_hyper_parameters
 from .func.algo_clustering._meanshift import meanshift_manual_hyper_parameters
+from .func.algo_clustering._optics import OPTICS_manual_hyper_parameters
 
 
 class ClusteringWorkflowBase(WorkflowBase):
@@ -818,6 +819,190 @@ def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None:
         )
 
 
+class OPTICSClustering(ClusteringWorkflowBase):
+
+    name = "OPTICS"
+    special_function = []
+
+    def __init__(
+        self,
+        min_samples: int = 5,
+        max_eps: float = np.inf,
+        metric: str = "minkowski",
+        p: float = 2,
+        metric_params: Optional[Dict] = None,
+        cluster_method: str = "xi",
+        eps: float = None,
+        xi: float = 0.05,
+        predecessor_correction: bool = True,
+        min_cluster_size: int = None,
+        algorithm: str = "auto",
+        leaf_size: int = 30,
+        memory: str = None,
+        n_jobs: int = None,
+    ) -> None:
+
+        """
+        Parameters
+        ----------
+        min_samples : int > 1 or float between 0 and 1, default=5
+            The number of samples in a neighborhood for a point to be considered as
+            a core point. Also, up and down steep regions can't have more than
+            ``min_samples`` consecutive non-steep points. Expressed as an absolute
+            number or a fraction of the number of samples (rounded to be at least
+            2).
+
+        max_eps : float, default=np.inf
+            The maximum distance between two samples for one to be considered as
+            in the neighborhood of the other. Default value of ``np.inf`` will
+            identify clusters across all scales; reducing ``max_eps`` will result
+            in shorter run times.
+
+        metric : str or callable, default='minkowski'
+            Metric to use for distance computation. Any metric from scikit-learn
+            or scipy.spatial.distance can be used.
+
+            If metric is a callable function, it is called on each
+            pair of instances (rows) and the resulting value recorded. The callable
+            should take two arrays as input and return one value indicating the
+            distance between them. This works for Scipy's metrics, but is less
+            efficient than passing the metric name as a string. If metric is
+            "precomputed", `X` is assumed to be a distance matrix and must be
+            square.
+
+            Valid values for metric are:
+            - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
+            'manhattan']
+            - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
+            'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
+            'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
+            'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
+            'yule']
+
+            Sparse matrices are only supported by scikit-learn metrics.
+            See the documentation for scipy.spatial.distance for details on these
+            metrics.
+
+        p : float, default=2
+            Parameter for the Minkowski metric from
+            :class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is
+            equivalent to using manhattan_distance (l1), and euclidean_distance
+            (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
+
+        metric_params : dict, default=None
+            Additional keyword arguments for the metric function.
+
+        cluster_method : str, default='xi'
+            The extraction method used to extract clusters using the calculated
+            reachability and ordering. Possible values are "xi" and "dbscan".
+
+        eps : float, default=None
+            The maximum distance between two samples for one to be considered as
+            in the neighborhood of the other. By default it assumes the same value
+            as ``max_eps``.
+            Used only when ``cluster_method='dbscan'``.
+
+        xi : float between 0 and 1, default=0.05
+            Determines the minimum steepness on the reachability plot that
+            constitutes a cluster boundary. For example, an upwards point in the
+            reachability plot is defined by the ratio from one point to its
+            successor being at most 1-xi.
+            Used only when ``cluster_method='xi'``.
+
+        predecessor_correction : bool, default=True
+            Correct clusters according to the predecessors calculated by OPTICS
+            [2]_. This parameter has minimal effect on most datasets.
+            Used only when ``cluster_method='xi'``.
+
+        min_cluster_size : int > 1 or float between 0 and 1, default=None
+            Minimum number of samples in an OPTICS cluster, expressed as an
+            absolute number or a fraction of the number of samples (rounded to be
+            at least 2). If ``None``, the value of ``min_samples`` is used instead.
+            Used only when ``cluster_method='xi'``.
+
+        algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+            Algorithm used to compute the nearest neighbors:
+
+            - 'ball_tree' will use :class:`~sklearn.neighbors.BallTree`.
+            - 'kd_tree' will use :class:`~sklearn.neighbors.KDTree`.
+            - 'brute' will use a brute-force search.
+            - 'auto' (default) will attempt to decide the most appropriate
+            algorithm based on the values passed to :meth:`fit` method.
+
+            Note: fitting on sparse input will override the setting of
+            this parameter, using brute force.
+
+        leaf_size : int, default=30
+            Leaf size passed to :class:`~sklearn.neighbors.BallTree` or
+            :class:`~sklearn.neighbors.KDTree`. This can affect the speed of the
+            construction and query, as well as the memory required to store the
+            tree. The optimal value depends on the nature of the problem.
+
+        memory : str or object with the joblib.Memory interface, default=None
+            Used to cache the output of the computation of the tree.
+            By default, no caching is done. If a string is given, it is the
+            path to the caching directory.
+
+        n_jobs : int, default=None
+            The number of parallel jobs to run for neighbors search.
+            ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+            ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+            for more details.
+
+        References
+        ----------
+        Scikit-learn API: sklearn.cluster.OPTICS
+        https://scikit-learn.org/stable/modules/generated/sklearn.cluster.OPTICS.html#sklearn.cluster.OPTICS
+        """
+
+        super().__init__()
+        self.min_samples = min_samples
+        self.max_eps = max_eps
+        self.metric = metric
+        self.p = p
+        self.metric_params = metric_params
+        self.cluster_method = cluster_method
+        self.eps = eps
+        self.xi = xi
+        self.predecessor_correction = predecessor_correction
+        self.min_cluster_size = min_cluster_size
+        self.algorithm = algorithm
+        self.leaf_size = leaf_size
+        self.memory = memory
+        self.n_jobs = n_jobs
+
+        self.model = OPTICS(
+            min_samples=self.min_samples,
+            max_eps=self.max_eps,
+            metric=self.metric,
+            p=self.p,
+            metric_params=self.metric_params,
+            cluster_method=self.cluster_method,
+            eps=self.eps,
+            xi=self.xi,
+            predecessor_correction=self.predecessor_correction,
+            min_cluster_size=self.min_cluster_size,
+            algorithm=self.algorithm,
+            leaf_size=self.leaf_size,
+            memory=self.memory,
+            n_jobs=self.n_jobs,
+        )
+
+        self.naming = OPTICSClustering.name
+
+    @classmethod
+    def manual_hyper_parameters(cls) -> Dict:
+        """Manual hyper-parameters specification."""
+        print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]")
+        hyper_parameters = OPTICS_manual_hyper_parameters()
+        clear_output()
+        return hyper_parameters
+
+    def special_components(self, **kwargs) -> None:
+        """Invoke all special application functions for this algorithms by Scikit-learn framework."""
+        pass
+
+
 class SpectralClustering(ClusteringWorkflowBase):
     name = "Spectral"
     pass
@@ -828,11 +1013,6 @@ class WardHierarchicalClustering(ClusteringWorkflowBase):
     pass
 
 
-class OPTICSClustering(ClusteringWorkflowBase):
-    name = "OPTICS"
-    pass
-
-
 class GaussianMixturesClustering(ClusteringWorkflowBase):
     name = "GaussianMixtures"
     pass

diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_optics.py b/geochemistrypi/data_mining/model/func/algo_clustering/_optics.py
@@ -0,0 +1,120 @@
+from typing import Dict
+
+import numpy as np
+from rich import print
+
+from ....constants import SECTION
+from ....data.data_readiness import float_input, int_input, num_input, str_input
+
+
+def OPTICS_manual_hyper_parameters() -> Dict:
+    """Manually set hyperparameters.
+
+    Returns
+    -------
+    hyper_parameters : dict
+
+    """
+    print("max_eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other.")
+    print("Default value of ``np.inf`` will identify clusters across all scales; reducing ``max_eps`` will result in shorter run times.")
+    max_eps = float_input(np.inf, SECTION[2], "max_eps: ")
+
+    print("min_samples: The number of samples in a neighborhood for a point to be considered as a core point")
+    print("A good starting value could be int > 1, such as 5.")
+    min_samples = int_input(5, SECTION[2], "min_samples: ")
+
+    print("algorithm: Algorithm used to compute the nearest neighbors")
+    print("Please specify the algorithm. It is generally recommended to leave it as 'auto'.")
+    algorithms = ["auto", "ball_tree", "kd_tree", "brute"]
+    algorithm = str_input(algorithms, SECTION[2])
+
+    print("metric: The metric to use when calculating distance between instances in a feature array.")
+    print("Please specify the metric to use when calculating distance between instances in a feature array. It is generally recommended to leave it as 'minkowski'.")
+    if algorithm == "kd_tree":
+        metrics = ["euclidean", "l2", "minkowski", "p", "manhattan", "cityblock", "l1", "chebyshev", "infinity"]
+    elif algorithm == "ball_tree":
+        metrics = [
+            "euclidean",
+            "l2",
+            "minkowski",
+            "p",
+            "manhattan",
+            "cityblock",
+            "l1",
+            "chebyshev",
+            "infinity",
+            "seuclidean",
+            "mahalanobis",
+            "hamming",
+            "canberra",
+            "braycurtis",
+            "jaccard",
+            "dice",
+            "rogerstanimoto",
+            "russellrao",
+            "sokalmichener",
+            "sokalsneath",
+            "haversine",
+        ]
+    else:
+        metrics = ["euclidean", "manhattan", "chebyshev", "minkowski", "cosine", "correlation"]
+    metric = str_input(metrics, SECTION[2])
+
+    print("cluster_method: The extraction method used to extract clusters using the calculated reachability and ordering.")
+    print("Please specify the method. It is generally recommended to leave it as 'xi'.")
+    cluster_methods = ["xi", "dbscan"]
+    cluster_method = str_input(cluster_methods, SECTION[2])
+
+    print("Leaf Size: Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree.")
+    print("Please specify the leaf size. A good starting range could be between 10 and 30, such as 30.")
+    leaf_size = num_input(SECTION[2], "Leaf Size: ")
+
+    p = None
+    if metric == "minkowski":
+        print("P: The power of the Minkowski metric to be used to calculate distance between points.")
+        print("Please specify the power of the Minkowski metric. A good starting range could be between 1 and 2, such as 2.")
+        p = num_input(SECTION[2], "P: ")
+
+    eps = None
+    xi = None
+
+    if cluster_method == "dbscan":
+        print("Eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other.")
+        print("Please specify the maximum distance. A good starting range could be between 0.1 and 1.0, such as 0.5.")
+        eps = float_input(0.5, SECTION[2], "Eps: ")
+
+        predecessor_correction = None
+        min_cluster_size = None
+
+    if cluster_method == "xi":
+        print("xi: minimum steepness on the reachability plot that constitutes a cluster boundary.")
+        print("A good starting range would be float between 0 and 1, such as 0.05.")
+        xi = float_input(0.05, SECTION[2], "xi: ")
+
+        """
+        print("predecessor_correction: Correct clusters according to the predecessors calculated by OPTICS")
+        print("It is generally recommended to leave it as True")
+        predecessor_correction = bool_input(SECTION[2], "predecessor_correction: ")
+        """
+        predecessor_correction = True
+
+        print("min_cluster_size: Minimum number of samples in an OPTICS cluster, expressed as an absolute number or a fraction of the number of samples")
+        print("A good starting range would be int > 1 or float between 0 and 1, such as None")
+        min_cluster_size = int_input(None, SECTION[2], "min_cluster_size: ")
+
+    # Reference:
+
+    hyper_parameters = {
+        "min_samples": min_samples,
+        "max_eps": max_eps,
+        "metric": metric,
+        "p": p,
+        "cluster_method": cluster_method,
+        "eps": eps,
+        "xi": xi,
+        "predecessor_correction": predecessor_correction,
+        "min_cluster_size": min_cluster_size,
+        "algorithm": algorithm,
+        "leaf_size": leaf_size,
+    }
+    return hyper_parameters
diff --git a/geochemistrypi/data_mining/process/cluster.py b/geochemistrypi/data_mining/process/cluster.py
@@ -4,7 +4,7 @@
 
 import pandas as pd
 
-from ..model.clustering import AffinityPropagationClustering, Agglomerative, ClusteringWorkflowBase, DBSCANClustering, KMeansClustering, MeanShiftClustering
+from ..model.clustering import AffinityPropagationClustering, Agglomerative, ClusteringWorkflowBase, DBSCANClustering, KMeansClustering, MeanShiftClustering, OPTICSClustering
 from ._base import ModelSelectionBase
 
 
@@ -75,6 +75,21 @@ def activate(
                 n_jobs=hyper_parameters["n_jobs"],
                 max_iter=hyper_parameters["max_iter"],
             )
+        elif self.model_name == "OPTICS":
+            hyper_parameters = OPTICSClustering.manual_hyper_parameters()
+            self.clt_workflow = OPTICSClustering(
+                min_samples=hyper_parameters["min_samples"],
+                max_eps=hyper_parameters["max_eps"],
+                metric=hyper_parameters["metric"],
+                p=hyper_parameters["p"],
+                cluster_method=hyper_parameters["cluster_method"],
+                eps=hyper_parameters["eps"],
+                xi=hyper_parameters["xi"],
+                predecessor_correction=hyper_parameters["predecessor_correction"],
+                min_cluster_size=hyper_parameters["min_cluster_size"],
+                algorithm=hyper_parameters["algorithm"],
+                leaf_size=hyper_parameters["leaf_size"],
+            )
         elif self.model_name == "":
             pass