diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py index 24971881..29cb6e5c 100644 --- a/geochemistrypi/data_mining/constants.py +++ b/geochemistrypi/data_mining/constants.py @@ -61,7 +61,7 @@ # "Decision Tree", # Histogram-based Gradient Boosting, ] -CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative", "AffinityPropagation", "MeanShift"] +CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative", "AffinityPropagation", "MeanShift", "OPTICS"] DECOMPOSITION_MODELS = ["PCA", "T-SNE", "MDS"] ANOMALYDETECTION_MODELS = ["Isolation Forest", "Local Outlier Factor"] diff --git a/geochemistrypi/data_mining/model/clustering.py b/geochemistrypi/data_mining/model/clustering.py index ce8d6091..2ebc76f4 100644 --- a/geochemistrypi/data_mining/model/clustering.py +++ b/geochemistrypi/data_mining/model/clustering.py @@ -8,7 +8,7 @@ import pandas as pd from numpy.typing import ArrayLike from rich import print -from sklearn.cluster import DBSCAN, AffinityPropagation, AgglomerativeClustering, KMeans, MeanShift +from sklearn.cluster import DBSCAN, OPTICS, AffinityPropagation, AgglomerativeClustering, KMeans, MeanShift from ..constants import MLFLOW_ARTIFACT_DATA_PATH, MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH from ..utils.base import clear_output, save_data, save_fig, save_text @@ -20,6 +20,7 @@ from .func.algo_clustering._enum import ClusteringCommonFunction, KMeansSpecialFunction, MeanShiftSpecialFunction from .func.algo_clustering._kmeans import kmeans_manual_hyper_parameters from .func.algo_clustering._meanshift import meanshift_manual_hyper_parameters +from .func.algo_clustering._optics import OPTICS_manual_hyper_parameters class ClusteringWorkflowBase(WorkflowBase): @@ -818,6 +819,190 @@ def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None: ) +class OPTICSClustering(ClusteringWorkflowBase): + + name = "OPTICS" + special_function = [] + + def __init__( + self, + min_samples: int = 5, + max_eps: float = np.inf, + metric: str = "minkowski", + p: float = 2, + metric_params: Optional[Dict] = None, + cluster_method: str = "xi", + eps: float = None, + xi: float = 0.05, + predecessor_correction: bool = True, + min_cluster_size: int = None, + algorithm: str = "auto", + leaf_size: int = 30, + memory: str = None, + n_jobs: int = None, + ) -> None: + + """ + Parameters + ---------- + min_samples : int > 1 or float between 0 and 1, default=5 + The number of samples in a neighborhood for a point to be considered as + a core point. Also, up and down steep regions can't have more than + ``min_samples`` consecutive non-steep points. Expressed as an absolute + number or a fraction of the number of samples (rounded to be at least + 2). + + max_eps : float, default=np.inf + The maximum distance between two samples for one to be considered as + in the neighborhood of the other. Default value of ``np.inf`` will + identify clusters across all scales; reducing ``max_eps`` will result + in shorter run times. + + metric : str or callable, default='minkowski' + Metric to use for distance computation. Any metric from scikit-learn + or scipy.spatial.distance can be used. + + If metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays as input and return one value indicating the + distance between them. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. If metric is + "precomputed", `X` is assumed to be a distance matrix and must be + square. + + Valid values for metric are: + - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan'] + - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', + 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', + 'yule'] + + Sparse matrices are only supported by scikit-learn metrics. + See the documentation for scipy.spatial.distance for details on these + metrics. + + p : float, default=2 + Parameter for the Minkowski metric from + :class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + cluster_method : str, default='xi' + The extraction method used to extract clusters using the calculated + reachability and ordering. Possible values are "xi" and "dbscan". + + eps : float, default=None + The maximum distance between two samples for one to be considered as + in the neighborhood of the other. By default it assumes the same value + as ``max_eps``. + Used only when ``cluster_method='dbscan'``. + + xi : float between 0 and 1, default=0.05 + Determines the minimum steepness on the reachability plot that + constitutes a cluster boundary. For example, an upwards point in the + reachability plot is defined by the ratio from one point to its + successor being at most 1-xi. + Used only when ``cluster_method='xi'``. + + predecessor_correction : bool, default=True + Correct clusters according to the predecessors calculated by OPTICS + [2]_. This parameter has minimal effect on most datasets. + Used only when ``cluster_method='xi'``. + + min_cluster_size : int > 1 or float between 0 and 1, default=None + Minimum number of samples in an OPTICS cluster, expressed as an + absolute number or a fraction of the number of samples (rounded to be + at least 2). If ``None``, the value of ``min_samples`` is used instead. + Used only when ``cluster_method='xi'``. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`~sklearn.neighbors.BallTree`. + - 'kd_tree' will use :class:`~sklearn.neighbors.KDTree`. + - 'brute' will use a brute-force search. + - 'auto' (default) will attempt to decide the most appropriate + algorithm based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf size passed to :class:`~sklearn.neighbors.BallTree` or + :class:`~sklearn.neighbors.KDTree`. This can affect the speed of the + construction and query, as well as the memory required to store the + tree. The optimal value depends on the nature of the problem. + + memory : str or object with the joblib.Memory interface, default=None + Used to cache the output of the computation of the tree. + By default, no caching is done. If a string is given, it is the + path to the caching directory. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + References + ---------- + Scikit-learn API: sklearn.cluster.OPTICS + https://scikit-learn.org/stable/modules/generated/sklearn.cluster.OPTICS.html#sklearn.cluster.OPTICS + """ + + super().__init__() + self.min_samples = min_samples + self.max_eps = max_eps + self.metric = metric + self.p = p + self.metric_params = metric_params + self.cluster_method = cluster_method + self.eps = eps + self.xi = xi + self.predecessor_correction = predecessor_correction + self.min_cluster_size = min_cluster_size + self.algorithm = algorithm + self.leaf_size = leaf_size + self.memory = memory + self.n_jobs = n_jobs + + self.model = OPTICS( + min_samples=self.min_samples, + max_eps=self.max_eps, + metric=self.metric, + p=self.p, + metric_params=self.metric_params, + cluster_method=self.cluster_method, + eps=self.eps, + xi=self.xi, + predecessor_correction=self.predecessor_correction, + min_cluster_size=self.min_cluster_size, + algorithm=self.algorithm, + leaf_size=self.leaf_size, + memory=self.memory, + n_jobs=self.n_jobs, + ) + + self.naming = OPTICSClustering.name + + @classmethod + def manual_hyper_parameters(cls) -> Dict: + """Manual hyper-parameters specification.""" + print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]") + hyper_parameters = OPTICS_manual_hyper_parameters() + clear_output() + return hyper_parameters + + def special_components(self, **kwargs) -> None: + """Invoke all special application functions for this algorithms by Scikit-learn framework.""" + pass + + class SpectralClustering(ClusteringWorkflowBase): name = "Spectral" pass @@ -828,11 +1013,6 @@ class WardHierarchicalClustering(ClusteringWorkflowBase): pass -class OPTICSClustering(ClusteringWorkflowBase): - name = "OPTICS" - pass - - class GaussianMixturesClustering(ClusteringWorkflowBase): name = "GaussianMixtures" pass diff --git a/geochemistrypi/data_mining/model/func/algo_clustering/_optics.py b/geochemistrypi/data_mining/model/func/algo_clustering/_optics.py new file mode 100644 index 00000000..f2b4f92c --- /dev/null +++ b/geochemistrypi/data_mining/model/func/algo_clustering/_optics.py @@ -0,0 +1,120 @@ +from typing import Dict + +import numpy as np +from rich import print + +from ....constants import SECTION +from ....data.data_readiness import float_input, int_input, num_input, str_input + + +def OPTICS_manual_hyper_parameters() -> Dict: + """Manually set hyperparameters. + + Returns + ------- + hyper_parameters : dict + + """ + print("max_eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other.") + print("Default value of ``np.inf`` will identify clusters across all scales; reducing ``max_eps`` will result in shorter run times.") + max_eps = float_input(np.inf, SECTION[2], "max_eps: ") + + print("min_samples: The number of samples in a neighborhood for a point to be considered as a core point") + print("A good starting value could be int > 1, such as 5.") + min_samples = int_input(5, SECTION[2], "min_samples: ") + + print("algorithm: Algorithm used to compute the nearest neighbors") + print("Please specify the algorithm. It is generally recommended to leave it as 'auto'.") + algorithms = ["auto", "ball_tree", "kd_tree", "brute"] + algorithm = str_input(algorithms, SECTION[2]) + + print("metric: The metric to use when calculating distance between instances in a feature array.") + print("Please specify the metric to use when calculating distance between instances in a feature array. It is generally recommended to leave it as 'minkowski'.") + if algorithm == "kd_tree": + metrics = ["euclidean", "l2", "minkowski", "p", "manhattan", "cityblock", "l1", "chebyshev", "infinity"] + elif algorithm == "ball_tree": + metrics = [ + "euclidean", + "l2", + "minkowski", + "p", + "manhattan", + "cityblock", + "l1", + "chebyshev", + "infinity", + "seuclidean", + "mahalanobis", + "hamming", + "canberra", + "braycurtis", + "jaccard", + "dice", + "rogerstanimoto", + "russellrao", + "sokalmichener", + "sokalsneath", + "haversine", + ] + else: + metrics = ["euclidean", "manhattan", "chebyshev", "minkowski", "cosine", "correlation"] + metric = str_input(metrics, SECTION[2]) + + print("cluster_method: The extraction method used to extract clusters using the calculated reachability and ordering.") + print("Please specify the method. It is generally recommended to leave it as 'xi'.") + cluster_methods = ["xi", "dbscan"] + cluster_method = str_input(cluster_methods, SECTION[2]) + + print("Leaf Size: Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree.") + print("Please specify the leaf size. A good starting range could be between 10 and 30, such as 30.") + leaf_size = num_input(SECTION[2], "Leaf Size: ") + + p = None + if metric == "minkowski": + print("P: The power of the Minkowski metric to be used to calculate distance between points.") + print("Please specify the power of the Minkowski metric. A good starting range could be between 1 and 2, such as 2.") + p = num_input(SECTION[2], "P: ") + + eps = None + xi = None + + if cluster_method == "dbscan": + print("Eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other.") + print("Please specify the maximum distance. A good starting range could be between 0.1 and 1.0, such as 0.5.") + eps = float_input(0.5, SECTION[2], "Eps: ") + + predecessor_correction = None + min_cluster_size = None + + if cluster_method == "xi": + print("xi: minimum steepness on the reachability plot that constitutes a cluster boundary.") + print("A good starting range would be float between 0 and 1, such as 0.05.") + xi = float_input(0.05, SECTION[2], "xi: ") + + """ + print("predecessor_correction: Correct clusters according to the predecessors calculated by OPTICS") + print("It is generally recommended to leave it as True") + predecessor_correction = bool_input(SECTION[2], "predecessor_correction: ") + """ + predecessor_correction = True + + print("min_cluster_size: Minimum number of samples in an OPTICS cluster, expressed as an absolute number or a fraction of the number of samples") + print("A good starting range would be int > 1 or float between 0 and 1, such as None") + min_cluster_size = int_input(None, SECTION[2], "min_cluster_size: ") + + # Reference: + + hyper_parameters = { + "min_samples": min_samples, + "max_eps": max_eps, + "metric": metric, + "p": p, + "cluster_method": cluster_method, + "eps": eps, + "xi": xi, + "predecessor_correction": predecessor_correction, + "min_cluster_size": min_cluster_size, + "algorithm": algorithm, + "leaf_size": leaf_size, + } + return hyper_parameters diff --git a/geochemistrypi/data_mining/process/cluster.py b/geochemistrypi/data_mining/process/cluster.py index 5c5bdb39..e7e5fc31 100644 --- a/geochemistrypi/data_mining/process/cluster.py +++ b/geochemistrypi/data_mining/process/cluster.py @@ -4,7 +4,7 @@ import pandas as pd -from ..model.clustering import AffinityPropagationClustering, Agglomerative, ClusteringWorkflowBase, DBSCANClustering, KMeansClustering, MeanShiftClustering +from ..model.clustering import AffinityPropagationClustering, Agglomerative, ClusteringWorkflowBase, DBSCANClustering, KMeansClustering, MeanShiftClustering, OPTICSClustering from ._base import ModelSelectionBase @@ -75,6 +75,21 @@ def activate( n_jobs=hyper_parameters["n_jobs"], max_iter=hyper_parameters["max_iter"], ) + elif self.model_name == "OPTICS": + hyper_parameters = OPTICSClustering.manual_hyper_parameters() + self.clt_workflow = OPTICSClustering( + min_samples=hyper_parameters["min_samples"], + max_eps=hyper_parameters["max_eps"], + metric=hyper_parameters["metric"], + p=hyper_parameters["p"], + cluster_method=hyper_parameters["cluster_method"], + eps=hyper_parameters["eps"], + xi=hyper_parameters["xi"], + predecessor_correction=hyper_parameters["predecessor_correction"], + min_cluster_size=hyper_parameters["min_cluster_size"], + algorithm=hyper_parameters["algorithm"], + leaf_size=hyper_parameters["leaf_size"], + ) elif self.model_name == "": pass