Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion geochemistrypi/data_mining/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
# "Decision Tree",
# Histogram-based Gradient Boosting,
]
CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative", "AffinityPropagation", "MeanShift"]
CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative", "AffinityPropagation", "MeanShift", "OPTICS"]
DECOMPOSITION_MODELS = ["PCA", "T-SNE", "MDS"]
ANOMALYDETECTION_MODELS = ["Isolation Forest", "Local Outlier Factor"]

Expand Down
192 changes: 186 additions & 6 deletions geochemistrypi/data_mining/model/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pandas as pd
from numpy.typing import ArrayLike
from rich import print
from sklearn.cluster import DBSCAN, AffinityPropagation, AgglomerativeClustering, KMeans, MeanShift
from sklearn.cluster import DBSCAN, OPTICS, AffinityPropagation, AgglomerativeClustering, KMeans, MeanShift

from ..constants import MLFLOW_ARTIFACT_DATA_PATH, MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH
from ..utils.base import clear_output, save_data, save_fig, save_text
Expand All @@ -20,6 +20,7 @@
from .func.algo_clustering._enum import ClusteringCommonFunction, KMeansSpecialFunction, MeanShiftSpecialFunction
from .func.algo_clustering._kmeans import kmeans_manual_hyper_parameters
from .func.algo_clustering._meanshift import meanshift_manual_hyper_parameters
from .func.algo_clustering._optics import OPTICS_manual_hyper_parameters


class ClusteringWorkflowBase(WorkflowBase):
Expand Down Expand Up @@ -818,6 +819,190 @@ def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None:
)


class OPTICSClustering(ClusteringWorkflowBase):

name = "OPTICS"
special_function = []

def __init__(
self,
min_samples: int = 5,
max_eps: float = np.inf,
metric: str = "minkowski",
p: float = 2,
metric_params: Optional[Dict] = None,
cluster_method: str = "xi",
eps: float = None,
xi: float = 0.05,
predecessor_correction: bool = True,
min_cluster_size: int = None,
algorithm: str = "auto",
leaf_size: int = 30,
memory: str = None,
n_jobs: int = None,
) -> None:

"""
Parameters
----------
min_samples : int > 1 or float between 0 and 1, default=5
The number of samples in a neighborhood for a point to be considered as
a core point. Also, up and down steep regions can't have more than
``min_samples`` consecutive non-steep points. Expressed as an absolute
number or a fraction of the number of samples (rounded to be at least
2).

max_eps : float, default=np.inf
The maximum distance between two samples for one to be considered as
in the neighborhood of the other. Default value of ``np.inf`` will
identify clusters across all scales; reducing ``max_eps`` will result
in shorter run times.

metric : str or callable, default='minkowski'
Metric to use for distance computation. Any metric from scikit-learn
or scipy.spatial.distance can be used.

If metric is a callable function, it is called on each
pair of instances (rows) and the resulting value recorded. The callable
should take two arrays as input and return one value indicating the
distance between them. This works for Scipy's metrics, but is less
efficient than passing the metric name as a string. If metric is
"precomputed", `X` is assumed to be a distance matrix and must be
square.

Valid values for metric are:
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
'manhattan']
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
'yule']

Sparse matrices are only supported by scikit-learn metrics.
See the documentation for scipy.spatial.distance for details on these
metrics.

p : float, default=2
Parameter for the Minkowski metric from
:class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.

metric_params : dict, default=None
Additional keyword arguments for the metric function.

cluster_method : str, default='xi'
The extraction method used to extract clusters using the calculated
reachability and ordering. Possible values are "xi" and "dbscan".

eps : float, default=None
The maximum distance between two samples for one to be considered as
in the neighborhood of the other. By default it assumes the same value
as ``max_eps``.
Used only when ``cluster_method='dbscan'``.

xi : float between 0 and 1, default=0.05
Determines the minimum steepness on the reachability plot that
constitutes a cluster boundary. For example, an upwards point in the
reachability plot is defined by the ratio from one point to its
successor being at most 1-xi.
Used only when ``cluster_method='xi'``.

predecessor_correction : bool, default=True
Correct clusters according to the predecessors calculated by OPTICS
[2]_. This parameter has minimal effect on most datasets.
Used only when ``cluster_method='xi'``.

min_cluster_size : int > 1 or float between 0 and 1, default=None
Minimum number of samples in an OPTICS cluster, expressed as an
absolute number or a fraction of the number of samples (rounded to be
at least 2). If ``None``, the value of ``min_samples`` is used instead.
Used only when ``cluster_method='xi'``.

algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
Algorithm used to compute the nearest neighbors:

- 'ball_tree' will use :class:`~sklearn.neighbors.BallTree`.
- 'kd_tree' will use :class:`~sklearn.neighbors.KDTree`.
- 'brute' will use a brute-force search.
- 'auto' (default) will attempt to decide the most appropriate
algorithm based on the values passed to :meth:`fit` method.

Note: fitting on sparse input will override the setting of
this parameter, using brute force.

leaf_size : int, default=30
Leaf size passed to :class:`~sklearn.neighbors.BallTree` or
:class:`~sklearn.neighbors.KDTree`. This can affect the speed of the
construction and query, as well as the memory required to store the
tree. The optimal value depends on the nature of the problem.

memory : str or object with the joblib.Memory interface, default=None
Used to cache the output of the computation of the tree.
By default, no caching is done. If a string is given, it is the
path to the caching directory.

n_jobs : int, default=None
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.

References
----------
Scikit-learn API: sklearn.cluster.OPTICS
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.OPTICS.html#sklearn.cluster.OPTICS
"""

super().__init__()
self.min_samples = min_samples
self.max_eps = max_eps
self.metric = metric
self.p = p
self.metric_params = metric_params
self.cluster_method = cluster_method
self.eps = eps
self.xi = xi
self.predecessor_correction = predecessor_correction
self.min_cluster_size = min_cluster_size
self.algorithm = algorithm
self.leaf_size = leaf_size
self.memory = memory
self.n_jobs = n_jobs

self.model = OPTICS(
min_samples=self.min_samples,
max_eps=self.max_eps,
metric=self.metric,
p=self.p,
metric_params=self.metric_params,
cluster_method=self.cluster_method,
eps=self.eps,
xi=self.xi,
predecessor_correction=self.predecessor_correction,
min_cluster_size=self.min_cluster_size,
algorithm=self.algorithm,
leaf_size=self.leaf_size,
memory=self.memory,
n_jobs=self.n_jobs,
)

self.naming = OPTICSClustering.name

@classmethod
def manual_hyper_parameters(cls) -> Dict:
"""Manual hyper-parameters specification."""
print(f"[bold green]-*-*- {cls.name} - Hyper-parameters Specification -*-*-[/bold green]")
hyper_parameters = OPTICS_manual_hyper_parameters()
clear_output()
return hyper_parameters

def special_components(self, **kwargs) -> None:
"""Invoke all special application functions for this algorithms by Scikit-learn framework."""
pass


class SpectralClustering(ClusteringWorkflowBase):
name = "Spectral"
pass
Expand All @@ -828,11 +1013,6 @@ class WardHierarchicalClustering(ClusteringWorkflowBase):
pass


class OPTICSClustering(ClusteringWorkflowBase):
name = "OPTICS"
pass


class GaussianMixturesClustering(ClusteringWorkflowBase):
name = "GaussianMixtures"
pass
Expand Down
120 changes: 120 additions & 0 deletions geochemistrypi/data_mining/model/func/algo_clustering/_optics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
from typing import Dict

import numpy as np
from rich import print

from ....constants import SECTION
from ....data.data_readiness import float_input, int_input, num_input, str_input


def OPTICS_manual_hyper_parameters() -> Dict:
"""Manually set hyperparameters.

Returns
-------
hyper_parameters : dict

"""
print("max_eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other.")
print("Default value of ``np.inf`` will identify clusters across all scales; reducing ``max_eps`` will result in shorter run times.")
max_eps = float_input(np.inf, SECTION[2], "max_eps: ")

print("min_samples: The number of samples in a neighborhood for a point to be considered as a core point")
print("A good starting value could be int > 1, such as 5.")
min_samples = int_input(5, SECTION[2], "min_samples: ")

print("algorithm: Algorithm used to compute the nearest neighbors")
print("Please specify the algorithm. It is generally recommended to leave it as 'auto'.")
algorithms = ["auto", "ball_tree", "kd_tree", "brute"]
algorithm = str_input(algorithms, SECTION[2])

print("metric: The metric to use when calculating distance between instances in a feature array.")
print("Please specify the metric to use when calculating distance between instances in a feature array. It is generally recommended to leave it as 'minkowski'.")
if algorithm == "kd_tree":
metrics = ["euclidean", "l2", "minkowski", "p", "manhattan", "cityblock", "l1", "chebyshev", "infinity"]
elif algorithm == "ball_tree":
metrics = [
"euclidean",
"l2",
"minkowski",
"p",
"manhattan",
"cityblock",
"l1",
"chebyshev",
"infinity",
"seuclidean",
"mahalanobis",
"hamming",
"canberra",
"braycurtis",
"jaccard",
"dice",
"rogerstanimoto",
"russellrao",
"sokalmichener",
"sokalsneath",
"haversine",
]
else:
metrics = ["euclidean", "manhattan", "chebyshev", "minkowski", "cosine", "correlation"]
metric = str_input(metrics, SECTION[2])

print("cluster_method: The extraction method used to extract clusters using the calculated reachability and ordering.")
print("Please specify the method. It is generally recommended to leave it as 'xi'.")
cluster_methods = ["xi", "dbscan"]
cluster_method = str_input(cluster_methods, SECTION[2])

print("Leaf Size: Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree.")
print("Please specify the leaf size. A good starting range could be between 10 and 30, such as 30.")
leaf_size = num_input(SECTION[2], "Leaf Size: ")

p = None
if metric == "minkowski":
print("P: The power of the Minkowski metric to be used to calculate distance between points.")
print("Please specify the power of the Minkowski metric. A good starting range could be between 1 and 2, such as 2.")
p = num_input(SECTION[2], "P: ")

eps = None
xi = None

if cluster_method == "dbscan":
print("Eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other.")
print("Please specify the maximum distance. A good starting range could be between 0.1 and 1.0, such as 0.5.")
eps = float_input(0.5, SECTION[2], "Eps: ")

predecessor_correction = None
min_cluster_size = None

if cluster_method == "xi":
print("xi: minimum steepness on the reachability plot that constitutes a cluster boundary.")
print("A good starting range would be float between 0 and 1, such as 0.05.")
xi = float_input(0.05, SECTION[2], "xi: ")

"""
print("predecessor_correction: Correct clusters according to the predecessors calculated by OPTICS")
print("It is generally recommended to leave it as True")
predecessor_correction = bool_input(SECTION[2], "predecessor_correction: ")
"""
predecessor_correction = True

print("min_cluster_size: Minimum number of samples in an OPTICS cluster, expressed as an absolute number or a fraction of the number of samples")
print("A good starting range would be int > 1 or float between 0 and 1, such as None")
min_cluster_size = int_input(None, SECTION[2], "min_cluster_size: ")

# Reference:

hyper_parameters = {
"min_samples": min_samples,
"max_eps": max_eps,
"metric": metric,
"p": p,
"cluster_method": cluster_method,
"eps": eps,
"xi": xi,
"predecessor_correction": predecessor_correction,
"min_cluster_size": min_cluster_size,
"algorithm": algorithm,
"leaf_size": leaf_size,
}
return hyper_parameters
17 changes: 16 additions & 1 deletion geochemistrypi/data_mining/process/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import pandas as pd

from ..model.clustering import AffinityPropagationClustering, Agglomerative, ClusteringWorkflowBase, DBSCANClustering, KMeansClustering, MeanShiftClustering
from ..model.clustering import AffinityPropagationClustering, Agglomerative, ClusteringWorkflowBase, DBSCANClustering, KMeansClustering, MeanShiftClustering, OPTICSClustering
from ._base import ModelSelectionBase


Expand Down Expand Up @@ -75,6 +75,21 @@ def activate(
n_jobs=hyper_parameters["n_jobs"],
max_iter=hyper_parameters["max_iter"],
)
elif self.model_name == "OPTICS":
hyper_parameters = OPTICSClustering.manual_hyper_parameters()
self.clt_workflow = OPTICSClustering(
min_samples=hyper_parameters["min_samples"],
max_eps=hyper_parameters["max_eps"],
metric=hyper_parameters["metric"],
p=hyper_parameters["p"],
cluster_method=hyper_parameters["cluster_method"],
eps=hyper_parameters["eps"],
xi=hyper_parameters["xi"],
predecessor_correction=hyper_parameters["predecessor_correction"],
min_cluster_size=hyper_parameters["min_cluster_size"],
algorithm=hyper_parameters["algorithm"],
leaf_size=hyper_parameters["leaf_size"],
)
elif self.model_name == "":
pass

Expand Down