-
Notifications
You must be signed in to change notification settings - Fork 4
Documentation (Python)
Leonardo Xavier Kuffo Rivero edited this page Feb 26, 2026
·
1 revision
Below, we provide a minimal example to run k-means clustering, reading data from a .hdf5 . In our Python bindings, the .train() method will decide at runtime whether to use vanilla Super K-Means or Hierarchical Super K-Means, depending on the number of data points. You can override this behavior.
import numpy as np
import sys
import h5py
import math
from superkmeans import SuperKMeans
def read_hdf5_data(hdf5_path):
"""Read train and test data from HDF5 file."""
with h5py.File(hdf5_path, "r") as hdf5_file:
train = np.array(hdf5_file["train"], dtype=np.float32)
test = np.array(hdf5_file["test"], dtype=np.float32) if "test" in hdf5_file else None
return train, test
def main():
if len(sys.argv) < 2:
print("Error: Missing HDF5 file path")
return 1
hdf5_path = sys.argv[1]
print(f"Loading data from {hdf5_path}...")
data, test = read_hdf5_data(hdf5_path)
n, d = data.shape
k = math.sqrt(n) * 4
print(f"Loaded {n:,} vectors with {d} dimensions")
print(f"Number of clusters: {k}")
kmeans = SuperKMeans(
n_clusters=k,
dimensionality=d,
)
print("Generating centroids...")
centroids = kmeans.train(data)
print("Assigning points to clusters...")
assignments = kmeans.assign(data, centroids)
print("Done!")
if __name__ == "__main__":
main()We provide a plethora of parameters for the customization of your clustering. You can pass these to the constructor:
kmeans = SuperKMeans(
n_clusters=k,
dimensionality=d,
angular = False, # Use spherical k-means (default=False)
iters = 10, # Number of iterations in the core loop (default=10)
sampling_fraction = 0.3, # Fraction of points to sample (default=0.3)
n_threads = 32, # Number of threads to use (default=0, uses max available)
seed = 42, # Random seed for reproducibility
early_termination = True, # Whether to enable early termination mechanisms (default=True)
tol = 1e-4, # Tolerance for WCSS improvement rate and centroids shift before stopping (default=1e-4)
verbose = False, # Prints debug logs
data_already_rotated = False, # If True, assumes input data is already rotated (default=False)
use_blas_only = False, # If True, disables SuperKMeans PRUNING phase (default=False)
hierarchical = False, # If False, forces the use of vanilla k-means (default=None, hierarchical is used depending on data size),
iters_mesoclustering = 3, # Mesoclustering iterations for hierarchical k-means (default=3)
iters_fineclustering = 5, # Fineclustering iterations for hierarchical k-means (default=5)
# Additional vanilla k-means iterations to refine the clusters resulting from hierarchical k-means
# In practice, these are very expensive and do not lead to substantial gains (default = 0)
iters_refinement = 0
)