Skip to content
This repository was archived by the owner on Jan 8, 2026. It is now read-only.

Commit f450555

Browse files
committed
Redesign progress bar on clustering
1 parent 42efbeb commit f450555

File tree

2 files changed

+38
-15
lines changed

2 files changed

+38
-15
lines changed

CHANGELOG.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ Improvements
4646
^^^^^^^^^^^^
4747

4848
* Select appropriate colormaps based on visualization best practices for clustering.
49+
* Redesign progress bar on clustering to be more informative and less noisy.
4950

5051
Improved Documentation
5152
^^^^^^^^^^^^^^^^^^^^^^

clusx/clustering/models.py

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -346,26 +346,48 @@ def fit(self, texts: list[str]) -> tuple[list[int], dict]:
346346
- List of cluster assignments for each text
347347
- Dictionary of cluster parameters
348348
"""
349-
logger.info(
350-
"Processing %s texts with %s...", len(texts), self.__class__.__name__
351-
)
349+
logger.info("Start processing %d texts ...", len(texts))
352350

353351
# Reset state for a fresh run
354352
self.clusters = []
355353
self.cluster_params = {}
356354

357-
# Process texts in batches for better progress reporting
358-
batch_size = 100
359-
total_batches = (len(texts) - 1) // batch_size + 1
360-
for i in range(0, len(texts), batch_size):
361-
batch = texts[i : i + batch_size]
362-
batch_num = i // batch_size + 1
363-
for text in tqdm(
364-
batch,
365-
desc=f"Clustering batch {batch_num}/{total_batches}",
366-
total=len(batch),
367-
):
368-
self.assign_cluster(text)
355+
def format_process(class_name: str) -> str:
356+
"""
357+
Format a class name into a human-readable progress description.
358+
359+
Converts CamelCase class names to space-separated words and handles
360+
special cases like 'PitmanYorProcess' to 'Pitman-Yor Process'.
361+
362+
Args:
363+
class_name (str): The name of the class to format
364+
365+
Returns:
366+
str: Formatted string with timestamp and readable class name
367+
in the format:
368+
"YYYY-MM-DD HH:MM:SS - INFO - Clustering with {formatted_name}"
369+
"""
370+
import re
371+
from datetime import datetime
372+
373+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
374+
formatted_name = (
375+
"Pitman-Yor Process" # Special case for Pitman-Yor
376+
if class_name == "PitmanYorProcess"
377+
else re.sub(r"(?<!^)(?=[A-Z])", " ", class_name)
378+
)
379+
380+
return f"{timestamp} - INFO - Clustering with {formatted_name}"
381+
382+
# Process all texts with a single progress bar
383+
for text in tqdm(
384+
texts,
385+
desc=format_process(self.__class__.__name__),
386+
total=len(texts),
387+
disable=None, # Disable on non-TTY
388+
unit=" texts",
389+
):
390+
self.assign_cluster(text)
369391

370392
return self.clusters, self.cluster_params
371393

0 commit comments

Comments
 (0)