@@ -346,26 +346,48 @@ def fit(self, texts: list[str]) -> tuple[list[int], dict]:
346346 - List of cluster assignments for each text
347347 - Dictionary of cluster parameters
348348 """
349- logger .info (
350- "Processing %s texts with %s..." , len (texts ), self .__class__ .__name__
351- )
349+ logger .info ("Start processing %d texts ..." , len (texts ))
352350
353351 # Reset state for a fresh run
354352 self .clusters = []
355353 self .cluster_params = {}
356354
357- # Process texts in batches for better progress reporting
358- batch_size = 100
359- total_batches = (len (texts ) - 1 ) // batch_size + 1
360- for i in range (0 , len (texts ), batch_size ):
361- batch = texts [i : i + batch_size ]
362- batch_num = i // batch_size + 1
363- for text in tqdm (
364- batch ,
365- desc = f"Clustering batch { batch_num } /{ total_batches } " ,
366- total = len (batch ),
367- ):
368- self .assign_cluster (text )
355+ def format_process (class_name : str ) -> str :
356+ """
357+ Format a class name into a human-readable progress description.
358+
359+ Converts CamelCase class names to space-separated words and handles
360+ special cases like 'PitmanYorProcess' to 'Pitman-Yor Process'.
361+
362+ Args:
363+ class_name (str): The name of the class to format
364+
365+ Returns:
366+ str: Formatted string with timestamp and readable class name
367+ in the format:
368+ "YYYY-MM-DD HH:MM:SS - INFO - Clustering with {formatted_name}"
369+ """
370+ import re
371+ from datetime import datetime
372+
373+ timestamp = datetime .now ().strftime ("%Y-%m-%d %H:%M:%S" )
374+ formatted_name = (
375+ "Pitman-Yor Process" # Special case for Pitman-Yor
376+ if class_name == "PitmanYorProcess"
377+ else re .sub (r"(?<!^)(?=[A-Z])" , " " , class_name )
378+ )
379+
380+ return f"{ timestamp } - INFO - Clustering with { formatted_name } "
381+
382+ # Process all texts with a single progress bar
383+ for text in tqdm (
384+ texts ,
385+ desc = format_process (self .__class__ .__name__ ),
386+ total = len (texts ),
387+ disable = None , # Disable on non-TTY
388+ unit = " texts" ,
389+ ):
390+ self .assign_cluster (text )
369391
370392 return self .clusters , self .cluster_params
371393
0 commit comments