@@ -242,7 +242,7 @@ def optimize_structure(
242242 except Exception as e :
243243 logger .warning (
244244 f"[Reorganize] Cluster processing "
245- f"failed: { e } , trace: { traceback .format_exc ()} "
245+ f"failed: { e } , cluster_nodes: { cluster_nodes } , trace: { traceback .format_exc ()} "
246246 )
247247 logger .info ("[GraphStructure Reorganize] Structure optimization finished." )
248248
@@ -333,7 +333,9 @@ def _process_cluster_and_write(
333333
334334 logger .info ("[Reorganizer] Cluster relation/reasoning done." )
335335
336- def _local_subcluster (self , cluster_nodes : list [GraphDBNode ]) -> list [list [GraphDBNode ]]:
336+ def _local_subcluster (
337+ self , cluster_nodes : list [GraphDBNode ], max_length : int = 8000
338+ ) -> (list )[list [GraphDBNode ]]:
337339 """
338340 Use LLM to split a large cluster into semantically coherent sub-clusters.
339341 """
@@ -347,7 +349,9 @@ def _local_subcluster(self, cluster_nodes: list[GraphDBNode]) -> list[list[Graph
347349 scene_lines .append (line )
348350
349351 joined_scene = "\n " .join (scene_lines )
350- prompt = LOCAL_SUBCLUSTER_PROMPT .replace ("{joined_scene}" , joined_scene )
352+ if len (joined_scene ) > max_length :
353+ logger .warning (f"Sub-cluster too long: { joined_scene } " )
354+ prompt = LOCAL_SUBCLUSTER_PROMPT .replace ("{joined_scene}" , joined_scene [:max_length ])
351355
352356 messages = [{"role" : "user" , "content" : prompt }]
353357 response_text = self .llm .generate (messages )
@@ -394,41 +398,73 @@ def _partition(self, nodes, min_cluster_size: int = 10, max_cluster_size: int =
394398 )
395399 return [nodes ]
396400
397- def recursive_clustering (nodes_list ):
401+ def recursive_clustering (nodes_list , depth = 0 ):
398402 """Recursively split clusters until each is <= max_cluster_size."""
403+ indent = " " * depth
404+ logger .info (
405+ f"{ indent } [Recursive] Start clustering { len (nodes_list )} nodes at depth { depth } "
406+ )
407+
399408 if len (nodes_list ) <= max_cluster_size :
409+ logger .info (
410+ f"{ indent } [Recursive] Node count <= { max_cluster_size } , stop splitting."
411+ )
400412 return [nodes_list ]
401-
402413 # Try kmeans with k = ceil(len(nodes) / max_cluster_size)
403- x = np .array ([n .metadata .embedding for n in nodes_list if n .metadata .embedding ])
414+ x_nodes = [n for n in nodes_list if n .metadata .embedding ]
415+ x = np .array ([n .metadata .embedding for n in x_nodes ])
416+
404417 if len (x ) < min_cluster_size :
418+ logger .info (
419+ f"{ indent } [Recursive] Too few embeddings ({ len (x )} ), skipping clustering."
420+ )
405421 return [nodes_list ]
406422
407423 k = min (len (x ), (len (nodes_list ) + max_cluster_size - 1 ) // max_cluster_size )
408- k = max (1 , min ( k , len ( x )) )
424+ k = max (1 , k )
409425
410426 try :
427+ logger .info (f"{ indent } [Recursive] Clustering with k={ k } on { len (x )} points." )
411428 kmeans = MiniBatchKMeans (n_clusters = k , batch_size = 256 , random_state = 42 )
412429 labels = kmeans .fit_predict (x )
413430
414431 label_groups = defaultdict (list )
415- for node , label in zip (nodes_list , labels , strict = False ):
432+ for node , label in zip (x_nodes , labels , strict = False ):
416433 label_groups [label ].append (node )
417434
435+ # Map: label -> nodes with no embedding (fallback group)
436+ no_embedding_nodes = [n for n in nodes_list if not n .metadata .embedding ]
437+ if no_embedding_nodes :
438+ logger .warning (
439+ f"{ indent } [Recursive] { len (no_embedding_nodes )} nodes have no embedding. Added to largest cluster."
440+ )
441+ # Assign to largest cluster
442+ largest_label = max (label_groups .items (), key = lambda kv : len (kv [1 ]))[0 ]
443+ label_groups [largest_label ].extend (no_embedding_nodes )
444+
418445 result = []
419- for sub_group in label_groups .values ():
420- result .extend (recursive_clustering (sub_group ))
446+ for label , sub_group in label_groups .items ():
447+ logger .info (f"{ indent } Cluster-{ label } : { len (sub_group )} nodes" )
448+ result .extend (recursive_clustering (sub_group , depth = depth + 1 ))
421449 return result
450+
422451 except Exception as e :
423- logger .warning (f"Clustering failed: { e } , falling back to single cluster." )
452+ logger .warning (
453+ f"{ indent } [Recursive] Clustering failed: { e } , fallback to one cluster."
454+ )
424455 return [nodes_list ]
425456
426457 raw_clusters = recursive_clustering (nodes )
427458 filtered_clusters = [c for c in raw_clusters if len (c ) > min_cluster_size ]
459+
460+ logger .info (f"[KMeansPartition] Total clusters before filtering: { len (raw_clusters )} " )
461+ for i , cluster in enumerate (raw_clusters ):
462+ logger .info (f"[KMeansPartition] Cluster-{ i } : { len (cluster )} nodes" )
463+
428464 logger .info (
429- f"[KMeansPartition] Total clusters created: { len (raw_clusters )} , "
430- f"kept { len (filtered_clusters )} (>{ min_cluster_size } )."
465+ f"[KMeansPartition] Clusters after filtering (>{ min_cluster_size } ): { len (filtered_clusters )} "
431466 )
467+
432468 return filtered_clusters
433469
434470 def _summarize_cluster (self , cluster_nodes : list [GraphDBNode ], scope : str ) -> GraphDBNode :
0 commit comments