explodinggradients
diff --git a/‎.gitignore‎
Lines changed: 7 additions & 1 deletion b/‎.gitignore‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎src/ragas/testset/graph.py‎
Lines changed: 186 additions & 5 deletions b/‎src/ragas/testset/graph.py‎
Lines changed: 186 additions & 5 deletions
diff --git a/‎src/ragas/testset/synthesizers/multi_hop/abstract.py‎
Lines changed: 20 additions & 9 deletions b/‎src/ragas/testset/synthesizers/multi_hop/abstract.py‎
Lines changed: 20 additions & 9 deletions
diff --git a/‎tests/unit/test_graph.py‎
Lines changed: 10 additions & 1 deletion b/‎tests/unit/test_graph.py‎
Lines changed: 10 additions & 1 deletion
@@ -164,6 +164,11 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
 
+# Cursor
+.cursorignore
+.cursor/*
+!.cursor/rules/
+
 # Ragas specific
 _experiments/
 **/fil-result/
@@ -174,7 +179,8 @@ examples/ragas_examples/_version.py
 .envrc
 uv.lock
 .cache/
-.claude
+.claude/*
+!.claude/commands/
 node_modules
 
 # Ragas examples
 
@@ -1,4 +1,6 @@
+import hashlib
 import json
+import random
 import typing as t
 import uuid
 from collections import defaultdict
@@ -312,19 +314,21 @@ def get_node_clusters(
 
             # NOTE: the upstream sknetwork Dataset has some issues with type hints,
             # so we use type: ignore to bypass them.
+            # Use hex representation to ensure proper UUID strings for clustering
             graph: SKDataset = from_edge_list(  # type: ignore
-                [(str(rel.source.id), str(rel.target.id)) for rel in relationships],
+                [(rel.source.id.hex, rel.target.id.hex) for rel in relationships],
                 directed=True,
             )
 
             # Apply Leiden clustering
             leiden = Leiden(random_state=42)
-            cluster_labels: np.ndarray = leiden.fit_predict(graph.adjacency)
+            cluster_labels: np.ndarray = leiden.fit_predict(graph["adjacency"])
 
             # Group nodes by cluster
             clusters: defaultdict[int, set[uuid.UUID]] = defaultdict(set)
-            for label, node_id in zip(cluster_labels, graph.names):
-                clusters[int(label)].add(uuid.UUID(node_id))
+            for label, node_id_hex in zip(cluster_labels, graph["names"]):
+                # node_id_hex is the hex string representation of the UUID
+                clusters[int(label)].add(uuid.UUID(hex=node_id_hex))
 
             return dict(clusters)
 
@@ -440,7 +444,8 @@ def sample_paths_from_graph(
         for _cluster_label, cluster_nodes in tqdm(
             clusters.items(), desc="Processing clusters"
         ):
-            if len(cluster_nodes) < depth_limit:
+            # Skip clusters that are too small to form any meaningful paths (need at least 2 nodes)
+            if len(cluster_nodes) < 2:
                 continue
 
             subgraph = to_nx_digraph(
@@ -463,6 +468,182 @@ def sample_paths_from_graph(
 
         return [set(path_nodes) for path_nodes in cluster_sets]
 
+    def find_n_indirect_clusters(
+        self,
+        n: int,
+        relationship_condition: t.Callable[[Relationship], bool] = lambda _: True,
+        depth_limit: int = 3,
+    ) -> t.List[t.Set[Node]]:
+        """
+        Return n indirect clusters of nodes in the knowledge graph based on a relationship condition.
+        Optimized for large datasets by using an adjacency index for lookups and limiting path exploration
+        relative to n.
+
+        A cluster represents a path through the graph. For example, if A -> B -> C -> D exists in the graph,
+        then {A, B, C, D} forms a cluster. If there's also a path A -> B -> C -> E, it forms a separate cluster.
+
+        The method returns a list of up to n sets, where each set contains nodes forming a complete path
+        from a starting node to a leaf node or a path segment up to depth_limit nodes long. The result may contain
+        fewer than n clusters if the graph is very sparse or if there aren't enough nodes to form n distinct clusters.
+
+        To maximize diversity in the results:
+        1. Random starting nodes are selected
+        2. Paths from each starting node are grouped
+        3. Clusters are selected in round-robin fashion from each group until n unique clusters are found
+        4. Duplicate clusters are eliminated
+        5. When a superset cluster is found (e.g., {A,B,C,D}), any existing subset clusters (e.g., {A,B,C})
+           are removed to avoid redundancy
+
+        Parameters
+        ----------
+        n : int
+            Target number of clusters to return. Must be at least 1. Should return n clusters unless the graph is
+            extremely sparse.
+        relationship_condition : Callable[[Relationship], bool], optional
+            A function that takes a Relationship and returns a boolean, by default lambda _: True
+        depth_limit : int, optional
+            Maximum depth for path exploration, by default 3. Must be at least 2 to form clusters by definition.
+
+        Returns
+        -------
+        List[Set[Node]]
+            A list of sets, where each set contains nodes that form a cluster.
+
+        Raises
+        ------
+        ValueError
+            If depth_limit < 2, n < 1, or no relationships match the provided condition.
+        """
+        if depth_limit < 2:
+            raise ValueError("depth_limit must be at least 2 to form valid clusters")
+
+        if n < 1:
+            raise ValueError("n must be at least 1")
+
+        # Filter relationships once upfront
+        filtered_relationships: list[Relationship] = [
+            rel for rel in self.relationships if relationship_condition(rel)
+        ]
+
+        if not filtered_relationships:
+            raise ValueError(
+                "No relationships match the provided condition. Cannot form clusters."
+            )
+
+        # Build adjacency list for faster neighbor lookup - optimized for large datasets
+        adjacency_list: dict[Node, set[Node]] = {}
+        unique_edges: set[frozenset[Node]] = set()
+        for rel in filtered_relationships:
+            # Lazy initialization since we only care about nodes with relationships
+            if rel.source not in adjacency_list:
+                adjacency_list[rel.source] = set()
+            adjacency_list[rel.source].add(rel.target)
+            unique_edges.add(frozenset({rel.source, rel.target}))
+            if rel.bidirectional:
+                if rel.target not in adjacency_list:
+                    adjacency_list[rel.target] = set()
+                adjacency_list[rel.target].add(rel.source)
+
+        # Aggregate clusters for each start node
+        start_node_clusters: dict[Node, set[frozenset[Node]]] = {}
+        # sample enough starting nodes to handle worst case grouping scenario where nodes are grouped
+        # in independent clusters of size equal to depth_limit. This only surfaces when there are less
+        # unique edges than nodes.
+        connected_nodes: set[Node] = set().union(*unique_edges)
+        sample_size: int = (
+            (n - 1) * depth_limit + 1
+            if len(unique_edges) < len(connected_nodes)
+            else max(n, depth_limit, 10)
+        )
+
+        def dfs(node: Node, start_node: Node, current_path: t.Set[Node]):
+            # Terminate exploration when max usable clusters is reached so complexity doesn't spiral
+            if len(start_node_clusters.get(start_node, [])) > sample_size:
+                return
+
+            current_path.add(node)
+            path_length = len(current_path)
+            at_max_depth = path_length >= depth_limit
+            neighbors = adjacency_list.get(node, None)
+
+            # If this is a leaf node or we've reached depth limit
+            # and we have a valid path of at least 2 nodes, add it as a cluster
+            if path_length > 1 and (
+                at_max_depth
+                or not neighbors
+                or all(n in current_path for n in neighbors)
+            ):
+                # Lazy initialization of the set for this start node
+                if start_node not in start_node_clusters:
+                    start_node_clusters[start_node] = set()
+                start_node_clusters[start_node].add(frozenset(current_path))
+            elif neighbors:
+                for neighbor in neighbors:
+                    # Block cycles
+                    if neighbor not in current_path:
+                        dfs(neighbor, start_node, current_path)
+
+            # Backtrack by removing the current node from path
+            current_path.remove(node)
+
+        # Shuffle nodes for random starting points
+        # Use adjacency list since that has filtered out isolated nodes
+        # Sort by node ID for consistent ordering while maintaining algorithm effectiveness
+        start_nodes = sorted(adjacency_list.keys(), key=lambda n: n.id.hex)
+        # Use a hash-based seed for reproducible but varied shuffling based on the nodes themselves
+        node_ids_str = "".join(n.id.hex for n in start_nodes)
+        node_hash = hashlib.sha256(node_ids_str.encode("utf-8")).hexdigest()
+        rng = random.Random(int(node_hash[:8], 16))  # Use first 8 hex chars as seed
+        rng.shuffle(start_nodes)
+        samples: list[Node] = start_nodes[:sample_size]
+        for start_node in samples:
+            dfs(start_node, start_node, set())
+
+        start_node_clusters_list: list[set[frozenset[Node]]] = list(
+            start_node_clusters.values()
+        )
+
+        # Iteratively pop from each start_node_clusters until we have n unique clusters
+        # Avoid adding duplicates and subset/superset pairs so we have diversity. We
+        # favor supersets over subsets if we are given a choice.
+        unique_clusters: set[frozenset[Node]] = set()
+        i = 0
+        while len(unique_clusters) < n and start_node_clusters_list:
+            # Cycle through the start node clusters
+            current_index = i % len(start_node_clusters_list)
+
+            current_start_node_clusters: set[frozenset[Node]] = (
+                start_node_clusters_list[current_index]
+            )
+            cluster: frozenset[Node] = current_start_node_clusters.pop()
+
+            # Check if the new cluster is a subset of any existing cluster
+            # and collect any existing clusters that are subsets of this cluster
+            is_subset = False
+            subsets_to_remove: set[frozenset[Node]] = set()
+
+            for existing in unique_clusters:
+                if cluster.issubset(existing):
+                    is_subset = True
+                    break
+                elif cluster.issuperset(existing):
+                    subsets_to_remove.add(existing)
+
+            # Only add the new cluster if it's not a subset of any existing cluster
+            if not is_subset:
+                # Remove any subsets of the new cluster
+                unique_clusters -= subsets_to_remove
+                unique_clusters.add(cluster)
+
+            # If this set is now empty, remove it
+            if not current_start_node_clusters:
+                start_node_clusters_list.pop(current_index)
+                # Don't increment i since we removed an element to account for shift
+            else:
+                i += 1
+
+        return [set(cluster) for cluster in unique_clusters]
+
     def remove_node(
         self, node: Node, inplace: bool = True
     ) -> t.Optional["KnowledgeGraph"]:
 
@@ -8,7 +8,6 @@
 
 from ragas.prompt import PydanticPrompt
 from ragas.testset.graph import KnowledgeGraph, Node
-from ragas.testset.graph_queries import get_child_nodes
 from ragas.testset.persona import Persona
 from ragas.testset.synthesizers.multi_hop.base import (
     MultiHopQuerySynthesizer,
@@ -39,11 +38,17 @@ class MultiHopAbstractQuerySynthesizer(MultiHopQuerySynthesizer):
     concept_combination_prompt: PydanticPrompt = ConceptCombinationPrompt()
     theme_persona_matching_prompt: PydanticPrompt = ThemesPersonasMatchingPrompt()
 
-    def get_node_clusters(self, knowledge_graph: KnowledgeGraph) -> t.List[t.Set[Node]]:
-        """Identify clusters of nodes based on the specified relationship condition."""
-        node_clusters = knowledge_graph.find_indirect_clusters(
-            relationship_condition=lambda rel: bool(
-                rel.get_property(self.relation_property)
+    def get_node_clusters(
+        self,
+        knowledge_graph: KnowledgeGraph,
+        n: int = 1,
+    ) -> t.List[t.Set[Node]]:
+        """Find n indirect clusters of nodes based on relationship condition"""
+
+        node_clusters = knowledge_graph.find_n_indirect_clusters(
+            n,
+            relationship_condition=lambda rel: (
+                True if rel.get_property(self.relation_property) else False
             ),
             depth_limit=3,
         )
@@ -61,7 +66,7 @@ async def _generate_scenarios(
         Generate a list of scenarios of type MultiHopScenario.
 
         Steps to generate scenarios:
-        1. Find indirect clusters of nodes based on relationship condition
+        1. Find n indirect clusters of nodes based on relationship condition
         2. Calculate the number of samples that should be created per cluster to get n samples in total
         3. For each cluster of nodes
             a. Find the child nodes of the cluster nodes
@@ -70,7 +75,7 @@ async def _generate_scenarios(
         4. Sample diverse combinations of scenarios to get n samples
         """
 
-        node_clusters = self.get_node_clusters(knowledge_graph)
+        node_clusters = self.get_node_clusters(knowledge_graph, n)
         scenarios = []
 
         if len(node_clusters) == 0:
@@ -79,12 +84,18 @@ async def _generate_scenarios(
             )
         num_sample_per_cluster = int(np.ceil(n / len(node_clusters)))
 
+        child_relationships = [
+            rel for rel in knowledge_graph.relationships if rel.type == "child"
+        ]
+
         for cluster in node_clusters:
             if len(scenarios) >= n:
                 break
             nodes = []
             for node in cluster:
-                child_nodes = get_child_nodes(node, knowledge_graph, level=1)
+                child_nodes = [
+                    rel.target for rel in child_relationships if rel.source == node
+                ]
                 if child_nodes:
                     nodes.extend(child_nodes)
                 else:
 
@@ -173,7 +173,16 @@ def simple_graph(self):
             ),
             (
                 4,
-                [],  # depth_limit=4 > max(cluster_size), so no paths are identified
+                [
+                    ("A", "C"),
+                    ("E", "F", "G"),
+                    ("B", "C"),
+                    ("A", "B"),
+                    ("F", "G"),
+                    ("A", "B", "C"),
+                    ("E", "F"),
+                    ("E", "G"),
+                ],
             ),
         ],
     )