Skip to content

Commit 3237d50

Browse files
authored
feature!: knowledge graph flexibility (#2030)
The current implementations of the synthesizers (in particular, `MultiHopAbstractQuerySynthesizer` and `MultiHopSpecificQuerySynthesizer`) have hard-coded values for the node- and relationship- properties/types they look for when generating scenarios and samples. This works when using the default knowledge graph transforms, but may cause unexpected behavior (or force users to use the expected hardcoded values when they do not actually correspond to the reality of property lineage) when using customized the knowledge graph transformations. This PR allows flexibility in defining the node-property, relationship-property, and relationship-type expected by the *QuerySynthesizers, while retaining the current behavior as default The bulk of this PR is nonbreaking, but there may be a breaking typo fix in the `SingleHopSpecificQuerySynthesizer` default name. - feat: allow user-defined node- and relationship- properties in MultiHopAbstractQuerySynthesizer - Users should be able to set the relationship property to use for identifying clusters and the node property used for identifying abstract concepts. - This will not change default behavior, but allows users to override. - feat: allow user-defined node- and relationship- properties in MultiHopSpecificQuerySynthesizer - Users should be able to set the relationship type to use for identifying clusters and the relationship property used for identifying overlapping concepts within the triple. - This will not change default behavior, but allows users to override. - fix!: typo in SingleHopSpecificQuerySynthesizer name (`single_hop_specifc_query_synthesizer` -> `single_hop_specific_query_synthesizer`) - docs: minor docstring updates to SingleHopSpecificQuerySynthesizer
1 parent 4b9d826 commit 3237d50

File tree

4 files changed

+38
-31
lines changed

4 files changed

+38
-31
lines changed

src/ragas/testset/synthesizers/multi_hop/abstract.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,21 +31,19 @@
3131

3232
@dataclass
3333
class MultiHopAbstractQuerySynthesizer(MultiHopQuerySynthesizer):
34-
"""
35-
Synthesizes abstract multi-hop queries from given knowledge graph.
36-
37-
Attributes
38-
----------
39-
"""
34+
"""Synthesize abstract multi-hop queries from given knowledge graph."""
4035

4136
name: str = "multi_hop_abstract_query_synthesizer"
37+
relation_property: str = "summary_similarity"
38+
abstract_property_name: str = "themes"
4239
concept_combination_prompt: PydanticPrompt = ConceptCombinationPrompt()
4340
theme_persona_matching_prompt: PydanticPrompt = ThemesPersonasMatchingPrompt()
4441

4542
def get_node_clusters(self, knowledge_graph: KnowledgeGraph) -> t.List[t.Set[Node]]:
43+
"""Identify clusters of nodes based on the specified relationship condition."""
4644
node_clusters = knowledge_graph.find_indirect_clusters(
47-
relationship_condition=lambda rel: (
48-
True if rel.get_property("summary_similarity") else False
45+
relationship_condition=lambda rel: bool(
46+
rel.get_property(self.relation_property)
4947
),
5048
depth_limit=3,
5149
)
@@ -60,7 +58,8 @@ async def _generate_scenarios(
6058
callbacks: Callbacks,
6159
) -> t.List[MultiHopScenario]:
6260
"""
63-
Generates a list of scenarios on type MultiHopAbstractQuerySynthesizer
61+
Generate a list of scenarios of type MultiHopScenario.
62+
6463
Steps to generate scenarios:
6564
1. Find indirect clusters of nodes based on relationship condition
6665
2. Calculate the number of samples that should be created per cluster to get n samples in total
@@ -92,7 +91,9 @@ async def _generate_scenarios(
9291
nodes.append(node)
9392

9493
base_scenarios = []
95-
node_themes = [node.properties.get("themes", []) for node in nodes]
94+
node_themes = [
95+
node.properties.get(self.abstract_property_name, []) for node in nodes
96+
]
9697
prompt_input = ConceptsList(
9798
lists_of_concepts=node_themes, max_combinations=num_sample_per_cluster
9899
)
@@ -116,7 +117,7 @@ async def _generate_scenarios(
116117
concept_combination.combinations,
117118
personas=persona_list,
118119
persona_item_mapping=persona_concepts.mapping,
119-
property_name="themes",
120+
property_name=self.abstract_property_name,
120121
)
121122
base_scenarios = self.sample_diverse_combinations(
122123
base_scenarios, num_sample_per_cluster

src/ragas/testset/synthesizers/multi_hop/specific.py

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import logging
44
import typing as t
5+
from collections.abc import Iterable
56
from dataclasses import dataclass
67

78
import numpy as np
@@ -27,27 +28,19 @@
2728

2829
@dataclass
2930
class MultiHopSpecificQuerySynthesizer(MultiHopQuerySynthesizer):
30-
"""
31-
Synthesizes overlap based queries by choosing specific chunks and generating a
32-
keyphrase from them and then generating queries based on that.
33-
34-
Attributes
35-
----------
36-
generate_query_prompt : PydanticPrompt
37-
The prompt used for generating the query.
38-
"""
31+
"""Synthesize multi-hop queries based on a chunk cluster defined by entity overlap."""
3932

4033
name: str = "multi_hop_specific_query_synthesizer"
41-
relation_type: str = "entities_overlap"
4234
property_name: str = "entities"
35+
relation_type: str = "entities_overlap"
36+
relation_overlap_property: str = "overlapped_items"
4337
theme_persona_matching_prompt: PydanticPrompt = ThemesPersonasMatchingPrompt()
4438
generate_query_reference_prompt: PydanticPrompt = QueryAnswerGenerationPrompt()
4539

4640
def get_node_clusters(self, knowledge_graph: KnowledgeGraph) -> t.List[t.Tuple]:
41+
"""Identify clusters of nodes based on the specified relationship condition."""
4742
node_clusters = knowledge_graph.find_two_nodes_single_rel(
48-
relationship_condition=lambda rel: (
49-
True if rel.type == self.relation_type else False
50-
)
43+
relationship_condition=lambda rel: rel.type == self.relation_type
5144
)
5245
logger.info("found %d clusters", len(node_clusters))
5346
return node_clusters
@@ -60,7 +53,8 @@ async def _generate_scenarios(
6053
callbacks: Callbacks,
6154
) -> t.List[MultiHopScenario]:
6255
"""
63-
Generates a list of scenarios on type MultiHopSpecificQuerySynthesizer
56+
Generate a list of scenarios of type MultiHopScenario.
57+
6458
Steps to generate scenarios:
6559
1. Filter the knowledge graph to find cluster of nodes or defined relation type. Here entities_overlap
6660
2. Calculate the number of samples that should be created per cluster to get n samples in total
@@ -86,9 +80,18 @@ async def _generate_scenarios(
8680
if len(scenarios) < n:
8781
node_a, node_b = triplet[0], triplet[-1]
8882
overlapped_items = []
89-
overlapped_items = triplet[1].properties["overlapped_items"]
83+
overlapped_items = triplet[1].properties[self.relation_overlap_property]
9084
if overlapped_items:
91-
themes = list(dict(overlapped_items).keys())
85+
if not all(
86+
isinstance(item, (str, Iterable)) for item in overlapped_items
87+
):
88+
logger.debug("Overlapped items are not strings or iterables.")
89+
continue
90+
themes = (
91+
list(overlapped_items.keys())
92+
if isinstance(overlapped_items, dict)
93+
else overlapped_items
94+
)
9295
prompt_input = ThemesPersonasInput(
9396
themes=themes, personas=persona_list
9497
)
@@ -97,10 +100,13 @@ async def _generate_scenarios(
97100
data=prompt_input, llm=self.llm, callbacks=callbacks
98101
)
99102
)
100-
overlapped_items = [list(item) for item in overlapped_items]
103+
combinations = [
104+
[item] if isinstance(item, str) else list(item)
105+
for item in themes
106+
]
101107
base_scenarios = self.prepare_combinations(
102108
[node_a, node_b],
103-
overlapped_items,
109+
combinations,
104110
personas=persona_list,
105111
persona_item_mapping=persona_concepts.mapping,
106112
property_name=self.property_name,

src/ragas/testset/synthesizers/single_hop/specific.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ class SingleHopScenario(BaseScenario):
3939

4040
@dataclass
4141
class SingleHopSpecificQuerySynthesizer(SingleHopQuerySynthesizer):
42-
name: str = "single_hop_specifc_query_synthesizer"
42+
name: str = "single_hop_specific_query_synthesizer"
4343
theme_persona_matching_prompt: PydanticPrompt = ThemesPersonasMatchingPrompt()
4444
property_name: str = "entities"
4545

tests/unit/test_analytics.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def test_testset_generation_tracking(monkeypatch):
136136
)
137137

138138
assert testset_event_payload.model_dump()["evolution_names"] == [
139-
"single_hop_specifc_query_synthesizer",
139+
"single_hop_specific_query_synthesizer",
140140
"multi_hop_abstract_query_synthesizer",
141141
"multi_hop_specific_query_synthesizer",
142142
]

0 commit comments

Comments
 (0)