fix: handle tuple-formatted entities in SingleHopSpecificQuerySynthesizer (#2377)

anistark · web-flow · commit b113f5249dff · 2025-10-27T14:13:27.000+05:30
## Issue Link / Problem Description  - Fixes #2368 ## Changes Made  - helper method `_extract_themes_from_items` to handle various formats ## Testing  ### How to Test - [x] Automated tests added/updated
diff --git a/src/ragas/testset/synthesizers/single_hop/specific.py b/src/ragas/testset/synthesizers/single_hop/specific.py
@@ -43,6 +43,44 @@ class SingleHopSpecificQuerySynthesizer(SingleHopQuerySynthesizer):
     theme_persona_matching_prompt: PydanticPrompt = ThemesPersonasMatchingPrompt()
     property_name: str = "entities"
 
+    def _extract_themes_from_items(self, items: t.Any) -> t.List[str]:
+        """
+        Extract unique theme names from various formats.
+
+        Handles multiple data formats that might appear during synthesis:
+        - List[Tuple[str, str]]: Entity pairs (from overlap detection)
+        - List[List[str]]: Entity pairs as lists
+        - List[str]: Direct entity names
+        - Dict[str, Any]: Keys as entity names
+
+        Parameters
+        ----------
+        items : t.Any
+            The items to extract themes from.
+
+        Returns
+        -------
+        t.List[str]
+            List of unique theme strings.
+        """
+        if isinstance(items, dict):
+            return list(items.keys())
+
+        if not isinstance(items, list):
+            return []
+
+        unique_themes = set()
+        for item in items:
+            if isinstance(item, (tuple, list)):
+                # Extract strings from pairs/sequences
+                for element in item:
+                    if isinstance(element, str):
+                        unique_themes.add(element)
+            elif isinstance(item, str):
+                unique_themes.add(item)
+
+        return list(unique_themes)
+
     def get_node_clusters(self, knowledge_graph: KnowledgeGraph) -> t.List[Node]:
         node_type_dict = defaultdict(int)
         for node in knowledge_graph.nodes:
@@ -101,7 +139,14 @@ async def _generate_scenarios(
         for node in nodes:
             if len(scenarios) >= n:
                 break
-            themes = node.properties.get(self.property_name, [""])
+            raw_themes = node.properties.get(self.property_name, [])
+            # Extract themes from potentially mixed data types (handles tuples, lists, strings)
+            themes = self._extract_themes_from_items(raw_themes)
+
+            if not themes:  # Skip if no themes extracted
+                logger.debug("No themes extracted from node %s. Skipping.", node.id)
+                continue
+
             prompt_input = ThemesPersonasInput(themes=themes, personas=persona_list)
             persona_concepts = await self.theme_persona_matching_prompt.generate(
                 data=prompt_input, llm=self.llm, callbacks=callbacks
diff --git a/tests/unit/test_single_hop_query_synthesizer.py b/tests/unit/test_single_hop_query_synthesizer.py
@@ -0,0 +1,147 @@
+import typing as t
+
+import pytest
+
+from ragas.prompt import PydanticPrompt
+from ragas.testset.graph import KnowledgeGraph, Node, NodeType
+from ragas.testset.persona import Persona
+from ragas.testset.synthesizers.prompts import PersonaThemesMapping, ThemesPersonasInput
+from ragas.testset.synthesizers.single_hop.specific import (
+    SingleHopSpecificQuerySynthesizer,
+)
+
+
+class MockThemePersonaMatchingPrompt(PydanticPrompt):
+    async def generate(self, data: ThemesPersonasInput, llm, callbacks=None):
+        themes: t.List[str] = data.themes
+        personas: t.List[Persona] = data.personas
+        return PersonaThemesMapping(
+            mapping={persona.name: themes for persona in personas}
+        )
+
+
+def test_extract_themes_from_items_with_strings(fake_llm):
+    """Test _extract_themes_from_items with string input."""
+    synthesizer = SingleHopSpecificQuerySynthesizer(llm=fake_llm)
+
+    items = ["Theme1", "Theme2", "Theme3"]
+    themes = synthesizer._extract_themes_from_items(items)
+
+    assert set(themes) == {"Theme1", "Theme2", "Theme3"}
+
+
+def test_extract_themes_from_items_with_tuples(fake_llm):
+    """Test _extract_themes_from_items with tuple input (the bug fix)."""
+    synthesizer = SingleHopSpecificQuerySynthesizer(llm=fake_llm)
+
+    # This is the format that was causing the ValidationError in issue #2368
+    items = [("Entity1", "Entity1"), ("Entity2", "Entity2")]
+    themes = synthesizer._extract_themes_from_items(items)
+
+    assert set(themes) == {"Entity1", "Entity2"}
+
+
+def test_extract_themes_from_items_with_mixed_formats(fake_llm):
+    """Test _extract_themes_from_items with mixed formats."""
+    synthesizer = SingleHopSpecificQuerySynthesizer(llm=fake_llm)
+
+    items = ["Theme1", ("Entity2", "Entity2"), ["Entity3", "Entity3"]]
+    themes = synthesizer._extract_themes_from_items(items)
+
+    assert set(themes) == {"Theme1", "Entity2", "Entity3"}
+
+
+def test_extract_themes_from_items_with_dict(fake_llm):
+    """Test _extract_themes_from_items with dict input."""
+    synthesizer = SingleHopSpecificQuerySynthesizer(llm=fake_llm)
+
+    items = {"Theme1": "value1", "Theme2": "value2"}
+    themes = synthesizer._extract_themes_from_items(items)
+
+    assert set(themes) == {"Theme1", "Theme2"}
+
+
+def test_extract_themes_from_items_empty_input(fake_llm):
+    """Test _extract_themes_from_items with empty input."""
+    synthesizer = SingleHopSpecificQuerySynthesizer(llm=fake_llm)
+
+    assert synthesizer._extract_themes_from_items([]) == []
+    assert synthesizer._extract_themes_from_items(None) == []
+    assert synthesizer._extract_themes_from_items("invalid") == []
+
+
+def test_extract_themes_from_items_with_nested_empty_tuples(fake_llm):
+    """Test _extract_themes_from_items skips non-string elements."""
+    synthesizer = SingleHopSpecificQuerySynthesizer(llm=fake_llm)
+
+    items = [("Theme1", 123), (456, "Theme2"), ("Theme3", "Theme3")]
+    themes = synthesizer._extract_themes_from_items(items)
+
+    # Only string elements should be extracted
+    assert set(themes) == {"Theme1", "Theme2", "Theme3"}
+
+
+@pytest.mark.asyncio
+async def test_generate_scenarios_with_tuple_entities(fake_llm):
+    """Test that _generate_scenarios handles tuple-formatted entities correctly.
+
+    This test validates the fix for issue #2368 where entities property
+    containing tuples would cause ValidationError.
+    """
+    # Create a node with tuple-formatted entities (the problematic case)
+    node = Node(type=NodeType.CHUNK)
+    node.add_property("entities", [("Entity1", "Entity1"), ("Entity2", "Entity2")])
+
+    kg = KnowledgeGraph(nodes=[node])
+
+    personas = [
+        Persona(
+            name="Researcher",
+            role_description="A researcher interested in entities.",
+        ),
+    ]
+
+    synthesizer = SingleHopSpecificQuerySynthesizer(llm=fake_llm)
+    synthesizer.theme_persona_matching_prompt = MockThemePersonaMatchingPrompt()
+
+    # This should not raise ValidationError
+    scenarios = await synthesizer._generate_scenarios(
+        n=2,
+        knowledge_graph=kg,
+        persona_list=personas,
+        callbacks=None,
+    )
+
+    # Should generate scenarios successfully
+    assert len(scenarios) > 0
+
+
+@pytest.mark.asyncio
+async def test_generate_scenarios_with_string_entities(fake_llm):
+    """Test that _generate_scenarios still works with string-formatted entities."""
+    # Create a node with string-formatted entities (the normal case)
+    node = Node(type=NodeType.CHUNK)
+    node.add_property("entities", ["Entity1", "Entity2", "Entity3"])
+
+    kg = KnowledgeGraph(nodes=[node])
+
+    personas = [
+        Persona(
+            name="Researcher",
+            role_description="A researcher interested in entities.",
+        ),
+    ]
+
+    synthesizer = SingleHopSpecificQuerySynthesizer(llm=fake_llm)
+    synthesizer.theme_persona_matching_prompt = MockThemePersonaMatchingPrompt()
+
+    # This should work as before
+    scenarios = await synthesizer._generate_scenarios(
+        n=2,
+        knowledge_graph=kg,
+        persona_list=personas,
+        callbacks=None,
+    )
+
+    # Should generate scenarios successfully
+    assert len(scenarios) > 0