|
| 1 | +# Create custom single-hop queries from your documents |
| 2 | + |
| 3 | +### Load sample documents |
| 4 | +I am using documents from [gitlab handbook](https://huggingface.co/datasets/explodinggradients/Sample_Docs_Markdown). You can download it by running the below command. |
| 5 | + |
| 6 | + |
| 7 | +```python |
| 8 | +from langchain_community.document_loaders import DirectoryLoader |
| 9 | + |
| 10 | + |
| 11 | +path = "Sample_Docs_Markdown/" |
| 12 | +loader = DirectoryLoader(path, glob="**/*.md") |
| 13 | +docs = loader.load() |
| 14 | +``` |
| 15 | + |
| 16 | +### Create KG |
| 17 | + |
| 18 | +Create a base knowledge graph with the documents |
| 19 | + |
| 20 | + |
| 21 | +```python |
| 22 | +from ragas.testset.graph import KnowledgeGraph |
| 23 | +from ragas.testset.graph import Node, NodeType |
| 24 | + |
| 25 | + |
| 26 | +kg = KnowledgeGraph() |
| 27 | +for doc in docs: |
| 28 | + kg.nodes.append( |
| 29 | + Node( |
| 30 | + type=NodeType.DOCUMENT, |
| 31 | + properties={ |
| 32 | + "page_content": doc.page_content, |
| 33 | + "document_metadata": doc.metadata, |
| 34 | + }, |
| 35 | + ) |
| 36 | + ) |
| 37 | +``` |
| 38 | + |
| 39 | + /opt/homebrew/Caskroom/miniforge/base/envs/ragas/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html |
| 40 | + from .autonotebook import tqdm as notebook_tqdm |
| 41 | + |
| 42 | + |
| 43 | +### Set up the LLM and Embedding Model |
| 44 | +You may use any of [your choice](/docs/howtos/customizations/customize_models.md), here I am using models from open-ai. |
| 45 | + |
| 46 | + |
| 47 | +```python |
| 48 | +from ragas.llms.base import llm_factory |
| 49 | +from ragas.embeddings.base import embedding_factory |
| 50 | + |
| 51 | +llm = llm_factory() |
| 52 | +embedding = embedding_factory() |
| 53 | +``` |
| 54 | + |
| 55 | +### Setup the transforms |
| 56 | + |
| 57 | + |
| 58 | +Here we are using 2 extractors and 2 relationship builders. |
| 59 | +- Headline extrator: Extracts headlines from the documents |
| 60 | +- Keyphrase extractor: Extracts keyphrases from the documents |
| 61 | +- Headline splitter: Splits the document into nodes based on headlines |
| 62 | + |
| 63 | + |
| 64 | + |
| 65 | +```python |
| 66 | +from ragas.testset.transforms import apply_transforms |
| 67 | +from ragas.testset.transforms import ( |
| 68 | + HeadlinesExtractor, |
| 69 | + HeadlineSplitter, |
| 70 | + KeyphrasesExtractor, |
| 71 | +) |
| 72 | + |
| 73 | + |
| 74 | +headline_extractor = HeadlinesExtractor(llm=llm) |
| 75 | +headline_splitter = HeadlineSplitter(min_tokens=300, max_tokens=1000) |
| 76 | +keyphrase_extractor = KeyphrasesExtractor( |
| 77 | + llm=llm, property_name="keyphrases", max_num=10 |
| 78 | +) |
| 79 | +``` |
| 80 | + |
| 81 | + |
| 82 | +```python |
| 83 | +transforms = [ |
| 84 | + headline_extractor, |
| 85 | + headline_splitter, |
| 86 | + keyphrase_extractor, |
| 87 | +] |
| 88 | + |
| 89 | +apply_transforms(kg, transforms=transforms) |
| 90 | +``` |
| 91 | + |
| 92 | + Applying KeyphrasesExtractor: 6%| | 2/36 [00:01<00:20, 1Property 'keyphrases' already exists in node '514fdc'. Skipping! |
| 93 | + Applying KeyphrasesExtractor: 11%| | 4/36 [00:01<00:10, 2Property 'keyphrases' already exists in node '84a0f6'. Skipping! |
| 94 | + Applying KeyphrasesExtractor: 64%|▋| 23/36 [00:03<00:01, Property 'keyphrases' already exists in node '93f19d'. Skipping! |
| 95 | + Applying KeyphrasesExtractor: 72%|▋| 26/36 [00:04<00:00, 1Property 'keyphrases' already exists in node 'a126bf'. Skipping! |
| 96 | + Applying KeyphrasesExtractor: 81%|▊| 29/36 [00:04<00:00, Property 'keyphrases' already exists in node 'c230df'. Skipping! |
| 97 | + Applying KeyphrasesExtractor: 89%|▉| 32/36 [00:04<00:00, 1Property 'keyphrases' already exists in node '4f2765'. Skipping! |
| 98 | + Property 'keyphrases' already exists in node '4a4777'. Skipping! |
| 99 | + |
| 100 | + |
| 101 | +### Configure personas |
| 102 | + |
| 103 | +You can also do this automatically by using the [automatic persona generator](/docs/howtos/customizations/testgenerator/_persona_generator.md) |
| 104 | + |
| 105 | + |
| 106 | +```python |
| 107 | +from ragas.testset.persona import Persona |
| 108 | + |
| 109 | +person1 = Persona( |
| 110 | + name="gitlab employee", |
| 111 | + role_description="A junior gitlab employee curious on workings on gitlab", |
| 112 | +) |
| 113 | +persona2 = Persona( |
| 114 | + name="Hiring manager at gitlab", |
| 115 | + role_description="A hiring manager at gitlab trying to underestand hiring policies in gitlab", |
| 116 | +) |
| 117 | +persona_list = [person1, persona2] |
| 118 | +``` |
| 119 | + |
| 120 | +## |
| 121 | + |
| 122 | +## SingleHop Query |
| 123 | + |
| 124 | +Inherit from `SingleHopQuerySynthesizer` and modify the function that generates scenarios for query creation. |
| 125 | + |
| 126 | +**Steps**: |
| 127 | +- find qualified set of nodes for the query creation. Here I am selecting all nodes with keyphrases extracted. |
| 128 | +- For each qualified set |
| 129 | + - Match the keyphrase with one or more persona. |
| 130 | + - Create all possible combinations of (Node, Persona, Query Style, Query Length) |
| 131 | + - Samples the required number of queries from the combinations |
| 132 | + |
| 133 | + |
| 134 | +```python |
| 135 | +from ragas.testset.synthesizers.single_hop import ( |
| 136 | + SingleHopQuerySynthesizer, |
| 137 | + SingleHopScenario, |
| 138 | +) |
| 139 | +from dataclasses import dataclass |
| 140 | +from ragas.testset.synthesizers.prompts import ( |
| 141 | + ThemesPersonasInput, |
| 142 | + ThemesPersonasMatchingPrompt, |
| 143 | +) |
| 144 | + |
| 145 | + |
| 146 | +@dataclass |
| 147 | +class MySingleHopScenario(SingleHopQuerySynthesizer): |
| 148 | + |
| 149 | + theme_persona_matching_prompt = ThemesPersonasMatchingPrompt() |
| 150 | + |
| 151 | + async def _generate_scenarios(self, n, knowledge_graph, persona_list, callbacks): |
| 152 | + |
| 153 | + property_name = "keyphrases" |
| 154 | + nodes = [] |
| 155 | + for node in knowledge_graph.nodes: |
| 156 | + if node.type.name == "CHUNK" and node.get_property(property_name): |
| 157 | + nodes.append(node) |
| 158 | + |
| 159 | + number_of_samples_per_node = max(1, n // len(nodes)) |
| 160 | + |
| 161 | + scenarios = [] |
| 162 | + for node in nodes: |
| 163 | + if len(scenarios) >= n: |
| 164 | + break |
| 165 | + themes = node.properties.get(property_name, [""]) |
| 166 | + prompt_input = ThemesPersonasInput(themes=themes, personas=persona_list) |
| 167 | + persona_concepts = await self.theme_persona_matching_prompt.generate( |
| 168 | + data=prompt_input, llm=self.llm, callbacks=callbacks |
| 169 | + ) |
| 170 | + base_scenarios = self.prepare_combinations( |
| 171 | + node, |
| 172 | + themes, |
| 173 | + personas=persona_list, |
| 174 | + persona_concepts=persona_concepts.mapping, |
| 175 | + ) |
| 176 | + scenarios.extend( |
| 177 | + self.sample_combinations(base_scenarios, number_of_samples_per_node) |
| 178 | + ) |
| 179 | + |
| 180 | + return scenarios |
| 181 | +``` |
| 182 | + |
| 183 | + |
| 184 | +```python |
| 185 | +query = MySingleHopScenario(llm=llm) |
| 186 | +``` |
| 187 | + |
| 188 | + |
| 189 | +```python |
| 190 | +scenarios = await query.generate_scenarios( |
| 191 | + n=5, knowledge_graph=kg, persona_list=persona_list |
| 192 | +) |
| 193 | +``` |
| 194 | + |
| 195 | + |
| 196 | +```python |
| 197 | +scenarios[0] |
| 198 | +``` |
| 199 | + |
| 200 | + |
| 201 | + |
| 202 | + |
| 203 | + SingleHopScenario( |
| 204 | + nodes=1 |
| 205 | + term=what is an ally |
| 206 | + persona=name='Hiring manager at gitlab' role_description='A hiring manager at gitlab trying to underestand hiring policies in gitlab' |
| 207 | + style=Web search like queries |
| 208 | + length=long) |
| 209 | + |
| 210 | + |
| 211 | + |
| 212 | + |
| 213 | +```python |
| 214 | +result = await query.generate_sample(scenario=scenarios[-1]) |
| 215 | +``` |
| 216 | + |
| 217 | +### Modify prompt to customize the query style |
| 218 | +Here I am replacing the default prompt with an instruction to generate only Yes/No questions. This is an optional step. |
| 219 | + |
| 220 | + |
| 221 | +```python |
| 222 | +instruction = """Generate a Yes/No query and answer based on the specified conditions (persona, term, style, length) |
| 223 | +and the provided context. Ensure the answer is entirely faithful to the context, using only the information |
| 224 | +directly from the provided context. |
| 225 | +
|
| 226 | +### Instructions: |
| 227 | +1. **Generate a Yes/No Query**: Based on the context, persona, term, style, and length, create a question |
| 228 | +that aligns with the persona's perspective, incorporates the term, and can be answered with 'Yes' or 'No'. |
| 229 | +2. **Generate an Answer**: Using only the content from the provided context, provide a 'Yes' or 'No' answer |
| 230 | +to the query. Do not add any information not included in or inferable from the context.""" |
| 231 | +``` |
| 232 | + |
| 233 | + |
| 234 | +```python |
| 235 | +prompt = query.get_prompts()["generate_query_reference_prompt"] |
| 236 | +prompt.instruction = instruction |
| 237 | +query.set_prompts(**{"generate_query_reference_prompt": prompt}) |
| 238 | +``` |
| 239 | + |
| 240 | + |
| 241 | +```python |
| 242 | +result = await query.generate_sample(scenario=scenarios[-1]) |
| 243 | +``` |
| 244 | + |
| 245 | + |
| 246 | +```python |
| 247 | +result.user_input |
| 248 | +``` |
| 249 | + |
| 250 | + |
| 251 | + |
| 252 | + |
| 253 | + 'Does the Diversity, Inclusion & Belonging (DIB) Team at GitLab have a structured approach to encourage collaborations among team members through various communication methods?' |
| 254 | + |
| 255 | + |
| 256 | + |
| 257 | + |
| 258 | +```python |
| 259 | +result.reference |
| 260 | +``` |
| 261 | + |
| 262 | + |
| 263 | + |
| 264 | + |
| 265 | + 'Yes' |
| 266 | + |
| 267 | + |
| 268 | + |
| 269 | + |
| 270 | +```python |
| 271 | + |
| 272 | +``` |
0 commit comments