Skip to content

Commit 33fbdfc

Browse files
authored
fixes: add extraction limit to Extractors (#1673)
1 parent 22de85b commit 33fbdfc

File tree

1 file changed

+36
-26
lines changed

1 file changed

+36
-26
lines changed

src/ragas/testset/transforms/extractors/llm_based.py

Lines changed: 36 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@
88
from ragas.testset.transforms.base import LLMBasedExtractor
99

1010

11-
# define prompts
11+
class TextWithExtractionLimit(BaseModel):
12+
text: str
13+
max_num: int = 10
14+
15+
1216
class SummaryExtractorPrompt(PydanticPrompt[StringIO, StringIO]):
1317
instruction: str = "Summarize the given text in less than 10 sentences."
1418
input_model: t.Type[StringIO] = StringIO
@@ -29,14 +33,15 @@ class Keyphrases(BaseModel):
2933
keyphrases: t.List[str]
3034

3135

32-
class KeyphrasesExtractorPrompt(PydanticPrompt[StringIO, Keyphrases]):
33-
instruction: str = "Extract top 5 keyphrases from the given text."
34-
input_model: t.Type[StringIO] = StringIO
36+
class KeyphrasesExtractorPrompt(PydanticPrompt[TextWithExtractionLimit, Keyphrases]):
37+
instruction: str = "Extract top max_num keyphrases from the given text."
38+
input_model: t.Type[TextWithExtractionLimit] = TextWithExtractionLimit
3539
output_model: t.Type[Keyphrases] = Keyphrases
36-
examples: t.List[t.Tuple[StringIO, Keyphrases]] = [
40+
examples: t.List[t.Tuple[TextWithExtractionLimit, Keyphrases]] = [
3741
(
38-
StringIO(
39-
text="Artificial intelligence\n\nArtificial intelligence is transforming various industries by automating tasks that previously required human intelligence. From healthcare to finance, AI is being used to analyze vast amounts of data quickly and accurately. This technology is also driving innovations in areas like self-driving cars and personalized recommendations."
42+
TextWithExtractionLimit(
43+
text="Artificial intelligence\n\nArtificial intelligence is transforming various industries by automating tasks that previously required human intelligence. From healthcare to finance, AI is being used to analyze vast amounts of data quickly and accurately. This technology is also driving innovations in areas like self-driving cars and personalized recommendations.",
44+
max_num=5,
4045
),
4146
Keyphrases(
4247
keyphrases=[
@@ -66,17 +71,20 @@ class TitleExtractorPrompt(PydanticPrompt[StringIO, StringIO]):
6671

6772

6873
class Headlines(BaseModel):
69-
headlines: t.List[str]
74+
headlines: t.List[str]
7075

7176

72-
class HeadlinesExtractorPrompt(PydanticPrompt[StringIO, Headlines]):
73-
instruction: str = "Extract only level 2 and level 3 headings from the given text."
77+
class HeadlinesExtractorPrompt(PydanticPrompt[TextWithExtractionLimit, Headlines]):
78+
instruction: str = (
79+
"Extract the most important max_num headlines from the given text that can be used to split the text into independent sections."
80+
"Focus on Level 2 and Level 3 headings."
81+
)
7482

75-
input_model: t.Type[StringIO] = StringIO
83+
input_model: t.Type[TextWithExtractionLimit] = TextWithExtractionLimit
7684
output_model: t.Type[Headlines] = Headlines
77-
examples: t.List[t.Tuple[StringIO, Headlines]] = [
85+
examples: t.List[t.Tuple[TextWithExtractionLimit, Headlines]] = [
7886
(
79-
StringIO(
87+
TextWithExtractionLimit(
8088
text="""\
8189
Introduction
8290
Overview of the topic...
@@ -98,30 +106,24 @@ class HeadlinesExtractorPrompt(PydanticPrompt[StringIO, Headlines]):
98106
99107
Conclusion
100108
Final remarks and summary.
101-
"""
109+
""",
110+
max_num=6,
102111
),
103112
Headlines(
104113
headlines=[
114+
"Introduction",
105115
"Main Concepts",
106116
"Detailed Analysis",
107-
"Subsection: Specialized Techniques",
108117
"Future Directions",
109-
"Subsection: Next Steps in Research",
110-
]
118+
],)
111119
),
112-
),
113120
]
114121

115122

116123
class NEROutput(BaseModel):
117124
entities: t.List[str]
118125

119126

120-
class TextWithExtractionLimit(BaseModel):
121-
text: str
122-
max_num: int = 10
123-
124-
125127
class NERPrompt(PydanticPrompt[TextWithExtractionLimit, NEROutput]):
126128
instruction: str = (
127129
"Extract the named entities from the given text, limiting the output to the top entities. "
@@ -190,12 +192,15 @@ class KeyphrasesExtractor(LLMBasedExtractor):
190192

191193
property_name: str = "keyphrases"
192194
prompt: KeyphrasesExtractorPrompt = KeyphrasesExtractorPrompt()
195+
max_num: int = 5
193196

194197
async def extract(self, node: Node) -> t.Tuple[str, t.Any]:
195198
node_text = node.get_property("page_content")
196199
if node_text is None:
197200
return self.property_name, None
198-
result = await self.prompt.generate(self.llm, data=StringIO(text=node_text))
201+
result = await self.prompt.generate(
202+
self.llm, data=TextWithExtractionLimit(text=node_text, max_num=self.max_num)
203+
)
199204
return self.property_name, result.keyphrases
200205

201206

@@ -238,12 +243,15 @@ class HeadlinesExtractor(LLMBasedExtractor):
238243

239244
property_name: str = "headlines"
240245
prompt: HeadlinesExtractorPrompt = HeadlinesExtractorPrompt()
246+
max_num: int = 5
241247

242248
async def extract(self, node: Node) -> t.Tuple[str, t.Any]:
243249
node_text = node.get_property("page_content")
244250
if node_text is None:
245251
return self.property_name, None
246-
result = await self.prompt.generate(self.llm, data=StringIO(text=node_text))
252+
result = await self.prompt.generate(
253+
self.llm, data=TextWithExtractionLimit(text=node_text, max_num=self.max_num)
254+
)
247255
if result is None:
248256
return self.property_name, None
249257
return self.property_name, result.headlines
@@ -282,7 +290,9 @@ class TopicDescription(BaseModel):
282290

283291

284292
class TopicDescriptionPrompt(PydanticPrompt[StringIO, TopicDescription]):
285-
instruction: str = "Provide a concise description of the main topic(s) discussed in the following text."
293+
instruction: str = (
294+
"Provide a concise description of the main topic(s) discussed in the following text."
295+
)
286296
input_model: t.Type[StringIO] = StringIO
287297
output_model: t.Type[TopicDescription] = TopicDescription
288298
examples: t.List[t.Tuple[StringIO, TopicDescription]] = [

0 commit comments

Comments
 (0)