88from ragas .testset .transforms .base import LLMBasedExtractor
99
1010
11- # define prompts
11+ class TextWithExtractionLimit (BaseModel ):
12+ text : str
13+ max_num : int = 10
14+
15+
1216class SummaryExtractorPrompt (PydanticPrompt [StringIO , StringIO ]):
1317 instruction : str = "Summarize the given text in less than 10 sentences."
1418 input_model : t .Type [StringIO ] = StringIO
@@ -29,14 +33,15 @@ class Keyphrases(BaseModel):
2933 keyphrases : t .List [str ]
3034
3135
32- class KeyphrasesExtractorPrompt (PydanticPrompt [StringIO , Keyphrases ]):
33- instruction : str = "Extract top 5 keyphrases from the given text."
34- input_model : t .Type [StringIO ] = StringIO
36+ class KeyphrasesExtractorPrompt (PydanticPrompt [TextWithExtractionLimit , Keyphrases ]):
37+ instruction : str = "Extract top max_num keyphrases from the given text."
38+ input_model : t .Type [TextWithExtractionLimit ] = TextWithExtractionLimit
3539 output_model : t .Type [Keyphrases ] = Keyphrases
36- examples : t .List [t .Tuple [StringIO , Keyphrases ]] = [
40+ examples : t .List [t .Tuple [TextWithExtractionLimit , Keyphrases ]] = [
3741 (
38- StringIO (
39- text = "Artificial intelligence\n \n Artificial intelligence is transforming various industries by automating tasks that previously required human intelligence. From healthcare to finance, AI is being used to analyze vast amounts of data quickly and accurately. This technology is also driving innovations in areas like self-driving cars and personalized recommendations."
42+ TextWithExtractionLimit (
43+ text = "Artificial intelligence\n \n Artificial intelligence is transforming various industries by automating tasks that previously required human intelligence. From healthcare to finance, AI is being used to analyze vast amounts of data quickly and accurately. This technology is also driving innovations in areas like self-driving cars and personalized recommendations." ,
44+ max_num = 5 ,
4045 ),
4146 Keyphrases (
4247 keyphrases = [
@@ -66,17 +71,20 @@ class TitleExtractorPrompt(PydanticPrompt[StringIO, StringIO]):
6671
6772
6873class Headlines (BaseModel ):
69- headlines : t .List [str ]
74+ headlines : t .List [str ]
7075
7176
72- class HeadlinesExtractorPrompt (PydanticPrompt [StringIO , Headlines ]):
73- instruction : str = "Extract only level 2 and level 3 headings from the given text."
77+ class HeadlinesExtractorPrompt (PydanticPrompt [TextWithExtractionLimit , Headlines ]):
78+ instruction : str = (
79+ "Extract the most important max_num headlines from the given text that can be used to split the text into independent sections."
80+ "Focus on Level 2 and Level 3 headings."
81+ )
7482
75- input_model : t .Type [StringIO ] = StringIO
83+ input_model : t .Type [TextWithExtractionLimit ] = TextWithExtractionLimit
7684 output_model : t .Type [Headlines ] = Headlines
77- examples : t .List [t .Tuple [StringIO , Headlines ]] = [
85+ examples : t .List [t .Tuple [TextWithExtractionLimit , Headlines ]] = [
7886 (
79- StringIO (
87+ TextWithExtractionLimit (
8088 text = """\
8189 Introduction
8290 Overview of the topic...
@@ -98,30 +106,24 @@ class HeadlinesExtractorPrompt(PydanticPrompt[StringIO, Headlines]):
98106
99107 Conclusion
100108 Final remarks and summary.
101- """
109+ """ ,
110+ max_num = 6 ,
102111 ),
103112 Headlines (
104113 headlines = [
114+ "Introduction" ,
105115 "Main Concepts" ,
106116 "Detailed Analysis" ,
107- "Subsection: Specialized Techniques" ,
108117 "Future Directions" ,
109- "Subsection: Next Steps in Research" ,
110- ]
118+ ],)
111119 ),
112- ),
113120 ]
114121
115122
116123class NEROutput (BaseModel ):
117124 entities : t .List [str ]
118125
119126
120- class TextWithExtractionLimit (BaseModel ):
121- text : str
122- max_num : int = 10
123-
124-
125127class NERPrompt (PydanticPrompt [TextWithExtractionLimit , NEROutput ]):
126128 instruction : str = (
127129 "Extract the named entities from the given text, limiting the output to the top entities. "
@@ -190,12 +192,15 @@ class KeyphrasesExtractor(LLMBasedExtractor):
190192
191193 property_name : str = "keyphrases"
192194 prompt : KeyphrasesExtractorPrompt = KeyphrasesExtractorPrompt ()
195+ max_num : int = 5
193196
194197 async def extract (self , node : Node ) -> t .Tuple [str , t .Any ]:
195198 node_text = node .get_property ("page_content" )
196199 if node_text is None :
197200 return self .property_name , None
198- result = await self .prompt .generate (self .llm , data = StringIO (text = node_text ))
201+ result = await self .prompt .generate (
202+ self .llm , data = TextWithExtractionLimit (text = node_text , max_num = self .max_num )
203+ )
199204 return self .property_name , result .keyphrases
200205
201206
@@ -238,12 +243,15 @@ class HeadlinesExtractor(LLMBasedExtractor):
238243
239244 property_name : str = "headlines"
240245 prompt : HeadlinesExtractorPrompt = HeadlinesExtractorPrompt ()
246+ max_num : int = 5
241247
242248 async def extract (self , node : Node ) -> t .Tuple [str , t .Any ]:
243249 node_text = node .get_property ("page_content" )
244250 if node_text is None :
245251 return self .property_name , None
246- result = await self .prompt .generate (self .llm , data = StringIO (text = node_text ))
252+ result = await self .prompt .generate (
253+ self .llm , data = TextWithExtractionLimit (text = node_text , max_num = self .max_num )
254+ )
247255 if result is None :
248256 return self .property_name , None
249257 return self .property_name , result .headlines
@@ -282,7 +290,9 @@ class TopicDescription(BaseModel):
282290
283291
284292class TopicDescriptionPrompt (PydanticPrompt [StringIO , TopicDescription ]):
285- instruction : str = "Provide a concise description of the main topic(s) discussed in the following text."
293+ instruction : str = (
294+ "Provide a concise description of the main topic(s) discussed in the following text."
295+ )
286296 input_model : t .Type [StringIO ] = StringIO
287297 output_model : t .Type [TopicDescription ] = TopicDescription
288298 examples : t .List [t .Tuple [StringIO , TopicDescription ]] = [
0 commit comments