@@ -70,35 +70,44 @@ class Headlines(BaseModel):
7070
7171
7272class HeadlinesExtractorPrompt (PydanticPrompt [StringIO , Headlines ]):
73- instruction : str = "Extract only level 2 headings from the given text."
73+ instruction : str = "Extract only level 2 and level 3 headings from the given text."
7474
7575 input_model : t .Type [StringIO ] = StringIO
7676 output_model : t .Type [Headlines ] = Headlines
7777 examples : t .List [t .Tuple [StringIO , Headlines ]] = [
7878 (
7979 StringIO (
8080 text = """\
81- Introduction
82- Overview of the topic...
81+ Introduction
82+ Overview of the topic...
8383
84- Main Concepts
85- Explanation of core ideas...
84+ Main Concepts
85+ Explanation of core ideas...
8686
87- Detailed Analysis
88- Techniques and methods for analysis...
87+ Detailed Analysis
88+ Techniques and methods for analysis...
8989
90- Subsection: Specialized Techniques
91- Further details on specialized techniques...
90+ Subsection: Specialized Techniques
91+ Further details on specialized techniques...
9292
93- Future Directions
94- Insights into upcoming trends...
93+ Future Directions
94+ Insights into upcoming trends...
9595
96- Conclusion
97- Final remarks and summary.
98- """ ,
96+ Subsection: Next Steps in Research
97+ Discussion of new areas of study...
98+
99+ Conclusion
100+ Final remarks and summary.
101+ """
99102 ),
100103 Headlines (
101- headlines = ["Main Concepts" , "Detailed Analysis" , "Future Directions" ]
104+ headlines = [
105+ "Main Concepts" ,
106+ "Detailed Analysis" ,
107+ "Subsection: Specialized Techniques" ,
108+ "Future Directions" ,
109+ "Subsection: Next Steps in Research" ,
110+ ]
102111 ),
103112 ),
104113 ]
@@ -108,15 +117,24 @@ class NEROutput(BaseModel):
108117 entities : t .List [str ]
109118
110119
111- class NERPrompt (PydanticPrompt [StringIO , NEROutput ]):
112- instruction : str = "Extract named entities from the given text."
113- input_model : t .Type [StringIO ] = StringIO
120+ class TextWithExtractionLimit (BaseModel ):
121+ text : str
122+ max_num : int = 10
123+
124+
125+ class NERPrompt (PydanticPrompt [TextWithExtractionLimit , NEROutput ]):
126+ instruction : str = (
127+ "Extract the named entities from the given text, limiting the output to the top entities. "
128+ "Ensure the number of entities does not exceed the specified maximum."
129+ )
130+ input_model : t .Type [TextWithExtractionLimit ] = TextWithExtractionLimit
114131 output_model : t .Type [NEROutput ] = NEROutput
115- examples : t .List [t .Tuple [StringIO , NEROutput ]] = [
132+ examples : t .List [t .Tuple [TextWithExtractionLimit , NEROutput ]] = [
116133 (
117- StringIO (
134+ TextWithExtractionLimit (
118135 text = """Elon Musk, the CEO of Tesla and SpaceX, announced plans to expand operations to new locations in Europe and Asia.
119- This expansion is expected to create thousands of jobs, particularly in cities like Berlin and Shanghai."""
136+ This expansion is expected to create thousands of jobs, particularly in cities like Berlin and Shanghai.""" ,
137+ max_num = 10 ,
120138 ),
121139 NEROutput (
122140 entities = [
@@ -246,12 +264,16 @@ class NERExtractor(LLMBasedExtractor):
246264
247265 property_name : str = "entities"
248266 prompt : NERPrompt = NERPrompt ()
267+ max_num_entities : int = 10
249268
250269 async def extract (self , node : Node ) -> t .Tuple [str , t .List [str ]]:
251270 node_text = node .get_property ("page_content" )
252271 if node_text is None :
253272 return self .property_name , []
254- result = await self .prompt .generate (self .llm , data = StringIO (text = node_text ))
273+ result = await self .prompt .generate (
274+ self .llm ,
275+ data = TextWithExtractionLimit (text = node_text , max_num = self .max_num_entities ),
276+ )
255277 return self .property_name , result .entities
256278
257279
@@ -305,14 +327,17 @@ class ThemesAndConcepts(BaseModel):
305327 output : t .List [str ]
306328
307329
308- class ThemesAndConceptsExtractorPrompt (PydanticPrompt [StringIO , ThemesAndConcepts ]):
330+ class ThemesAndConceptsExtractorPrompt (
331+ PydanticPrompt [TextWithExtractionLimit , ThemesAndConcepts ]
332+ ):
309333 instruction : str = "Extract the main themes and concepts from the given text."
310- input_model : t .Type [StringIO ] = StringIO
334+ input_model : t .Type [TextWithExtractionLimit ] = TextWithExtractionLimit
311335 output_model : t .Type [ThemesAndConcepts ] = ThemesAndConcepts
312- examples : t .List [t .Tuple [StringIO , ThemesAndConcepts ]] = [
336+ examples : t .List [t .Tuple [TextWithExtractionLimit , ThemesAndConcepts ]] = [
313337 (
314- StringIO (
315- text = "Artificial intelligence is transforming industries by automating tasks requiring human intelligence. AI analyzes vast data quickly and accurately, driving innovations like self-driving cars and personalized recommendations."
338+ TextWithExtractionLimit (
339+ text = "Artificial intelligence is transforming industries by automating tasks requiring human intelligence. AI analyzes vast data quickly and accurately, driving innovations like self-driving cars and personalized recommendations." ,
340+ max_num = 10 ,
316341 ),
317342 ThemesAndConcepts (
318343 output = [
@@ -343,10 +368,14 @@ class ThemesExtractor(LLMBasedExtractor):
343368
344369 property_name : str = "themes"
345370 prompt : ThemesAndConceptsExtractorPrompt = ThemesAndConceptsExtractorPrompt ()
371+ max_num_themes : int = 10
346372
347373 async def extract (self , node : Node ) -> t .Tuple [str , t .List [str ]]:
348374 node_text = node .get_property ("page_content" )
349375 if node_text is None :
350376 return self .property_name , []
351- result = await self .prompt .generate (self .llm , data = StringIO (text = node_text ))
377+ result = await self .prompt .generate (
378+ self .llm ,
379+ data = TextWithExtractionLimit (text = node_text , max_num = self .max_num_themes ),
380+ )
352381 return self .property_name , result .output
0 commit comments