1+ # SPDX-FileCopyrightText: 2025 Aaron Kaplan, Institute for Common Good Technology
12#
23# SPDX-License-Identifier: AGPL-3.0-or-later
34
1213
1314from intelmq .lib .bot import ParserBot , utils
1415from intelmq .lib .exceptions import InvalidArgument
15- from intelmq .lib .harmonization import ClassificationType
1616from intelmq .lib .exceptions import MissingDependencyError
1717
18- from pydantic import BaseModel , AfterValidator
19- from typing import Annotated , List
18+ from pydantic import BaseModel , ValidationError
19+ from typing import List
2020from pydantic_ai import Agent
2121
2222from intelmq .lib .basemodel import IntelMQEventModel
23- from requests import api
2423
2524from pprint import pprint
2625
3635)
3736
3837
39- def extract_data (text : str , model : str , api_key : str ) -> List [IntelMQEventModel ]:
38+ def extract_data (text : str , model : str , api_key : str , maximum_attempts : int = 5 ) -> List [IntelMQEventModel ]:
4039 """Use an LLM (part of the config which one) to extract IDF-style events from the raw text.
4140 We use ai.pydantic.dev for telling the LLM to extract and map all information from the (unstructured) `text` to the IntelMQ Data Format
4241 (see https://docs.intelmq.org/latest/user/event/) for a description of the IntelMQ Data Format (IDF)
@@ -48,10 +47,15 @@ def extract_data(text: str, model: str, api_key: str) -> List[IntelMQEventModel]
4847 Agent .instrument_all ()
4948 agent = Agent (model , output_type = IntelMQEventModel )
5049
51- result = agent .run_sync (text )
52- print (result .output )
53- print (result .usage ())
54- pprint (result )
50+ for attempt in range (maximum_attempts ):
51+ try :
52+ result = agent .run_sync (text )
53+ except ValidationError as exc :
54+ print (f'Got invalid result: ({ exc !r} . Trying again ({ attempt } /{ maximum_attempts } ).' )
55+ pass
56+ else :
57+ break
58+ print (f'Usage: { result .usage ()} ' )
5559 return [result ]
5660
5761
@@ -85,8 +89,9 @@ def process(self):
8589with open ("intelmq/bots/parsers/unstructured_text/test_data/sample.txt" , "r" ) as f :
8690 SAMPLE_CONTENT = f .read ()
8791
88- result = extract_data (
92+ results = extract_data (
8993 SAMPLE_CONTENT , model = "openai:gpt-4o" , api_key = os .getenv ("OPENAI_API_KEY" )
9094 )
91- for r in result :
92- print (r )
95+ for i , result in enumerate (results ):
96+ print (f'Result { i } :' )
97+ pprint (result .response .parts [0 ].args_as_dict ())
0 commit comments