1313
1414import bs4 # type: ignore
1515from bs4 .element import ResultSet # type: ignore
16+ from charset_normalizer import from_bytes
1617from dateutil .parser import isoparse
1718from icalendar import Calendar # type: ignore
1819from pydantic import BaseModel , PrivateAttr
@@ -305,6 +306,7 @@ class LLM(Parser):
305306 """LLM parser."""
306307
307308 _data_types = PrivateAttr (["text/html" , "html" , "text/plain" ])
309+ _tokens_used = PrivateAttr (default = 0 )
308310
309311 _llm_question = """Please, could you extract a JSON form without any other comment,
310312 with the following JSON schema (start and end times are datetime objects and should be displayed in UTC):
@@ -374,7 +376,18 @@ def parser_hook(self, raw: bytes, content_type: str):
374376 @staticmethod
375377 def get_text_hook (raw : bytes ) -> str :
376378 """Can be overwritten by subclasses."""
377- return raw .decode ()
379+ try :
380+ # Decode quoted-printable if needed
381+ decoded_bytes = quopri .decodestring (raw )
382+
383+ # Auto-detect and decode
384+ result = from_bytes (decoded_bytes ).best ()
385+ if result is not None :
386+ return str (result )
387+ return decoded_bytes .decode ("latin-1" , errors = "replace" )
388+ except (UnicodeDecodeError , ValueError , TypeError , AttributeError ):
389+ # Final fallback if all above methods fail
390+ return raw .decode ("utf-8" , errors = "replace" )
378391
379392 @staticmethod
380393 def get_key_with_string (dictionary : dict , string : str ):
@@ -401,6 +414,16 @@ def llm_question(self):
401414
402415 return self ._llm_question
403416
417+ @property
418+ def tokens_used (self ):
419+ """Return the number of tokens used by the LLM."""
420+ return self ._tokens_used
421+
422+ @tokens_used .setter
423+ def tokens_used (self , value ):
424+ """Set the number of tokens used by the LLM."""
425+ self ._tokens_used = value
426+
404427 def get_llm_response (self , content ):
405428 """Method to retrieve the response from the LLM for some content."""
406429 raise NotImplementedError
@@ -482,7 +505,6 @@ def _get_maintenance_id(self, generated_json: dict, start, end, circuits):
482505 maintenance_key = self .get_key_with_string (generated_json , "maintenance" )
483506 if maintenance_key and generated_json ["maintenance_id" ] != "N/A" :
484507 return generated_json ["maintenance_id" ]
485-
486508 maintenance_id = str (start ) + str (end ) + "" .join (list (circuits ))
487509 return hashlib .sha256 (maintenance_id .encode ("utf-8" )).hexdigest () # nosec
488510
@@ -508,6 +530,7 @@ def parse_content(self, content):
508530 "summary" : str (self ._get_summary (generated_json )),
509531 "status" : self ._get_status (generated_json ),
510532 "account" : str (self ._get_account (generated_json )),
533+ "_llm_tokens_used" : self .tokens_used ,
511534 }
512535
513536 # Generate maintenance ID for main window
0 commit comments