@@ -109,7 +109,7 @@ def free_gpu():
109109def get_best_gpu (min_memory_gb : int = 16 ) -> str :
110110 """
111111 Get the GPU with most available memory.
112-
112+
113113 Returns device string like 'cuda:0' or 'cpu' if no GPU available.
114114 """
115115 if not torch .cuda .is_available ():
@@ -332,12 +332,12 @@ def get_raw_text(
332332) -> Tuple [str , str ]:
333333 """
334334 Get raw text from a poster file.
335-
335+
336336 Args:
337337 poster_path: Path to poster file (PDF, JPG, PNG)
338338 poster_id: Optional ID for caching
339339 output_dir: Optional directory for cached results
340-
340+
341341 Returns:
342342 Tuple of (text, source) where source indicates extraction method
343343 """
@@ -480,7 +480,9 @@ def _generate(model, tokenizer, prompt: str, max_tokens: int) -> str:
480480 )
481481 elapsed = time .time () - t0
482482 tokens_generated = outputs .shape [1 ] - inputs ["input_ids" ].shape [1 ]
483- log (f" Generated { tokens_generated } tokens in { elapsed :.2f} s ({ tokens_generated / elapsed :.1f} tok/s)" )
483+ log (
484+ f" Generated { tokens_generated } tokens in { elapsed :.2f} s ({ tokens_generated / elapsed :.1f} tok/s)"
485+ )
484486
485487 return tokenizer .decode (outputs [0 ][inputs ["input_ids" ].shape [1 ] :], skip_special_tokens = True )
486488
@@ -739,9 +741,22 @@ def _clean_unicode_artifacts(text: str) -> str:
739741 return text
740742
741743 bidi_chars = [
742- "\u200e " , "\u200f " , "\u202a " , "\u202b " , "\u202c " , "\u202d " , "\u202e " ,
743- "\u2066 " , "\u2067 " , "\u2068 " , "\u2069 " , "\u200b " , "\u200c " , "\u200d " ,
744- "\ufeff " , "\u00ad " ,
744+ "\u200e " ,
745+ "\u200f " ,
746+ "\u202a " ,
747+ "\u202b " ,
748+ "\u202c " ,
749+ "\u202d " ,
750+ "\u202e " ,
751+ "\u2066 " ,
752+ "\u2067 " ,
753+ "\u2068 " ,
754+ "\u2069 " ,
755+ "\u200b " ,
756+ "\u200c " ,
757+ "\u200d " ,
758+ "\ufeff " ,
759+ "\u00ad " ,
745760 ]
746761 for char in bidi_chars :
747762 text = text .replace (char , "" )
@@ -811,7 +826,9 @@ def _postprocess_json(data: dict) -> dict:
811826 content = section .get ("sectionContent" , "" )
812827 if isinstance (content , list ):
813828 content = " " .join (str (c ) for c in content )
814- content = _clean_unicode_artifacts (content .strip () if isinstance (content , str ) else "" )
829+ content = _clean_unicode_artifacts (
830+ content .strip () if isinstance (content , str ) else ""
831+ )
815832 if content and len (content ) > 10 :
816833 cleaned_sections .append ({"sectionTitle" : title , "sectionContent" : content })
817834 result ["posterContent" ]["sections" ] = cleaned_sections
@@ -839,7 +856,7 @@ def _postprocess_json(data: dict) -> dict:
839856def extract_json_with_retry (raw_text : str , model , tokenizer ) -> dict :
840857 """
841858 Send raw poster text to the LLM and robustly parse the JSON response.
842-
859+
843860 This function:
844861 1. Calls the model with a full prompt
845862 2. Retries with more tokens if truncation is detected
@@ -876,16 +893,16 @@ def extract_json_with_retry(raw_text: str, model, tokenizer) -> dict:
876893def extract_poster (poster_path : str ) -> dict :
877894 """
878895 Extract structured JSON metadata from a scientific poster.
879-
896+
880897 This is the main entry point for poster extraction.
881-
898+
882899 Args:
883900 poster_path: Path to the poster file (PDF, JPG, or PNG)
884-
901+
885902 Returns:
886903 Dictionary containing structured poster metadata conforming to
887904 the poster-json-schema.
888-
905+
889906 Example:
890907 >>> result = extract_poster("poster.pdf")
891908 >>> print(result["titles"][0]["title"])
@@ -930,4 +947,3 @@ def extract_poster(poster_path: str) -> dict:
930947 traceback .print_exc ()
931948 unload_json_model ()
932949 return {"error" : str (e )}
933-
0 commit comments