Skip to content

Commit 2c0d886

Browse files
committed
refactor: clean up whitespace and improve code formatting in extract.py
1 parent 94ce030 commit 2c0d886

File tree

1 file changed

+30
-14
lines changed

1 file changed

+30
-14
lines changed

poster2json/extract.py

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def free_gpu():
109109
def get_best_gpu(min_memory_gb: int = 16) -> str:
110110
"""
111111
Get the GPU with most available memory.
112-
112+
113113
Returns device string like 'cuda:0' or 'cpu' if no GPU available.
114114
"""
115115
if not torch.cuda.is_available():
@@ -332,12 +332,12 @@ def get_raw_text(
332332
) -> Tuple[str, str]:
333333
"""
334334
Get raw text from a poster file.
335-
335+
336336
Args:
337337
poster_path: Path to poster file (PDF, JPG, PNG)
338338
poster_id: Optional ID for caching
339339
output_dir: Optional directory for cached results
340-
340+
341341
Returns:
342342
Tuple of (text, source) where source indicates extraction method
343343
"""
@@ -480,7 +480,9 @@ def _generate(model, tokenizer, prompt: str, max_tokens: int) -> str:
480480
)
481481
elapsed = time.time() - t0
482482
tokens_generated = outputs.shape[1] - inputs["input_ids"].shape[1]
483-
log(f" Generated {tokens_generated} tokens in {elapsed:.2f}s ({tokens_generated/elapsed:.1f} tok/s)")
483+
log(
484+
f" Generated {tokens_generated} tokens in {elapsed:.2f}s ({tokens_generated/elapsed:.1f} tok/s)"
485+
)
484486

485487
return tokenizer.decode(outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True)
486488

@@ -739,9 +741,22 @@ def _clean_unicode_artifacts(text: str) -> str:
739741
return text
740742

741743
bidi_chars = [
742-
"\u200e", "\u200f", "\u202a", "\u202b", "\u202c", "\u202d", "\u202e",
743-
"\u2066", "\u2067", "\u2068", "\u2069", "\u200b", "\u200c", "\u200d",
744-
"\ufeff", "\u00ad",
744+
"\u200e",
745+
"\u200f",
746+
"\u202a",
747+
"\u202b",
748+
"\u202c",
749+
"\u202d",
750+
"\u202e",
751+
"\u2066",
752+
"\u2067",
753+
"\u2068",
754+
"\u2069",
755+
"\u200b",
756+
"\u200c",
757+
"\u200d",
758+
"\ufeff",
759+
"\u00ad",
745760
]
746761
for char in bidi_chars:
747762
text = text.replace(char, "")
@@ -811,7 +826,9 @@ def _postprocess_json(data: dict) -> dict:
811826
content = section.get("sectionContent", "")
812827
if isinstance(content, list):
813828
content = " ".join(str(c) for c in content)
814-
content = _clean_unicode_artifacts(content.strip() if isinstance(content, str) else "")
829+
content = _clean_unicode_artifacts(
830+
content.strip() if isinstance(content, str) else ""
831+
)
815832
if content and len(content) > 10:
816833
cleaned_sections.append({"sectionTitle": title, "sectionContent": content})
817834
result["posterContent"]["sections"] = cleaned_sections
@@ -839,7 +856,7 @@ def _postprocess_json(data: dict) -> dict:
839856
def extract_json_with_retry(raw_text: str, model, tokenizer) -> dict:
840857
"""
841858
Send raw poster text to the LLM and robustly parse the JSON response.
842-
859+
843860
This function:
844861
1. Calls the model with a full prompt
845862
2. Retries with more tokens if truncation is detected
@@ -876,16 +893,16 @@ def extract_json_with_retry(raw_text: str, model, tokenizer) -> dict:
876893
def extract_poster(poster_path: str) -> dict:
877894
"""
878895
Extract structured JSON metadata from a scientific poster.
879-
896+
880897
This is the main entry point for poster extraction.
881-
898+
882899
Args:
883900
poster_path: Path to the poster file (PDF, JPG, or PNG)
884-
901+
885902
Returns:
886903
Dictionary containing structured poster metadata conforming to
887904
the poster-json-schema.
888-
905+
889906
Example:
890907
>>> result = extract_poster("poster.pdf")
891908
>>> print(result["titles"][0]["title"])
@@ -930,4 +947,3 @@ def extract_poster(poster_path: str) -> dict:
930947
traceback.print_exc()
931948
unload_json_model()
932949
return {"error": str(e)}
933-

0 commit comments

Comments
 (0)