Skip to content

Commit 5675e97

Browse files
authored
Image reader and image support in gather_evidence (#1046)
1 parent a69d5e0 commit 5675e97

16 files changed

+8875
-8276
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -880,6 +880,7 @@ will return much faster than the first query and we'll be certain the authors ma
880880
| `answer.evidence_retrieval` | `True` | Use retrieval vs processing all docs. |
881881
| `answer.evidence_summary_length` | `"about 100 words"` | Length of evidence summary. |
882882
| `answer.evidence_skip_summary` | `False` | Whether to skip summarization. |
883+
| `answer.evidence_text_only_fallback` | `False` | Whether to allow context creation to retry without media present. |
883884
| `answer.answer_max_sources` | `5` | Max number of sources for an answer. |
884885
| `answer.max_answer_attempts` | `None` | Max attempts to generate an answer. |
885886
| `answer.answer_length` | `"about 200 words, but can be longer"` | Length of final answer. |

packages/paper-qa-pypdf/tests/test_paperqa_pypdf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ def test_parse_pdf_to_pages() -> None:
1919
parsed_text = parse_pdf_to_pages(filepath)
2020
assert isinstance(parsed_text.content, dict)
2121
assert "1" in parsed_text.content, "Parsed text should contain page 1"
22+
assert isinstance(parsed_text.content["1"], str)
2223
matches = re.findall(
2324
r"Abstract\nWe introduce PaSa, an advanced Paper ?Search"
2425
r"\nagent powered by large language models.",

pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ dev = [
6161
"ipython>=8", # Pin to keep recent
6262
"litellm>=1.68,<1.71", # Lower pin for PydanticDeprecatedSince20 fixes, upper pin for VCR cassette breaks (https://github.com/BerriAI/litellm/issues/11724)
6363
"mypy>=1.8", # Pin for mutable-override
64-
"paper-qa[ldp,pypdf,pymupdf,typing,zotero,local,qdrant]",
64+
"paper-qa[image,ldp,pypdf,pymupdf,typing,zotero,local,qdrant]",
6565
"pre-commit>=3.4", # Pin to keep recent
6666
"pydantic~=2.11", # Pin for start of model_fields deprecation
6767
"pylint-pydantic",
@@ -78,6 +78,9 @@ dev = [
7878
"typeguard",
7979
"vcrpy>=6", # Pin for https://github.com/kevin1024/vcrpy/issues/884
8080
]
81+
image = [
82+
"pillow>=10.3.0", # Pin for py.typed
83+
]
8184
ldp = [
8285
"ldp>=0.25.0", # For new LLM client interface
8386
]

src/paperqa/core.py

Lines changed: 60 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,21 @@
22

33
import contextlib
44
import json
5+
import logging
56
import re
67
from collections.abc import Callable, Sequence
78
from typing import Any, cast
89

10+
import litellm
911
from aviary.core import Message
1012
from lmi import LLMModel
1113

14+
from paperqa.prompts import text_with_tables_prompt_template
1215
from paperqa.types import Context, LLMResult, Text
1316
from paperqa.utils import extract_score, strip_citations
1417

18+
logger = logging.getLogger(__name__)
19+
1520

1621
def llm_parse_json(text: str) -> dict:
1722
"""Read LLM output and extract JSON data from it."""
@@ -136,6 +141,7 @@ async def map_fxn_summary(
136141
parser: Callable[[str], dict[str, Any]] | None = None,
137142
callbacks: Sequence[Callable[[str], None]] | None = None,
138143
skip_citation_strip: bool = False,
144+
evidence_text_only_fallback: bool = False,
139145
) -> tuple[Context, LLMResult]:
140146
"""Parses the given text and returns a context object with the parser and prompt runner.
141147
@@ -154,6 +160,8 @@ async def map_fxn_summary(
154160
Should return dict with at least 'summary' field.
155161
callbacks: Optional sequence of callback functions to execute during LLM calls.
156162
skip_citation_strip: Optional skipping of citation stripping, if you want to keep in the context.
163+
evidence_text_only_fallback: Opt-in flag to allow retrying context creation
164+
without media in the completion.
157165
158166
Returns:
159167
The context object and LLMResult to get info about the LLM execution.
@@ -163,25 +171,61 @@ async def map_fxn_summary(
163171
extras: dict[str, Any] = {}
164172
citation = text.name + ": " + text.doc.formatted_citation
165173
success = False
174+
used_text_only_fallback = False
166175

176+
# Strip newlines in case chunking led to blank lines,
177+
# but not spaces, to preserve text alignment
178+
cleaned_text = text.text.strip("\n")
167179
if summary_llm_model and prompt_templates:
180+
media_text: list[str] = [m.text for m in text.media if m.text]
168181
data = {
169182
"question": question,
170183
"citation": citation,
171-
# Strip newlines in case chunking led to blank lines,
172-
# but not spaces, to preserve text alignment
173-
"text": text.text.strip("\n"),
184+
"text": (
185+
text_with_tables_prompt_template.format(
186+
text=cleaned_text,
187+
citation=citation,
188+
tables="\n\n----\n\n".join(media_text),
189+
)
190+
if media_text
191+
else cleaned_text
192+
),
174193
} | (extra_prompt_data or {})
175-
message_prompt, system_prompt = prompt_templates
176-
messages = [
177-
Message(role="system", content=system_prompt.format(**data)),
178-
Message(role="user", content=message_prompt.format(**data)),
179-
]
180-
llm_result = await summary_llm_model.call_single(
181-
messages=messages,
182-
callbacks=callbacks,
183-
name="evidence:" + text.name,
184-
)
194+
message_prompt, system_prompt = (pt.format(**data) for pt in prompt_templates)
195+
try:
196+
llm_result = await summary_llm_model.call_single(
197+
messages=[
198+
Message(role="system", content=system_prompt),
199+
Message.create_message(
200+
text=message_prompt,
201+
images=(
202+
[i.to_image_url() for i in text.media]
203+
if text.media
204+
else None
205+
),
206+
),
207+
],
208+
callbacks=callbacks,
209+
name="evidence:" + text.name,
210+
)
211+
except litellm.BadRequestError as exc:
212+
if not evidence_text_only_fallback:
213+
raise
214+
logger.warning(
215+
f"LLM call to create a context failed with exception {exc!r}"
216+
f" on text named {text.name!r}"
217+
f" with doc name {text.doc.docname!r} and doc key {text.doc.dockey!r}."
218+
f" Retrying without media."
219+
)
220+
llm_result = await summary_llm_model.call_single(
221+
messages=[
222+
Message(role="system", content=system_prompt),
223+
Message(content=message_prompt),
224+
],
225+
callbacks=callbacks,
226+
name="evidence:" + text.name,
227+
)
228+
used_text_only_fallback = True
185229
context = cast("str", llm_result.text)
186230
result_data = parser(context) if parser else {}
187231
success = bool(result_data)
@@ -199,9 +243,7 @@ async def map_fxn_summary(
199243
except KeyError:
200244
success = False
201245
else:
202-
# Strip newlines in case chunking led to blank lines,
203-
# but not spaces, to preserve text alignment
204-
context = text.text.strip("\n")
246+
context = cleaned_text
205247
# If we don't assign scores, just default to 5.
206248
# why 5? Because we filter out 0s in another place
207249
# and 5/10 is the other default I could come up with
@@ -213,6 +255,8 @@ async def map_fxn_summary(
213255

214256
if not success:
215257
score = extract_score(context)
258+
if used_text_only_fallback:
259+
extras["used_images"] = False
216260

217261
return (
218262
Context(

src/paperqa/docs.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -380,17 +380,18 @@ async def aadd( # noqa: PLR0912
380380
doc, **(query_kwargs | kwargs)
381381
)
382382

383-
texts = await read_doc(
383+
texts, metadata = await read_doc(
384384
path,
385385
doc,
386386
chunk_chars=parse_config.chunk_size,
387387
overlap=parse_config.overlap,
388388
page_size_limit=parse_config.page_size_limit,
389389
use_block_parsing=parse_config.pdfs_use_block_parsing,
390390
parse_pdf=parse_config.parse_pdf,
391+
include_metadata=True,
391392
)
392393
# loose check to see if document was loaded
393-
if (
394+
if metadata.parse_type != "image" and (
394395
not texts
395396
or len(texts[0].text) < 10 # noqa: PLR2004
396397
or (
@@ -669,6 +670,7 @@ async def aget_evidence(
669670
parser=llm_parse_json if prompt_config.use_json else None,
670671
callbacks=callbacks,
671672
skip_citation_strip=answer_config.skip_evidence_citation_strip,
673+
evidence_text_only_fallback=answer_config.evidence_text_only_fallback,
672674
)
673675
for m in matches
674676
],

src/paperqa/prompts.py

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,26 @@
11
from datetime import datetime
22

3-
# ruff: noqa: E501
4-
53
summary_prompt = (
64
"Summarize the excerpt below to help answer a question.\n\nExcerpt from"
7-
" {citation}\n\n----\n\n{text}\n\n----\n\nQuestion: {question}\n\nDo not directly"
5+
" {citation}\n\n------------\n\n{text}\n\n------------"
6+
"\n\nQuestion: {question}\n\nDo not directly"
87
" answer the question, instead summarize to give evidence to help answer the"
98
" question. Stay detailed; report specific numbers, equations, or direct quotes"
109
' (marked with quotation marks). Reply "Not applicable" if the excerpt is'
1110
" irrelevant. At the end of your response, provide an integer score from 1-10 on a"
1211
" newline indicating relevance to question. Do not explain your score.\n\nRelevant"
1312
" Information Summary ({summary_length}):"
1413
)
14+
# This prompt template integrates with `text` variable of the above `summary_prompt`
15+
text_with_tables_prompt_template = (
16+
"{text}\n\n------------\n\nMarkdown tables from {citation}."
17+
" If the markdown is poorly formatted, defer to the images"
18+
"\n\n------------\n\n{tables}"
19+
)
1520

1621
summary_json_prompt = (
17-
"Excerpt from {citation}\n\n----\n\n{text}\n\n----\n\nQuestion: {question}\n\n"
22+
"Excerpt from {citation}\n\n------------\n\n{text}\n\n------------"
23+
"\n\nQuestion: {question}\n\n"
1824
)
1925

2026
# The below "cannot answer" sentinel phrase should:
@@ -45,7 +51,7 @@
4551

4652
qa_prompt = (
4753
"Answer the question below with the context.\n\n"
48-
"Context:\n\n{context}\n\n----\n\n"
54+
"Context:\n\n{context}\n\n------------\n\n"
4955
"Question: {question}\n\n"
5056
"Write an answer based on the context. "
5157
"If the context provides insufficient information reply "
@@ -99,15 +105,19 @@
99105
)
100106

101107
# NOTE: we use double curly braces here so it's not considered an f-string template
102-
summary_json_system_prompt = """\
103-
Provide a summary of the relevant information that could help answer the question based on the excerpt. Respond with the following JSON format:
104-
105-
{{
106-
"summary": "...",
107-
"relevance_score": "..."
108-
}}
109-
110-
where `summary` is relevant information from the text - {summary_length} words. `relevance_score` is an integer 1-10 for the relevance of `summary` to the question."""
108+
summary_json_system_prompt = (
109+
"Provide a summary of the relevant information"
110+
" that could help answer the question based on the excerpt."
111+
" Your summary, combined with many others,"
112+
" will be given to the model to generate an answer."
113+
" Respond with the following JSON format:"
114+
'\n\n{{\n "summary": "...",\n "relevance_score": "..."\n "used_images"\n}}'
115+
"\n\nwhere `summary` is relevant information from the text - {summary_length} words."
116+
" `relevance_score` is an integer 1-10 for the relevance of `summary` to the question."
117+
" `used_images` is a boolean flag indicating"
118+
" if any images present in a multimodal message were used,"
119+
" and if no images were present it should be false."
120+
)
111121

112122
env_system_prompt = (
113123
# Matching https://github.com/langchain-ai/langchain/blob/langchain%3D%3D0.2.3/libs/langchain/langchain/agents/openai_functions_agent/base.py#L213-L215

0 commit comments

Comments
 (0)